From 1f30306e2329e5a1f0c5dd39844d9bb0a0c04573 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vladim=C3=ADr=20Vondru=C5=A1?= Date: Tue, 19 Sep 2017 18:22:56 +0200 Subject: [PATCH 1/4] Make the internal link replacer function public. So it can be used from outside. --- pelican/contents.py | 126 ++++++++++++++++++++++---------------------- 1 file changed, 63 insertions(+), 63 deletions(-) diff --git a/pelican/contents.py b/pelican/contents.py index 15770fc87..a534dbaae 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -228,6 +228,68 @@ def get_url_setting(self, key): key = key if self.in_default_lang else 'lang_%s' % key return self._expand_settings(key) + def _link_replacer(self, siteurl, m): + what = m.group('what') + value = urlparse(m.group('value')) + path = value.path + origin = m.group('path') + + # XXX Put this in a different location. + if what in {'filename', 'attach'}: + if path.startswith('/'): + path = path[1:] + else: + # relative to the source path of this content + path = self.get_relative_source_path( + os.path.join(self.relative_dir, path) + ) + + if path not in self._context['filenames']: + unquoted_path = path.replace('%20', ' ') + + if unquoted_path in self._context['filenames']: + path = unquoted_path + + linked_content = self._context['filenames'].get(path) + if linked_content: + if what == 'attach': + if isinstance(linked_content, Static): + linked_content.attach_to(self) + else: + logger.warning( + "%s used {attach} link syntax on a " + "non-static file. Use {filename} instead.", + self.get_relative_source_path()) + origin = '/'.join((siteurl, linked_content.url)) + origin = origin.replace('\\', '/') # for Windows paths. + else: + logger.warning( + "Unable to find '%s', skipping url replacement.", + value.geturl(), extra={ + 'limit_msg': ("Other resources were not found " + "and their urls not replaced")}) + elif what == 'category': + origin = '/'.join((siteurl, Category(path, self.settings).url)) + elif what == 'tag': + origin = '/'.join((siteurl, Tag(path, self.settings).url)) + elif what == 'index': + origin = '/'.join((siteurl, self.settings['INDEX_SAVE_AS'])) + elif what == 'author': + origin = '/'.join((siteurl, Author(path, self.settings).url)) + else: + logger.warning( + "Replacement Indicator '%s' not recognized, " + "skipping replacement", + what) + + # keep all other parts, such as query, fragment, etc. + parts = list(value) + parts[2] = origin + origin = urlunparse(parts) + + return ''.join((m.group('markup'), m.group('quote'), origin, + m.group('quote'))) + def _update_content(self, content, siteurl): """Update the content attribute. @@ -251,69 +313,7 @@ def _update_content(self, content, siteurl): \2""".format(instrasite_link_regex) hrefs = re.compile(regex, re.X) - def replacer(m): - what = m.group('what') - value = urlparse(m.group('value')) - path = value.path - origin = m.group('path') - - # XXX Put this in a different location. - if what in {'filename', 'attach'}: - if path.startswith('/'): - path = path[1:] - else: - # relative to the source path of this content - path = self.get_relative_source_path( - os.path.join(self.relative_dir, path) - ) - - if path not in self._context['filenames']: - unquoted_path = path.replace('%20', ' ') - - if unquoted_path in self._context['filenames']: - path = unquoted_path - - linked_content = self._context['filenames'].get(path) - if linked_content: - if what == 'attach': - if isinstance(linked_content, Static): - linked_content.attach_to(self) - else: - logger.warning( - "%s used {attach} link syntax on a " - "non-static file. Use {filename} instead.", - self.get_relative_source_path()) - origin = '/'.join((siteurl, linked_content.url)) - origin = origin.replace('\\', '/') # for Windows paths. - else: - logger.warning( - "Unable to find '%s', skipping url replacement.", - value.geturl(), extra={ - 'limit_msg': ("Other resources were not found " - "and their urls not replaced")}) - elif what == 'category': - origin = '/'.join((siteurl, Category(path, self.settings).url)) - elif what == 'tag': - origin = '/'.join((siteurl, Tag(path, self.settings).url)) - elif what == 'index': - origin = '/'.join((siteurl, self.settings['INDEX_SAVE_AS'])) - elif what == 'author': - origin = '/'.join((siteurl, Author(path, self.settings).url)) - else: - logger.warning( - "Replacement Indicator '%s' not recognized, " - "skipping replacement", - what) - - # keep all other parts, such as query, fragment, etc. - parts = list(value) - parts[2] = origin - origin = urlunparse(parts) - - return ''.join((m.group('markup'), m.group('quote'), origin, - m.group('quote'))) - - return hrefs.sub(replacer, content) + return hrefs.sub(lambda m: self._link_replacer(siteurl, m), content) def get_siteurl(self): return self._context.get('localsiteurl', '') From fb587e1ae636c1a2723c1a87e75f20ec121021fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vladim=C3=ADr=20Vondru=C5=A1?= Date: Sun, 10 Dec 2017 21:38:28 +0100 Subject: [PATCH 2/4] tests: avoid invalid HTML markup in the tests. --- pelican/tests/test_contents.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pelican/tests/test_contents.py b/pelican/tests/test_contents.py index d028c7a1a..af9b06bb1 100644 --- a/pelican/tests/test_contents.py +++ b/pelican/tests/test_contents.py @@ -294,28 +294,28 @@ def test_intrasite_link(self): args['content'] = ( 'A simple test, with a ' 'link' + '?utm_whatever=234&highlight=word">link' ) content = Page(**args).get_content('http://notmyidea.org') self.assertEqual( content, 'A simple test, with a ' 'link' + '?utm_whatever=234&highlight=word">link' ) # combination args['content'] = ( 'A simple test, with a ' 'link' + '?utm_whatever=234&highlight=word#section-2">link' ) content = Page(**args).get_content('http://notmyidea.org') self.assertEqual( content, 'A simple test, with a ' 'link' + '?utm_whatever=234&highlight=word#section-2">link' ) # also test for summary in metadata From 96eebd8ea37a226c1f9925b647d4c72affdae0e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vladim=C3=ADr=20Vondru=C5=A1?= Date: Sun, 10 Dec 2017 21:47:46 +0100 Subject: [PATCH 3/4] Test that SITEURL with special characters gets escaped in link replacement. --- pelican/tests/test_contents.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pelican/tests/test_contents.py b/pelican/tests/test_contents.py index af9b06bb1..d2385905c 100644 --- a/pelican/tests/test_contents.py +++ b/pelican/tests/test_contents.py @@ -331,6 +331,21 @@ def test_intrasite_link(self): 'link' ) + # SITEURL with characters that should be escaped + args['content'] = ( + 'A simple test, with a ' + 'link' + ) + content = Page(**args).get_content('http://notmyidea.org/' + '?app=blog&path=') + self.assertEqual( + content, + 'A simple test, with a ' + 'link' + ) + def test_intrasite_link_more(self): # type does not take unicode in PY2 and bytes in PY3, which in # combination with unicode literals leads to following insane line: From 4462d84461fc17700bfa59ab7a24d35388391948 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vladim=C3=ADr=20Vondru=C5=A1?= Date: Sun, 10 Dec 2017 21:48:50 +0100 Subject: [PATCH 4/4] Fix link replacer to properly escape special HTML characters. --- pelican/contents.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/pelican/contents.py b/pelican/contents.py index a534dbaae..e434d2f90 100644 --- a/pelican/contents.py +++ b/pelican/contents.py @@ -26,6 +26,25 @@ logger = logging.getLogger(__name__) +try: + import html +except ImportError: + # html.escape()/html.unescape() is since Python 3.2, do this for py2.7 + # https://wiki.python.org/moin/EscapingHtml + from xml.sax.saxutils import escape, unescape + + class html(object): + _html_escape_table = {'"': """, + "'": "'"} + _html_unescape_table = {'"': '"', + ''': "'"} + + @classmethod + def escape(cls, v): return escape(v, cls._html_escape_table) + + @classmethod + def unescape(cls, v): return unescape(v, cls._html_unescape_table) + @python_2_unicode_compatible class Content(object): @@ -230,9 +249,9 @@ def get_url_setting(self, key): def _link_replacer(self, siteurl, m): what = m.group('what') - value = urlparse(m.group('value')) + value = urlparse(html.unescape(m.group('value'))) path = value.path - origin = m.group('path') + origin = html.unescape(m.group('path')) # XXX Put this in a different location. if what in {'filename', 'attach'}: @@ -285,7 +304,7 @@ def _link_replacer(self, siteurl, m): # keep all other parts, such as query, fragment, etc. parts = list(value) parts[2] = origin - origin = urlunparse(parts) + origin = html.escape(urlunparse(parts)) return ''.join((m.group('markup'), m.group('quote'), origin, m.group('quote')))