From 1f30306e2329e5a1f0c5dd39844d9bb0a0c04573 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vladim=C3=ADr=20Vondru=C5=A1?= <mosra@centrum.cz>
Date: Tue, 19 Sep 2017 18:22:56 +0200
Subject: [PATCH 1/4] Make the internal link replacer function public.

So it can be used from outside.
---
 pelican/contents.py | 126 ++++++++++++++++++++++----------------------
 1 file changed, 63 insertions(+), 63 deletions(-)

diff --git a/pelican/contents.py b/pelican/contents.py
index 15770fc87..a534dbaae 100644
--- a/pelican/contents.py
+++ b/pelican/contents.py
@@ -228,6 +228,68 @@ def get_url_setting(self, key):
         key = key if self.in_default_lang else 'lang_%s' % key
         return self._expand_settings(key)
 
+    def _link_replacer(self, siteurl, m):
+        what = m.group('what')
+        value = urlparse(m.group('value'))
+        path = value.path
+        origin = m.group('path')
+
+        # XXX Put this in a different location.
+        if what in {'filename', 'attach'}:
+            if path.startswith('/'):
+                path = path[1:]
+            else:
+                # relative to the source path of this content
+                path = self.get_relative_source_path(
+                    os.path.join(self.relative_dir, path)
+                )
+
+            if path not in self._context['filenames']:
+                unquoted_path = path.replace('%20', ' ')
+
+                if unquoted_path in self._context['filenames']:
+                    path = unquoted_path
+
+            linked_content = self._context['filenames'].get(path)
+            if linked_content:
+                if what == 'attach':
+                    if isinstance(linked_content, Static):
+                        linked_content.attach_to(self)
+                    else:
+                        logger.warning(
+                            "%s used {attach} link syntax on a "
+                            "non-static file. Use {filename} instead.",
+                            self.get_relative_source_path())
+                origin = '/'.join((siteurl, linked_content.url))
+                origin = origin.replace('\\', '/')  # for Windows paths.
+            else:
+                logger.warning(
+                    "Unable to find '%s', skipping url replacement.",
+                    value.geturl(), extra={
+                        'limit_msg': ("Other resources were not found "
+                                      "and their urls not replaced")})
+        elif what == 'category':
+            origin = '/'.join((siteurl, Category(path, self.settings).url))
+        elif what == 'tag':
+            origin = '/'.join((siteurl, Tag(path, self.settings).url))
+        elif what == 'index':
+            origin = '/'.join((siteurl, self.settings['INDEX_SAVE_AS']))
+        elif what == 'author':
+            origin = '/'.join((siteurl, Author(path, self.settings).url))
+        else:
+            logger.warning(
+                "Replacement Indicator '%s' not recognized, "
+                "skipping replacement",
+                what)
+
+        # keep all other parts, such as query, fragment, etc.
+        parts = list(value)
+        parts[2] = origin
+        origin = urlunparse(parts)
+
+        return ''.join((m.group('markup'), m.group('quote'), origin,
+                        m.group('quote')))
+
     def _update_content(self, content, siteurl):
         """Update the content attribute.
 
@@ -251,69 +313,7 @@ def _update_content(self, content, siteurl):
             \2""".format(instrasite_link_regex)
         hrefs = re.compile(regex, re.X)
 
-        def replacer(m):
-            what = m.group('what')
-            value = urlparse(m.group('value'))
-            path = value.path
-            origin = m.group('path')
-
-            # XXX Put this in a different location.
-            if what in {'filename', 'attach'}:
-                if path.startswith('/'):
-                    path = path[1:]
-                else:
-                    # relative to the source path of this content
-                    path = self.get_relative_source_path(
-                        os.path.join(self.relative_dir, path)
-                    )
-
-                if path not in self._context['filenames']:
-                    unquoted_path = path.replace('%20', ' ')
-
-                    if unquoted_path in self._context['filenames']:
-                        path = unquoted_path
-
-                linked_content = self._context['filenames'].get(path)
-                if linked_content:
-                    if what == 'attach':
-                        if isinstance(linked_content, Static):
-                            linked_content.attach_to(self)
-                        else:
-                            logger.warning(
-                                "%s used {attach} link syntax on a "
-                                "non-static file. Use {filename} instead.",
-                                self.get_relative_source_path())
-                    origin = '/'.join((siteurl, linked_content.url))
-                    origin = origin.replace('\\', '/')  # for Windows paths.
-                else:
-                    logger.warning(
-                        "Unable to find '%s', skipping url replacement.",
-                        value.geturl(), extra={
-                            'limit_msg': ("Other resources were not found "
-                                          "and their urls not replaced")})
-            elif what == 'category':
-                origin = '/'.join((siteurl, Category(path, self.settings).url))
-            elif what == 'tag':
-                origin = '/'.join((siteurl, Tag(path, self.settings).url))
-            elif what == 'index':
-                origin = '/'.join((siteurl, self.settings['INDEX_SAVE_AS']))
-            elif what == 'author':
-                origin = '/'.join((siteurl, Author(path, self.settings).url))
-            else:
-                logger.warning(
-                    "Replacement Indicator '%s' not recognized, "
-                    "skipping replacement",
-                    what)
-
-            # keep all other parts, such as query, fragment, etc.
-            parts = list(value)
-            parts[2] = origin
-            origin = urlunparse(parts)
-
-            return ''.join((m.group('markup'), m.group('quote'), origin,
-                            m.group('quote')))
-
-        return hrefs.sub(replacer, content)
+        return hrefs.sub(lambda m: self._link_replacer(siteurl, m), content)
 
     def get_siteurl(self):
         return self._context.get('localsiteurl', '')

From fb587e1ae636c1a2723c1a87e75f20ec121021fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vladim=C3=ADr=20Vondru=C5=A1?= <mosra@centrum.cz>
Date: Sun, 10 Dec 2017 21:38:28 +0100
Subject: [PATCH 2/4] tests: avoid invalid HTML markup in the tests.

---
 pelican/tests/test_contents.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pelican/tests/test_contents.py b/pelican/tests/test_contents.py
index d028c7a1a..af9b06bb1 100644
--- a/pelican/tests/test_contents.py
+++ b/pelican/tests/test_contents.py
@@ -294,28 +294,28 @@ def test_intrasite_link(self):
         args['content'] = (
             'A simple test, with a '
             '<a href="|filename|article.rst'
-            '?utm_whatever=234&highlight=word">link</a>'
+            '?utm_whatever=234&amp;highlight=word">link</a>'
         )
         content = Page(**args).get_content('http://notmyidea.org')
         self.assertEqual(
             content,
             'A simple test, with a '
             '<a href="http://notmyidea.org/article.html'
-            '?utm_whatever=234&highlight=word">link</a>'
+            '?utm_whatever=234&amp;highlight=word">link</a>'
         )
 
         # combination
         args['content'] = (
             'A simple test, with a '
             '<a href="|filename|article.rst'
-            '?utm_whatever=234&highlight=word#section-2">link</a>'
+            '?utm_whatever=234&amp;highlight=word#section-2">link</a>'
         )
         content = Page(**args).get_content('http://notmyidea.org')
         self.assertEqual(
             content,
             'A simple test, with a '
             '<a href="http://notmyidea.org/article.html'
-            '?utm_whatever=234&highlight=word#section-2">link</a>'
+            '?utm_whatever=234&amp;highlight=word#section-2">link</a>'
         )
 
         # also test for summary in metadata

From 96eebd8ea37a226c1f9925b647d4c72affdae0e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vladim=C3=ADr=20Vondru=C5=A1?= <mosra@centrum.cz>
Date: Sun, 10 Dec 2017 21:47:46 +0100
Subject: [PATCH 3/4] Test that SITEURL with special characters gets escaped in
 link replacement.

---
 pelican/tests/test_contents.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/pelican/tests/test_contents.py b/pelican/tests/test_contents.py
index af9b06bb1..d2385905c 100644
--- a/pelican/tests/test_contents.py
+++ b/pelican/tests/test_contents.py
@@ -331,6 +331,21 @@ def test_intrasite_link(self):
             '<a href="http://notmyidea.org/article.html">link</a>'
         )
 
+        # SITEURL with characters that should be escaped
+        args['content'] = (
+            'A simple test, with a '
+            '<a href="|filename|article.rst'
+            '#highlight=&quot;word&quot;">link</a>'
+        )
+        content = Page(**args).get_content('http://notmyidea.org/'
+                                           '?app=blog&path=')
+        self.assertEqual(
+            content,
+            'A simple test, with a '
+            '<a href="http://notmyidea.org/?app=blog&amp;path='
+            '/article.html#highlight=&quot;word&quot;">link</a>'
+        )
+
     def test_intrasite_link_more(self):
         # type does not take unicode in PY2 and bytes in PY3, which in
         # combination with unicode literals leads to following insane line:

From 4462d84461fc17700bfa59ab7a24d35388391948 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vladim=C3=ADr=20Vondru=C5=A1?= <mosra@centrum.cz>
Date: Sun, 10 Dec 2017 21:48:50 +0100
Subject: [PATCH 4/4] Fix link replacer to properly escape special HTML
 characters.

---
 pelican/contents.py | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/pelican/contents.py b/pelican/contents.py
index a534dbaae..e434d2f90 100644
--- a/pelican/contents.py
+++ b/pelican/contents.py
@@ -26,6 +26,25 @@
 
 logger = logging.getLogger(__name__)
 
+try:
+    import html
+except ImportError:
+    # html.escape()/html.unescape() is since Python 3.2, do this for py2.7
+    # https://wiki.python.org/moin/EscapingHtml
+    from xml.sax.saxutils import escape, unescape
+
+    class html(object):
+        _html_escape_table = {'"': "&quot;",
+                              "'": "&apos;"}
+        _html_unescape_table = {'&quot;': '"',
+                                '&apos;': "'"}
+
+        @classmethod
+        def escape(cls, v): return escape(v, cls._html_escape_table)
+
+        @classmethod
+        def unescape(cls, v): return unescape(v, cls._html_unescape_table)
+
 
 @python_2_unicode_compatible
 class Content(object):
@@ -230,9 +249,9 @@ def get_url_setting(self, key):
 
     def _link_replacer(self, siteurl, m):
         what = m.group('what')
-        value = urlparse(m.group('value'))
+        value = urlparse(html.unescape(m.group('value')))
         path = value.path
-        origin = m.group('path')
+        origin = html.unescape(m.group('path'))
 
         # XXX Put this in a different location.
         if what in {'filename', 'attach'}:
@@ -285,7 +304,7 @@ def _link_replacer(self, siteurl, m):
         # keep all other parts, such as query, fragment, etc.
         parts = list(value)
         parts[2] = origin
-        origin = urlunparse(parts)
+        origin = html.escape(urlunparse(parts))
 
         return ''.join((m.group('markup'), m.group('quote'), origin,
                         m.group('quote')))