Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Escape special HTML chars coming from SITEURL in the internal link replacer #2260

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 82 additions & 63 deletions pelican/contents.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,25 @@

logger = logging.getLogger(__name__)

try:
import html
except ImportError:
# html.escape()/html.unescape() is since Python 3.2, do this for py2.7
# https://wiki.python.org/moin/EscapingHtml
from xml.sax.saxutils import escape, unescape

class html(object):
_html_escape_table = {'"': """,
"'": "'"}
_html_unescape_table = {'"': '"',
''': "'"}

@classmethod
def escape(cls, v): return escape(v, cls._html_escape_table)

@classmethod
def unescape(cls, v): return unescape(v, cls._html_unescape_table)


@python_2_unicode_compatible
class Content(object):
Expand Down Expand Up @@ -228,6 +247,68 @@ def get_url_setting(self, key):
key = key if self.in_default_lang else 'lang_%s' % key
return self._expand_settings(key)

def _link_replacer(self, siteurl, m):
what = m.group('what')
value = urlparse(html.unescape(m.group('value')))
path = value.path
origin = html.unescape(m.group('path'))

# XXX Put this in a different location.
if what in {'filename', 'attach'}:
if path.startswith('/'):
path = path[1:]
else:
# relative to the source path of this content
path = self.get_relative_source_path(
os.path.join(self.relative_dir, path)
)

if path not in self._context['filenames']:
unquoted_path = path.replace('%20', ' ')

if unquoted_path in self._context['filenames']:
path = unquoted_path

linked_content = self._context['filenames'].get(path)
if linked_content:
if what == 'attach':
if isinstance(linked_content, Static):
linked_content.attach_to(self)
else:
logger.warning(
"%s used {attach} link syntax on a "
"non-static file. Use {filename} instead.",
self.get_relative_source_path())
origin = '/'.join((siteurl, linked_content.url))
origin = origin.replace('\\', '/') # for Windows paths.
else:
logger.warning(
"Unable to find '%s', skipping url replacement.",
value.geturl(), extra={
'limit_msg': ("Other resources were not found "
"and their urls not replaced")})
elif what == 'category':
origin = '/'.join((siteurl, Category(path, self.settings).url))
elif what == 'tag':
origin = '/'.join((siteurl, Tag(path, self.settings).url))
elif what == 'index':
origin = '/'.join((siteurl, self.settings['INDEX_SAVE_AS']))
elif what == 'author':
origin = '/'.join((siteurl, Author(path, self.settings).url))
else:
logger.warning(
"Replacement Indicator '%s' not recognized, "
"skipping replacement",
what)

# keep all other parts, such as query, fragment, etc.
parts = list(value)
parts[2] = origin
origin = html.escape(urlunparse(parts))

return ''.join((m.group('markup'), m.group('quote'), origin,
m.group('quote')))

def _update_content(self, content, siteurl):
"""Update the content attribute.

Expand All @@ -251,69 +332,7 @@ def _update_content(self, content, siteurl):
\2""".format(instrasite_link_regex)
hrefs = re.compile(regex, re.X)

def replacer(m):
what = m.group('what')
value = urlparse(m.group('value'))
path = value.path
origin = m.group('path')

# XXX Put this in a different location.
if what in {'filename', 'attach'}:
if path.startswith('/'):
path = path[1:]
else:
# relative to the source path of this content
path = self.get_relative_source_path(
os.path.join(self.relative_dir, path)
)

if path not in self._context['filenames']:
unquoted_path = path.replace('%20', ' ')

if unquoted_path in self._context['filenames']:
path = unquoted_path

linked_content = self._context['filenames'].get(path)
if linked_content:
if what == 'attach':
if isinstance(linked_content, Static):
linked_content.attach_to(self)
else:
logger.warning(
"%s used {attach} link syntax on a "
"non-static file. Use {filename} instead.",
self.get_relative_source_path())
origin = '/'.join((siteurl, linked_content.url))
origin = origin.replace('\\', '/') # for Windows paths.
else:
logger.warning(
"Unable to find '%s', skipping url replacement.",
value.geturl(), extra={
'limit_msg': ("Other resources were not found "
"and their urls not replaced")})
elif what == 'category':
origin = '/'.join((siteurl, Category(path, self.settings).url))
elif what == 'tag':
origin = '/'.join((siteurl, Tag(path, self.settings).url))
elif what == 'index':
origin = '/'.join((siteurl, self.settings['INDEX_SAVE_AS']))
elif what == 'author':
origin = '/'.join((siteurl, Author(path, self.settings).url))
else:
logger.warning(
"Replacement Indicator '%s' not recognized, "
"skipping replacement",
what)

# keep all other parts, such as query, fragment, etc.
parts = list(value)
parts[2] = origin
origin = urlunparse(parts)

return ''.join((m.group('markup'), m.group('quote'), origin,
m.group('quote')))

return hrefs.sub(replacer, content)
return hrefs.sub(lambda m: self._link_replacer(siteurl, m), content)

def get_siteurl(self):
return self._context.get('localsiteurl', '')
Expand Down
23 changes: 19 additions & 4 deletions pelican/tests/test_contents.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,28 +294,28 @@ def test_intrasite_link(self):
args['content'] = (
'A simple test, with a '
'<a href="|filename|article.rst'
'?utm_whatever=234&highlight=word">link</a>'
'?utm_whatever=234&amp;highlight=word">link</a>'
)
content = Page(**args).get_content('http://notmyidea.org')
self.assertEqual(
content,
'A simple test, with a '
'<a href="http://notmyidea.org/article.html'
'?utm_whatever=234&highlight=word">link</a>'
'?utm_whatever=234&amp;highlight=word">link</a>'
)

# combination
args['content'] = (
'A simple test, with a '
'<a href="|filename|article.rst'
'?utm_whatever=234&highlight=word#section-2">link</a>'
'?utm_whatever=234&amp;highlight=word#section-2">link</a>'
)
content = Page(**args).get_content('http://notmyidea.org')
self.assertEqual(
content,
'A simple test, with a '
'<a href="http://notmyidea.org/article.html'
'?utm_whatever=234&highlight=word#section-2">link</a>'
'?utm_whatever=234&amp;highlight=word#section-2">link</a>'
)

# also test for summary in metadata
Expand All @@ -331,6 +331,21 @@ def test_intrasite_link(self):
'<a href="http://notmyidea.org/article.html">link</a>'
)

# SITEURL with characters that should be escaped
args['content'] = (
'A simple test, with a '
'<a href="|filename|article.rst'
'#highlight=&quot;word&quot;">link</a>'
)
content = Page(**args).get_content('http://notmyidea.org/'
'?app=blog&path=')
self.assertEqual(
content,
'A simple test, with a '
'<a href="http://notmyidea.org/?app=blog&amp;path='
'/article.html#highlight=&quot;word&quot;">link</a>'
)

def test_intrasite_link_more(self):
# type does not take unicode in PY2 and bytes in PY3, which in
# combination with unicode literals leads to following insane line:
Expand Down