diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 4689f74b..ccc8bf02 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -14,7 +14,13 @@ from pywb.rewrite.regex_rewriters import JSRewriter, CSSRewriter from pywb.rewrite.content_rewriter import StreamingRewriter import six.moves.html_parser -six.moves.html_parser.unescape = lambda x: x + +try: + orig_unescape = six.moves.html_parser.unescape + six.moves.html_parser.unescape = lambda x: x +except: + orig_unescape = None + from six import text_type @@ -223,23 +229,26 @@ class HTMLRewriterMixin(StreamingRewriter): if not value: return '' - value = self.try_unescape(value) - return self.url_rewriter.rewrite(value, mod) + unesc_value = self.try_unescape(value) + rewritten_value = self.url_rewriter.rewrite(unesc_value, mod) + + if unesc_value != value and rewritten_value != unesc_value: + rewritten_value = rewritten_value.replace(unesc_value, value) + + return rewritten_value def try_unescape(self, value): if not value.startswith('http'): return value try: - new_value = HTMLParser.unescape(self, value) - except: + if orig_unescape: + new_value = orig_unescape(value) + else: + new_value = HTMLParser.unescape(self, value) + except Exception as e: return value - if value != new_value: - # ensure utf-8 encoded to avoid %-encoding query here - if isinstance(new_value, text_type): - new_value = new_value.encode('utf-8') - return new_value SRCSET_REGEX = re.compile('\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))') diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index 1276dca2..7c4ff26b 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -88,6 +88,10 @@ r""" >>> parse('X') X +# Ensure url is rewritten, but is not unescaped +>>> parse('') + + # Empty values should be ignored >>> parse('')