HTML Unescape Improvements (#500)

* html-unescape fix: - unescape any url that contains '&#' as it may be html-encoded - unescape css blocks that contain '&#' as well, as they may contain css urls that need rewriting * misc fixes: - Update CHANGES - Update to latest wombat - Update reqs to surt 0.3.1, fix tests
2025-03-15 00:03:28 +01:00 · 2019-08-22 18:35:32 -07:00 · 2019-08-22 18:35:32 -07:00 · cf5aceb4f5
commit cf5aceb4f5
parent bdf4a26807
7 changed files with 36 additions and 28 deletions
--- a/CHANGES.rst
+++ b/CHANGES.rst
@ -7,6 +7,7 @@ pywb 2.3.4 changelist
  - General auto-fetch fixes: queue messages if worker not yet inited (in proxy mode), only parse <link> stylesheet hrefs as sheets.

 * Cookie Rewriting Fix: don't update cookie cache on service worker (``sw_`` modifier) responses (#499)
+* Rewriting: HTML Unescape Fix: Attempt to HTML-entity-decode urls and innline styles that contain ``&#`` to get correct rewriting of encoded urls (#500)


 pywb 2.3.3 changelist
--- a/pywb/indexer/test/test_indexing.py
+++ b/pywb/indexer/test/test_indexing.py
@ -79,7 +79,7 @@ org,gnu)/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/
 >>> print_cdx_index('example-wpull.warc.gz')
 CDX N b a m s k r M S V g
 com,example)/ 20150330235046 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1150 2031 example-wpull.warc.gz
-urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
+urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz

 # bad arcs -- test error edge cases
 >>> print_cdx_index('bad.arc', include_all=True)
@ -151,19 +151,19 @@ StatusAndHeadersParserException: Expected Status Line starting with ['HTTP/1.0',
 # test sort, multiple inputs
 >>> cli_lines(['--sort', '-',  TEST_WARC_DIR])
 com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
-urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
+urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
 Total: 212

 # test sort, multiple inputs, recursive, from base test dir
 >>> cli_lines(['--sort', '-r', '-',  get_test_dir()])
 com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 warcs/example-url-agnostic-revisit.warc.gz
-urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 warcs/example-wpull.warc.gz
+urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 warcs/example-wpull.warc.gz
 Total: 212

 # test sort, 9-field, multiple inputs, all records + post query
 >>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
 com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
-urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - 3181 example-wpull.warc.gz
+urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - 3181 example-wpull.warc.gz
 Total: 407

 # test writing to stdout
@ -187,7 +187,7 @@ Total: 4
 # test custom root dir for cdx filenames, dir input
 >>> cli_lines(['--sort', '--dir-root', get_test_dir() + 'other/', TEST_WARC_DIR])
 com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 ../warcs/example-url-agnostic-revisit.warc.gz
-urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 ../warcs/example-wpull.warc.gz
+urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 ../warcs/example-wpull.warc.gz
 Total: 212

 # test writing to temp dir, also use unicode filename
--- a/pywb/rewrite/html_rewriter.py
+++ b/pywb/rewrite/html_rewriter.py
@ -88,10 +88,6 @@ class HTMLRewriterMixin(StreamingRewriter):

        return rewrite_tags

-    # tags allowed in the <head> of an html document
-    HEAD_TAGS = ['html', 'head', 'base', 'link', 'meta',
-                 'title', 'style', 'script', 'object', 'bgsound']
-
    BEFORE_HEAD_TAGS = ['html', 'head']

    DATA_RW_PROTOCOLS = ('http://', 'https://', '//')
@ -174,6 +170,16 @@ class HTMLRewriterMixin(StreamingRewriter):

    ADD_WINDOW = re.compile('(?<![.])(WB_wombat_)')

+    SRCSET_REGEX = re.compile('\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))')
+
+    def _rewrite_srcset(self, value, mod=''):
+        if not value:
+            return ''
+
+        values = (url.strip() for url in re.split(self.SRCSET_REGEX, value) if url)
+        values = [self._rewrite_url(v.strip()) for v in values]
+        return ', '.join(values)
+
    def _rewrite_meta_refresh(self, meta_refresh):
        if not meta_refresh:
            return ''
@ -272,7 +278,7 @@ class HTMLRewriterMixin(StreamingRewriter):
        return rewritten_value

    def try_unescape(self, value):
-        if not value.startswith('http'):
+        if '&#' not in value:
            return value

        try:
@ -285,22 +291,18 @@ class HTMLRewriterMixin(StreamingRewriter):

        return new_value

-    SRCSET_REGEX = re.compile('\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))')
-
-    def _rewrite_srcset(self, value, mod=''):
-        if not value:
-            return ''
-
-        values = (url.strip() for url in re.split(self.SRCSET_REGEX, value) if url)
-        values = [self._rewrite_url(v.strip()) for v in values]
-        return ', '.join(values)
-
    def _rewrite_css(self, css_content):
-        if css_content:
-            return self.css_rewriter.rewrite_complete(css_content)
-        else:
+        if not css_content:
            return ''

+        unesc_css = self.try_unescape(css_content)
+        rw_css = self.css_rewriter.rewrite_complete(unesc_css)
+
+        if unesc_css == rw_css:
+            return css_content
+        else:
+            return rw_css
+
    def _rewrite_script(self, script_content, inline_attr=False):
        if not script_content:
            return ''
--- a/pywb/rewrite/test/test_html_rewriter.py
+++ b/pywb/rewrite/test/test_html_rewriter.py
@ -107,9 +107,11 @@ r"""
 #<a href="/web/20131226101010/http://испытание.испытание/">испытание</a>

 # entity unescaping
-#>>> parse('<a href="http&#x3a;&#x2f;&#x2f;www&#x2e;example&#x2e;com&#x2f;path&#x2f;file.html">')
-<a href="/web/20131226101010/http://www.example.com/path/file.html">
+>>> parse('<a href="http&#x3a;&#x2f;&#x2f;www&#x2e;example&#x2e;com&#x2f;path&#x2f;file.html">')
+<a href="/web/20131226101010/http&#x3a;&#x2f;&#x2f;www&#x2e;example&#x2e;com&#x2f;path&#x2f;file.html">

+>>> parse('<a href="&#x2f;&#x2f;www&#x2e;example&#x2e;com&#x2f;path&#x2f;file.html">')
+<a href="/web/20131226101010/&#x2f;&#x2f;www&#x2e;example&#x2e;com&#x2f;path&#x2f;file.html">

 # Meta tag
 >>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
@ -253,6 +255,9 @@ r"""
 >>> parse('<i style=\'background-image: url(&quot;http://foo.example.com/&quot;)\'></i>')
 <i style="background-image: url(&quot;/web/20131226101010/http://foo.example.com/&quot;)"></i>

+>>> parse('<i style=\'background-image: url(&#x27;http://foo.example.com/&#x27;)\'></i>')
+<i style="background-image: url('/web/20131226101010/http://foo.example.com/')"></i>
+
 >>> parse("<i style='background-image: url(&apos;http://foo.example.com/&apos;)'></i>")
 <i style="background-image: url(&apos;/web/20131226101010/http://foo.example.com/&apos;)"></i>

--- a/pywb/static/wombat.js
+++ b/pywb/static/wombat.js
--- a/requirements.txt
+++ b/requirements.txt
@ -3,7 +3,7 @@ warcio>=1.7.1
 requests
 redis<3.0
 jinja2
-surt>=0.3.0
+surt>=0.3.1
 brotlipy
 pyyaml
 werkzeug
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit e647aa17a121bc9328809fc08b61b742c1357dd2
+Subproject commit 8b74896d94a71d9b97172ee5a1a798f04d65ec63