1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

HTML Unescape Improvements (#500)

* html-unescape fix:
- unescape any url that contains '&#' as it may be html-encoded
- unescape css blocks that contain '&#' as well, as they may contain css urls that need rewriting
* misc fixes:
- Update CHANGES
- Update to latest wombat
- Update reqs to surt 0.3.1, fix tests
This commit is contained in:
Ilya Kreymer 2019-08-22 18:35:32 -07:00 committed by GitHub
parent bdf4a26807
commit cf5aceb4f5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 36 additions and 28 deletions

View File

@ -7,6 +7,7 @@ pywb 2.3.4 changelist
- General auto-fetch fixes: queue messages if worker not yet inited (in proxy mode), only parse <link> stylesheet hrefs as sheets.
* Cookie Rewriting Fix: don't update cookie cache on service worker (``sw_`` modifier) responses (#499)
* Rewriting: HTML Unescape Fix: Attempt to HTML-entity-decode urls and innline styles that contain ``&#`` to get correct rewriting of encoded urls (#500)
pywb 2.3.3 changelist

View File

@ -79,7 +79,7 @@ org,gnu)/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/
>>> print_cdx_index('example-wpull.warc.gz')
CDX N b a m s k r M S V g
com,example)/ 20150330235046 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1150 2031 example-wpull.warc.gz
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
# bad arcs -- test error edge cases
>>> print_cdx_index('bad.arc', include_all=True)
@ -151,19 +151,19 @@ StatusAndHeadersParserException: Expected Status Line starting with ['HTTP/1.0',
# test sort, multiple inputs
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
Total: 212
# test sort, multiple inputs, recursive, from base test dir
>>> cli_lines(['--sort', '-r', '-', get_test_dir()])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 warcs/example-url-agnostic-revisit.warc.gz
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 warcs/example-wpull.warc.gz
urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 warcs/example-wpull.warc.gz
Total: 212
# test sort, 9-field, multiple inputs, all records + post query
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - 3181 example-wpull.warc.gz
urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - 3181 example-wpull.warc.gz
Total: 407
# test writing to stdout
@ -187,7 +187,7 @@ Total: 4
# test custom root dir for cdx filenames, dir input
>>> cli_lines(['--sort', '--dir-root', get_test_dir() + 'other/', TEST_WARC_DIR])
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 ../warcs/example-url-agnostic-revisit.warc.gz
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 ../warcs/example-wpull.warc.gz
urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 ../warcs/example-wpull.warc.gz
Total: 212
# test writing to temp dir, also use unicode filename

View File

@ -88,10 +88,6 @@ class HTMLRewriterMixin(StreamingRewriter):
return rewrite_tags
# tags allowed in the <head> of an html document
HEAD_TAGS = ['html', 'head', 'base', 'link', 'meta',
'title', 'style', 'script', 'object', 'bgsound']
BEFORE_HEAD_TAGS = ['html', 'head']
DATA_RW_PROTOCOLS = ('http://', 'https://', '//')
@ -174,6 +170,16 @@ class HTMLRewriterMixin(StreamingRewriter):
ADD_WINDOW = re.compile('(?<![.])(WB_wombat_)')
SRCSET_REGEX = re.compile('\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))')
def _rewrite_srcset(self, value, mod=''):
if not value:
return ''
values = (url.strip() for url in re.split(self.SRCSET_REGEX, value) if url)
values = [self._rewrite_url(v.strip()) for v in values]
return ', '.join(values)
def _rewrite_meta_refresh(self, meta_refresh):
if not meta_refresh:
return ''
@ -272,7 +278,7 @@ class HTMLRewriterMixin(StreamingRewriter):
return rewritten_value
def try_unescape(self, value):
if not value.startswith('http'):
if '&#' not in value:
return value
try:
@ -285,22 +291,18 @@ class HTMLRewriterMixin(StreamingRewriter):
return new_value
SRCSET_REGEX = re.compile('\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))')
def _rewrite_srcset(self, value, mod=''):
if not value:
return ''
values = (url.strip() for url in re.split(self.SRCSET_REGEX, value) if url)
values = [self._rewrite_url(v.strip()) for v in values]
return ', '.join(values)
def _rewrite_css(self, css_content):
if css_content:
return self.css_rewriter.rewrite_complete(css_content)
else:
if not css_content:
return ''
unesc_css = self.try_unescape(css_content)
rw_css = self.css_rewriter.rewrite_complete(unesc_css)
if unesc_css == rw_css:
return css_content
else:
return rw_css
def _rewrite_script(self, script_content, inline_attr=False):
if not script_content:
return ''

View File

@ -107,9 +107,11 @@ r"""
#<a href="/web/20131226101010/http://испытание.испытание/">испытание</a>
# entity unescaping
#>>> parse('<a href="http&#x3a;&#x2f;&#x2f;www&#x2e;example&#x2e;com&#x2f;path&#x2f;file.html">')
<a href="/web/20131226101010/http://www.example.com/path/file.html">
>>> parse('<a href="http&#x3a;&#x2f;&#x2f;www&#x2e;example&#x2e;com&#x2f;path&#x2f;file.html">')
<a href="/web/20131226101010/http&#x3a;&#x2f;&#x2f;www&#x2e;example&#x2e;com&#x2f;path&#x2f;file.html">
>>> parse('<a href="&#x2f;&#x2f;www&#x2e;example&#x2e;com&#x2f;path&#x2f;file.html">')
<a href="/web/20131226101010/&#x2f;&#x2f;www&#x2e;example&#x2e;com&#x2f;path&#x2f;file.html">
# Meta tag
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
@ -253,6 +255,9 @@ r"""
>>> parse('<i style=\'background-image: url(&quot;http://foo.example.com/&quot;)\'></i>')
<i style="background-image: url(&quot;/web/20131226101010/http://foo.example.com/&quot;)"></i>
>>> parse('<i style=\'background-image: url(&#x27;http://foo.example.com/&#x27;)\'></i>')
<i style="background-image: url('/web/20131226101010/http://foo.example.com/')"></i>
>>> parse("<i style='background-image: url(&apos;http://foo.example.com/&apos;)'></i>")
<i style="background-image: url(&apos;/web/20131226101010/http://foo.example.com/&apos;)"></i>

File diff suppressed because one or more lines are too long

View File

@ -3,7 +3,7 @@ warcio>=1.7.1
requests
redis<3.0
jinja2
surt>=0.3.0
surt>=0.3.1
brotlipy
pyyaml
werkzeug

2
wombat

@ -1 +1 @@
Subproject commit e647aa17a121bc9328809fc08b61b742c1357dd2
Subproject commit 8b74896d94a71d9b97172ee5a1a798f04d65ec63