mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
HTML Unescape Improvements (#500)
* html-unescape fix: - unescape any url that contains '&#' as it may be html-encoded - unescape css blocks that contain '&#' as well, as they may contain css urls that need rewriting * misc fixes: - Update CHANGES - Update to latest wombat - Update reqs to surt 0.3.1, fix tests
This commit is contained in:
parent
bdf4a26807
commit
cf5aceb4f5
@ -7,6 +7,7 @@ pywb 2.3.4 changelist
|
||||
- General auto-fetch fixes: queue messages if worker not yet inited (in proxy mode), only parse <link> stylesheet hrefs as sheets.
|
||||
|
||||
* Cookie Rewriting Fix: don't update cookie cache on service worker (``sw_`` modifier) responses (#499)
|
||||
* Rewriting: HTML Unescape Fix: Attempt to HTML-entity-decode urls and innline styles that contain ``&#`` to get correct rewriting of encoded urls (#500)
|
||||
|
||||
|
||||
pywb 2.3.3 changelist
|
||||
|
@ -79,7 +79,7 @@ org,gnu)/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/
|
||||
>>> print_cdx_index('example-wpull.warc.gz')
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/ 20150330235046 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1150 2031 example-wpull.warc.gz
|
||||
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
|
||||
urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
|
||||
|
||||
# bad arcs -- test error edge cases
|
||||
>>> print_cdx_index('bad.arc', include_all=True)
|
||||
@ -151,19 +151,19 @@ StatusAndHeadersParserException: Expected Status Line starting with ['HTTP/1.0',
|
||||
# test sort, multiple inputs
|
||||
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
|
||||
urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
|
||||
Total: 212
|
||||
|
||||
# test sort, multiple inputs, recursive, from base test dir
|
||||
>>> cli_lines(['--sort', '-r', '-', get_test_dir()])
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 warcs/example-url-agnostic-revisit.warc.gz
|
||||
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 warcs/example-wpull.warc.gz
|
||||
urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 warcs/example-wpull.warc.gz
|
||||
Total: 212
|
||||
|
||||
# test sort, 9-field, multiple inputs, all records + post query
|
||||
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
|
||||
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - 3181 example-wpull.warc.gz
|
||||
urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - 3181 example-wpull.warc.gz
|
||||
Total: 407
|
||||
|
||||
# test writing to stdout
|
||||
@ -187,7 +187,7 @@ Total: 4
|
||||
# test custom root dir for cdx filenames, dir input
|
||||
>>> cli_lines(['--sort', '--dir-root', get_test_dir() + 'other/', TEST_WARC_DIR])
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 ../warcs/example-url-agnostic-revisit.warc.gz
|
||||
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 ../warcs/example-wpull.warc.gz
|
||||
urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 ../warcs/example-wpull.warc.gz
|
||||
Total: 212
|
||||
|
||||
# test writing to temp dir, also use unicode filename
|
||||
|
@ -88,10 +88,6 @@ class HTMLRewriterMixin(StreamingRewriter):
|
||||
|
||||
return rewrite_tags
|
||||
|
||||
# tags allowed in the <head> of an html document
|
||||
HEAD_TAGS = ['html', 'head', 'base', 'link', 'meta',
|
||||
'title', 'style', 'script', 'object', 'bgsound']
|
||||
|
||||
BEFORE_HEAD_TAGS = ['html', 'head']
|
||||
|
||||
DATA_RW_PROTOCOLS = ('http://', 'https://', '//')
|
||||
@ -174,6 +170,16 @@ class HTMLRewriterMixin(StreamingRewriter):
|
||||
|
||||
ADD_WINDOW = re.compile('(?<![.])(WB_wombat_)')
|
||||
|
||||
SRCSET_REGEX = re.compile('\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))')
|
||||
|
||||
def _rewrite_srcset(self, value, mod=''):
|
||||
if not value:
|
||||
return ''
|
||||
|
||||
values = (url.strip() for url in re.split(self.SRCSET_REGEX, value) if url)
|
||||
values = [self._rewrite_url(v.strip()) for v in values]
|
||||
return ', '.join(values)
|
||||
|
||||
def _rewrite_meta_refresh(self, meta_refresh):
|
||||
if not meta_refresh:
|
||||
return ''
|
||||
@ -272,7 +278,7 @@ class HTMLRewriterMixin(StreamingRewriter):
|
||||
return rewritten_value
|
||||
|
||||
def try_unescape(self, value):
|
||||
if not value.startswith('http'):
|
||||
if '&#' not in value:
|
||||
return value
|
||||
|
||||
try:
|
||||
@ -285,22 +291,18 @@ class HTMLRewriterMixin(StreamingRewriter):
|
||||
|
||||
return new_value
|
||||
|
||||
SRCSET_REGEX = re.compile('\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))')
|
||||
|
||||
def _rewrite_srcset(self, value, mod=''):
|
||||
if not value:
|
||||
return ''
|
||||
|
||||
values = (url.strip() for url in re.split(self.SRCSET_REGEX, value) if url)
|
||||
values = [self._rewrite_url(v.strip()) for v in values]
|
||||
return ', '.join(values)
|
||||
|
||||
def _rewrite_css(self, css_content):
|
||||
if css_content:
|
||||
return self.css_rewriter.rewrite_complete(css_content)
|
||||
else:
|
||||
if not css_content:
|
||||
return ''
|
||||
|
||||
unesc_css = self.try_unescape(css_content)
|
||||
rw_css = self.css_rewriter.rewrite_complete(unesc_css)
|
||||
|
||||
if unesc_css == rw_css:
|
||||
return css_content
|
||||
else:
|
||||
return rw_css
|
||||
|
||||
def _rewrite_script(self, script_content, inline_attr=False):
|
||||
if not script_content:
|
||||
return ''
|
||||
|
@ -107,9 +107,11 @@ r"""
|
||||
#<a href="/web/20131226101010/http://испытание.испытание/">испытание</a>
|
||||
|
||||
# entity unescaping
|
||||
#>>> parse('<a href="http://www.example.com/path/file.html">')
|
||||
<a href="/web/20131226101010/http://www.example.com/path/file.html">
|
||||
>>> parse('<a href="http://www.example.com/path/file.html">')
|
||||
<a href="/web/20131226101010/http://www.example.com/path/file.html">
|
||||
|
||||
>>> parse('<a href="//www.example.com/path/file.html">')
|
||||
<a href="/web/20131226101010///www.example.com/path/file.html">
|
||||
|
||||
# Meta tag
|
||||
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
||||
@ -253,6 +255,9 @@ r"""
|
||||
>>> parse('<i style=\'background-image: url("http://foo.example.com/")\'></i>')
|
||||
<i style="background-image: url("/web/20131226101010/http://foo.example.com/")"></i>
|
||||
|
||||
>>> parse('<i style=\'background-image: url('http://foo.example.com/')\'></i>')
|
||||
<i style="background-image: url('/web/20131226101010/http://foo.example.com/')"></i>
|
||||
|
||||
>>> parse("<i style='background-image: url('http://foo.example.com/')'></i>")
|
||||
<i style="background-image: url('/web/20131226101010/http://foo.example.com/')"></i>
|
||||
|
||||
|
File diff suppressed because one or more lines are too long
@ -3,7 +3,7 @@ warcio>=1.7.1
|
||||
requests
|
||||
redis<3.0
|
||||
jinja2
|
||||
surt>=0.3.0
|
||||
surt>=0.3.1
|
||||
brotlipy
|
||||
pyyaml
|
||||
werkzeug
|
||||
|
2
wombat
2
wombat
@ -1 +1 @@
|
||||
Subproject commit e647aa17a121bc9328809fc08b61b742c1357dd2
|
||||
Subproject commit 8b74896d94a71d9b97172ee5a1a798f04d65ec63
|
Loading…
x
Reference in New Issue
Block a user