mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
HTML Unescape Improvements (#500)
* html-unescape fix: - unescape any url that contains '&#' as it may be html-encoded - unescape css blocks that contain '&#' as well, as they may contain css urls that need rewriting * misc fixes: - Update CHANGES - Update to latest wombat - Update reqs to surt 0.3.1, fix tests
This commit is contained in:
parent
bdf4a26807
commit
cf5aceb4f5
@ -7,6 +7,7 @@ pywb 2.3.4 changelist
|
|||||||
- General auto-fetch fixes: queue messages if worker not yet inited (in proxy mode), only parse <link> stylesheet hrefs as sheets.
|
- General auto-fetch fixes: queue messages if worker not yet inited (in proxy mode), only parse <link> stylesheet hrefs as sheets.
|
||||||
|
|
||||||
* Cookie Rewriting Fix: don't update cookie cache on service worker (``sw_`` modifier) responses (#499)
|
* Cookie Rewriting Fix: don't update cookie cache on service worker (``sw_`` modifier) responses (#499)
|
||||||
|
* Rewriting: HTML Unescape Fix: Attempt to HTML-entity-decode urls and innline styles that contain ``&#`` to get correct rewriting of encoded urls (#500)
|
||||||
|
|
||||||
|
|
||||||
pywb 2.3.3 changelist
|
pywb 2.3.3 changelist
|
||||||
|
@ -79,7 +79,7 @@ org,gnu)/software/wget/warc/wget.log 20140216012908 metadata://gnu.org/software/
|
|||||||
>>> print_cdx_index('example-wpull.warc.gz')
|
>>> print_cdx_index('example-wpull.warc.gz')
|
||||||
CDX N b a m s k r M S V g
|
CDX N b a m s k r M S V g
|
||||||
com,example)/ 20150330235046 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1150 2031 example-wpull.warc.gz
|
com,example)/ 20150330235046 http://example.com/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1150 2031 example-wpull.warc.gz
|
||||||
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
|
urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
|
||||||
|
|
||||||
# bad arcs -- test error edge cases
|
# bad arcs -- test error edge cases
|
||||||
>>> print_cdx_index('bad.arc', include_all=True)
|
>>> print_cdx_index('bad.arc', include_all=True)
|
||||||
@ -151,19 +151,19 @@ StatusAndHeadersParserException: Expected Status Line starting with ['HTTP/1.0',
|
|||||||
# test sort, multiple inputs
|
# test sort, multiple inputs
|
||||||
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
|
>>> cli_lines(['--sort', '-', TEST_WARC_DIR])
|
||||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||||
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
|
urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 example-wpull.warc.gz
|
||||||
Total: 212
|
Total: 212
|
||||||
|
|
||||||
# test sort, multiple inputs, recursive, from base test dir
|
# test sort, multiple inputs, recursive, from base test dir
|
||||||
>>> cli_lines(['--sort', '-r', '-', get_test_dir()])
|
>>> cli_lines(['--sort', '-r', '-', get_test_dir()])
|
||||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 warcs/example-url-agnostic-revisit.warc.gz
|
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 warcs/example-url-agnostic-revisit.warc.gz
|
||||||
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 warcs/example-wpull.warc.gz
|
urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 warcs/example-wpull.warc.gz
|
||||||
Total: 212
|
Total: 212
|
||||||
|
|
||||||
# test sort, 9-field, multiple inputs, all records + post query
|
# test sort, 9-field, multiple inputs, all records + post query
|
||||||
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
|
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
|
||||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
|
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
|
||||||
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - 3181 example-wpull.warc.gz
|
urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - 3181 example-wpull.warc.gz
|
||||||
Total: 407
|
Total: 407
|
||||||
|
|
||||||
# test writing to stdout
|
# test writing to stdout
|
||||||
@ -187,7 +187,7 @@ Total: 4
|
|||||||
# test custom root dir for cdx filenames, dir input
|
# test custom root dir for cdx filenames, dir input
|
||||||
>>> cli_lines(['--sort', '--dir-root', get_test_dir() + 'other/', TEST_WARC_DIR])
|
>>> cli_lines(['--sort', '--dir-root', get_test_dir() + 'other/', TEST_WARC_DIR])
|
||||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 ../warcs/example-url-agnostic-revisit.warc.gz
|
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 ../warcs/example-url-agnostic-revisit.warc.gz
|
||||||
urn:X-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 ../warcs/example-wpull.warc.gz
|
urn:x-wpull:log 20150330235046 urn:X-wpull:log text/plain - Q32A3PBAN6S7I26HWZDX5CDCB6MN6UN6 - - 557 3181 ../warcs/example-wpull.warc.gz
|
||||||
Total: 212
|
Total: 212
|
||||||
|
|
||||||
# test writing to temp dir, also use unicode filename
|
# test writing to temp dir, also use unicode filename
|
||||||
|
@ -88,10 +88,6 @@ class HTMLRewriterMixin(StreamingRewriter):
|
|||||||
|
|
||||||
return rewrite_tags
|
return rewrite_tags
|
||||||
|
|
||||||
# tags allowed in the <head> of an html document
|
|
||||||
HEAD_TAGS = ['html', 'head', 'base', 'link', 'meta',
|
|
||||||
'title', 'style', 'script', 'object', 'bgsound']
|
|
||||||
|
|
||||||
BEFORE_HEAD_TAGS = ['html', 'head']
|
BEFORE_HEAD_TAGS = ['html', 'head']
|
||||||
|
|
||||||
DATA_RW_PROTOCOLS = ('http://', 'https://', '//')
|
DATA_RW_PROTOCOLS = ('http://', 'https://', '//')
|
||||||
@ -174,6 +170,16 @@ class HTMLRewriterMixin(StreamingRewriter):
|
|||||||
|
|
||||||
ADD_WINDOW = re.compile('(?<![.])(WB_wombat_)')
|
ADD_WINDOW = re.compile('(?<![.])(WB_wombat_)')
|
||||||
|
|
||||||
|
SRCSET_REGEX = re.compile('\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))')
|
||||||
|
|
||||||
|
def _rewrite_srcset(self, value, mod=''):
|
||||||
|
if not value:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
values = (url.strip() for url in re.split(self.SRCSET_REGEX, value) if url)
|
||||||
|
values = [self._rewrite_url(v.strip()) for v in values]
|
||||||
|
return ', '.join(values)
|
||||||
|
|
||||||
def _rewrite_meta_refresh(self, meta_refresh):
|
def _rewrite_meta_refresh(self, meta_refresh):
|
||||||
if not meta_refresh:
|
if not meta_refresh:
|
||||||
return ''
|
return ''
|
||||||
@ -272,7 +278,7 @@ class HTMLRewriterMixin(StreamingRewriter):
|
|||||||
return rewritten_value
|
return rewritten_value
|
||||||
|
|
||||||
def try_unescape(self, value):
|
def try_unescape(self, value):
|
||||||
if not value.startswith('http'):
|
if '&#' not in value:
|
||||||
return value
|
return value
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -285,22 +291,18 @@ class HTMLRewriterMixin(StreamingRewriter):
|
|||||||
|
|
||||||
return new_value
|
return new_value
|
||||||
|
|
||||||
SRCSET_REGEX = re.compile('\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))')
|
|
||||||
|
|
||||||
def _rewrite_srcset(self, value, mod=''):
|
|
||||||
if not value:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
values = (url.strip() for url in re.split(self.SRCSET_REGEX, value) if url)
|
|
||||||
values = [self._rewrite_url(v.strip()) for v in values]
|
|
||||||
return ', '.join(values)
|
|
||||||
|
|
||||||
def _rewrite_css(self, css_content):
|
def _rewrite_css(self, css_content):
|
||||||
if css_content:
|
if not css_content:
|
||||||
return self.css_rewriter.rewrite_complete(css_content)
|
|
||||||
else:
|
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
unesc_css = self.try_unescape(css_content)
|
||||||
|
rw_css = self.css_rewriter.rewrite_complete(unesc_css)
|
||||||
|
|
||||||
|
if unesc_css == rw_css:
|
||||||
|
return css_content
|
||||||
|
else:
|
||||||
|
return rw_css
|
||||||
|
|
||||||
def _rewrite_script(self, script_content, inline_attr=False):
|
def _rewrite_script(self, script_content, inline_attr=False):
|
||||||
if not script_content:
|
if not script_content:
|
||||||
return ''
|
return ''
|
||||||
|
@ -107,9 +107,11 @@ r"""
|
|||||||
#<a href="/web/20131226101010/http://испытание.испытание/">испытание</a>
|
#<a href="/web/20131226101010/http://испытание.испытание/">испытание</a>
|
||||||
|
|
||||||
# entity unescaping
|
# entity unescaping
|
||||||
#>>> parse('<a href="http://www.example.com/path/file.html">')
|
>>> parse('<a href="http://www.example.com/path/file.html">')
|
||||||
<a href="/web/20131226101010/http://www.example.com/path/file.html">
|
<a href="/web/20131226101010/http://www.example.com/path/file.html">
|
||||||
|
|
||||||
|
>>> parse('<a href="//www.example.com/path/file.html">')
|
||||||
|
<a href="/web/20131226101010///www.example.com/path/file.html">
|
||||||
|
|
||||||
# Meta tag
|
# Meta tag
|
||||||
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
||||||
@ -253,6 +255,9 @@ r"""
|
|||||||
>>> parse('<i style=\'background-image: url("http://foo.example.com/")\'></i>')
|
>>> parse('<i style=\'background-image: url("http://foo.example.com/")\'></i>')
|
||||||
<i style="background-image: url("/web/20131226101010/http://foo.example.com/")"></i>
|
<i style="background-image: url("/web/20131226101010/http://foo.example.com/")"></i>
|
||||||
|
|
||||||
|
>>> parse('<i style=\'background-image: url('http://foo.example.com/')\'></i>')
|
||||||
|
<i style="background-image: url('/web/20131226101010/http://foo.example.com/')"></i>
|
||||||
|
|
||||||
>>> parse("<i style='background-image: url('http://foo.example.com/')'></i>")
|
>>> parse("<i style='background-image: url('http://foo.example.com/')'></i>")
|
||||||
<i style="background-image: url('/web/20131226101010/http://foo.example.com/')"></i>
|
<i style="background-image: url('/web/20131226101010/http://foo.example.com/')"></i>
|
||||||
|
|
||||||
|
File diff suppressed because one or more lines are too long
@ -3,7 +3,7 @@ warcio>=1.7.1
|
|||||||
requests
|
requests
|
||||||
redis<3.0
|
redis<3.0
|
||||||
jinja2
|
jinja2
|
||||||
surt>=0.3.0
|
surt>=0.3.1
|
||||||
brotlipy
|
brotlipy
|
||||||
pyyaml
|
pyyaml
|
||||||
werkzeug
|
werkzeug
|
||||||
|
2
wombat
2
wombat
@ -1 +1 @@
|
|||||||
Subproject commit e647aa17a121bc9328809fc08b61b742c1357dd2
|
Subproject commit 8b74896d94a71d9b97172ee5a1a798f04d65ec63
|
Loading…
x
Reference in New Issue
Block a user