1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

html unescape: ensure escaped urls are rewritten (py2 and 3) (#337)

This commit is contained in:
Ilya Kreymer 2018-05-29 09:17:04 -07:00 committed by GitHub
parent a138fca5e3
commit bb1dbc0080
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 23 additions and 10 deletions

View File

@ -14,7 +14,13 @@ from pywb.rewrite.regex_rewriters import JSRewriter, CSSRewriter
from pywb.rewrite.content_rewriter import StreamingRewriter
import six.moves.html_parser
six.moves.html_parser.unescape = lambda x: x
try:
orig_unescape = six.moves.html_parser.unescape
six.moves.html_parser.unescape = lambda x: x
except:
orig_unescape = None
from six import text_type
@ -223,23 +229,26 @@ class HTMLRewriterMixin(StreamingRewriter):
if not value:
return ''
value = self.try_unescape(value)
return self.url_rewriter.rewrite(value, mod)
unesc_value = self.try_unescape(value)
rewritten_value = self.url_rewriter.rewrite(unesc_value, mod)
if unesc_value != value and rewritten_value != unesc_value:
rewritten_value = rewritten_value.replace(unesc_value, value)
return rewritten_value
def try_unescape(self, value):
if not value.startswith('http'):
return value
try:
new_value = HTMLParser.unescape(self, value)
except:
if orig_unescape:
new_value = orig_unescape(value)
else:
new_value = HTMLParser.unescape(self, value)
except Exception as e:
return value
if value != new_value:
# ensure utf-8 encoded to avoid %-encoding query here
if isinstance(new_value, text_type):
new_value = new_value.encode('utf-8')
return new_value
SRCSET_REGEX = re.compile('\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))')

View File

@ -88,6 +88,10 @@ r"""
>>> parse('<input value="&amp;X&amp;&quot;">X</input>')
<input value="&amp;X&amp;&quot;">X</input>
# Ensure url is rewritten, but is not unescaped
>>> parse('<a href="http&#x3a;&#x2f;&#x2f;example.com&#x2f;path&#x2f;">')
<a href="/web/20131226101010/http&#x3a;&#x2f;&#x2f;example.com&#x2f;path&#x2f;">
# Empty values should be ignored
>>> parse('<input name="foo" value>')
<input name="foo" value>