1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Updated html_rewriter.py to correctly handle self-closing <script> elements: (#392)

- adding the 'xlink:href' attribute to script element attributes to rewrite
Updated html_rewriter.py to better handle self closing tags:
- added boolean set_parsing_context arg to _rewrite_tag_attrs to indicate if the parsing context is to be set
- the call to _rewrite_tag_attrs from handle_startendtag now sets set_parsing_context to false
Added a test to test_html_rewriter.py for rewriting SVGScriptElements
This commit is contained in:
John Berlin 2018-10-10 18:24:34 -04:00 committed by Ilya Kreymer
parent 1c7badf117
commit c28e38718c
2 changed files with 28 additions and 4 deletions

View File

@ -63,7 +63,7 @@ class HTMLRewriterMixin(StreamingRewriter):
'param': {'value': 'oe_'},
'q': {'cite': defmod},
'ref': {'href': 'oe_'},
'script': {'src': 'js_'},
'script': {'src': 'js_', 'xlink:href': 'js_'}, # covers both HTML and SVG script tags
'source': {'src': 'oe_'},
'video': {'src': 'oe_',
'poster': 'im_'},
@ -310,7 +310,22 @@ class HTMLRewriterMixin(StreamingRewriter):
return None
def _rewrite_tag_attrs(self, tag, tag_attrs):
def _rewrite_tag_attrs(self, tag, tag_attrs, set_parsing_context=True):
"""Rewrite a tags attributes.
If set_parsing_context is false then the parsing context will not set.
If the head insert has not been added to the HTML being rewritten, there
is no parsing context and the tag is not in BEFORE_HEAD_TAGS then the
head_insert will be "inserted" and set to None
:param str tag: The name of the tag to be rewritten
:param list[tuple[str, str]] tag_attrs: A list of tuples representing
the tags attributes
:param bool set_parsing_context: Boolean indicating if the parsing
context should be set
:return: True
:rtype: bool
"""
# special case: head insertion, before-head tags
if (self.head_insert and
not self._wb_parse_context
@ -318,7 +333,8 @@ class HTMLRewriterMixin(StreamingRewriter):
self.out.write(self.head_insert)
self.head_insert = None
self._set_parse_context(tag, tag_attrs)
if set_parsing_context:
self._set_parse_context(tag, tag_attrs)
# attr rewriting
handler = self.rewrite_tags.get(tag)
@ -604,7 +620,7 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
self.out.write('>')
def handle_startendtag(self, tag, attrs):
self._rewrite_tag_attrs(tag, attrs)
self._rewrite_tag_attrs(tag, attrs, False)
if tag != 'head' or not self._rewrite_head(True):
self.out.write('/>')

View File

@ -223,6 +223,14 @@ r"""
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc></script>
# SVG Script tag
>>> parse('<script xlink:href="/js/scripts.js"/>')
<script xlink:href="/web/20131226101010js_/http://example.com/js/scripts.js"/>
# SVG Script tag with other elements
>>> parse('<svg><defs><script xlink:href="/js/scripts.js"/><defs/><title>I\'m a title tag in svg!</title></svg>')
<svg><defs><script xlink:href="/web/20131226101010js_/http://example.com/js/scripts.js"/><defs/><title>I'm a title tag in svg!</title></svg>
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>