mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
refactor: cleanup HTMLRewrtier/LXMLHTMLRewriter close path,
single close in base class delegeating to _internal_close() Also, HTMLRewriter auto-terminates <script> and <style> tags for consistency with lxml
This commit is contained in:
parent
52d99aef57
commit
d1ad9b5e69
@ -74,8 +74,6 @@ class HTMLRewriterMixin(object):
|
|||||||
|
|
||||||
self.url_rewriter = url_rewriter
|
self.url_rewriter = url_rewriter
|
||||||
self._wb_parse_context = None
|
self._wb_parse_context = None
|
||||||
#self.out = outstream if outstream else self.AccumBuff()
|
|
||||||
self.out = self.AccumBuff()
|
|
||||||
|
|
||||||
self.js_rewriter = js_rewriter_class(url_rewriter)
|
self.js_rewriter = js_rewriter_class(url_rewriter)
|
||||||
self.css_rewriter = css_rewriter_class(url_rewriter)
|
self.css_rewriter = css_rewriter_class(url_rewriter)
|
||||||
@ -218,17 +216,32 @@ class HTMLRewriterMixin(object):
|
|||||||
self.out.write(data)
|
self.out.write(data)
|
||||||
|
|
||||||
def rewrite(self, string):
|
def rewrite(self, string):
|
||||||
if not self.out:
|
self.out = self.AccumBuff()
|
||||||
self.out = self.AccumBuff()
|
|
||||||
|
|
||||||
self.feed(string)
|
self.feed(string)
|
||||||
|
|
||||||
result = self.out.getvalue()
|
result = self.out.getvalue()
|
||||||
|
|
||||||
# Clear buffer to create new one for next rewrite()
|
# Clear buffer to create new one for next rewrite()
|
||||||
self.out = None
|
self.out = None
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.out = self.AccumBuff()
|
||||||
|
|
||||||
|
self._internal_close()
|
||||||
|
|
||||||
|
result = self.out.getvalue()
|
||||||
|
|
||||||
|
# Clear buffer to create new one for next rewrite()
|
||||||
|
self.out = None
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _internal_close(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
|
class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
|
||||||
@ -243,30 +256,23 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
|
|||||||
js_rewriter_class,
|
js_rewriter_class,
|
||||||
css_rewriter_class)
|
css_rewriter_class)
|
||||||
|
|
||||||
# HTMLParser overrides below
|
|
||||||
def feed(self, string):
|
def feed(self, string):
|
||||||
try:
|
try:
|
||||||
HTMLParser.feed(self, string)
|
HTMLParser.feed(self, string)
|
||||||
except HTMLParseError:
|
except HTMLParseError:
|
||||||
self.out.write(string)
|
self.out.write(string)
|
||||||
|
|
||||||
def close(self):
|
def _internal_close(self):
|
||||||
if (self._wb_parse_context):
|
if (self._wb_parse_context):
|
||||||
end_tag = '</' + self._wb_parse_context + '>'
|
end_tag = '</' + self._wb_parse_context + '>'
|
||||||
result = self.rewrite(end_tag)
|
self.feed(end_tag)
|
||||||
if result.endswith(end_tag):
|
|
||||||
result = result[:-len(end_tag)]
|
|
||||||
self._wb_parse_context = None
|
self._wb_parse_context = None
|
||||||
else:
|
|
||||||
result = ''
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
HTMLParser.close(self)
|
HTMLParser.close(self)
|
||||||
except HTMLParseError:
|
except HTMLParseError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
# called to unescape attrs -- do not unescape!
|
# called to unescape attrs -- do not unescape!
|
||||||
def unescape(self, s):
|
def unescape(self, s):
|
||||||
return s
|
return s
|
||||||
|
@ -36,19 +36,9 @@ class LXMLHTMLRewriter(HTMLRewriterMixin):
|
|||||||
#string = string.replace(u'</html>', u'')
|
#string = string.replace(u'</html>', u'')
|
||||||
self.parser.feed(string)
|
self.parser.feed(string)
|
||||||
|
|
||||||
def close(self):
|
def _internal_close(self):
|
||||||
if not self.out:
|
|
||||||
self.out = self.AccumBuff()
|
|
||||||
|
|
||||||
self.is_closing = True
|
|
||||||
self.parser.close()
|
self.parser.close()
|
||||||
|
|
||||||
result = self.out.getvalue()
|
|
||||||
# Clear buffer to create new one for next rewrite()
|
|
||||||
self.out = None
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class RewriterTarget(object):
|
class RewriterTarget(object):
|
||||||
|
@ -53,9 +53,9 @@ ur"""
|
|||||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
||||||
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
|
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
|
||||||
|
|
||||||
# Unterminated script tag, handle but don't auto-terminate
|
# Unterminated script tag, handle and auto-terminate
|
||||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
|
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
|
||||||
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc>
|
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc></script>
|
||||||
|
|
||||||
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
|
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
|
||||||
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
|
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
|
||||||
@ -66,9 +66,9 @@ ur"""
|
|||||||
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
|
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
|
||||||
<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
|
<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
|
||||||
|
|
||||||
# Unterminated style tag, handle but don't auto-terminate
|
# Unterminated style tag, handle and auto-terminate
|
||||||
>>> parse('<style>@import url(styles.css)')
|
>>> parse('<style>@import url(styles.css)')
|
||||||
<style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)
|
<style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style>
|
||||||
|
|
||||||
# Head Insertion
|
# Head Insertion
|
||||||
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user