1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

rewrite: HTMLRewriter should insert head_insert at end of stream, if it hasn't

been inserted by the end (and if there was some content written -- don't insert for 0-length responses)
Addresses missing head insert if only head tags are present and no head, as per hypothesis/via#9
This commit is contained in:
Ilya Kreymer 2015-04-27 00:46:58 -07:00
parent 48aa73df38
commit 33f247582f
3 changed files with 25 additions and 0 deletions

View File

@ -83,6 +83,7 @@ class HTMLRewriterMixin(object):
def getvalue(self):
return b''.join(self.ls)
# ===========================
def __init__(self, url_rewriter,
head_insert=None,
@ -105,6 +106,8 @@ class HTMLRewriterMixin(object):
# get opts from urlrewriter
self.opts = url_rewriter.rewrite_opts
self.parsed_any = False
# ===========================
META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$',
re.IGNORECASE | re.MULTILINE)
@ -288,6 +291,9 @@ class HTMLRewriterMixin(object):
result = self.out.getvalue()
# track that something was parsed
self.parsed_any = self.parsed_any or bool(string)
# Clear buffer to create new one for next rewrite()
self.out = None
@ -338,6 +344,12 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
self.feed(end_tag)
self._wb_parse_context = None
# if haven't insert head_insert, but wrote some content
# out, then insert head_insert now
if self.head_insert and self.parsed_any:
self.out.write(self.head_insert)
self.head_insert = None
try:
HTMLParser.close(self)
except HTMLParseError: # pragma: no cover

View File

@ -98,6 +98,16 @@ def test_local_no_head():
# link rewritten
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
def test_local_no_head_only_title():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head_2.html',
urlrewriter,
head_insert_func,
'com,example,test)/')
# wombat insert added
assert '<script src="/static/__pywb/wombat.js"> </script>' in buff
def test_local_no_head_banner_only():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head.html',
bn_urlrewriter,

View File

@ -0,0 +1,3 @@
<!DOCTYPE html>
<title>A title</title>
Some Text