From a0b53344f41ca2ff3ae236a7ba2c45ca1a6a7e03 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 27 Apr 2015 00:46:58 -0700 Subject: [PATCH] rewrite: HTMLRewriter should insert head_insert at end of stream, if it hasn't been inserted by the end (and if there was some content written -- don't insert for 0-length responses) Addresses missing head insert if only head tags are present and no head, as per hypothesis/via#9 --- pywb/rewrite/html_rewriter.py | 12 ++++++++++++ pywb/rewrite/test/test_rewrite_live.py | 10 ++++++++++ sample_archive/text_content/sample_no_head_2.html | 3 +++ 3 files changed, 25 insertions(+) create mode 100644 sample_archive/text_content/sample_no_head_2.html diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 4005293f..9d1310b1 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -83,6 +83,7 @@ class HTMLRewriterMixin(object): def getvalue(self): return b''.join(self.ls) + # =========================== def __init__(self, url_rewriter, head_insert=None, @@ -105,6 +106,8 @@ class HTMLRewriterMixin(object): # get opts from urlrewriter self.opts = url_rewriter.rewrite_opts + self.parsed_any = False + # =========================== META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE) @@ -288,6 +291,9 @@ class HTMLRewriterMixin(object): result = self.out.getvalue() + # track that something was parsed + self.parsed_any = self.parsed_any or bool(string) + # Clear buffer to create new one for next rewrite() self.out = None @@ -338,6 +344,12 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser): self.feed(end_tag) self._wb_parse_context = None + # if haven't insert head_insert, but wrote some content + # out, then insert head_insert now + if self.head_insert and self.parsed_any: + self.out.write(self.head_insert) + self.head_insert = None + try: HTMLParser.close(self) except HTMLParseError: # pragma: no cover diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py index 53c952c7..ba0a6a58 100644 --- a/pywb/rewrite/test/test_rewrite_live.py +++ b/pywb/rewrite/test/test_rewrite_live.py @@ -98,6 +98,16 @@ def test_local_no_head(): # link rewritten assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff +def test_local_no_head_only_title(): + status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head_2.html', + urlrewriter, + head_insert_func, + 'com,example,test)/') + + # wombat insert added + assert '' in buff + + def test_local_no_head_banner_only(): status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample_no_head.html', bn_urlrewriter, diff --git a/sample_archive/text_content/sample_no_head_2.html b/sample_archive/text_content/sample_no_head_2.html new file mode 100644 index 00000000..56ec21b1 --- /dev/null +++ b/sample_archive/text_content/sample_no_head_2.html @@ -0,0 +1,3 @@ + +A title +Some Text