From bf9284fec5160f79c3a0fcfe7038df6ba584e5d2 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 3 Aug 2019 11:24:50 -0700 Subject: [PATCH] proxy mode HTMLInsertOnlyRewriter: (#496) - insert head-insert before first tag that is not or insert before - addresses issue with rewriting pages that have no tag (already handled in full rewriter) - tests: add tests for HTMLInsertOnlyRewriter - bump version to 2.3.3, update changelist --- CHANGES.rst | 6 ++++ pywb/rewrite/html_insert_rewriter.py | 14 ++++----- .../rewrite/test/test_html_insert_rewriter.py | 30 +++++++++++++++++++ pywb/version.py | 2 +- 4 files changed, 44 insertions(+), 8 deletions(-) create mode 100644 pywb/rewrite/test/test_html_insert_rewriter.py diff --git a/CHANGES.rst b/CHANGES.rst index bb9277fd..91b62f1c 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,3 +1,9 @@ +pywb 2.3.3 changelist +~~~~~~~~~~~~~~~~~~~~~ + +* Proxy Mode: Ensure head insert added even if no ```` tag, insert after first tag that is not ```` or ```` (#496) + + pywb 2.3.2 changelist ~~~~~~~~~~~~~~~~~~~~~ diff --git a/pywb/rewrite/html_insert_rewriter.py b/pywb/rewrite/html_insert_rewriter.py index 053c1231..7a5dcf26 100644 --- a/pywb/rewrite/html_insert_rewriter.py +++ b/pywb/rewrite/html_insert_rewriter.py @@ -4,10 +4,10 @@ from pywb.rewrite.content_rewriter import StreamingRewriter # ============================================================================ class HTMLInsertOnlyRewriter(StreamingRewriter): - """ Insert custom string into HTML tag + """ Insert custom string into HTML into the head, before any tag not or no other rewriting performed """ - HEAD_REGEX = re.compile('<\s*head\\b[^>]*[>]+', re.I) + NOT_HEAD_REGEX = re.compile(r'(<\s*\b)(?!(html|head))', re.I) def __init__(self, url_rewriter, **kwargs): super(HTMLInsertOnlyRewriter, self).__init__(url_rewriter, False) @@ -19,16 +19,16 @@ class HTMLInsertOnlyRewriter(StreamingRewriter): if self.done: return string - # only try to find in first buffer - self.done = True - m = self.HEAD_REGEX.search(string) + m = self.NOT_HEAD_REGEX.search(string) if m: - inx = m.end() + inx = m.start() buff = string[:inx] buff += self.head_insert buff += string[inx:] + self.done = True return buff else: return string - + def final_read(self): + return '' if self.done else self.head_insert diff --git a/pywb/rewrite/test/test_html_insert_rewriter.py b/pywb/rewrite/test/test_html_insert_rewriter.py new file mode 100644 index 00000000..ed3607a4 --- /dev/null +++ b/pywb/rewrite/test/test_html_insert_rewriter.py @@ -0,0 +1,30 @@ + + + +r''' +>>> parse('') +'' + +>>> parse('Text') +'Text' + +>>> parse(' < head> ') +' < head> ' + +>>> parse('< head> ') +'< head> ' + +>>> parse('text') +'text' +''' + +from pywb.rewrite.url_rewriter import UrlRewriter +from pywb.rewrite.html_insert_rewriter import HTMLInsertOnlyRewriter + +def parse(html_text): + urlrewriter = UrlRewriter('20131226101010/https://example.com/some/path.html', '/web/') + + rewriter = HTMLInsertOnlyRewriter(urlrewriter, head_insert='') + + return rewriter.rewrite(html_text) + rewriter.final_read() + diff --git a/pywb/version.py b/pywb/version.py index 436bf7e1..578298e6 100644 --- a/pywb/version.py +++ b/pywb/version.py @@ -1,4 +1,4 @@ -__version__ = '2.3.2' +__version__ = '2.3.3' if __name__ == '__main__': print(__version__)