From 1249b41dbaba96ed3ba4b1d6042339959a9b5d36 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 31 Aug 2022 16:51:41 -0700 Subject: [PATCH] rewrite: detect edge-case where html starts with BOM characters followed followed as html (#758) tests: add test that now results in correct html rewriting fixes #756 --- pywb/rewrite/content_rewriter.py | 2 +- pywb/rewrite/test/test_content_rewriter.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index ce7d7c74..ca0275bf 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -391,7 +391,7 @@ class StreamingRewriter(object): # ============================================================================ class RewriteInfo(object): TAG_REGEX = re.compile(b'^(\xef\xbb\xbf)?\s*\<') - TAG_REGEX2 = re.compile(b'^.*<\w+[\s>]') + TAG_REGEX2 = re.compile(b'^.*<[!]?\w+[\s>]') JSON_REGEX = re.compile(b'^\s*[{[][{"]') # if it starts with this then highly likely not HTML JSONP_CONTAINS = ['callback=jQuery', diff --git a/pywb/rewrite/test/test_content_rewriter.py b/pywb/rewrite/test/test_content_rewriter.py index 7f4298c1..4555291a 100644 --- a/pywb/rewrite/test/test_content_rewriter.py +++ b/pywb/rewrite/test/test_content_rewriter.py @@ -141,6 +141,17 @@ class TestContentRewriter(object): assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers assert b''.join(gen).decode('utf-8') == exp + def test_rewrite_html_ignore_bom(self): + headers = {'Content-Type': 'text/html'} + content = u'\ufeff\ufeff\ufeff\n\n' + + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_') + + exp = '\ufeff\ufeff\ufeff\n\n' + assert is_rw + assert ('Content-Type', 'text/html') in headers.headers + assert b''.join(gen).decode('utf-8') == exp + def test_rewrite_html_utf_8_anchor(self): headers = {'Content-Type': 'text/html; charset=utf-8'} content = u''