diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index ce7d7c74..ca0275bf 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -391,7 +391,7 @@ class StreamingRewriter(object): # ============================================================================ class RewriteInfo(object): TAG_REGEX = re.compile(b'^(\xef\xbb\xbf)?\s*\<') - TAG_REGEX2 = re.compile(b'^.*<\w+[\s>]') + TAG_REGEX2 = re.compile(b'^.*<[!]?\w+[\s>]') JSON_REGEX = re.compile(b'^\s*[{[][{"]') # if it starts with this then highly likely not HTML JSONP_CONTAINS = ['callback=jQuery', diff --git a/pywb/rewrite/test/test_content_rewriter.py b/pywb/rewrite/test/test_content_rewriter.py index 7f4298c1..4555291a 100644 --- a/pywb/rewrite/test/test_content_rewriter.py +++ b/pywb/rewrite/test/test_content_rewriter.py @@ -141,6 +141,17 @@ class TestContentRewriter(object): assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers assert b''.join(gen).decode('utf-8') == exp + def test_rewrite_html_ignore_bom(self): + headers = {'Content-Type': 'text/html'} + content = u'\ufeff\ufeff\ufeff\n\n' + + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_') + + exp = '\ufeff\ufeff\ufeff\n\n' + assert is_rw + assert ('Content-Type', 'text/html') in headers.headers + assert b''.join(gen).decode('utf-8') == exp + def test_rewrite_html_utf_8_anchor(self): headers = {'Content-Type': 'text/html; charset=utf-8'} content = u''