diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index ce7d7c74..ca0275bf 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -391,7 +391,7 @@ class StreamingRewriter(object): # ============================================================================ class RewriteInfo(object): TAG_REGEX = re.compile(b'^(\xef\xbb\xbf)?\s*\<') - TAG_REGEX2 = re.compile(b'^.*<\w+[\s>]') + TAG_REGEX2 = re.compile(b'^.*<[!]?\w+[\s>]') JSON_REGEX = re.compile(b'^\s*[{[][{"]') # if it starts with this then highly likely not HTML JSONP_CONTAINS = ['callback=jQuery', diff --git a/pywb/rewrite/test/test_content_rewriter.py b/pywb/rewrite/test/test_content_rewriter.py index 7f4298c1..4555291a 100644 --- a/pywb/rewrite/test/test_content_rewriter.py +++ b/pywb/rewrite/test/test_content_rewriter.py @@ -141,6 +141,17 @@ class TestContentRewriter(object): assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers assert b''.join(gen).decode('utf-8') == exp + def test_rewrite_html_ignore_bom(self): + headers = {'Content-Type': 'text/html'} + content = u'\ufeff\ufeff\ufeff\n
\n