diff --git a/.travis.yml b/.travis.yml index 5d6a7359..65fd91ab 100644 --- a/.travis.yml +++ b/.travis.yml @@ -36,6 +36,9 @@ after_success: - codecov matrix: + allow_failures: + - env: WR_TEST=yes + exclude: - env: WR_TEST=yes python: "2.7" diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index 6887959e..8c98f2fa 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -347,7 +347,7 @@ class StreamingRewriter(object): # ============================================================================ class RewriteInfo(object): - TAG_REGEX = re.compile(b'^\s*\<') + TAG_REGEX = re.compile(b'^(\xef\xbb\xbf)?\s*\<') TAG_REGEX2 = re.compile(b'^.*<\w+[\s>]') JSON_REGEX = re.compile(b'^\s*[{[][{"]') # if it starts with this then highly likely not HTML diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 41be3364..91dbf986 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -13,6 +13,8 @@ from pywb.rewrite.regex_rewriters import JSRewriter, CSSRewriter from pywb.rewrite.content_rewriter import StreamingRewriter +from six import text_type + import six.moves.html_parser try: @@ -21,7 +23,15 @@ try: except: orig_unescape = None -from six import text_type + +try: + import _markupbase as markupbase +except: + import markupbase as markupbase + +# ensure invalid cond ending ']-->' closing decl +# is treated same as ']>' +markupbase._msmarkedsectionclose = re.compile(r']\s*-{0,2}>') #================================================================= diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index 371867a8..74bc1a99 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -286,6 +286,10 @@ r""" >>> parse('Some Text without any tags ', head_insert = '') Some Text without any tags +# UTF-8 BOM +>>> parse('\ufeffSome Text without any tags ', head_insert = '') +\ufeffSome Text without any tags + # no parse comments >>> parse('') @@ -395,6 +399,13 @@ r""" >>> parse('', js_proxy=True) +# IE conditional +>>> parse('') + + +# IE conditional with invalid ']-->' ending, rewritten as ']>' +>>> parse('') + # Test blank