Edge-case and HTML Rewrite Fixes (#441)

* recoder fix: ensure Transfer-Encoding header is not passed through by RecorderApp, as may result in duplicate Transfer-Encoding in py2.7, fixes #432 * html rewriter fixes: - html detection: allow for UTF-8 BOM when detecting if text is html - html decl parsing: modify base parser regex to allow IE conditional declaration to also end with -->, eg. support '<![endif]-->' in addition to '<![endif]>', fixes #425 * travis: add allow failure for integration tests (for now)
2025-03-15 00:03:28 +01:00 · 2019-02-18 10:11:29 -08:00 · 2019-02-18 10:11:29 -08:00 · 38c1b1cc3e
commit 38c1b1cc3e
parent 100c7f5509
4 changed files with 26 additions and 2 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -36,6 +36,9 @@ after_success:
  - codecov
 matrix:
  allow_failures:
    - env: WR_TEST=yes
  exclude:
    - env: WR_TEST=yes
      python: "2.7"
--- a/pywb/rewrite/content_rewriter.py
+++ b/pywb/rewrite/content_rewriter.py
@ -347,7 +347,7 @@ class StreamingRewriter(object):
 # ============================================================================
 class RewriteInfo(object):
-    TAG_REGEX = re.compile(b'^\s*\<')
+    TAG_REGEX = re.compile(b'^(\xef\xbb\xbf)?\s*\<')
    TAG_REGEX2 = re.compile(b'^.*<\w+[\s>]')
    JSON_REGEX = re.compile(b'^\s*[{[][{"]')  # if it starts with this then highly likely not HTML
--- a/pywb/rewrite/html_rewriter.py
+++ b/pywb/rewrite/html_rewriter.py
@ -13,6 +13,8 @@ from pywb.rewrite.regex_rewriters import JSRewriter, CSSRewriter
 from pywb.rewrite.content_rewriter import StreamingRewriter
 from six import text_type
 import six.moves.html_parser
 try:
@ -21,7 +23,15 @@ try:
 except:
    orig_unescape = None
-from six import text_type
+
 try:
    import _markupbase as markupbase
 except:
    import markupbase as markupbase
 # ensure invalid cond ending ']-->' closing decl
 # is treated same as ']>'
 markupbase._msmarkedsectionclose = re.compile(r']\s*-{0,2}>')
 #=================================================================
--- a/pywb/rewrite/test/test_html_rewriter.py
+++ b/pywb/rewrite/test/test_html_rewriter.py
@ -286,6 +286,10 @@ r"""
 >>> parse('<!DOCTYPE html>Some Text without any tags <!-- comments -->', head_insert = '<script>load_stuff();</script>')
 <!DOCTYPE html>Some Text without any tags <!-- comments --><script>load_stuff();</script>
 # UTF-8 BOM
 >>> parse('\ufeff<!DOCTYPE html>Some Text without any tags <!-- comments -->', head_insert = '<script>load_stuff();</script>')
 \ufeff<!DOCTYPE html>Some Text without any tags <!-- comments --><script>load_stuff();</script>
 # no parse comments
 >>> parse('<html><!-- <a href="/foo.html"> --></html>')
 <html><!-- <a href="/foo.html"> --></html>
@ -395,6 +399,13 @@ r"""
 >>> parse('<html><a href="javascript:alert()"></a></html>', js_proxy=True)
 <html><a href="javascript:alert()"></a></html>
 # IE conditional
 >>> parse('<!--[if !IE]><html><![endif]--><a href="http://example.com/"><!--[if IE]><![endif]--><a href="http://example.com/"></html>')
 <!--[if !IE]><html><![endif]--><a href="/web/20131226101010/http://example.com/"><!--[if IE]><![endif]--><a href="/web/20131226101010/http://example.com/"></html>
 # IE conditional with invalid ']-->' ending, rewritten as ']>'
 >>> parse('<!--[if !IE]> --><html><![endif]--><a href="http://example.com/"><!--[if IE]><![endif]--><a href="http://example.com/"></html>')
 <!--[if !IE]> --><html><![endif]><a href="/web/20131226101010/http://example.com/"><!--[if IE]><![endif]--><a href="/web/20131226101010/http://example.com/"></html>
 # Test blank