Check text/html content to ensure actually html (#428)

* html rewrite: when encountering 'text/html' content-type, add html-detection check before assuming content is html (similar to text/plain) supersedes #426, fixes #424 -- binary files served under mp_/ as text/html should now be served as binary - when guessing if html, add additional regex to check if text does not start with < -- perhaps html but starting with plain text. only check for text/html content-type and not js_/cs_ mod
2025-03-15 00:03:28 +01:00 · 2018-12-05 15:32:38 -08:00 · 2018-12-05 15:32:38 -08:00 · 3235c382a5
commit 3235c382a5
parent 2b8bf76c9a
4 changed files with 24 additions and 9 deletions
--- a/pywb/rewrite/content_rewriter.py
+++ b/pywb/rewrite/content_rewriter.py
@ -348,6 +348,7 @@ class StreamingRewriter(object):
 # ============================================================================
 class RewriteInfo(object):
    TAG_REGEX = re.compile(b'^\s*\<')
+    TAG_REGEX2 = re.compile(b'^.*<\w+[\s>]')
    JSON_REGEX = re.compile(b'^\s*[{[][{"]')  # if it starts with this then highly likely not HTML

    JSONP_CONTAINS = ['callback=jQuery',
@ -391,7 +392,7 @@ class RewriteInfo(object):
        text_type = self._resolve_text_type(orig_text_type)
        url = self.url_rewriter.wburl.url

-        if text_type in ('guess-text', 'guess-bin'):
+        if text_type in ('guess-text', 'guess-bin', 'guess-html'):
            text_type = None

        if text_type == 'js':
@ -432,8 +433,8 @@ class RewriteInfo(object):

        # if html or no-content type, allow resolving on js, css,
        # or other templates
-        if text_type == 'guess-text':
-            if not is_js_or_css and mod not in ('if_', 'mp_', ''):
+        if text_type in ('guess-text', 'guess-html'):
+            if not is_js_or_css and mod not in ('if_', 'mp_', 'bn_', ''):
                return None

        # if application/octet-stream binary, only resolve if in js/css content
@ -449,6 +450,10 @@ class RewriteInfo(object):
        # check if doesn't start with a tag, then likely not html
        if self.TAG_REGEX.match(buff):
            return 'html'
+        # perform additional check to see if it has any html tags
+        elif text_type == 'guess-html' and not is_js_or_css:
+            if self.TAG_REGEX2.match(buff):
+                return 'html'

        if not is_js_or_css:
            return text_type
--- a/pywb/rewrite/default_rewriter.py
+++ b/pywb/rewrite/default_rewriter.py
@ -48,7 +48,7 @@ class DefaultRewriter(BaseContentRewriter):

    rewrite_types = {
        # HTML
-        'text/html': 'html',
+        'text/html': 'guess-html',
        'application/xhtml': 'html',
        'application/xhtml+xml': 'html',

--- a/pywb/rewrite/test/test_content_rewriter.py
+++ b/pywb/rewrite/test/test_content_rewriter.py
@ -126,7 +126,7 @@ class TestContentRewriter(object):

        headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')

-        assert is_rw
+        assert is_rw == False
        assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers
        assert b''.join(gen).decode('utf-8') == exp

@ -333,9 +333,19 @@ class TestContentRewriter(object):

        assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers

-        assert is_rw == True
+        assert is_rw == False
        assert b''.join(gen) == content

+    def test_binary_wrong_content_type_html_rw(self):
+        headers = {'Content-Type': 'text/html; charset=utf-8'}
+        content = b'Hello <a href="/foo.html">link</a>'
+        headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
+
+        assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers
+
+        assert is_rw
+        assert b''.join(gen) == b'Hello <a href="/prefix/201701/http://example.com/foo.html">link</a>'
+
    def test_binary_wrong_content_type_css(self):
        headers = {'Content-Type': 'text/css; charset=utf-8'}
        content = b'\xe9\x11\x12\x13\x14'
--- a/pywb/rewrite/test/test_header_rewriter.py
+++ b/pywb/rewrite/test/test_header_rewriter.py
@ -43,15 +43,15 @@ class TestHeaderRewriter(object):
        res = """\
 HTTP/1.0 200 OK\r\n\
 Date: Fri, 03 Jan 2014 03:03:21 GMT\r\n\
-Content-Length: 5\r\n\
+X-Archive-Orig-Content-Length: 5\r\n\
 Content-Type: text/html;charset=UTF-8\r\n\
 """
        rwinfo = self.do_rewrite('200 OK', headers)
        http_headers = DefaultHeaderRewriter(rwinfo)()
        assert str(http_headers) == res

-        assert rwinfo.text_type == 'html'
-        assert rwinfo.charset == 'utf-8'
+        assert rwinfo.text_type == None
+        assert rwinfo.charset == None

    def test_header_rewrite_redirect(self):
        headers = [('Connection', 'close'),