From 3235c382a585601ef399f206829a5689e223efff Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 5 Dec 2018 15:32:38 -0800 Subject: [PATCH] Check text/html content to ensure actually html (#428) * html rewrite: when encountering 'text/html' content-type, add html-detection check before assuming content is html (similar to text/plain) supersedes #426, fixes #424 -- binary files served under mp_/ as text/html should now be served as binary - when guessing if html, add additional regex to check if text does not start with < -- perhaps html but starting with plain text. only check for text/html content-type and not js_/cs_ mod --- pywb/rewrite/content_rewriter.py | 11 ++++++++--- pywb/rewrite/default_rewriter.py | 2 +- pywb/rewrite/test/test_content_rewriter.py | 14 ++++++++++++-- pywb/rewrite/test/test_header_rewriter.py | 6 +++--- 4 files changed, 24 insertions(+), 9 deletions(-) diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index a37281ae..ebf5c72b 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -348,6 +348,7 @@ class StreamingRewriter(object): # ============================================================================ class RewriteInfo(object): TAG_REGEX = re.compile(b'^\s*\<') + TAG_REGEX2 = re.compile(b'^.*<\w+[\s>]') JSON_REGEX = re.compile(b'^\s*[{[][{"]') # if it starts with this then highly likely not HTML JSONP_CONTAINS = ['callback=jQuery', @@ -391,7 +392,7 @@ class RewriteInfo(object): text_type = self._resolve_text_type(orig_text_type) url = self.url_rewriter.wburl.url - if text_type in ('guess-text', 'guess-bin'): + if text_type in ('guess-text', 'guess-bin', 'guess-html'): text_type = None if text_type == 'js': @@ -432,8 +433,8 @@ class RewriteInfo(object): # if html or no-content type, allow resolving on js, css, # or other templates - if text_type == 'guess-text': - if not is_js_or_css and mod not in ('if_', 'mp_', ''): + if text_type in ('guess-text', 'guess-html'): + if not is_js_or_css and mod not in ('if_', 'mp_', 'bn_', ''): return None # if application/octet-stream binary, only resolve if in js/css content @@ -449,6 +450,10 @@ class RewriteInfo(object): # check if doesn't start with a tag, then likely not html if self.TAG_REGEX.match(buff): return 'html' + # perform additional check to see if it has any html tags + elif text_type == 'guess-html' and not is_js_or_css: + if self.TAG_REGEX2.match(buff): + return 'html' if not is_js_or_css: return text_type diff --git a/pywb/rewrite/default_rewriter.py b/pywb/rewrite/default_rewriter.py index e2ce966f..82a9ebc3 100644 --- a/pywb/rewrite/default_rewriter.py +++ b/pywb/rewrite/default_rewriter.py @@ -48,7 +48,7 @@ class DefaultRewriter(BaseContentRewriter): rewrite_types = { # HTML - 'text/html': 'html', + 'text/html': 'guess-html', 'application/xhtml': 'html', 'application/xhtml+xml': 'html', diff --git a/pywb/rewrite/test/test_content_rewriter.py b/pywb/rewrite/test/test_content_rewriter.py index 385896e0..59068bc7 100644 --- a/pywb/rewrite/test/test_content_rewriter.py +++ b/pywb/rewrite/test/test_content_rewriter.py @@ -126,7 +126,7 @@ class TestContentRewriter(object): headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_') - assert is_rw + assert is_rw == False assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers assert b''.join(gen).decode('utf-8') == exp @@ -333,9 +333,19 @@ class TestContentRewriter(object): assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers - assert is_rw == True + assert is_rw == False assert b''.join(gen) == content + def test_binary_wrong_content_type_html_rw(self): + headers = {'Content-Type': 'text/html; charset=utf-8'} + content = b'Hello link' + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_') + + assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers + + assert is_rw + assert b''.join(gen) == b'Hello link' + def test_binary_wrong_content_type_css(self): headers = {'Content-Type': 'text/css; charset=utf-8'} content = b'\xe9\x11\x12\x13\x14' diff --git a/pywb/rewrite/test/test_header_rewriter.py b/pywb/rewrite/test/test_header_rewriter.py index 98870221..3755a842 100644 --- a/pywb/rewrite/test/test_header_rewriter.py +++ b/pywb/rewrite/test/test_header_rewriter.py @@ -43,15 +43,15 @@ class TestHeaderRewriter(object): res = """\ HTTP/1.0 200 OK\r\n\ Date: Fri, 03 Jan 2014 03:03:21 GMT\r\n\ -Content-Length: 5\r\n\ +X-Archive-Orig-Content-Length: 5\r\n\ Content-Type: text/html;charset=UTF-8\r\n\ """ rwinfo = self.do_rewrite('200 OK', headers) http_headers = DefaultHeaderRewriter(rwinfo)() assert str(http_headers) == res - assert rwinfo.text_type == 'html' - assert rwinfo.charset == 'utf-8' + assert rwinfo.text_type == None + assert rwinfo.charset == None def test_header_rewrite_redirect(self): headers = [('Connection', 'close'),