diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py
index a37281ae..ebf5c72b 100644
--- a/pywb/rewrite/content_rewriter.py
+++ b/pywb/rewrite/content_rewriter.py
@@ -348,6 +348,7 @@ class StreamingRewriter(object):
# ============================================================================
class RewriteInfo(object):
TAG_REGEX = re.compile(b'^\s*\<')
+ TAG_REGEX2 = re.compile(b'^.*<\w+[\s>]')
JSON_REGEX = re.compile(b'^\s*[{[][{"]') # if it starts with this then highly likely not HTML
JSONP_CONTAINS = ['callback=jQuery',
@@ -391,7 +392,7 @@ class RewriteInfo(object):
text_type = self._resolve_text_type(orig_text_type)
url = self.url_rewriter.wburl.url
- if text_type in ('guess-text', 'guess-bin'):
+ if text_type in ('guess-text', 'guess-bin', 'guess-html'):
text_type = None
if text_type == 'js':
@@ -432,8 +433,8 @@ class RewriteInfo(object):
# if html or no-content type, allow resolving on js, css,
# or other templates
- if text_type == 'guess-text':
- if not is_js_or_css and mod not in ('if_', 'mp_', ''):
+ if text_type in ('guess-text', 'guess-html'):
+ if not is_js_or_css and mod not in ('if_', 'mp_', 'bn_', ''):
return None
# if application/octet-stream binary, only resolve if in js/css content
@@ -449,6 +450,10 @@ class RewriteInfo(object):
# check if doesn't start with a tag, then likely not html
if self.TAG_REGEX.match(buff):
return 'html'
+ # perform additional check to see if it has any html tags
+ elif text_type == 'guess-html' and not is_js_or_css:
+ if self.TAG_REGEX2.match(buff):
+ return 'html'
if not is_js_or_css:
return text_type
diff --git a/pywb/rewrite/default_rewriter.py b/pywb/rewrite/default_rewriter.py
index e2ce966f..82a9ebc3 100644
--- a/pywb/rewrite/default_rewriter.py
+++ b/pywb/rewrite/default_rewriter.py
@@ -48,7 +48,7 @@ class DefaultRewriter(BaseContentRewriter):
rewrite_types = {
# HTML
- 'text/html': 'html',
+ 'text/html': 'guess-html',
'application/xhtml': 'html',
'application/xhtml+xml': 'html',
diff --git a/pywb/rewrite/test/test_content_rewriter.py b/pywb/rewrite/test/test_content_rewriter.py
index 385896e0..59068bc7 100644
--- a/pywb/rewrite/test/test_content_rewriter.py
+++ b/pywb/rewrite/test/test_content_rewriter.py
@@ -126,7 +126,7 @@ class TestContentRewriter(object):
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
- assert is_rw
+ assert is_rw == False
assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers
assert b''.join(gen).decode('utf-8') == exp
@@ -333,9 +333,19 @@ class TestContentRewriter(object):
assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers
- assert is_rw == True
+ assert is_rw == False
assert b''.join(gen) == content
+ def test_binary_wrong_content_type_html_rw(self):
+ headers = {'Content-Type': 'text/html; charset=utf-8'}
+ content = b'Hello link'
+ headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
+
+ assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers
+
+ assert is_rw
+ assert b''.join(gen) == b'Hello link'
+
def test_binary_wrong_content_type_css(self):
headers = {'Content-Type': 'text/css; charset=utf-8'}
content = b'\xe9\x11\x12\x13\x14'
diff --git a/pywb/rewrite/test/test_header_rewriter.py b/pywb/rewrite/test/test_header_rewriter.py
index 98870221..3755a842 100644
--- a/pywb/rewrite/test/test_header_rewriter.py
+++ b/pywb/rewrite/test/test_header_rewriter.py
@@ -43,15 +43,15 @@ class TestHeaderRewriter(object):
res = """\
HTTP/1.0 200 OK\r\n\
Date: Fri, 03 Jan 2014 03:03:21 GMT\r\n\
-Content-Length: 5\r\n\
+X-Archive-Orig-Content-Length: 5\r\n\
Content-Type: text/html;charset=UTF-8\r\n\
"""
rwinfo = self.do_rewrite('200 OK', headers)
http_headers = DefaultHeaderRewriter(rwinfo)()
assert str(http_headers) == res
- assert rwinfo.text_type == 'html'
- assert rwinfo.charset == 'utf-8'
+ assert rwinfo.text_type == None
+ assert rwinfo.charset == None
def test_header_rewrite_redirect(self):
headers = [('Connection', 'close'),