mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Check text/html content to ensure actually html (#428)
* html rewrite: when encountering 'text/html' content-type, add html-detection check before assuming content is html (similar to text/plain) supersedes #426, fixes #424 -- binary files served under mp_/ as text/html should now be served as binary - when guessing if html, add additional regex to check if text does not start with < -- perhaps html but starting with plain text. only check for text/html content-type and not js_/cs_ mod
This commit is contained in:
parent
2b8bf76c9a
commit
3235c382a5
@ -348,6 +348,7 @@ class StreamingRewriter(object):
|
|||||||
# ============================================================================
|
# ============================================================================
|
||||||
class RewriteInfo(object):
|
class RewriteInfo(object):
|
||||||
TAG_REGEX = re.compile(b'^\s*\<')
|
TAG_REGEX = re.compile(b'^\s*\<')
|
||||||
|
TAG_REGEX2 = re.compile(b'^.*<\w+[\s>]')
|
||||||
JSON_REGEX = re.compile(b'^\s*[{[][{"]') # if it starts with this then highly likely not HTML
|
JSON_REGEX = re.compile(b'^\s*[{[][{"]') # if it starts with this then highly likely not HTML
|
||||||
|
|
||||||
JSONP_CONTAINS = ['callback=jQuery',
|
JSONP_CONTAINS = ['callback=jQuery',
|
||||||
@ -391,7 +392,7 @@ class RewriteInfo(object):
|
|||||||
text_type = self._resolve_text_type(orig_text_type)
|
text_type = self._resolve_text_type(orig_text_type)
|
||||||
url = self.url_rewriter.wburl.url
|
url = self.url_rewriter.wburl.url
|
||||||
|
|
||||||
if text_type in ('guess-text', 'guess-bin'):
|
if text_type in ('guess-text', 'guess-bin', 'guess-html'):
|
||||||
text_type = None
|
text_type = None
|
||||||
|
|
||||||
if text_type == 'js':
|
if text_type == 'js':
|
||||||
@ -432,8 +433,8 @@ class RewriteInfo(object):
|
|||||||
|
|
||||||
# if html or no-content type, allow resolving on js, css,
|
# if html or no-content type, allow resolving on js, css,
|
||||||
# or other templates
|
# or other templates
|
||||||
if text_type == 'guess-text':
|
if text_type in ('guess-text', 'guess-html'):
|
||||||
if not is_js_or_css and mod not in ('if_', 'mp_', ''):
|
if not is_js_or_css and mod not in ('if_', 'mp_', 'bn_', ''):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# if application/octet-stream binary, only resolve if in js/css content
|
# if application/octet-stream binary, only resolve if in js/css content
|
||||||
@ -449,6 +450,10 @@ class RewriteInfo(object):
|
|||||||
# check if doesn't start with a tag, then likely not html
|
# check if doesn't start with a tag, then likely not html
|
||||||
if self.TAG_REGEX.match(buff):
|
if self.TAG_REGEX.match(buff):
|
||||||
return 'html'
|
return 'html'
|
||||||
|
# perform additional check to see if it has any html tags
|
||||||
|
elif text_type == 'guess-html' and not is_js_or_css:
|
||||||
|
if self.TAG_REGEX2.match(buff):
|
||||||
|
return 'html'
|
||||||
|
|
||||||
if not is_js_or_css:
|
if not is_js_or_css:
|
||||||
return text_type
|
return text_type
|
||||||
|
@ -48,7 +48,7 @@ class DefaultRewriter(BaseContentRewriter):
|
|||||||
|
|
||||||
rewrite_types = {
|
rewrite_types = {
|
||||||
# HTML
|
# HTML
|
||||||
'text/html': 'html',
|
'text/html': 'guess-html',
|
||||||
'application/xhtml': 'html',
|
'application/xhtml': 'html',
|
||||||
'application/xhtml+xml': 'html',
|
'application/xhtml+xml': 'html',
|
||||||
|
|
||||||
|
@ -126,7 +126,7 @@ class TestContentRewriter(object):
|
|||||||
|
|
||||||
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
|
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
|
||||||
|
|
||||||
assert is_rw
|
assert is_rw == False
|
||||||
assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers
|
assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers
|
||||||
assert b''.join(gen).decode('utf-8') == exp
|
assert b''.join(gen).decode('utf-8') == exp
|
||||||
|
|
||||||
@ -333,9 +333,19 @@ class TestContentRewriter(object):
|
|||||||
|
|
||||||
assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers
|
assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers
|
||||||
|
|
||||||
assert is_rw == True
|
assert is_rw == False
|
||||||
assert b''.join(gen) == content
|
assert b''.join(gen) == content
|
||||||
|
|
||||||
|
def test_binary_wrong_content_type_html_rw(self):
|
||||||
|
headers = {'Content-Type': 'text/html; charset=utf-8'}
|
||||||
|
content = b'Hello <a href="/foo.html">link</a>'
|
||||||
|
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
|
||||||
|
|
||||||
|
assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers
|
||||||
|
|
||||||
|
assert is_rw
|
||||||
|
assert b''.join(gen) == b'Hello <a href="/prefix/201701/http://example.com/foo.html">link</a>'
|
||||||
|
|
||||||
def test_binary_wrong_content_type_css(self):
|
def test_binary_wrong_content_type_css(self):
|
||||||
headers = {'Content-Type': 'text/css; charset=utf-8'}
|
headers = {'Content-Type': 'text/css; charset=utf-8'}
|
||||||
content = b'\xe9\x11\x12\x13\x14'
|
content = b'\xe9\x11\x12\x13\x14'
|
||||||
|
@ -43,15 +43,15 @@ class TestHeaderRewriter(object):
|
|||||||
res = """\
|
res = """\
|
||||||
HTTP/1.0 200 OK\r\n\
|
HTTP/1.0 200 OK\r\n\
|
||||||
Date: Fri, 03 Jan 2014 03:03:21 GMT\r\n\
|
Date: Fri, 03 Jan 2014 03:03:21 GMT\r\n\
|
||||||
Content-Length: 5\r\n\
|
X-Archive-Orig-Content-Length: 5\r\n\
|
||||||
Content-Type: text/html;charset=UTF-8\r\n\
|
Content-Type: text/html;charset=UTF-8\r\n\
|
||||||
"""
|
"""
|
||||||
rwinfo = self.do_rewrite('200 OK', headers)
|
rwinfo = self.do_rewrite('200 OK', headers)
|
||||||
http_headers = DefaultHeaderRewriter(rwinfo)()
|
http_headers = DefaultHeaderRewriter(rwinfo)()
|
||||||
assert str(http_headers) == res
|
assert str(http_headers) == res
|
||||||
|
|
||||||
assert rwinfo.text_type == 'html'
|
assert rwinfo.text_type == None
|
||||||
assert rwinfo.charset == 'utf-8'
|
assert rwinfo.charset == None
|
||||||
|
|
||||||
def test_header_rewrite_redirect(self):
|
def test_header_rewrite_redirect(self):
|
||||||
headers = [('Connection', 'close'),
|
headers = [('Connection', 'close'),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user