1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Add content sniffing to the html check of _fill_text_type_and_charset when the url ends with .json (#367)

Detect if .json urls served with mtext/html are actually json and not html.

Tests: updated test_content_rewriter.py to test for json sent as mime text/html
This commit is contained in:
John Berlin 2018-08-20 18:03:28 -04:00 committed by Ilya Kreymer
parent b4d4be8a64
commit d62ab14914
2 changed files with 14 additions and 1 deletions

View File

@ -312,6 +312,7 @@ class StreamingRewriter(object):
# ============================================================================
class RewriteInfo(object):
TAG_REGEX = re.compile(b'^\s*\<')
JSON_REGEX = re.compile(b'^\s*[{[][{"]') # if it starts with this then highly likely not HTML
JSONP_CONTAINS = ['callback=jQuery',
'callback=jsonp',
@ -363,6 +364,10 @@ class RewriteInfo(object):
text_type = 'json'
if (text_type and orig_text_type != text_type) or text_type == 'html':
if url.endswith('.json'):
buff = self.read_and_keep(56)
if self.JSON_REGEX.match(buff) is not None:
return 'json', charset
# check if default content_type that needs to be set
new_mime = content_rewriter.default_content_types.get(text_type)
@ -392,7 +397,7 @@ class RewriteInfo(object):
# if html or no-content type, allow resolving on js, css,
# or other templates
if text_type == 'guess-text':
if not is_js_or_css and not mod in ('if_', 'mp_', ''):
if not is_js_or_css and mod not in ('if_', 'mp_', ''):
return None
# if application/octet-stream binary, only resolve if in js/css content

View File

@ -617,3 +617,11 @@ http://example.com/video_4.m3u8
assert b''.join(gen).decode('utf-8') == filtered
def test_json_body_but_mime_html(self):
headers = {'Content-Type': 'text/html'}
content = '{"foo":"bar", "dash": {"on": "true"}'
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_',
url='http://example.com/path/file.json')
assert headers.headers == [('Content-Type', 'text/html')]
result = b''.join(gen).decode('utf-8')
assert result == content