mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Add content sniffing to the html check of _fill_text_type_and_charset
when the url ends with .json (#367)
Detect if .json urls served with mtext/html are actually json and not html. Tests: updated test_content_rewriter.py to test for json sent as mime text/html
This commit is contained in:
parent
b4d4be8a64
commit
d62ab14914
@ -312,6 +312,7 @@ class StreamingRewriter(object):
|
||||
# ============================================================================
|
||||
class RewriteInfo(object):
|
||||
TAG_REGEX = re.compile(b'^\s*\<')
|
||||
JSON_REGEX = re.compile(b'^\s*[{[][{"]') # if it starts with this then highly likely not HTML
|
||||
|
||||
JSONP_CONTAINS = ['callback=jQuery',
|
||||
'callback=jsonp',
|
||||
@ -363,6 +364,10 @@ class RewriteInfo(object):
|
||||
text_type = 'json'
|
||||
|
||||
if (text_type and orig_text_type != text_type) or text_type == 'html':
|
||||
if url.endswith('.json'):
|
||||
buff = self.read_and_keep(56)
|
||||
if self.JSON_REGEX.match(buff) is not None:
|
||||
return 'json', charset
|
||||
# check if default content_type that needs to be set
|
||||
new_mime = content_rewriter.default_content_types.get(text_type)
|
||||
|
||||
@ -392,7 +397,7 @@ class RewriteInfo(object):
|
||||
# if html or no-content type, allow resolving on js, css,
|
||||
# or other templates
|
||||
if text_type == 'guess-text':
|
||||
if not is_js_or_css and not mod in ('if_', 'mp_', ''):
|
||||
if not is_js_or_css and mod not in ('if_', 'mp_', ''):
|
||||
return None
|
||||
|
||||
# if application/octet-stream binary, only resolve if in js/css content
|
||||
|
@ -617,3 +617,11 @@ http://example.com/video_4.m3u8
|
||||
|
||||
assert b''.join(gen).decode('utf-8') == filtered
|
||||
|
||||
def test_json_body_but_mime_html(self):
|
||||
headers = {'Content-Type': 'text/html'}
|
||||
content = '{"foo":"bar", "dash": {"on": "true"}'
|
||||
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_',
|
||||
url='http://example.com/path/file.json')
|
||||
assert headers.headers == [('Content-Type', 'text/html')]
|
||||
result = b''.join(gen).decode('utf-8')
|
||||
assert result == content
|
||||
|
Loading…
x
Reference in New Issue
Block a user