mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Add content sniffing to the html check of _fill_text_type_and_charset
when the url ends with .json (#367)
Detect if .json urls served with mtext/html are actually json and not html. Tests: updated test_content_rewriter.py to test for json sent as mime text/html
This commit is contained in:
parent
b4d4be8a64
commit
d62ab14914
@ -312,6 +312,7 @@ class StreamingRewriter(object):
|
|||||||
# ============================================================================
|
# ============================================================================
|
||||||
class RewriteInfo(object):
|
class RewriteInfo(object):
|
||||||
TAG_REGEX = re.compile(b'^\s*\<')
|
TAG_REGEX = re.compile(b'^\s*\<')
|
||||||
|
JSON_REGEX = re.compile(b'^\s*[{[][{"]') # if it starts with this then highly likely not HTML
|
||||||
|
|
||||||
JSONP_CONTAINS = ['callback=jQuery',
|
JSONP_CONTAINS = ['callback=jQuery',
|
||||||
'callback=jsonp',
|
'callback=jsonp',
|
||||||
@ -363,6 +364,10 @@ class RewriteInfo(object):
|
|||||||
text_type = 'json'
|
text_type = 'json'
|
||||||
|
|
||||||
if (text_type and orig_text_type != text_type) or text_type == 'html':
|
if (text_type and orig_text_type != text_type) or text_type == 'html':
|
||||||
|
if url.endswith('.json'):
|
||||||
|
buff = self.read_and_keep(56)
|
||||||
|
if self.JSON_REGEX.match(buff) is not None:
|
||||||
|
return 'json', charset
|
||||||
# check if default content_type that needs to be set
|
# check if default content_type that needs to be set
|
||||||
new_mime = content_rewriter.default_content_types.get(text_type)
|
new_mime = content_rewriter.default_content_types.get(text_type)
|
||||||
|
|
||||||
@ -392,7 +397,7 @@ class RewriteInfo(object):
|
|||||||
# if html or no-content type, allow resolving on js, css,
|
# if html or no-content type, allow resolving on js, css,
|
||||||
# or other templates
|
# or other templates
|
||||||
if text_type == 'guess-text':
|
if text_type == 'guess-text':
|
||||||
if not is_js_or_css and not mod in ('if_', 'mp_', ''):
|
if not is_js_or_css and mod not in ('if_', 'mp_', ''):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# if application/octet-stream binary, only resolve if in js/css content
|
# if application/octet-stream binary, only resolve if in js/css content
|
||||||
|
@ -617,3 +617,11 @@ http://example.com/video_4.m3u8
|
|||||||
|
|
||||||
assert b''.join(gen).decode('utf-8') == filtered
|
assert b''.join(gen).decode('utf-8') == filtered
|
||||||
|
|
||||||
|
def test_json_body_but_mime_html(self):
|
||||||
|
headers = {'Content-Type': 'text/html'}
|
||||||
|
content = '{"foo":"bar", "dash": {"on": "true"}'
|
||||||
|
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_',
|
||||||
|
url='http://example.com/path/file.json')
|
||||||
|
assert headers.headers == [('Content-Type', 'text/html')]
|
||||||
|
result = b''.join(gen).decode('utf-8')
|
||||||
|
assert result == content
|
||||||
|
Loading…
x
Reference in New Issue
Block a user