mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
wrong encoding fallback: don't rely on content-type charset=utf-8 as being accurate! (#380)
- only use utf-8 decoding optimization for html - when parsing as html, if utf-8 encoding fails, default to iso-8859-1/latin-1 for remainder (usually will happen right away eg. if actually binary content) - tests: add tests rewriting css and html with wrong charset
This commit is contained in:
parent
348e434bee
commit
adf34cdb35
@ -298,9 +298,10 @@ class StreamingRewriter(object):
|
|||||||
try:
|
try:
|
||||||
buff = self.first_buff
|
buff = self.first_buff
|
||||||
|
|
||||||
|
# for html rewriting:
|
||||||
# if charset is utf-8, use that, otherwise default to encode to ascii-compatible encoding
|
# if charset is utf-8, use that, otherwise default to encode to ascii-compatible encoding
|
||||||
# encoding only used for url rewriting, encoding back to bytes after rewriting
|
# encoding only used for url rewriting, encoding back to bytes after rewriting
|
||||||
if rwinfo.charset == 'utf-8':
|
if rwinfo.charset == 'utf-8' and rwinfo.text_type == 'html':
|
||||||
charset = 'utf-8'
|
charset = 'utf-8'
|
||||||
else:
|
else:
|
||||||
charset = 'iso-8859-1'
|
charset = 'iso-8859-1'
|
||||||
@ -318,7 +319,15 @@ class StreamingRewriter(object):
|
|||||||
if self.align_to_line:
|
if self.align_to_line:
|
||||||
buff += stream.readline()
|
buff += stream.readline()
|
||||||
|
|
||||||
buff = decoder.decode(buff)
|
try:
|
||||||
|
buff = decoder.decode(buff)
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
if charset == 'utf-8':
|
||||||
|
rwinfo.charset = 'iso-8859-1'
|
||||||
|
charset = rwinfo.charset
|
||||||
|
decoder = codecs.getincrementaldecoder(charset)()
|
||||||
|
buff = decoder.decode(buff)
|
||||||
|
|
||||||
buff = self.rewrite(buff)
|
buff = self.rewrite(buff)
|
||||||
|
|
||||||
yield buff.encode(charset)
|
yield buff.encode(charset)
|
||||||
|
@ -295,6 +295,26 @@ class TestContentRewriter(object):
|
|||||||
|
|
||||||
assert is_rw == False
|
assert is_rw == False
|
||||||
|
|
||||||
|
def test_binary_wrong_content_type_html(self):
|
||||||
|
headers = {'Content-Type': 'text/html; charset=utf-8'}
|
||||||
|
content = b'\xe9\x11\x12\x13\x14'
|
||||||
|
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
|
||||||
|
|
||||||
|
assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers
|
||||||
|
|
||||||
|
assert is_rw == True
|
||||||
|
assert b''.join(gen) == content
|
||||||
|
|
||||||
|
def test_binary_wrong_content_type_css(self):
|
||||||
|
headers = {'Content-Type': 'text/css; charset=utf-8'}
|
||||||
|
content = b'\xe9\x11\x12\x13\x14'
|
||||||
|
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701cs_')
|
||||||
|
|
||||||
|
assert ('Content-Type', 'text/css; charset=utf-8') in headers.headers
|
||||||
|
|
||||||
|
assert is_rw == True
|
||||||
|
assert b''.join(gen) == content
|
||||||
|
|
||||||
def test_binary_dechunk(self):
|
def test_binary_dechunk(self):
|
||||||
headers = {'Content-Type': 'application/octet-stream',
|
headers = {'Content-Type': 'application/octet-stream',
|
||||||
'Transfer-Encoding': 'chunked'}
|
'Transfer-Encoding': 'chunked'}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user