From adf34cdb3569ac3f5999cb8b07ff98f8ede168c1 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 11 Sep 2018 11:51:09 -0700 Subject: [PATCH] wrong encoding fallback: don't rely on content-type charset=utf-8 as being accurate! (#380) - only use utf-8 decoding optimization for html - when parsing as html, if utf-8 encoding fails, default to iso-8859-1/latin-1 for remainder (usually will happen right away eg. if actually binary content) - tests: add tests rewriting css and html with wrong charset --- pywb/rewrite/content_rewriter.py | 13 +++++++++++-- pywb/rewrite/test/test_content_rewriter.py | 20 ++++++++++++++++++++ 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index b6eb8f43..a37281ae 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -298,9 +298,10 @@ class StreamingRewriter(object): try: buff = self.first_buff + # for html rewriting: # if charset is utf-8, use that, otherwise default to encode to ascii-compatible encoding # encoding only used for url rewriting, encoding back to bytes after rewriting - if rwinfo.charset == 'utf-8': + if rwinfo.charset == 'utf-8' and rwinfo.text_type == 'html': charset = 'utf-8' else: charset = 'iso-8859-1' @@ -318,7 +319,15 @@ class StreamingRewriter(object): if self.align_to_line: buff += stream.readline() - buff = decoder.decode(buff) + try: + buff = decoder.decode(buff) + except UnicodeDecodeError: + if charset == 'utf-8': + rwinfo.charset = 'iso-8859-1' + charset = rwinfo.charset + decoder = codecs.getincrementaldecoder(charset)() + buff = decoder.decode(buff) + buff = self.rewrite(buff) yield buff.encode(charset) diff --git a/pywb/rewrite/test/test_content_rewriter.py b/pywb/rewrite/test/test_content_rewriter.py index b89c3959..60cbd265 100644 --- a/pywb/rewrite/test/test_content_rewriter.py +++ b/pywb/rewrite/test/test_content_rewriter.py @@ -295,6 +295,26 @@ class TestContentRewriter(object): assert is_rw == False + def test_binary_wrong_content_type_html(self): + headers = {'Content-Type': 'text/html; charset=utf-8'} + content = b'\xe9\x11\x12\x13\x14' + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_') + + assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers + + assert is_rw == True + assert b''.join(gen) == content + + def test_binary_wrong_content_type_css(self): + headers = {'Content-Type': 'text/css; charset=utf-8'} + content = b'\xe9\x11\x12\x13\x14' + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701cs_') + + assert ('Content-Type', 'text/css; charset=utf-8') in headers.headers + + assert is_rw == True + assert b''.join(gen) == content + def test_binary_dechunk(self): headers = {'Content-Type': 'application/octet-stream', 'Transfer-Encoding': 'chunked'}