wrong encoding fallback: don't rely on content-type charset=utf-8 as being accurate! (#380)

- only use utf-8 decoding optimization for html - when parsing as html, if utf-8 encoding fails, default to iso-8859-1/latin-1 for remainder (usually will happen right away eg. if actually binary content) - tests: add tests rewriting css and html with wrong charset
2025-03-24 06:59:52 +01:00 · 2018-09-11 11:51:09 -07:00 · 2018-09-11 11:51:09 -07:00 · adf34cdb35
commit adf34cdb35
parent 348e434bee
2 changed files with 31 additions and 2 deletions
--- a/pywb/rewrite/content_rewriter.py
+++ b/pywb/rewrite/content_rewriter.py
@ -298,9 +298,10 @@ class StreamingRewriter(object):
        try:
            buff = self.first_buff
            # for html rewriting:
            # if charset is utf-8, use that, otherwise default to encode to ascii-compatible encoding
            # encoding only used for url rewriting, encoding back to bytes after rewriting
-            if rwinfo.charset == 'utf-8':
+            if rwinfo.charset == 'utf-8' and rwinfo.text_type == 'html':
                charset = 'utf-8'
            else:
                charset = 'iso-8859-1'
@ -318,7 +319,15 @@ class StreamingRewriter(object):
                if self.align_to_line:
                    buff += stream.readline()
-                buff = decoder.decode(buff)
+                try:
                    buff = decoder.decode(buff)
                except UnicodeDecodeError:
                    if charset == 'utf-8':
                        rwinfo.charset = 'iso-8859-1'
                        charset = rwinfo.charset
                        decoder = codecs.getincrementaldecoder(charset)()
                        buff = decoder.decode(buff)
                buff = self.rewrite(buff)
                yield buff.encode(charset)
--- a/pywb/rewrite/test/test_content_rewriter.py
+++ b/pywb/rewrite/test/test_content_rewriter.py
@ -295,6 +295,26 @@ class TestContentRewriter(object):
        assert is_rw == False
    def test_binary_wrong_content_type_html(self):
        headers = {'Content-Type': 'text/html; charset=utf-8'}
        content = b'\xe9\x11\x12\x13\x14'
        headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
        assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers
        assert is_rw == True
        assert b''.join(gen) == content
    def test_binary_wrong_content_type_css(self):
        headers = {'Content-Type': 'text/css; charset=utf-8'}
        content = b'\xe9\x11\x12\x13\x14'
        headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701cs_')
        assert ('Content-Type', 'text/css; charset=utf-8') in headers.headers
        assert is_rw == True
        assert b''.join(gen) == content
    def test_binary_dechunk(self):
        headers = {'Content-Type': 'application/octet-stream',
                   'Transfer-Encoding': 'chunked'}