From cabb488f4ec39554502ec7ada5ed482d972f8363 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 6 Sep 2018 10:32:54 -0700 Subject: [PATCH] Encoding Fix (#376) * encoding fix: a better fix from #361: - when dealing with unicode urls, don't assume always %-encoded. if no change, (eg. anchor), then return url in original encoding - utf-8 optimization: if content is known to be in utf-8, use utf-8 directly, don't decode as iso-8859-1 and then re-encode to utf-8 for rewriting * content rewriter decoding fix: use incrementaldecoder for incrementally decoding utf-8 stream tests: add test which splits utf-8 char along 16k boundary to test incremental decoding --- pywb/rewrite/content_rewriter.py | 28 +++++++++++--- pywb/rewrite/html_rewriter.py | 14 +++++-- pywb/rewrite/test/test_content_rewriter.py | 44 ++++++++++++++++++++++ 3 files changed, 76 insertions(+), 10 deletions(-) diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index f77cc13e..18805772 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -9,6 +9,7 @@ import re import webencodings import tempfile import json +import codecs from pywb.utils.io import StreamIter, BUFF_SIZE @@ -277,7 +278,7 @@ class StreamingRewriter(object): self.first_buff = first_buff def __call__(self, rwinfo): - return self.rewrite_text_stream_to_gen(rwinfo.content_stream) + return self.rewrite_text_stream_to_gen(rwinfo.content_stream, rwinfo) def rewrite(self, string): return string @@ -288,7 +289,7 @@ class StreamingRewriter(object): def final_read(self): return '' - def rewrite_text_stream_to_gen(self, stream): + def rewrite_text_stream_to_gen(self, stream, rwinfo): """ Convert stream to generator using applying rewriting func to each portion of the stream. @@ -297,8 +298,17 @@ class StreamingRewriter(object): try: buff = self.first_buff + # if charset is utf-8, use that, otherwise default to encode to ascii-compatible encoding + # encoding only used for url rewriting, encoding back to bytes after rewriting + if rwinfo.charset == 'utf-8': + charset = 'utf-8' + else: + charset = 'iso-8859-1' + if buff: - yield buff.encode('iso-8859-1') + yield buff.encode(charset) + + decoder = codecs.getincrementaldecoder(charset)() while True: buff = stream.read(BUFF_SIZE) @@ -308,13 +318,19 @@ class StreamingRewriter(object): if self.align_to_line: buff += stream.readline() - buff = self.rewrite(buff.decode('iso-8859-1')) - yield buff.encode('iso-8859-1') + buff = decoder.decode(buff) + buff = self.rewrite(buff) + + yield buff.encode(charset) # For adding a tail/handling final buffer buff = self.final_read() + + # ensure decoder is marked as finished (final buffer already decoded) + decoder.decode(b'', final=True) + if buff: - yield buff.encode('iso-8859-1') + yield buff.encode(charset) finally: stream.close() diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 792fd716..f8848c2d 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -231,10 +231,12 @@ class HTMLRewriterMixin(StreamingRewriter): if not value: return '' - # if url is not ascii, ensure its reencoded in expected charset - try: - value.encode('ascii') - except: + + orig_value = value + + # if not utf-8, then stream was encoded as iso-8859-1, and need to reencode + # into correct charset + if self.charset != 'utf-8' and self.charset != 'iso-8859-1': try: value = value.encode('iso-8859-1').decode(self.charset) except: @@ -243,6 +245,10 @@ class HTMLRewriterMixin(StreamingRewriter): unesc_value = self.try_unescape(value) rewritten_value = self.url_rewriter.rewrite(unesc_value, mod, force_abs) + # if no rewriting has occured, ensure we return original, not reencoded value + if rewritten_value == value: + return orig_value + if unesc_value != value and rewritten_value != unesc_value: rewritten_value = rewritten_value.replace(unesc_value, value) diff --git a/pywb/rewrite/test/test_content_rewriter.py b/pywb/rewrite/test/test_content_rewriter.py index f7cd42bb..6d100a26 100644 --- a/pywb/rewrite/test/test_content_rewriter.py +++ b/pywb/rewrite/test/test_content_rewriter.py @@ -110,6 +110,17 @@ class TestContentRewriter(object): assert ('Content-Type', 'text/html; charset=UTF-8') in headers.headers assert b''.join(gen).decode('utf-8') == exp + def test_rewrite_text_utf_8_long(self): + headers = {'Content-Type': 'text/html; charset=utf-8'} + exp = u'éeé' * 3277 + content = exp.encode('utf-8') + + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_') + + assert is_rw + assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers + assert b''.join(gen).decode('utf-8') == exp + def test_rewrite_html_utf_8(self): headers = {'Content-Type': 'text/html; charset=utf-8'} content = u'' @@ -121,6 +132,39 @@ class TestContentRewriter(object): assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers assert b''.join(gen).decode('utf-8') == exp + def test_rewrite_html_utf_8_anchor(self): + headers = {'Content-Type': 'text/html; charset=utf-8'} + content = u'' + + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_') + + exp = u'' + assert is_rw + assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers + assert b''.join(gen).decode('utf-8') == exp + + def test_rewrite_html_other_encoding(self): + headers = {'Content-Type': 'text/html; charset=latin-1'} + content = b'' + + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_') + + exp = '' + assert is_rw + assert ('Content-Type', 'text/html; charset=latin-1') in headers.headers + assert b''.join(gen).decode('latin-1') == exp + + def test_rewrite_html_other_encoding_anchor(self): + headers = {'Content-Type': 'text/html; charset=latin-1'} + content = b'' + + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_') + + exp = u'' + assert is_rw + assert ('Content-Type', 'text/html; charset=latin-1') in headers.headers + assert b''.join(gen).decode('latin-1') == exp + def test_rewrite_html_js_mod(self, headers): content = ''