1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Encoding Fix (#376)

* encoding fix: a better fix from #361:
- when dealing with unicode urls, don't assume always %-encoded. if no change, (eg. anchor), then return url in original encoding
- utf-8 optimization: if content is known to be in utf-8, use utf-8 directly, don't decode as iso-8859-1 and then re-encode to utf-8 for rewriting

* content rewriter decoding fix: use incrementaldecoder for incrementally decoding utf-8 stream
tests: add test which splits utf-8 char along 16k boundary to test incremental decoding
This commit is contained in:
Ilya Kreymer 2018-09-06 10:32:54 -07:00 committed by John Berlin
parent 5c00743bdd
commit cabb488f4e
3 changed files with 76 additions and 10 deletions

View File

@ -9,6 +9,7 @@ import re
import webencodings
import tempfile
import json
import codecs
from pywb.utils.io import StreamIter, BUFF_SIZE
@ -277,7 +278,7 @@ class StreamingRewriter(object):
self.first_buff = first_buff
def __call__(self, rwinfo):
return self.rewrite_text_stream_to_gen(rwinfo.content_stream)
return self.rewrite_text_stream_to_gen(rwinfo.content_stream, rwinfo)
def rewrite(self, string):
return string
@ -288,7 +289,7 @@ class StreamingRewriter(object):
def final_read(self):
return ''
def rewrite_text_stream_to_gen(self, stream):
def rewrite_text_stream_to_gen(self, stream, rwinfo):
"""
Convert stream to generator using applying rewriting func
to each portion of the stream.
@ -297,8 +298,17 @@ class StreamingRewriter(object):
try:
buff = self.first_buff
# if charset is utf-8, use that, otherwise default to encode to ascii-compatible encoding
# encoding only used for url rewriting, encoding back to bytes after rewriting
if rwinfo.charset == 'utf-8':
charset = 'utf-8'
else:
charset = 'iso-8859-1'
if buff:
yield buff.encode('iso-8859-1')
yield buff.encode(charset)
decoder = codecs.getincrementaldecoder(charset)()
while True:
buff = stream.read(BUFF_SIZE)
@ -308,13 +318,19 @@ class StreamingRewriter(object):
if self.align_to_line:
buff += stream.readline()
buff = self.rewrite(buff.decode('iso-8859-1'))
yield buff.encode('iso-8859-1')
buff = decoder.decode(buff)
buff = self.rewrite(buff)
yield buff.encode(charset)
# For adding a tail/handling final buffer
buff = self.final_read()
# ensure decoder is marked as finished (final buffer already decoded)
decoder.decode(b'', final=True)
if buff:
yield buff.encode('iso-8859-1')
yield buff.encode(charset)
finally:
stream.close()

View File

@ -231,10 +231,12 @@ class HTMLRewriterMixin(StreamingRewriter):
if not value:
return ''
# if url is not ascii, ensure its reencoded in expected charset
try:
value.encode('ascii')
except:
orig_value = value
# if not utf-8, then stream was encoded as iso-8859-1, and need to reencode
# into correct charset
if self.charset != 'utf-8' and self.charset != 'iso-8859-1':
try:
value = value.encode('iso-8859-1').decode(self.charset)
except:
@ -243,6 +245,10 @@ class HTMLRewriterMixin(StreamingRewriter):
unesc_value = self.try_unescape(value)
rewritten_value = self.url_rewriter.rewrite(unesc_value, mod, force_abs)
# if no rewriting has occured, ensure we return original, not reencoded value
if rewritten_value == value:
return orig_value
if unesc_value != value and rewritten_value != unesc_value:
rewritten_value = rewritten_value.replace(unesc_value, value)

View File

@ -110,6 +110,17 @@ class TestContentRewriter(object):
assert ('Content-Type', 'text/html; charset=UTF-8') in headers.headers
assert b''.join(gen).decode('utf-8') == exp
def test_rewrite_text_utf_8_long(self):
headers = {'Content-Type': 'text/html; charset=utf-8'}
exp = u'éeé' * 3277
content = exp.encode('utf-8')
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
assert is_rw
assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers
assert b''.join(gen).decode('utf-8') == exp
def test_rewrite_html_utf_8(self):
headers = {'Content-Type': 'text/html; charset=utf-8'}
content = u'<html><body><a href="http://éxample.com/tésté"></a></body></html>'
@ -121,6 +132,39 @@ class TestContentRewriter(object):
assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers
assert b''.join(gen).decode('utf-8') == exp
def test_rewrite_html_utf_8_anchor(self):
headers = {'Content-Type': 'text/html; charset=utf-8'}
content = u'<html><body><a href="#éxample-tésté"></a></body></html>'
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
exp = u'<html><body><a href="#éxample-tésté"></a></body></html>'
assert is_rw
assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers
assert b''.join(gen).decode('utf-8') == exp
def test_rewrite_html_other_encoding(self):
headers = {'Content-Type': 'text/html; charset=latin-1'}
content = b'<html><body><a href="http://\xe9xample.com/t\xe9st\xe9"></a></body></html>'
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
exp = '<html><body><a href="http://localhost:8080/prefix/201701/http://%C3%A9xample.com/t%C3%A9st%C3%A9"></a></body></html>'
assert is_rw
assert ('Content-Type', 'text/html; charset=latin-1') in headers.headers
assert b''.join(gen).decode('latin-1') == exp
def test_rewrite_html_other_encoding_anchor(self):
headers = {'Content-Type': 'text/html; charset=latin-1'}
content = b'<html><body><a href="#\xe9xample-t\xe9st\xe9"></a></body></html>'
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
exp = u'<html><body><a href="#éxample-tésté"></a></body></html>'
assert is_rw
assert ('Content-Type', 'text/html; charset=latin-1') in headers.headers
assert b''.join(gen).decode('latin-1') == exp
def test_rewrite_html_js_mod(self, headers):
content = '<html><body><a href="http://example.com/"></a></body></html>'