mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Encoding Fix (#376)
* encoding fix: a better fix from #361: - when dealing with unicode urls, don't assume always %-encoded. if no change, (eg. anchor), then return url in original encoding - utf-8 optimization: if content is known to be in utf-8, use utf-8 directly, don't decode as iso-8859-1 and then re-encode to utf-8 for rewriting * content rewriter decoding fix: use incrementaldecoder for incrementally decoding utf-8 stream tests: add test which splits utf-8 char along 16k boundary to test incremental decoding
This commit is contained in:
parent
5c00743bdd
commit
cabb488f4e
@ -9,6 +9,7 @@ import re
|
||||
import webencodings
|
||||
import tempfile
|
||||
import json
|
||||
import codecs
|
||||
|
||||
from pywb.utils.io import StreamIter, BUFF_SIZE
|
||||
|
||||
@ -277,7 +278,7 @@ class StreamingRewriter(object):
|
||||
self.first_buff = first_buff
|
||||
|
||||
def __call__(self, rwinfo):
|
||||
return self.rewrite_text_stream_to_gen(rwinfo.content_stream)
|
||||
return self.rewrite_text_stream_to_gen(rwinfo.content_stream, rwinfo)
|
||||
|
||||
def rewrite(self, string):
|
||||
return string
|
||||
@ -288,7 +289,7 @@ class StreamingRewriter(object):
|
||||
def final_read(self):
|
||||
return ''
|
||||
|
||||
def rewrite_text_stream_to_gen(self, stream):
|
||||
def rewrite_text_stream_to_gen(self, stream, rwinfo):
|
||||
"""
|
||||
Convert stream to generator using applying rewriting func
|
||||
to each portion of the stream.
|
||||
@ -297,8 +298,17 @@ class StreamingRewriter(object):
|
||||
try:
|
||||
buff = self.first_buff
|
||||
|
||||
# if charset is utf-8, use that, otherwise default to encode to ascii-compatible encoding
|
||||
# encoding only used for url rewriting, encoding back to bytes after rewriting
|
||||
if rwinfo.charset == 'utf-8':
|
||||
charset = 'utf-8'
|
||||
else:
|
||||
charset = 'iso-8859-1'
|
||||
|
||||
if buff:
|
||||
yield buff.encode('iso-8859-1')
|
||||
yield buff.encode(charset)
|
||||
|
||||
decoder = codecs.getincrementaldecoder(charset)()
|
||||
|
||||
while True:
|
||||
buff = stream.read(BUFF_SIZE)
|
||||
@ -308,13 +318,19 @@ class StreamingRewriter(object):
|
||||
if self.align_to_line:
|
||||
buff += stream.readline()
|
||||
|
||||
buff = self.rewrite(buff.decode('iso-8859-1'))
|
||||
yield buff.encode('iso-8859-1')
|
||||
buff = decoder.decode(buff)
|
||||
buff = self.rewrite(buff)
|
||||
|
||||
yield buff.encode(charset)
|
||||
|
||||
# For adding a tail/handling final buffer
|
||||
buff = self.final_read()
|
||||
|
||||
# ensure decoder is marked as finished (final buffer already decoded)
|
||||
decoder.decode(b'', final=True)
|
||||
|
||||
if buff:
|
||||
yield buff.encode('iso-8859-1')
|
||||
yield buff.encode(charset)
|
||||
|
||||
finally:
|
||||
stream.close()
|
||||
|
@ -231,10 +231,12 @@ class HTMLRewriterMixin(StreamingRewriter):
|
||||
if not value:
|
||||
return ''
|
||||
|
||||
# if url is not ascii, ensure its reencoded in expected charset
|
||||
try:
|
||||
value.encode('ascii')
|
||||
except:
|
||||
|
||||
orig_value = value
|
||||
|
||||
# if not utf-8, then stream was encoded as iso-8859-1, and need to reencode
|
||||
# into correct charset
|
||||
if self.charset != 'utf-8' and self.charset != 'iso-8859-1':
|
||||
try:
|
||||
value = value.encode('iso-8859-1').decode(self.charset)
|
||||
except:
|
||||
@ -243,6 +245,10 @@ class HTMLRewriterMixin(StreamingRewriter):
|
||||
unesc_value = self.try_unescape(value)
|
||||
rewritten_value = self.url_rewriter.rewrite(unesc_value, mod, force_abs)
|
||||
|
||||
# if no rewriting has occured, ensure we return original, not reencoded value
|
||||
if rewritten_value == value:
|
||||
return orig_value
|
||||
|
||||
if unesc_value != value and rewritten_value != unesc_value:
|
||||
rewritten_value = rewritten_value.replace(unesc_value, value)
|
||||
|
||||
|
@ -110,6 +110,17 @@ class TestContentRewriter(object):
|
||||
assert ('Content-Type', 'text/html; charset=UTF-8') in headers.headers
|
||||
assert b''.join(gen).decode('utf-8') == exp
|
||||
|
||||
def test_rewrite_text_utf_8_long(self):
|
||||
headers = {'Content-Type': 'text/html; charset=utf-8'}
|
||||
exp = u'éeé' * 3277
|
||||
content = exp.encode('utf-8')
|
||||
|
||||
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
|
||||
|
||||
assert is_rw
|
||||
assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers
|
||||
assert b''.join(gen).decode('utf-8') == exp
|
||||
|
||||
def test_rewrite_html_utf_8(self):
|
||||
headers = {'Content-Type': 'text/html; charset=utf-8'}
|
||||
content = u'<html><body><a href="http://éxample.com/tésté"></a></body></html>'
|
||||
@ -121,6 +132,39 @@ class TestContentRewriter(object):
|
||||
assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers
|
||||
assert b''.join(gen).decode('utf-8') == exp
|
||||
|
||||
def test_rewrite_html_utf_8_anchor(self):
|
||||
headers = {'Content-Type': 'text/html; charset=utf-8'}
|
||||
content = u'<html><body><a href="#éxample-tésté"></a></body></html>'
|
||||
|
||||
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
|
||||
|
||||
exp = u'<html><body><a href="#éxample-tésté"></a></body></html>'
|
||||
assert is_rw
|
||||
assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers
|
||||
assert b''.join(gen).decode('utf-8') == exp
|
||||
|
||||
def test_rewrite_html_other_encoding(self):
|
||||
headers = {'Content-Type': 'text/html; charset=latin-1'}
|
||||
content = b'<html><body><a href="http://\xe9xample.com/t\xe9st\xe9"></a></body></html>'
|
||||
|
||||
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
|
||||
|
||||
exp = '<html><body><a href="http://localhost:8080/prefix/201701/http://%C3%A9xample.com/t%C3%A9st%C3%A9"></a></body></html>'
|
||||
assert is_rw
|
||||
assert ('Content-Type', 'text/html; charset=latin-1') in headers.headers
|
||||
assert b''.join(gen).decode('latin-1') == exp
|
||||
|
||||
def test_rewrite_html_other_encoding_anchor(self):
|
||||
headers = {'Content-Type': 'text/html; charset=latin-1'}
|
||||
content = b'<html><body><a href="#\xe9xample-t\xe9st\xe9"></a></body></html>'
|
||||
|
||||
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
|
||||
|
||||
exp = u'<html><body><a href="#éxample-tésté"></a></body></html>'
|
||||
assert is_rw
|
||||
assert ('Content-Type', 'text/html; charset=latin-1') in headers.headers
|
||||
assert b''.join(gen).decode('latin-1') == exp
|
||||
|
||||
def test_rewrite_html_js_mod(self, headers):
|
||||
content = '<html><body><a href="http://example.com/"></a></body></html>'
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user