1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

rewrite: fix rewriting encoding -- for best rewriting, keep strategy of encoding

insert to match page, then using latin-1 for rewriting. support for non-ascii
based encoding still needed
This commit is contained in:
Ilya Kreymer 2016-02-23 18:07:34 -08:00
parent 3a584a1ec3
commit cebd6b6239

View File

@ -175,11 +175,13 @@ class RewriteContent:
charset = 'utf-8' charset = 'utf-8'
head_insert_str = head_insert_orig.encode(charset) head_insert_str = head_insert_orig.encode(charset)
head_insert_str = to_native_str(head_insert_str, 'utf-8') head_insert_buf = head_insert_str
#head_insert_str = to_native_str(head_insert_str)
head_insert_str = head_insert_str.decode('iso-8859-1')
if wb_url.is_banner_only: if wb_url.is_banner_only:
gen = self._head_insert_only_gen(head_insert_str, gen = self._head_insert_only_gen(head_insert_buf,
stream, stream,
first_buff) first_buff)
@ -241,7 +243,8 @@ class RewriteContent:
m = RewriteContent.CHARSET_REGEX.search(buff) m = RewriteContent.CHARSET_REGEX.search(buff)
if m: if m:
charset = m.group(1) charset = m.group(1)
content_type = 'text/html; charset=' + to_native_str(charset, 'utf-8') charset = to_native_str(charset)
content_type = 'text/html; charset=' + charset
status_headers.replace_header('content-type', content_type) status_headers.replace_header('content-type', content_type)
return charset return charset
@ -279,10 +282,10 @@ class RewriteContent:
if matcher: if matcher:
yield buff[:matcher.end()] yield buff[:matcher.end()]
yield insert_str.encode('utf-8') yield insert_str
yield buff[matcher.end():] yield buff[matcher.end():]
else: else:
yield insert_str.encode('utf-8') yield insert_str
yield buff yield buff
for buff in self.stream_to_gen(stream): for buff in self.stream_to_gen(stream):
@ -336,8 +339,8 @@ class RewriteContent:
while True: while True:
if buff: if buff:
buff = rewrite_func(to_native_str(buff, 'utf-8')) buff = rewrite_func(buff.decode('iso-8859-1'))
yield buff.encode('utf-8') yield buff.encode('iso-8859-1')
buff = stream.read(RewriteContent.BUFF_SIZE) buff = stream.read(RewriteContent.BUFF_SIZE)
# on 2.6, readline() (but not read()) throws an exception # on 2.6, readline() (but not read()) throws an exception
@ -352,7 +355,7 @@ class RewriteContent:
# For adding a tail/handling final buffer # For adding a tail/handling final buffer
buff = final_read_func() buff = final_read_func()
if buff: if buff:
yield buff.encode('utf-8') yield buff.encode('iso-8859-1')
finally: finally:
stream.close() stream.close()