mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
rewrite: fix rewriting encoding -- for best rewriting, keep strategy of encoding
insert to match page, then using latin-1 for rewriting. support for non-ascii based encoding still needed
This commit is contained in:
parent
3a584a1ec3
commit
cebd6b6239
@ -175,11 +175,13 @@ class RewriteContent:
|
|||||||
charset = 'utf-8'
|
charset = 'utf-8'
|
||||||
head_insert_str = head_insert_orig.encode(charset)
|
head_insert_str = head_insert_orig.encode(charset)
|
||||||
|
|
||||||
head_insert_str = to_native_str(head_insert_str, 'utf-8')
|
head_insert_buf = head_insert_str
|
||||||
|
#head_insert_str = to_native_str(head_insert_str)
|
||||||
|
head_insert_str = head_insert_str.decode('iso-8859-1')
|
||||||
|
|
||||||
|
|
||||||
if wb_url.is_banner_only:
|
if wb_url.is_banner_only:
|
||||||
gen = self._head_insert_only_gen(head_insert_str,
|
gen = self._head_insert_only_gen(head_insert_buf,
|
||||||
stream,
|
stream,
|
||||||
first_buff)
|
first_buff)
|
||||||
|
|
||||||
@ -241,7 +243,8 @@ class RewriteContent:
|
|||||||
m = RewriteContent.CHARSET_REGEX.search(buff)
|
m = RewriteContent.CHARSET_REGEX.search(buff)
|
||||||
if m:
|
if m:
|
||||||
charset = m.group(1)
|
charset = m.group(1)
|
||||||
content_type = 'text/html; charset=' + to_native_str(charset, 'utf-8')
|
charset = to_native_str(charset)
|
||||||
|
content_type = 'text/html; charset=' + charset
|
||||||
status_headers.replace_header('content-type', content_type)
|
status_headers.replace_header('content-type', content_type)
|
||||||
return charset
|
return charset
|
||||||
|
|
||||||
@ -279,10 +282,10 @@ class RewriteContent:
|
|||||||
|
|
||||||
if matcher:
|
if matcher:
|
||||||
yield buff[:matcher.end()]
|
yield buff[:matcher.end()]
|
||||||
yield insert_str.encode('utf-8')
|
yield insert_str
|
||||||
yield buff[matcher.end():]
|
yield buff[matcher.end():]
|
||||||
else:
|
else:
|
||||||
yield insert_str.encode('utf-8')
|
yield insert_str
|
||||||
yield buff
|
yield buff
|
||||||
|
|
||||||
for buff in self.stream_to_gen(stream):
|
for buff in self.stream_to_gen(stream):
|
||||||
@ -336,8 +339,8 @@ class RewriteContent:
|
|||||||
|
|
||||||
while True:
|
while True:
|
||||||
if buff:
|
if buff:
|
||||||
buff = rewrite_func(to_native_str(buff, 'utf-8'))
|
buff = rewrite_func(buff.decode('iso-8859-1'))
|
||||||
yield buff.encode('utf-8')
|
yield buff.encode('iso-8859-1')
|
||||||
|
|
||||||
buff = stream.read(RewriteContent.BUFF_SIZE)
|
buff = stream.read(RewriteContent.BUFF_SIZE)
|
||||||
# on 2.6, readline() (but not read()) throws an exception
|
# on 2.6, readline() (but not read()) throws an exception
|
||||||
@ -352,7 +355,7 @@ class RewriteContent:
|
|||||||
# For adding a tail/handling final buffer
|
# For adding a tail/handling final buffer
|
||||||
buff = final_read_func()
|
buff = final_read_func()
|
||||||
if buff:
|
if buff:
|
||||||
yield buff.encode('utf-8')
|
yield buff.encode('iso-8859-1')
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
stream.close()
|
stream.close()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user