1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

encoding fix: additional fix to #376 for banner encoding: (#377)

- if no encoding is detected, don't default to utf-8
- if no encoding known, encode banner as 'ascii' with 'xmlcharrefreplace', converting to xml entities
- tests: add tests for rewriting with no known encoding
This commit is contained in:
Ilya Kreymer 2018-09-06 14:09:30 -07:00 committed by John Berlin
parent cabb488f4e
commit d3e66b581a
2 changed files with 15 additions and 7 deletions

View File

@ -153,9 +153,9 @@ class BaseContentRewriter(object):
except:
pass
# no charset detected, encode banner as ascii html entities
if not head_insert_str:
rwinfo.charset = 'utf-8'
head_insert_str = head_insert_orig.encode(rwinfo.charset)
head_insert_str = head_insert_orig.encode('ascii', 'xmlcharrefreplace')
head_insert_str = head_insert_str.decode('iso-8859-1')

View File

@ -74,10 +74,18 @@ class TestContentRewriter(object):
cdx['is_fuzzy'] = '1'
cdx['is_live'] = is_live
def insert_func(rule, cdx):
return ''
if use_js_proxy:
return self.js_proxy_content_rewriter(record, url_rewriter, None, cdx=cdx, environ=environ)
rewriter = self.js_proxy_content_rewriter
else:
return self.content_rewriter(record, url_rewriter, None, cdx=cdx, environ=environ)
rewriter = self.content_rewriter
return rewriter(record, url_rewriter, cookie_rewriter=None,
head_insert_func=insert_func,
cdx=cdx,
environ=environ)
def test_rewrite_html(self, headers):
content = '<html><body><a href="http://example.com/"></a></body></html>'
@ -154,15 +162,15 @@ class TestContentRewriter(object):
assert ('Content-Type', 'text/html; charset=latin-1') in headers.headers
assert b''.join(gen).decode('latin-1') == exp
def test_rewrite_html_other_encoding_anchor(self):
headers = {'Content-Type': 'text/html; charset=latin-1'}
def test_rewrite_html_no_encoding_anchor(self):
headers = {'Content-Type': 'text/html'}
content = b'<html><body><a href="#\xe9xample-t\xe9st\xe9"></a></body></html>'
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
exp = u'<html><body><a href="#éxample-tésté"></a></body></html>'
assert is_rw
assert ('Content-Type', 'text/html; charset=latin-1') in headers.headers
assert ('Content-Type', 'text/html') in headers.headers
assert b''.join(gen).decode('latin-1') == exp
def test_rewrite_html_js_mod(self, headers):