mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
- if no encoding is detected, don't default to utf-8 - if no encoding known, encode banner as 'ascii' with 'xmlcharrefreplace', converting to xml entities - tests: add tests for rewriting with no known encoding
This commit is contained in:
parent
cabb488f4e
commit
d3e66b581a
@ -153,9 +153,9 @@ class BaseContentRewriter(object):
|
||||
except:
|
||||
pass
|
||||
|
||||
# no charset detected, encode banner as ascii html entities
|
||||
if not head_insert_str:
|
||||
rwinfo.charset = 'utf-8'
|
||||
head_insert_str = head_insert_orig.encode(rwinfo.charset)
|
||||
head_insert_str = head_insert_orig.encode('ascii', 'xmlcharrefreplace')
|
||||
|
||||
head_insert_str = head_insert_str.decode('iso-8859-1')
|
||||
|
||||
|
@ -74,10 +74,18 @@ class TestContentRewriter(object):
|
||||
cdx['is_fuzzy'] = '1'
|
||||
cdx['is_live'] = is_live
|
||||
|
||||
def insert_func(rule, cdx):
|
||||
return ''
|
||||
|
||||
if use_js_proxy:
|
||||
return self.js_proxy_content_rewriter(record, url_rewriter, None, cdx=cdx, environ=environ)
|
||||
rewriter = self.js_proxy_content_rewriter
|
||||
else:
|
||||
return self.content_rewriter(record, url_rewriter, None, cdx=cdx, environ=environ)
|
||||
rewriter = self.content_rewriter
|
||||
|
||||
return rewriter(record, url_rewriter, cookie_rewriter=None,
|
||||
head_insert_func=insert_func,
|
||||
cdx=cdx,
|
||||
environ=environ)
|
||||
|
||||
def test_rewrite_html(self, headers):
|
||||
content = '<html><body><a href="http://example.com/"></a></body></html>'
|
||||
@ -154,15 +162,15 @@ class TestContentRewriter(object):
|
||||
assert ('Content-Type', 'text/html; charset=latin-1') in headers.headers
|
||||
assert b''.join(gen).decode('latin-1') == exp
|
||||
|
||||
def test_rewrite_html_other_encoding_anchor(self):
|
||||
headers = {'Content-Type': 'text/html; charset=latin-1'}
|
||||
def test_rewrite_html_no_encoding_anchor(self):
|
||||
headers = {'Content-Type': 'text/html'}
|
||||
content = b'<html><body><a href="#\xe9xample-t\xe9st\xe9"></a></body></html>'
|
||||
|
||||
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
|
||||
|
||||
exp = u'<html><body><a href="#éxample-tésté"></a></body></html>'
|
||||
assert is_rw
|
||||
assert ('Content-Type', 'text/html; charset=latin-1') in headers.headers
|
||||
assert ('Content-Type', 'text/html') in headers.headers
|
||||
assert b''.join(gen).decode('latin-1') == exp
|
||||
|
||||
def test_rewrite_html_js_mod(self, headers):
|
||||
|
Loading…
x
Reference in New Issue
Block a user