mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
- if no encoding is detected, don't default to utf-8 - if no encoding known, encode banner as 'ascii' with 'xmlcharrefreplace', converting to xml entities - tests: add tests for rewriting with no known encoding
This commit is contained in:
parent
cabb488f4e
commit
d3e66b581a
@ -153,9 +153,9 @@ class BaseContentRewriter(object):
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# no charset detected, encode banner as ascii html entities
|
||||||
if not head_insert_str:
|
if not head_insert_str:
|
||||||
rwinfo.charset = 'utf-8'
|
head_insert_str = head_insert_orig.encode('ascii', 'xmlcharrefreplace')
|
||||||
head_insert_str = head_insert_orig.encode(rwinfo.charset)
|
|
||||||
|
|
||||||
head_insert_str = head_insert_str.decode('iso-8859-1')
|
head_insert_str = head_insert_str.decode('iso-8859-1')
|
||||||
|
|
||||||
|
@ -74,10 +74,18 @@ class TestContentRewriter(object):
|
|||||||
cdx['is_fuzzy'] = '1'
|
cdx['is_fuzzy'] = '1'
|
||||||
cdx['is_live'] = is_live
|
cdx['is_live'] = is_live
|
||||||
|
|
||||||
|
def insert_func(rule, cdx):
|
||||||
|
return ''
|
||||||
|
|
||||||
if use_js_proxy:
|
if use_js_proxy:
|
||||||
return self.js_proxy_content_rewriter(record, url_rewriter, None, cdx=cdx, environ=environ)
|
rewriter = self.js_proxy_content_rewriter
|
||||||
else:
|
else:
|
||||||
return self.content_rewriter(record, url_rewriter, None, cdx=cdx, environ=environ)
|
rewriter = self.content_rewriter
|
||||||
|
|
||||||
|
return rewriter(record, url_rewriter, cookie_rewriter=None,
|
||||||
|
head_insert_func=insert_func,
|
||||||
|
cdx=cdx,
|
||||||
|
environ=environ)
|
||||||
|
|
||||||
def test_rewrite_html(self, headers):
|
def test_rewrite_html(self, headers):
|
||||||
content = '<html><body><a href="http://example.com/"></a></body></html>'
|
content = '<html><body><a href="http://example.com/"></a></body></html>'
|
||||||
@ -154,15 +162,15 @@ class TestContentRewriter(object):
|
|||||||
assert ('Content-Type', 'text/html; charset=latin-1') in headers.headers
|
assert ('Content-Type', 'text/html; charset=latin-1') in headers.headers
|
||||||
assert b''.join(gen).decode('latin-1') == exp
|
assert b''.join(gen).decode('latin-1') == exp
|
||||||
|
|
||||||
def test_rewrite_html_other_encoding_anchor(self):
|
def test_rewrite_html_no_encoding_anchor(self):
|
||||||
headers = {'Content-Type': 'text/html; charset=latin-1'}
|
headers = {'Content-Type': 'text/html'}
|
||||||
content = b'<html><body><a href="#\xe9xample-t\xe9st\xe9"></a></body></html>'
|
content = b'<html><body><a href="#\xe9xample-t\xe9st\xe9"></a></body></html>'
|
||||||
|
|
||||||
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
|
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
|
||||||
|
|
||||||
exp = u'<html><body><a href="#éxample-tésté"></a></body></html>'
|
exp = u'<html><body><a href="#éxample-tésté"></a></body></html>'
|
||||||
assert is_rw
|
assert is_rw
|
||||||
assert ('Content-Type', 'text/html; charset=latin-1') in headers.headers
|
assert ('Content-Type', 'text/html') in headers.headers
|
||||||
assert b''.join(gen).decode('latin-1') == exp
|
assert b''.join(gen).decode('latin-1') == exp
|
||||||
|
|
||||||
def test_rewrite_html_js_mod(self, headers):
|
def test_rewrite_html_js_mod(self, headers):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user