From d3e66b581a29ebedcb595d5bbabac35a147ca19f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 6 Sep 2018 14:09:30 -0700 Subject: [PATCH] encoding fix: additional fix to #376 for banner encoding: (#377) - if no encoding is detected, don't default to utf-8 - if no encoding known, encode banner as 'ascii' with 'xmlcharrefreplace', converting to xml entities - tests: add tests for rewriting with no known encoding --- pywb/rewrite/content_rewriter.py | 4 ++-- pywb/rewrite/test/test_content_rewriter.py | 18 +++++++++++++----- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index 18805772..b6eb8f43 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -153,9 +153,9 @@ class BaseContentRewriter(object): except: pass + # no charset detected, encode banner as ascii html entities if not head_insert_str: - rwinfo.charset = 'utf-8' - head_insert_str = head_insert_orig.encode(rwinfo.charset) + head_insert_str = head_insert_orig.encode('ascii', 'xmlcharrefreplace') head_insert_str = head_insert_str.decode('iso-8859-1') diff --git a/pywb/rewrite/test/test_content_rewriter.py b/pywb/rewrite/test/test_content_rewriter.py index 6d100a26..b89c3959 100644 --- a/pywb/rewrite/test/test_content_rewriter.py +++ b/pywb/rewrite/test/test_content_rewriter.py @@ -74,10 +74,18 @@ class TestContentRewriter(object): cdx['is_fuzzy'] = '1' cdx['is_live'] = is_live + def insert_func(rule, cdx): + return '' + if use_js_proxy: - return self.js_proxy_content_rewriter(record, url_rewriter, None, cdx=cdx, environ=environ) + rewriter = self.js_proxy_content_rewriter else: - return self.content_rewriter(record, url_rewriter, None, cdx=cdx, environ=environ) + rewriter = self.content_rewriter + + return rewriter(record, url_rewriter, cookie_rewriter=None, + head_insert_func=insert_func, + cdx=cdx, + environ=environ) def test_rewrite_html(self, headers): content = '' @@ -154,15 +162,15 @@ class TestContentRewriter(object): assert ('Content-Type', 'text/html; charset=latin-1') in headers.headers assert b''.join(gen).decode('latin-1') == exp - def test_rewrite_html_other_encoding_anchor(self): - headers = {'Content-Type': 'text/html; charset=latin-1'} + def test_rewrite_html_no_encoding_anchor(self): + headers = {'Content-Type': 'text/html'} content = b'' headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_') exp = u'' assert is_rw - assert ('Content-Type', 'text/html; charset=latin-1') in headers.headers + assert ('Content-Type', 'text/html') in headers.headers assert b''.join(gen).decode('latin-1') == exp def test_rewrite_html_js_mod(self, headers):