From 5476d75294414015cd5e61ce746b84cf095fccc0 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 6 Aug 2018 22:42:24 -0700 Subject: [PATCH] htmlrewriter: if urls contain non-ascii chars, ensure the url is reencoded with expected charset, using same charset as for banner insert (#361) (instead of default iso-8859-1) before %-encoding and rewriting tests: add test to ensure correct %-encoding of utf-8 urls --- pywb/rewrite/content_rewriter.py | 16 ++++++++-------- pywb/rewrite/html_rewriter.py | 13 ++++++++++++- pywb/rewrite/test/test_content_rewriter.py | 14 ++++++++++++++ 3 files changed, 34 insertions(+), 9 deletions(-) diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index b0bbdda4..ebbdf4ea 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -130,31 +130,31 @@ class BaseContentRewriter(object): head_insert=head_insert_str, url=cdx['url'], defmod=self.replay_mod, - parse_comments=rule.get('parse_comments', False)) + parse_comments=rule.get('parse_comments', False), + charset=rwinfo.charset) return rw def get_head_insert(self, rwinfo, rule, head_insert_func, cdx): head_insert_str = '' - charset = rwinfo.charset # if no charset set, attempt to extract from first 1024 - if not charset: + if not rwinfo.charset: first_buff = rwinfo.read_and_keep(1024) - charset = self.extract_html_charset(first_buff) + rwinfo.charset = self.extract_html_charset(first_buff) if head_insert_func: head_insert_orig = head_insert_func(rule, cdx) - if charset: + if rwinfo.charset: try: - head_insert_str = webencodings.encode(head_insert_orig, charset) + head_insert_str = webencodings.encode(head_insert_orig, rwinfo.charset) except: pass if not head_insert_str: - charset = 'utf-8' - head_insert_str = head_insert_orig.encode(charset) + rwinfo.charset = 'utf-8' + head_insert_str = head_insert_orig.encode(rwinfo.charset) head_insert_str = head_insert_str.decode('iso-8859-1') diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index a77e02b1..792fd716 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -115,9 +115,11 @@ class HTMLRewriterMixin(StreamingRewriter): css_rewriter_class=None, url = '', defmod='', - parse_comments=False): + parse_comments=False, + charset='utf-8'): super(HTMLRewriterMixin, self).__init__(url_rewriter, False) + self.charset = charset self._wb_parse_context = None if js_rewriter: @@ -229,6 +231,15 @@ class HTMLRewriterMixin(StreamingRewriter): if not value: return '' + # if url is not ascii, ensure its reencoded in expected charset + try: + value.encode('ascii') + except: + try: + value = value.encode('iso-8859-1').decode(self.charset) + except: + pass + unesc_value = self.try_unescape(value) rewritten_value = self.url_rewriter.rewrite(unesc_value, mod, force_abs) diff --git a/pywb/rewrite/test/test_content_rewriter.py b/pywb/rewrite/test/test_content_rewriter.py index 6e09dfb1..e43051cf 100644 --- a/pywb/rewrite/test/test_content_rewriter.py +++ b/pywb/rewrite/test/test_content_rewriter.py @@ -1,3 +1,6 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + from warcio.warcwriter import BufferWARCWriter, GzippingWrapper from warcio.statusandheaders import StatusAndHeaders @@ -105,6 +108,17 @@ class TestContentRewriter(object): assert ('Content-Type', 'text/html; charset=UTF-8') in headers.headers assert b''.join(gen).decode('utf-8') == exp + def test_rewrite_html_utf_8(self): + headers = {'Content-Type': 'text/html; charset=utf-8'} + content = u'' + + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_') + + exp = '' + assert is_rw + assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers + assert b''.join(gen).decode('utf-8') == exp + def test_rewrite_html_js_mod(self, headers): content = ''