1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

htmlrewriter: if urls contain non-ascii chars, ensure the url is reencoded with expected charset, using same charset as for banner insert (#361)

(instead of default iso-8859-1) before %-encoding and rewriting
tests: add test to ensure correct %-encoding of utf-8 urls
This commit is contained in:
Ilya Kreymer 2018-08-06 22:42:24 -07:00 committed by GitHub
parent 1156032e0e
commit 5476d75294
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 34 additions and 9 deletions

View File

@ -130,31 +130,31 @@ class BaseContentRewriter(object):
head_insert=head_insert_str, head_insert=head_insert_str,
url=cdx['url'], url=cdx['url'],
defmod=self.replay_mod, defmod=self.replay_mod,
parse_comments=rule.get('parse_comments', False)) parse_comments=rule.get('parse_comments', False),
charset=rwinfo.charset)
return rw return rw
def get_head_insert(self, rwinfo, rule, head_insert_func, cdx): def get_head_insert(self, rwinfo, rule, head_insert_func, cdx):
head_insert_str = '' head_insert_str = ''
charset = rwinfo.charset
# if no charset set, attempt to extract from first 1024 # if no charset set, attempt to extract from first 1024
if not charset: if not rwinfo.charset:
first_buff = rwinfo.read_and_keep(1024) first_buff = rwinfo.read_and_keep(1024)
charset = self.extract_html_charset(first_buff) rwinfo.charset = self.extract_html_charset(first_buff)
if head_insert_func: if head_insert_func:
head_insert_orig = head_insert_func(rule, cdx) head_insert_orig = head_insert_func(rule, cdx)
if charset: if rwinfo.charset:
try: try:
head_insert_str = webencodings.encode(head_insert_orig, charset) head_insert_str = webencodings.encode(head_insert_orig, rwinfo.charset)
except: except:
pass pass
if not head_insert_str: if not head_insert_str:
charset = 'utf-8' rwinfo.charset = 'utf-8'
head_insert_str = head_insert_orig.encode(charset) head_insert_str = head_insert_orig.encode(rwinfo.charset)
head_insert_str = head_insert_str.decode('iso-8859-1') head_insert_str = head_insert_str.decode('iso-8859-1')

View File

@ -115,9 +115,11 @@ class HTMLRewriterMixin(StreamingRewriter):
css_rewriter_class=None, css_rewriter_class=None,
url = '', url = '',
defmod='', defmod='',
parse_comments=False): parse_comments=False,
charset='utf-8'):
super(HTMLRewriterMixin, self).__init__(url_rewriter, False) super(HTMLRewriterMixin, self).__init__(url_rewriter, False)
self.charset = charset
self._wb_parse_context = None self._wb_parse_context = None
if js_rewriter: if js_rewriter:
@ -229,6 +231,15 @@ class HTMLRewriterMixin(StreamingRewriter):
if not value: if not value:
return '' return ''
# if url is not ascii, ensure its reencoded in expected charset
try:
value.encode('ascii')
except:
try:
value = value.encode('iso-8859-1').decode(self.charset)
except:
pass
unesc_value = self.try_unescape(value) unesc_value = self.try_unescape(value)
rewritten_value = self.url_rewriter.rewrite(unesc_value, mod, force_abs) rewritten_value = self.url_rewriter.rewrite(unesc_value, mod, force_abs)

View File

@ -1,3 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from warcio.warcwriter import BufferWARCWriter, GzippingWrapper from warcio.warcwriter import BufferWARCWriter, GzippingWrapper
from warcio.statusandheaders import StatusAndHeaders from warcio.statusandheaders import StatusAndHeaders
@ -105,6 +108,17 @@ class TestContentRewriter(object):
assert ('Content-Type', 'text/html; charset=UTF-8') in headers.headers assert ('Content-Type', 'text/html; charset=UTF-8') in headers.headers
assert b''.join(gen).decode('utf-8') == exp assert b''.join(gen).decode('utf-8') == exp
def test_rewrite_html_utf_8(self):
headers = {'Content-Type': 'text/html; charset=utf-8'}
content = u'<html><body><a href="http://éxample.com/tésté"></a></body></html>'
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
exp = '<html><body><a href="http://localhost:8080/prefix/201701/http://%C3%A9xample.com/t%C3%A9st%C3%A9"></a></body></html>'
assert is_rw
assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers
assert b''.join(gen).decode('utf-8') == exp
def test_rewrite_html_js_mod(self, headers): def test_rewrite_html_js_mod(self, headers):
content = '<html><body><a href="http://example.com/"></a></body></html>' content = '<html><body><a href="http://example.com/"></a></body></html>'