diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index 277753f4..3938efda 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -53,7 +53,9 @@ ur""" # Unicode -- default with %-encoding >>> parse(u'испытание') -испытание +испытание + +#испытание >>> parse(u'испытание', urlrewriter=urlrewriter_pencode) испытание @@ -171,6 +173,7 @@ from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.html_rewriter import HTMLRewriter import pprint +import urllib urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/', @@ -187,9 +190,12 @@ no_base_canon_rewriter = UrlRewriter('20131226101010/http://example.com/some/pat rewrite_base=False)) def parse(data, head_insert=None, urlrewriter=urlrewriter): - data = data.encode('utf-8') parser = HTMLRewriter(urlrewriter, head_insert = head_insert) - #data = data.decode('utf-8') + + if isinstance(data, unicode): + data = data.encode('utf-8') + #data = urllib.quote(data, ':" =/-\\<>') + result = parser.rewrite(data) + parser.close() # decode only for printing print result.decode('utf-8') diff --git a/pywb/rewrite/test/test_wburl.py b/pywb/rewrite/test/test_wburl.py index c6ac76ef..077bba6d 100644 --- a/pywb/rewrite/test/test_wburl.py +++ b/pywb/rewrite/test/test_wburl.py @@ -88,6 +88,9 @@ https://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B >>> print(to_uri_pencode('http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:])) http://d0%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5 +# invalid +>>> print(to_uri_pencode('http://xn--abcd')) +http://xn--abcd # IRI representation >>> repr(WbUrl(u'http://пример.испытание')) diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py index dac119f5..e43bab37 100644 --- a/pywb/rewrite/wburl.py +++ b/pywb/rewrite/wburl.py @@ -97,6 +97,10 @@ class WbUrl(BaseWbUrl): The rest of url should be unchanged """ + # only continue if punycode encoded + if 'xn--' not in url: + return url + parts = urlparse.urlsplit(url) domain = parts.netloc try: @@ -108,10 +112,6 @@ class WbUrl(BaseWbUrl): domain = urllib.quote(domain)#, safe=r':\/') - # no changes - if parts.netloc == domain: - return url - return urlparse.urlunsplit((parts[0], domain, parts[2], parts[3], parts[4]))