diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py
index 277753f4..3938efda 100644
--- a/pywb/rewrite/test/test_html_rewriter.py
+++ b/pywb/rewrite/test/test_html_rewriter.py
@@ -53,7 +53,9 @@ ur"""
# Unicode -- default with %-encoding
>>> parse(u'испытание')
-испытание
+испытание
+
+#испытание
>>> parse(u'испытание', urlrewriter=urlrewriter_pencode)
испытание
@@ -171,6 +173,7 @@ from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.html_rewriter import HTMLRewriter
import pprint
+import urllib
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
'/web/',
@@ -187,9 +190,12 @@ no_base_canon_rewriter = UrlRewriter('20131226101010/http://example.com/some/pat
rewrite_base=False))
def parse(data, head_insert=None, urlrewriter=urlrewriter):
- data = data.encode('utf-8')
parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
- #data = data.decode('utf-8')
+
+ if isinstance(data, unicode):
+ data = data.encode('utf-8')
+ #data = urllib.quote(data, ':" =/-\\<>')
+
result = parser.rewrite(data) + parser.close()
# decode only for printing
print result.decode('utf-8')
diff --git a/pywb/rewrite/test/test_wburl.py b/pywb/rewrite/test/test_wburl.py
index c6ac76ef..077bba6d 100644
--- a/pywb/rewrite/test/test_wburl.py
+++ b/pywb/rewrite/test/test_wburl.py
@@ -88,6 +88,9 @@ https://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B
>>> print(to_uri_pencode('http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:]))
http://d0%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5
+# invalid
+>>> print(to_uri_pencode('http://xn--abcd'))
+http://xn--abcd
# IRI representation
>>> repr(WbUrl(u'http://пример.испытание'))
diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py
index dac119f5..e43bab37 100644
--- a/pywb/rewrite/wburl.py
+++ b/pywb/rewrite/wburl.py
@@ -97,6 +97,10 @@ class WbUrl(BaseWbUrl):
The rest of url should be unchanged
"""
+ # only continue if punycode encoded
+ if 'xn--' not in url:
+ return url
+
parts = urlparse.urlsplit(url)
domain = parts.netloc
try:
@@ -108,10 +112,6 @@ class WbUrl(BaseWbUrl):
domain = urllib.quote(domain)#, safe=r':\/')
- # no changes
- if parts.netloc == domain:
- return url
-
return urlparse.urlunsplit((parts[0], domain, parts[2], parts[3], parts[4]))