rewrite: optimize / sanity, only %-encode urls that are actually idna-encoded,

otherwise return as is, #66
2025-03-15 00:03:28 +01:00 · 2015-02-15 10:34:56 -08:00 · 2015-02-15 10:34:56 -08:00 · c4d5dd4690
commit c4d5dd4690
parent afe49a91f4
3 changed files with 16 additions and 7 deletions
--- a/pywb/rewrite/test/test_html_rewriter.py
+++ b/pywb/rewrite/test/test_html_rewriter.py
@ -53,7 +53,9 @@ ur"""

 # Unicode -- default with %-encoding
 >>> parse(u'<a href="http://испытание.испытание/">испытание</a>')
-<a href="/web/20131226101010/http://%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/">испытание</a>
+<a href="/web/20131226101010/http://испытание.испытание/">испытание</a>
+
+#<a href="/web/20131226101010/http://%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/">испытание</a>

 >>> parse(u'<a href="http://испытание.испытание/">испытание</a>', urlrewriter=urlrewriter_pencode)
 <a href="/web/20131226101010/http://испытание.испытание/">испытание</a>
@ -171,6 +173,7 @@ from pywb.rewrite.url_rewriter import UrlRewriter
 from pywb.rewrite.html_rewriter import HTMLRewriter

 import pprint
+import urllib

 urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
                          '/web/',
@ -187,9 +190,12 @@ no_base_canon_rewriter = UrlRewriter('20131226101010/http://example.com/some/pat
                                                       rewrite_base=False))

 def parse(data, head_insert=None, urlrewriter=urlrewriter):
-    data = data.encode('utf-8')
    parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
-    #data = data.decode('utf-8')
+
+    if isinstance(data, unicode):
+        data = data.encode('utf-8')
+        #data = urllib.quote(data, ':" =/-\\<>')
+
    result = parser.rewrite(data) + parser.close()
    # decode only for printing
    print result.decode('utf-8')
--- a/pywb/rewrite/test/test_wburl.py
+++ b/pywb/rewrite/test/test_wburl.py
@ -88,6 +88,9 @@ https://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B
 >>> print(to_uri_pencode('http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:]))
 http://d0%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5

+# invalid
+>>> print(to_uri_pencode('http://xn--abcd'))
+http://xn--abcd

 # IRI representation
 >>> repr(WbUrl(u'http://пример.испытание'))
--- a/pywb/rewrite/wburl.py
+++ b/pywb/rewrite/wburl.py
@ -97,6 +97,10 @@ class WbUrl(BaseWbUrl):
        The rest of url should be unchanged
        """

+        # only continue if punycode encoded
+        if 'xn--' not in url:
+            return url
+
        parts = urlparse.urlsplit(url)
        domain = parts.netloc
        try:
@ -108,10 +112,6 @@ class WbUrl(BaseWbUrl):

        domain = urllib.quote(domain)#, safe=r':\/')

-        # no changes
-        if parts.netloc == domain:
-            return url
-
        return urlparse.urlunsplit((parts[0], domain, parts[2], parts[3], parts[4]))