From 695245d9e89ae31862a6ceeeae916d935c2bac8e Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 26 Jan 2015 09:52:04 -0800 Subject: [PATCH] wburl idn: more complete support for idn urls (#66) add distinct to_iri() and to_uri() functions in WbUrl internal representation is always as ascii uri for rewriting, defaults to iri representation unless 'rewrite_ascii_only_urls' is set to true per collection add wbrequest.get_url() to get url as either iri or uri to be passed to templates --- pywb/framework/wbrequestresponse.py | 12 ++ pywb/rewrite/test/test_wburl.py | 163 +++++++++++++++++++--------- pywb/rewrite/url_rewriter.py | 9 +- pywb/rewrite/wburl.py | 100 +++++++++++++---- pywb/static/wb.js | 3 + pywb/ui/head_insert.html | 4 +- pywb/ui/query.html | 14 +-- pywb/webapp/handlers.py | 5 +- pywb/webapp/views.py | 16 ++- tests/test_integration.py | 2 +- 10 files changed, 235 insertions(+), 93 deletions(-) diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index 95a07e66..43680545 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -105,6 +105,18 @@ class WbRequest(object): self._parse_extra() + def get_url(self, url=None): + if not self.wb_url: + return None + + if not url: + url = self.wb_url.url + + if self.urlrewriter.rewrite_opts.get('rewrite_ascii_urls_only'): + return self.wb_url.url + else: + return self.wb_url.to_iri(url) + def _is_ajax(self): value = self.env.get('HTTP_X_REQUESTED_WITH') if value and value.lower() == 'xmlhttprequest': diff --git a/pywb/rewrite/test/test_wburl.py b/pywb/rewrite/test/test_wburl.py index 2e7475af..78bf9764 100644 --- a/pywb/rewrite/test/test_wburl.py +++ b/pywb/rewrite/test/test_wburl.py @@ -4,94 +4,138 @@ ur""" # Replay Urls # ====================== ->>> repr(WbUrl('20131010000506/example.com')) -"('replay', '20131010000506', '', 'http://example.com', '20131010000506/http://example.com')" +>>> repr_unicode(WbUrl('20131010000506/example.com')) +('replay', '20131010000506', '', 'http://example.com', '20131010000506/http://example.com') ->>> repr(WbUrl('20130102im_/https://example.com')) -"('replay', '20130102', 'im_', 'https://example.com', '20130102im_/https://example.com')" +>>> repr_unicode(WbUrl('20130102im_/https://example.com')) +('replay', '20130102', 'im_', 'https://example.com', '20130102im_/https://example.com') ->>> repr(WbUrl('20130102im_/https:/example.com')) -"('replay', '20130102', 'im_', 'https://example.com', '20130102im_/https://example.com')" +>>> repr_unicode(WbUrl('20130102im_/https:/example.com')) +('replay', '20130102', 'im_', 'https://example.com', '20130102im_/https://example.com') # Protocol agnostic convert to http ->>> repr(WbUrl('20130102im_///example.com')) -"('replay', '20130102', 'im_', 'http://example.com', '20130102im_/http://example.com')" +>>> repr_unicode(WbUrl('20130102im_///example.com')) +('replay', '20130102', 'im_', 'http://example.com', '20130102im_/http://example.com') ->>> repr(WbUrl('cs_/example.com')) -"('latest_replay', '', 'cs_', 'http://example.com', 'cs_/http://example.com')" +>>> repr_unicode(WbUrl('cs_/example.com')) +('latest_replay', '', 'cs_', 'http://example.com', 'cs_/http://example.com') ->>> repr(WbUrl('https://example.com/xyz')) -"('latest_replay', '', '', 'https://example.com/xyz', 'https://example.com/xyz')" +>>> repr_unicode(WbUrl('https://example.com/xyz')) +('latest_replay', '', '', 'https://example.com/xyz', 'https://example.com/xyz') ->>> repr(WbUrl('https:/example.com/xyz')) -"('latest_replay', '', '', 'https://example.com/xyz', 'https://example.com/xyz')" +>>> repr_unicode(WbUrl('https:/example.com/xyz')) +('latest_replay', '', '', 'https://example.com/xyz', 'https://example.com/xyz') ->>> repr(WbUrl('https://example.com/xyz?a=%2f&b=%2E')) -"('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', 'https://example.com/xyz?a=%2f&b=%2E')" +>>> repr_unicode(WbUrl('https://example.com/xyz?a=%2f&b=%2E')) +('latest_replay', '', '', 'https://example.com/xyz?a=/&b=.', 'https://example.com/xyz?a=/&b=.') # Test scheme partially encoded urls ->>> repr(WbUrl('https%3A//example.com/')) -"('latest_replay', '', '', 'https://example.com/', 'https://example.com/')" +>>> repr_unicode(WbUrl('https%3A//example.com/')) +('latest_replay', '', '', 'https://example.com/', 'https://example.com/') ->>> repr(WbUrl('2014/http%3A%2F%2Fexample.com/')) -"('replay', '2014', '', 'http://example.com/', '2014/http://example.com/')" +>>> repr_unicode(WbUrl('2014/http%3A%2F%2Fexample.com/')) +('replay', '2014', '', 'http://example.com/', '2014/http://example.com/') # Test IDNs ->>> repr(WbUrl(u'http://пример.испытание')) -"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f', 'http://xn--e1afmkfd.xn--80akhbyknj4f')" ->>> repr(WbUrl(u'https://пример.испытание/abc/')) -"('latest_replay', '', '', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc/', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc/')" +To IRI +>>> print(WbUrl.to_iri(u'https://пример.испытание')) +https://пример.испытание ->>> repr(WbUrl(u'//пример.испытание/abc/')) -"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc/', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc/')" +>>> print(WbUrl.to_iri(u'пример.испытание')) +пример.испытание ->>> repr(WbUrl(u'2014id_/https://пример.испытание/abc')) -"('replay', '2014', 'id_', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/https://xn--e1afmkfd.xn--80akhbyknj4f/abc')" +>>> print(WbUrl.to_iri('http://' + quote_plus(u'пример.испытание'.encode('utf-8')))) +http://пример.испытание + +>>> print(WbUrl.to_iri(u'//пример.испытание/abc/испытание')) +//пример.испытание/abc/испытание + +>>> print(WbUrl.to_iri(quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc/' + quote_plus(u'пример'.encode('utf-8')))) +пример.испытание/abc/пример + +>>> print(WbUrl.to_iri('https://xn--e1afmkfd.xn--80akhbyknj4f')) +https://пример.испытание + + +To URI +>>> print(WbUrl.to_uri(u'https://пример.испытание')) +https://xn--e1afmkfd.xn--80akhbyknj4f + +>>> print(WbUrl.to_uri(u'пример.испытание')) +xn--e1afmkfd.xn--80akhbyknj4f + +>>> print(WbUrl.to_uri('http://' + quote_plus(u'пример.испытание'.encode('utf-8')))) +http://xn--e1afmkfd.xn--80akhbyknj4f + +>>> print(WbUrl.to_uri(u'//пример.испытание/abc/испытание')) +//xn--e1afmkfd.xn--80akhbyknj4f/abc%2F%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5 + +>>> print(WbUrl.to_uri('//' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc/' + quote_plus(u'пример'.encode('utf-8')))) +//xn--e1afmkfd.xn--80akhbyknj4f/abc/%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80 + +>>> print(WbUrl.to_uri('https://xn--e1afmkfd.xn--80akhbyknj4f/abc/')) +https://xn--e1afmkfd.xn--80akhbyknj4f/abc/ + +>>> print(WbUrl.to_uri('http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:])) +http://xn--d0-olcluwd.xn--80akhbyknj4f + +# IRI representation +>>> repr_unicode(WbUrl(u'http://пример.испытание')) +('latest_replay', '', '', 'http://пример.испытание', 'http://пример.испытание') + +>>> repr_unicode(WbUrl(u'https://пример.испытание/abc/')) +('latest_replay', '', '', 'https://пример.испытание/abc/', 'https://пример.испытание/abc/') + +>>> repr_unicode(WbUrl(u'//пример.испытание/abc/')) +('latest_replay', '', '', 'http://пример.испытание/abc/', 'http://пример.испытание/abc/') + +>>> repr_unicode(WbUrl(u'2014id_/https://пример.испытание/abc')) +('replay', '2014', 'id_', 'https://пример.испытание/abc', '2014id_/https://пример.испытание/abc') # percent-encoded form (as sent by browser usually) ->>> repr(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc')) -"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://xn--e1afmkfd.xn--80akhbyknj4f/abc')" +>>> repr_unicode(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc')) +('replay', '2014', 'id_', 'http://пример.испытание/abc', '2014id_/http://пример.испытание/abc') # percent-encoded form -- scheme relative ->>> repr(WbUrl('2014id_///' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc')) -"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://xn--e1afmkfd.xn--80akhbyknj4f/abc')" +>>> repr_unicode(WbUrl('2014id_///' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc')) +('replay', '2014', 'id_', 'http://пример.испытание/abc', '2014id_/http://пример.испытание/abc') # invalid: truncated and superfluous '%', ignore invalid (no exception) ->>> repr(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:] + '%' + '/abc')) -"('replay', '2014', 'id_', 'http://xn--d0-olcluwd.xn--%-7sbpkb3ampk3g/abc', '2014id_/http://xn--d0-olcluwd.xn--%-7sbpkb3ampk3g/abc')" +>>> repr_unicode(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:] + '%' + '/abc')) +('replay', '2014', 'id_', 'http://d0ример.испытание%/abc', '2014id_/http://d0ример.испытание%/abc') # Query Urls # ====================== ->>> repr(WbUrl('*/http://example.com/abc?def=a')) -"('query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a')" +>>> repr_unicode(WbUrl('*/http://example.com/abc?def=a')) +('query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a') ->>> repr(WbUrl('*/http://example.com/abc?def=a*')) -"('url_query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a*')" +>>> repr_unicode(WbUrl('*/http://example.com/abc?def=a*')) +('url_query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a*') ->>> repr(WbUrl('2010*/http://example.com/abc?def=a')) -"('query', '2010', '', 'http://example.com/abc?def=a', '2010*/http://example.com/abc?def=a')" +>>> repr_unicode(WbUrl('2010*/http://example.com/abc?def=a')) +('query', '2010', '', 'http://example.com/abc?def=a', '2010*/http://example.com/abc?def=a') # timestamp range query ->>> repr(WbUrl('2009-2015*/http://example.com/abc?def=a')) -"('query', '2009', '', 'http://example.com/abc?def=a', '2009-2015*/http://example.com/abc?def=a')" +>>> repr_unicode(WbUrl('2009-2015*/http://example.com/abc?def=a')) +('query', '2009', '', 'http://example.com/abc?def=a', '2009-2015*/http://example.com/abc?def=a') ->>> repr(WbUrl('json/*/http://example.com/abc?def=a')) -"('query', '', 'json', 'http://example.com/abc?def=a', 'json/*/http://example.com/abc?def=a')" +>>> repr_unicode(WbUrl('json/*/http://example.com/abc?def=a')) +('query', '', 'json', 'http://example.com/abc?def=a', 'json/*/http://example.com/abc?def=a') ->>> repr(WbUrl('timemap-link/2011*/http://example.com/abc?def=a')) -"('query', '2011', 'timemap-link', 'http://example.com/abc?def=a', 'timemap-link/2011*/http://example.com/abc?def=a')" +>>> repr_unicode(WbUrl('timemap-link/2011*/http://example.com/abc?def=a')) +('query', '2011', 'timemap-link', 'http://example.com/abc?def=a', 'timemap-link/2011*/http://example.com/abc?def=a') # strip off repeated, likely scheme-agnostic, slashes altogether ->>> repr(WbUrl('///example.com')) -"('latest_replay', '', '', 'http://example.com', 'http://example.com')" +>>> repr_unicode(WbUrl('///example.com')) +('latest_replay', '', '', 'http://example.com', 'http://example.com') ->>> repr(WbUrl('//example.com/')) -"('latest_replay', '', '', 'http://example.com/', 'http://example.com/')" +>>> repr_unicode(WbUrl('//example.com/')) +('latest_replay', '', '', 'http://example.com/', 'http://example.com/') ->>> repr(WbUrl('/example.com/')) -"('latest_replay', '', '', 'http://example.com/', 'http://example.com/')" +>>> repr_unicode(WbUrl('/example.com/')) +('latest_replay', '', '', 'http://example.com/', 'http://example.com/') # Is_ Tests >>> u = WbUrl('*/http://example.com/abc?def=a*') @@ -131,7 +175,20 @@ Exception: ('Invalid WbUrl: ', '') """ from pywb.rewrite.wburl import WbUrl -from urllib import quote_plus +from urllib import quote_plus, unquote_plus + +from StringIO import StringIO + + +def repr_unicode(wburl): + buff = StringIO() + buff.write("('{0}', '{1}', '{2}', '".format(wburl.type, wburl.timestamp, wburl.mod)) + buff.write(WbUrl.to_iri(wburl.url)) + buff.write("', '") + buff.write(wburl.to_str(iri=True)) + buff.write("')") + print(buff.getvalue()) + if __name__ == "__main__": import doctest diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index cae44b73..201b6016 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -52,6 +52,9 @@ class UrlRewriter(object): is_abs = True url = 'http:' + url + # always convert any unicode urls to punycode + ascii_urls_only = self.rewrite_opts.get('rewrite_ascii_urls_only', False) + # Optimized rewriter for # -rel urls that don't start with / and # do not contain ../ and no special mod @@ -68,7 +71,11 @@ class UrlRewriter(object): if mod is None: mod = wburl.mod - final_url = self.prefix + wburl.to_str(mod=mod, url=new_url) + final_url = self.prefix + wburl.to_str(mod=mod, + url=new_url, + iri=not ascii_urls_only) + if not ascii_urls_only: + final_url = final_url.encode('utf-8') return final_url diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py index 9436dc20..859e9582 100644 --- a/pywb/rewrite/wburl.py +++ b/pywb/rewrite/wburl.py @@ -90,6 +90,79 @@ class WbUrl(BaseWbUrl): #PARTIAL_ENC_RX = re.compile('(https?%3A)?(%2F%2F)?', re.I) FIRST_PATH = re.compile('(? 1: + url = scheme_dom[0] + u'/' + dom + else: + url = dom + + if len(parts) > 1: + url += u'/' + parts[1] + + return url + + + @staticmethod + def to_uri(url, was_uni=False): + #if not was_uni: + # if isinstance(url, unicode): + # was_uni = True + + #if not was_uni and not '%' in url: + # return url + + parts = WbUrl.FIRST_PATH.split(url, 1) + + #if not was_uni and not '%' in parts[0]: + # return url + + scheme_dom = urllib.unquote_plus(parts[0]) + + if isinstance(scheme_dom, str): + if scheme_dom == parts[0]: + return url + + scheme_dom = scheme_dom.decode('utf-8', 'ignore') + + scheme_dom = scheme_dom.rsplit('/', 1) + dom = scheme_dom[-1] + + dom = dom.encode('idna') + + if len(scheme_dom) > 1: + url = scheme_dom[0] + '/' + dom + else: + url = dom + + if len(parts) > 1: + if isinstance(parts[1], unicode): + url += '/' + urllib.quote_plus(parts[1].encode('utf-8')) + else: + url += '/' + parts[1] + + return url + # ====================== def __init__(self, orig_url): @@ -106,30 +179,7 @@ class WbUrl(BaseWbUrl): if not self._init_replay(orig_url): raise Exception('Invalid WbUrl: ', orig_url) - if was_uni or '%' in self.url: - parts = self.FIRST_PATH.split(self.url, 1) - - if was_uni or '%' in parts[0]: - if not was_uni: - scheme_dom = urllib.unquote_plus(parts[0]) - else: - scheme_dom = parts[0] - - scheme_dom = scheme_dom.rsplit('/', 1) - - dom = scheme_dom[-1] - - dom = dom.decode('utf-8', 'ignore') - dom = dom.encode('idna') - - if len(scheme_dom) > 1: - self.url = scheme_dom[0] + '/' + dom - else: - self.url = dom - - if len(parts) > 1: - self.url += '/' + parts[1] - + self.url = WbUrl.to_uri(self.url, was_uni) # protocol agnostic url -> http:// # no protocol -> http:// @@ -208,6 +258,8 @@ class WbUrl(BaseWbUrl): timestamp = overrides.get('timestamp', self.timestamp) end_timestamp = overrides.get('end_timestamp', self.end_timestamp) url = overrides.get('url', self.url) + if overrides.get('iri'): + url = WbUrl.to_iri(url) return self.to_wburl_str(url=url, type=type_, diff --git a/pywb/static/wb.js b/pywb/static/wb.js index ea14c9fa..0b48ae97 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -132,9 +132,12 @@ this.load = function() { var hash = window.location.hash; var loc = window.location.href.replace(window.location.hash, ""); + loc = decodeURI(loc); if (wbinfo.top_url && (loc != wbinfo.top_url) && wbinfo.mod != "bn_") { // Auto-redirect to top frame + console.log(wbinfo.top_url); + console.log(loc); window.location.replace(wbinfo.top_url + hash); } else { // Init Banner (no frame or top frame) diff --git a/pywb/ui/head_insert.html b/pywb/ui/head_insert.html index 218e7259..0ebf2fba 100644 --- a/pywb/ui/head_insert.html +++ b/pywb/ui/head_insert.html @@ -2,7 +2,7 @@ {% if rule.js_rewrite_location != 'urls' and include_wombat %} - {{ cdx['statuscode'] }} - {{ cdx['original'] }} - {{ cdx['filename'] }} + {{ cdx.statuscode }} + {{ cdx.url }} + {{ cdx.filename }} {% endfor %} diff --git a/pywb/webapp/handlers.py b/pywb/webapp/handlers.py index ed5a5af4..7aa25e5d 100644 --- a/pywb/webapp/handlers.py +++ b/pywb/webapp/handlers.py @@ -72,7 +72,8 @@ class SearchPageWbUrlHandler(WbUrlHandler): return self.handle_request(wbrequest) def get_top_frame_params(self, wbrequest, mod=''): - embed_url = wbrequest.wb_url.to_str(mod=mod) + embed_url = wbrequest.wb_url.to_str(mod=mod, url='') + embed_url += wbrequest.get_url() if wbrequest.wb_url.timestamp: timestamp = wbrequest.wb_url.timestamp @@ -82,7 +83,7 @@ class SearchPageWbUrlHandler(WbUrlHandler): params = dict(embed_url=embed_url, wbrequest=wbrequest, timestamp=timestamp, - url=wbrequest.wb_url.url, + url=wbrequest.get_url(), banner_html=self.banner_html) return params diff --git a/pywb/webapp/views.py b/pywb/webapp/views.py index 9f2dd1e7..c900aae5 100644 --- a/pywb/webapp/views.py +++ b/pywb/webapp/views.py @@ -3,6 +3,7 @@ from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.memento import make_timemap, LINK_FORMAT import urlparse +import urllib import logging from os import path @@ -128,12 +129,16 @@ class HeadInsertView(J2TemplateView): def create_insert_func(self, wbrequest, include_ts=True): + url = wbrequest.get_url() + top_url = wbrequest.wb_prefix - top_url += wbrequest.wb_url.to_str(mod=wbrequest.final_mod) + top_url += wbrequest.wb_url.to_str(mod=wbrequest.final_mod, url='') + top_url += url include_wombat = not wbrequest.wb_url.is_banner_only def make_head_insert(rule, cdx): + cdx['url'] = url return (self.render_to_string(wbrequest=wbrequest, cdx=cdx, top_url=top_url, @@ -165,9 +170,14 @@ class HeadInsertView(J2TemplateView): #================================================================= class J2HtmlCapturesView(J2TemplateView): def render_response(self, wbrequest, cdx_lines, **kwargs): + def format_cdx_lines(): + for cdx in cdx_lines: + cdx['url'] = wbrequest.get_url(url=cdx['original']) + yield cdx + return J2TemplateView.render_response(self, - cdx_lines=list(cdx_lines), - url=wbrequest.wb_url.url, + cdx_lines=list(format_cdx_lines()), + url=wbrequest.get_url(), type=wbrequest.wb_url.type, prefix=wbrequest.wb_prefix, **kwargs) diff --git a/tests/test_integration.py b/tests/test_integration.py index 17161ae3..ef06cf91 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -97,7 +97,7 @@ class TestWb: resp = self.testapp.get('/pywb/20140127171238tf_/http://www.iana.org/') assert '