diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index 95a07e66..43680545 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -105,6 +105,18 @@ class WbRequest(object): self._parse_extra() + def get_url(self, url=None): + if not self.wb_url: + return None + + if not url: + url = self.wb_url.url + + if self.urlrewriter.rewrite_opts.get('rewrite_ascii_urls_only'): + return self.wb_url.url + else: + return self.wb_url.to_iri(url) + def _is_ajax(self): value = self.env.get('HTTP_X_REQUESTED_WITH') if value and value.lower() == 'xmlhttprequest': diff --git a/pywb/rewrite/test/test_wburl.py b/pywb/rewrite/test/test_wburl.py index 2e7475af..78bf9764 100644 --- a/pywb/rewrite/test/test_wburl.py +++ b/pywb/rewrite/test/test_wburl.py @@ -4,94 +4,138 @@ ur""" # Replay Urls # ====================== ->>> repr(WbUrl('20131010000506/example.com')) -"('replay', '20131010000506', '', 'http://example.com', '20131010000506/http://example.com')" +>>> repr_unicode(WbUrl('20131010000506/example.com')) +('replay', '20131010000506', '', 'http://example.com', '20131010000506/http://example.com') ->>> repr(WbUrl('20130102im_/https://example.com')) -"('replay', '20130102', 'im_', 'https://example.com', '20130102im_/https://example.com')" +>>> repr_unicode(WbUrl('20130102im_/https://example.com')) +('replay', '20130102', 'im_', 'https://example.com', '20130102im_/https://example.com') ->>> repr(WbUrl('20130102im_/https:/example.com')) -"('replay', '20130102', 'im_', 'https://example.com', '20130102im_/https://example.com')" +>>> repr_unicode(WbUrl('20130102im_/https:/example.com')) +('replay', '20130102', 'im_', 'https://example.com', '20130102im_/https://example.com') # Protocol agnostic convert to http ->>> repr(WbUrl('20130102im_///example.com')) -"('replay', '20130102', 'im_', 'http://example.com', '20130102im_/http://example.com')" +>>> repr_unicode(WbUrl('20130102im_///example.com')) +('replay', '20130102', 'im_', 'http://example.com', '20130102im_/http://example.com') ->>> repr(WbUrl('cs_/example.com')) -"('latest_replay', '', 'cs_', 'http://example.com', 'cs_/http://example.com')" +>>> repr_unicode(WbUrl('cs_/example.com')) +('latest_replay', '', 'cs_', 'http://example.com', 'cs_/http://example.com') ->>> repr(WbUrl('https://example.com/xyz')) -"('latest_replay', '', '', 'https://example.com/xyz', 'https://example.com/xyz')" +>>> repr_unicode(WbUrl('https://example.com/xyz')) +('latest_replay', '', '', 'https://example.com/xyz', 'https://example.com/xyz') ->>> repr(WbUrl('https:/example.com/xyz')) -"('latest_replay', '', '', 'https://example.com/xyz', 'https://example.com/xyz')" +>>> repr_unicode(WbUrl('https:/example.com/xyz')) +('latest_replay', '', '', 'https://example.com/xyz', 'https://example.com/xyz') ->>> repr(WbUrl('https://example.com/xyz?a=%2f&b=%2E')) -"('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', 'https://example.com/xyz?a=%2f&b=%2E')" +>>> repr_unicode(WbUrl('https://example.com/xyz?a=%2f&b=%2E')) +('latest_replay', '', '', 'https://example.com/xyz?a=/&b=.', 'https://example.com/xyz?a=/&b=.') # Test scheme partially encoded urls ->>> repr(WbUrl('https%3A//example.com/')) -"('latest_replay', '', '', 'https://example.com/', 'https://example.com/')" +>>> repr_unicode(WbUrl('https%3A//example.com/')) +('latest_replay', '', '', 'https://example.com/', 'https://example.com/') ->>> repr(WbUrl('2014/http%3A%2F%2Fexample.com/')) -"('replay', '2014', '', 'http://example.com/', '2014/http://example.com/')" +>>> repr_unicode(WbUrl('2014/http%3A%2F%2Fexample.com/')) +('replay', '2014', '', 'http://example.com/', '2014/http://example.com/') # Test IDNs ->>> repr(WbUrl(u'http://пример.испытание')) -"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f', 'http://xn--e1afmkfd.xn--80akhbyknj4f')" ->>> repr(WbUrl(u'https://пример.испытание/abc/')) -"('latest_replay', '', '', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc/', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc/')" +To IRI +>>> print(WbUrl.to_iri(u'https://пример.испытание')) +https://пример.испытание ->>> repr(WbUrl(u'//пример.испытание/abc/')) -"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc/', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc/')" +>>> print(WbUrl.to_iri(u'пример.испытание')) +пример.испытание ->>> repr(WbUrl(u'2014id_/https://пример.испытание/abc')) -"('replay', '2014', 'id_', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/https://xn--e1afmkfd.xn--80akhbyknj4f/abc')" +>>> print(WbUrl.to_iri('http://' + quote_plus(u'пример.испытание'.encode('utf-8')))) +http://пример.испытание + +>>> print(WbUrl.to_iri(u'//пример.испытание/abc/испытание')) +//пример.испытание/abc/испытание + +>>> print(WbUrl.to_iri(quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc/' + quote_plus(u'пример'.encode('utf-8')))) +пример.испытание/abc/пример + +>>> print(WbUrl.to_iri('https://xn--e1afmkfd.xn--80akhbyknj4f')) +https://пример.испытание + + +To URI +>>> print(WbUrl.to_uri(u'https://пример.испытание')) +https://xn--e1afmkfd.xn--80akhbyknj4f + +>>> print(WbUrl.to_uri(u'пример.испытание')) +xn--e1afmkfd.xn--80akhbyknj4f + +>>> print(WbUrl.to_uri('http://' + quote_plus(u'пример.испытание'.encode('utf-8')))) +http://xn--e1afmkfd.xn--80akhbyknj4f + +>>> print(WbUrl.to_uri(u'//пример.испытание/abc/испытание')) +//xn--e1afmkfd.xn--80akhbyknj4f/abc%2F%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5 + +>>> print(WbUrl.to_uri('//' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc/' + quote_plus(u'пример'.encode('utf-8')))) +//xn--e1afmkfd.xn--80akhbyknj4f/abc/%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80 + +>>> print(WbUrl.to_uri('https://xn--e1afmkfd.xn--80akhbyknj4f/abc/')) +https://xn--e1afmkfd.xn--80akhbyknj4f/abc/ + +>>> print(WbUrl.to_uri('http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:])) +http://xn--d0-olcluwd.xn--80akhbyknj4f + +# IRI representation +>>> repr_unicode(WbUrl(u'http://пример.испытание')) +('latest_replay', '', '', 'http://пример.испытание', 'http://пример.испытание') + +>>> repr_unicode(WbUrl(u'https://пример.испытание/abc/')) +('latest_replay', '', '', 'https://пример.испытание/abc/', 'https://пример.испытание/abc/') + +>>> repr_unicode(WbUrl(u'//пример.испытание/abc/')) +('latest_replay', '', '', 'http://пример.испытание/abc/', 'http://пример.испытание/abc/') + +>>> repr_unicode(WbUrl(u'2014id_/https://пример.испытание/abc')) +('replay', '2014', 'id_', 'https://пример.испытание/abc', '2014id_/https://пример.испытание/abc') # percent-encoded form (as sent by browser usually) ->>> repr(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc')) -"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://xn--e1afmkfd.xn--80akhbyknj4f/abc')" +>>> repr_unicode(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc')) +('replay', '2014', 'id_', 'http://пример.испытание/abc', '2014id_/http://пример.испытание/abc') # percent-encoded form -- scheme relative ->>> repr(WbUrl('2014id_///' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc')) -"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://xn--e1afmkfd.xn--80akhbyknj4f/abc')" +>>> repr_unicode(WbUrl('2014id_///' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc')) +('replay', '2014', 'id_', 'http://пример.испытание/abc', '2014id_/http://пример.испытание/abc') # invalid: truncated and superfluous '%', ignore invalid (no exception) ->>> repr(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:] + '%' + '/abc')) -"('replay', '2014', 'id_', 'http://xn--d0-olcluwd.xn--%-7sbpkb3ampk3g/abc', '2014id_/http://xn--d0-olcluwd.xn--%-7sbpkb3ampk3g/abc')" +>>> repr_unicode(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:] + '%' + '/abc')) +('replay', '2014', 'id_', 'http://d0ример.испытание%/abc', '2014id_/http://d0ример.испытание%/abc') # Query Urls # ====================== ->>> repr(WbUrl('*/http://example.com/abc?def=a')) -"('query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a')" +>>> repr_unicode(WbUrl('*/http://example.com/abc?def=a')) +('query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a') ->>> repr(WbUrl('*/http://example.com/abc?def=a*')) -"('url_query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a*')" +>>> repr_unicode(WbUrl('*/http://example.com/abc?def=a*')) +('url_query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a*') ->>> repr(WbUrl('2010*/http://example.com/abc?def=a')) -"('query', '2010', '', 'http://example.com/abc?def=a', '2010*/http://example.com/abc?def=a')" +>>> repr_unicode(WbUrl('2010*/http://example.com/abc?def=a')) +('query', '2010', '', 'http://example.com/abc?def=a', '2010*/http://example.com/abc?def=a') # timestamp range query ->>> repr(WbUrl('2009-2015*/http://example.com/abc?def=a')) -"('query', '2009', '', 'http://example.com/abc?def=a', '2009-2015*/http://example.com/abc?def=a')" +>>> repr_unicode(WbUrl('2009-2015*/http://example.com/abc?def=a')) +('query', '2009', '', 'http://example.com/abc?def=a', '2009-2015*/http://example.com/abc?def=a') ->>> repr(WbUrl('json/*/http://example.com/abc?def=a')) -"('query', '', 'json', 'http://example.com/abc?def=a', 'json/*/http://example.com/abc?def=a')" +>>> repr_unicode(WbUrl('json/*/http://example.com/abc?def=a')) +('query', '', 'json', 'http://example.com/abc?def=a', 'json/*/http://example.com/abc?def=a') ->>> repr(WbUrl('timemap-link/2011*/http://example.com/abc?def=a')) -"('query', '2011', 'timemap-link', 'http://example.com/abc?def=a', 'timemap-link/2011*/http://example.com/abc?def=a')" +>>> repr_unicode(WbUrl('timemap-link/2011*/http://example.com/abc?def=a')) +('query', '2011', 'timemap-link', 'http://example.com/abc?def=a', 'timemap-link/2011*/http://example.com/abc?def=a') # strip off repeated, likely scheme-agnostic, slashes altogether ->>> repr(WbUrl('///example.com')) -"('latest_replay', '', '', 'http://example.com', 'http://example.com')" +>>> repr_unicode(WbUrl('///example.com')) +('latest_replay', '', '', 'http://example.com', 'http://example.com') ->>> repr(WbUrl('//example.com/')) -"('latest_replay', '', '', 'http://example.com/', 'http://example.com/')" +>>> repr_unicode(WbUrl('//example.com/')) +('latest_replay', '', '', 'http://example.com/', 'http://example.com/') ->>> repr(WbUrl('/example.com/')) -"('latest_replay', '', '', 'http://example.com/', 'http://example.com/')" +>>> repr_unicode(WbUrl('/example.com/')) +('latest_replay', '', '', 'http://example.com/', 'http://example.com/') # Is_ Tests >>> u = WbUrl('*/http://example.com/abc?def=a*') @@ -131,7 +175,20 @@ Exception: ('Invalid WbUrl: ', '') """ from pywb.rewrite.wburl import WbUrl -from urllib import quote_plus +from urllib import quote_plus, unquote_plus + +from StringIO import StringIO + + +def repr_unicode(wburl): + buff = StringIO() + buff.write("('{0}', '{1}', '{2}', '".format(wburl.type, wburl.timestamp, wburl.mod)) + buff.write(WbUrl.to_iri(wburl.url)) + buff.write("', '") + buff.write(wburl.to_str(iri=True)) + buff.write("')") + print(buff.getvalue()) + if __name__ == "__main__": import doctest diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index cae44b73..201b6016 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -52,6 +52,9 @@ class UrlRewriter(object): is_abs = True url = 'http:' + url + # always convert any unicode urls to punycode + ascii_urls_only = self.rewrite_opts.get('rewrite_ascii_urls_only', False) + # Optimized rewriter for # -rel urls that don't start with / and # do not contain ../ and no special mod @@ -68,7 +71,11 @@ class UrlRewriter(object): if mod is None: mod = wburl.mod - final_url = self.prefix + wburl.to_str(mod=mod, url=new_url) + final_url = self.prefix + wburl.to_str(mod=mod, + url=new_url, + iri=not ascii_urls_only) + if not ascii_urls_only: + final_url = final_url.encode('utf-8') return final_url diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py index 9436dc20..859e9582 100644 --- a/pywb/rewrite/wburl.py +++ b/pywb/rewrite/wburl.py @@ -90,6 +90,79 @@ class WbUrl(BaseWbUrl): #PARTIAL_ENC_RX = re.compile('(https?%3A)?(%2F%2F)?', re.I) FIRST_PATH = re.compile('(? 1: + url = scheme_dom[0] + u'/' + dom + else: + url = dom + + if len(parts) > 1: + url += u'/' + parts[1] + + return url + + + @staticmethod + def to_uri(url, was_uni=False): + #if not was_uni: + # if isinstance(url, unicode): + # was_uni = True + + #if not was_uni and not '%' in url: + # return url + + parts = WbUrl.FIRST_PATH.split(url, 1) + + #if not was_uni and not '%' in parts[0]: + # return url + + scheme_dom = urllib.unquote_plus(parts[0]) + + if isinstance(scheme_dom, str): + if scheme_dom == parts[0]: + return url + + scheme_dom = scheme_dom.decode('utf-8', 'ignore') + + scheme_dom = scheme_dom.rsplit('/', 1) + dom = scheme_dom[-1] + + dom = dom.encode('idna') + + if len(scheme_dom) > 1: + url = scheme_dom[0] + '/' + dom + else: + url = dom + + if len(parts) > 1: + if isinstance(parts[1], unicode): + url += '/' + urllib.quote_plus(parts[1].encode('utf-8')) + else: + url += '/' + parts[1] + + return url + # ====================== def __init__(self, orig_url): @@ -106,30 +179,7 @@ class WbUrl(BaseWbUrl): if not self._init_replay(orig_url): raise Exception('Invalid WbUrl: ', orig_url) - if was_uni or '%' in self.url: - parts = self.FIRST_PATH.split(self.url, 1) - - if was_uni or '%' in parts[0]: - if not was_uni: - scheme_dom = urllib.unquote_plus(parts[0]) - else: - scheme_dom = parts[0] - - scheme_dom = scheme_dom.rsplit('/', 1) - - dom = scheme_dom[-1] - - dom = dom.decode('utf-8', 'ignore') - dom = dom.encode('idna') - - if len(scheme_dom) > 1: - self.url = scheme_dom[0] + '/' + dom - else: - self.url = dom - - if len(parts) > 1: - self.url += '/' + parts[1] - + self.url = WbUrl.to_uri(self.url, was_uni) # protocol agnostic url -> http:// # no protocol -> http:// @@ -208,6 +258,8 @@ class WbUrl(BaseWbUrl): timestamp = overrides.get('timestamp', self.timestamp) end_timestamp = overrides.get('end_timestamp', self.end_timestamp) url = overrides.get('url', self.url) + if overrides.get('iri'): + url = WbUrl.to_iri(url) return self.to_wburl_str(url=url, type=type_, diff --git a/pywb/static/wb.js b/pywb/static/wb.js index ea14c9fa..0b48ae97 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -132,9 +132,12 @@ this.load = function() { var hash = window.location.hash; var loc = window.location.href.replace(window.location.hash, ""); + loc = decodeURI(loc); if (wbinfo.top_url && (loc != wbinfo.top_url) && wbinfo.mod != "bn_") { // Auto-redirect to top frame + console.log(wbinfo.top_url); + console.log(loc); window.location.replace(wbinfo.top_url + hash); } else { // Init Banner (no frame or top frame) diff --git a/pywb/ui/head_insert.html b/pywb/ui/head_insert.html index 218e7259..0ebf2fba 100644 --- a/pywb/ui/head_insert.html +++ b/pywb/ui/head_insert.html @@ -2,7 +2,7 @@ {% if rule.js_rewrite_location != 'urls' and include_wombat %} -