From afe49a91f4fac1a05f8344cace6685da4626256e Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 14 Feb 2015 20:55:36 -0800 Subject: [PATCH] rewrite: more fixes for IDN #66 - add _do_percent_encode field to wburl itself defaults to true, may be disabled with 'punycode_links' remove wbrequest and urlrewriter from get_url path, simply call wb_url.get_url() to get properly formatted url --- pywb/framework/wbrequestresponse.py | 8 ----- pywb/rewrite/test/test_html_rewriter.py | 4 +-- pywb/rewrite/test/test_wburl.py | 26 ++++++++++------- pywb/rewrite/url_rewriter.py | 9 +++--- pywb/rewrite/wburl.py | 39 ++++++++++++------------- pywb/webapp/handlers.py | 5 ++-- pywb/webapp/replay_views.py | 1 - pywb/webapp/views.py | 9 +++--- 8 files changed, 46 insertions(+), 55 deletions(-) diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index 4a638382..95a07e66 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -105,14 +105,6 @@ class WbRequest(object): self._parse_extra() - def get_url(self, url=None): - if not self.wb_url: - return None - - # pencode urls to force actual urls to appear, unless ascii_links_only set to true - pencode = self.urlrewriter.rewrite_opts.get('punycode_link_only', False) - return self.wb_url.get_url(url, pencode) - def _is_ajax(self): value = self.env.get('HTTP_X_REQUESTED_WITH') if value and value.lower() == 'xmlhttprequest': diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index 7137b6fa..277753f4 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -174,11 +174,11 @@ import pprint urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/', - rewrite_opts=dict(punycode_links_only=False)) + rewrite_opts=dict(punycode_links=False)) urlrewriter_pencode = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/', - rewrite_opts=dict(punycode_links_only=True)) + rewrite_opts=dict(punycode_links=True)) no_base_canon_rewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', diff --git a/pywb/rewrite/test/test_wburl.py b/pywb/rewrite/test/test_wburl.py index 0665d6d4..c6ac76ef 100644 --- a/pywb/rewrite/test/test_wburl.py +++ b/pywb/rewrite/test/test_wburl.py @@ -29,6 +29,9 @@ ur""" >>> repr(WbUrl('https://example.com/xyz?a=%2f&b=%2E')) "('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', 'https://example.com/xyz?a=%2f&b=%2E')" +>>> repr(WbUrl('http://example.com?example=2')) +"('latest_replay', '', '', 'http://example.com?example=2', 'http://example.com?example=2')" + # Test scheme partially encoded urls >>> repr(WbUrl('https%3A//example.com/')) "('latest_replay', '', '', 'https://example.com/', 'https://example.com/')" @@ -68,16 +71,16 @@ http://xn--d0-olcluwd.xn--80akhbyknj4f https://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5 >>> print(to_uri_pencode(u'пример.испытание')) -%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5 +http://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5 >>> print(to_uri_pencode('http://' + quote_plus(u'пример.испытание'.encode('utf-8')))) http://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5 >>> print(to_uri_pencode(u'//пример.испытание/abc/испытание')) -//%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc/%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5 +http://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc/%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5 >>> print(to_uri_pencode(quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc/' + quote_plus(u'пример'.encode('utf-8')))) -%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc/%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80 +http://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc/%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80 >>> print(to_uri_pencode('https://xn--e1afmkfd.xn--80akhbyknj4f/foo/bar?abc=def')) https://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/foo/bar?abc=def @@ -88,28 +91,31 @@ http://d0%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0% # IRI representation >>> repr(WbUrl(u'http://пример.испытание')) +"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f', 'http://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5')" + +>>> x = WbUrl(u'http://пример.испытание'); x._do_percent_encode = False; repr(x) "('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f', 'http://xn--e1afmkfd.xn--80akhbyknj4f')" >>> repr(WbUrl(u'https://пример.испытание/abc/def_ghi/')) -"('latest_replay', '', '', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc/def_ghi/', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc/def_ghi/')" +"('latest_replay', '', '', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc/def_ghi/', 'https://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc/def_ghi/')" >>> repr(WbUrl(u'//пример.испытание/abc/')) -"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc/', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc/')" +"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc/', 'http://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc/')" >>> repr(WbUrl(u'2014id_/https://пример.испытание/abc')) -"('replay', '2014', 'id_', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/https://xn--e1afmkfd.xn--80akhbyknj4f/abc')" +"('replay', '2014', 'id_', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/https://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc')" # percent-encoded form (as sent by browser usually) >>> repr(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc')) -"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://xn--e1afmkfd.xn--80akhbyknj4f/abc')" +"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc')" # percent-encoded form -- scheme relative >>> repr(WbUrl('2014id_///' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc')) -"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://xn--e1afmkfd.xn--80akhbyknj4f/abc')" +"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc')" # invalid: truncated and superfluous '%', ignore invalid (no exception) >>> repr(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:] + '%' + '/abc')) -"('replay', '2014', 'id_', 'http://xn--d0-olcluwd.xn--%-7sbpkb3ampk3g/abc', '2014id_/http://xn--d0-olcluwd.xn--%-7sbpkb3ampk3g/abc')" +"('replay', '2014', 'id_', 'http://xn--d0-olcluwd.xn--%-7sbpkb3ampk3g/abc', '2014id_/http://d0%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5%25/abc')" # Query Urls @@ -187,7 +193,7 @@ from StringIO import StringIO def to_uri_pencode(url): - return WbUrl.percent_encode_host(WbUrl.to_uri(url)) + return WbUrl(url).get_url() if __name__ == "__main__": diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index 37a6e095..8bc87b43 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -29,6 +29,9 @@ class UrlRewriter(object): self.cookie_scope = cookie_scope self.rewrite_opts = rewrite_opts + if rewrite_opts.get('punycode_links'): + self.wburl._do_percent_encode = False + def rewrite(self, url, mod=None): # if special protocol, no rewriting at all if any(url.startswith(x) for x in self.NO_REWRITE_URI_PREFIX): @@ -52,9 +55,6 @@ class UrlRewriter(object): is_abs = True url = 'http:' + url - # convert host to %-encoding instead of default punycode - peh = not self.rewrite_opts.get('punycode_links_only', False) - # Optimized rewriter for # -rel urls that don't start with / and # do not contain ../ and no special mod @@ -72,8 +72,7 @@ class UrlRewriter(object): mod = wburl.mod final_url = self.prefix + wburl.to_str(mod=mod, - url=new_url, - percent_encode=peh) + url=new_url) return final_url def get_new_url(self, **kwargs): diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py index 50907791..dac119f5 100644 --- a/pywb/rewrite/wburl.py +++ b/pywb/rewrite/wburl.py @@ -40,7 +40,7 @@ wayback url format. import re import urllib - +import urlparse #================================================================= class BaseWbUrl(object): @@ -96,30 +96,24 @@ class WbUrl(BaseWbUrl): to have a %-encoded host instead of punycode host The rest of url should be unchanged """ - parts = WbUrl.FIRST_PATH.split(url, 1) - - scheme_dom = parts[0].rsplit('/', 1) - - dom = scheme_dom[-1] + parts = urlparse.urlsplit(url) + domain = parts.netloc try: - dom = dom.decode('idna') - dom = dom.encode('utf-8', 'ignore') + domain = domain.decode('idna') + domain = domain.encode('utf-8', 'ignore') except: # likely already encoded, so use as is pass - dom = urllib.quote(dom, safe=r':\/') + domain = urllib.quote(domain)#, safe=r':\/') - if len(scheme_dom) > 1: - url = scheme_dom[0] + '/' + dom - else: - url = dom + # no changes + if parts.netloc == domain: + return url - if len(parts) > 1: - url += '/' + parts[1] + return urlparse.urlunsplit((parts[0], domain, parts[2], parts[3], parts[4])) - return url @staticmethod def to_uri(url): @@ -174,7 +168,11 @@ class WbUrl(BaseWbUrl): if not self._init_replay(orig_url): raise Exception('Invalid WbUrl: ', orig_url) - self.url = WbUrl.to_uri(self.url) + new_uri = WbUrl.to_uri(self.url) + + self._do_percent_encode = True + + self.url = new_uri # protocol agnostic url -> http:// # no protocol -> http:// @@ -244,13 +242,13 @@ class WbUrl(BaseWbUrl): self.url = new_url return self.url - def get_url(self, url=None, percent_encode=False): + def get_url(self, url=None): if url is not None: url = WbUrl.to_uri(url) else: url = self.url - if percent_encode: + if self._do_percent_encode: url = WbUrl.percent_encode_host(url) return url @@ -264,8 +262,7 @@ class WbUrl(BaseWbUrl): timestamp = overrides.get('timestamp', self.timestamp) end_timestamp = overrides.get('end_timestamp', self.end_timestamp) - url = self.get_url(overrides.get('url'), - overrides.get('percent_encode', False)) + url = self.get_url(overrides.get('url', self.url)) return self.to_wburl_str(url=url, type=type_, diff --git a/pywb/webapp/handlers.py b/pywb/webapp/handlers.py index d71296ed..a8e72118 100644 --- a/pywb/webapp/handlers.py +++ b/pywb/webapp/handlers.py @@ -75,8 +75,7 @@ class SearchPageWbUrlHandler(WbUrlHandler): return self.handle_not_found(wbrequest, nfe) def get_top_frame_params(self, wbrequest, mod=''): - embed_url = wbrequest.wb_url.to_str(mod=mod, url='') - embed_url += wbrequest.get_url() + embed_url = wbrequest.wb_url.to_str(mod=mod) if wbrequest.wb_url.timestamp: timestamp = wbrequest.wb_url.timestamp @@ -86,7 +85,7 @@ class SearchPageWbUrlHandler(WbUrlHandler): params = dict(embed_url=embed_url, wbrequest=wbrequest, timestamp=timestamp, - url=wbrequest.get_url(), + url=wbrequest.wb_url.get_url(), banner_html=self.banner_html) return params diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index 2bdcf730..5fe22d26 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -241,7 +241,6 @@ class ReplayView(object): else: statusline = '302 Internal Redirect' - new_url = new_url.encode('utf-8') status_headers = StatusAndHeaders(statusline, [('Location', new_url)]) diff --git a/pywb/webapp/views.py b/pywb/webapp/views.py index c900aae5..11624a22 100644 --- a/pywb/webapp/views.py +++ b/pywb/webapp/views.py @@ -129,11 +129,10 @@ class HeadInsertView(J2TemplateView): def create_insert_func(self, wbrequest, include_ts=True): - url = wbrequest.get_url() + url = wbrequest.wb_url.get_url() top_url = wbrequest.wb_prefix - top_url += wbrequest.wb_url.to_str(mod=wbrequest.final_mod, url='') - top_url += url + top_url += wbrequest.wb_url.to_str(mod=wbrequest.final_mod) include_wombat = not wbrequest.wb_url.is_banner_only @@ -172,12 +171,12 @@ class J2HtmlCapturesView(J2TemplateView): def render_response(self, wbrequest, cdx_lines, **kwargs): def format_cdx_lines(): for cdx in cdx_lines: - cdx['url'] = wbrequest.get_url(url=cdx['original']) + cdx['url'] = wbrequest.wb_url.get_url(url=cdx['original']) yield cdx return J2TemplateView.render_response(self, cdx_lines=list(format_cdx_lines()), - url=wbrequest.get_url(), + url=wbrequest.wb_url.get_url(), type=wbrequest.wb_url.type, prefix=wbrequest.wb_prefix, **kwargs)