1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

rewrite: more fixes for IDN #66 - add _do_percent_encode field to wburl itself

defaults to true, may be disabled with 'punycode_links'
remove wbrequest and urlrewriter from get_url path, simply call wb_url.get_url() to get properly formatted url
This commit is contained in:
Ilya Kreymer 2015-02-14 20:55:36 -08:00
parent f9452bf48e
commit afe49a91f4
8 changed files with 46 additions and 55 deletions

View File

@ -105,14 +105,6 @@ class WbRequest(object):
self._parse_extra()
def get_url(self, url=None):
if not self.wb_url:
return None
# pencode urls to force actual urls to appear, unless ascii_links_only set to true
pencode = self.urlrewriter.rewrite_opts.get('punycode_link_only', False)
return self.wb_url.get_url(url, pencode)
def _is_ajax(self):
value = self.env.get('HTTP_X_REQUESTED_WITH')
if value and value.lower() == 'xmlhttprequest':

View File

@ -174,11 +174,11 @@ import pprint
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
'/web/',
rewrite_opts=dict(punycode_links_only=False))
rewrite_opts=dict(punycode_links=False))
urlrewriter_pencode = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
'/web/',
rewrite_opts=dict(punycode_links_only=True))
rewrite_opts=dict(punycode_links=True))
no_base_canon_rewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',

View File

@ -29,6 +29,9 @@ ur"""
>>> repr(WbUrl('https://example.com/xyz?a=%2f&b=%2E'))
"('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', 'https://example.com/xyz?a=%2f&b=%2E')"
>>> repr(WbUrl('http://example.com?example=2'))
"('latest_replay', '', '', 'http://example.com?example=2', 'http://example.com?example=2')"
# Test scheme partially encoded urls
>>> repr(WbUrl('https%3A//example.com/'))
"('latest_replay', '', '', 'https://example.com/', 'https://example.com/')"
@ -68,16 +71,16 @@ http://xn--d0-olcluwd.xn--80akhbyknj4f
https://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5
>>> print(to_uri_pencode(u'пример.испытание'))
%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5
http://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5
>>> print(to_uri_pencode('http://' + quote_plus(u'пример.испытание'.encode('utf-8'))))
http://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5
>>> print(to_uri_pencode(u'//пример.испытание/abc/испытание'))
//%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc/%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5
http://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc/%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5
>>> print(to_uri_pencode(quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc/' + quote_plus(u'пример'.encode('utf-8'))))
%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc/%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80
http://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc/%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80
>>> print(to_uri_pencode('https://xn--e1afmkfd.xn--80akhbyknj4f/foo/bar?abc=def'))
https://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/foo/bar?abc=def
@ -88,28 +91,31 @@ http://d0%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%
# IRI representation
>>> repr(WbUrl(u'http://пример.испытание'))
"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f', 'http://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5')"
>>> x = WbUrl(u'http://пример.испытание'); x._do_percent_encode = False; repr(x)
"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f', 'http://xn--e1afmkfd.xn--80akhbyknj4f')"
>>> repr(WbUrl(u'https://пример.испытание/abc/def_ghi/'))
"('latest_replay', '', '', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc/def_ghi/', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc/def_ghi/')"
"('latest_replay', '', '', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc/def_ghi/', 'https://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc/def_ghi/')"
>>> repr(WbUrl(u'//пример.испытание/abc/'))
"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc/', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc/')"
"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc/', 'http://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc/')"
>>> repr(WbUrl(u'2014id_/https://пример.испытание/abc'))
"('replay', '2014', 'id_', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/https://xn--e1afmkfd.xn--80akhbyknj4f/abc')"
"('replay', '2014', 'id_', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/https://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc')"
# percent-encoded form (as sent by browser usually)
>>> repr(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc'))
"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://xn--e1afmkfd.xn--80akhbyknj4f/abc')"
"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc')"
# percent-encoded form -- scheme relative
>>> repr(WbUrl('2014id_///' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc'))
"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://xn--e1afmkfd.xn--80akhbyknj4f/abc')"
"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc')"
# invalid: truncated and superfluous '%', ignore invalid (no exception)
>>> repr(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:] + '%' + '/abc'))
"('replay', '2014', 'id_', 'http://xn--d0-olcluwd.xn--%-7sbpkb3ampk3g/abc', '2014id_/http://xn--d0-olcluwd.xn--%-7sbpkb3ampk3g/abc')"
"('replay', '2014', 'id_', 'http://xn--d0-olcluwd.xn--%-7sbpkb3ampk3g/abc', '2014id_/http://d0%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5%25/abc')"
# Query Urls
@ -187,7 +193,7 @@ from StringIO import StringIO
def to_uri_pencode(url):
return WbUrl.percent_encode_host(WbUrl.to_uri(url))
return WbUrl(url).get_url()
if __name__ == "__main__":

View File

@ -29,6 +29,9 @@ class UrlRewriter(object):
self.cookie_scope = cookie_scope
self.rewrite_opts = rewrite_opts
if rewrite_opts.get('punycode_links'):
self.wburl._do_percent_encode = False
def rewrite(self, url, mod=None):
# if special protocol, no rewriting at all
if any(url.startswith(x) for x in self.NO_REWRITE_URI_PREFIX):
@ -52,9 +55,6 @@ class UrlRewriter(object):
is_abs = True
url = 'http:' + url
# convert host to %-encoding instead of default punycode
peh = not self.rewrite_opts.get('punycode_links_only', False)
# Optimized rewriter for
# -rel urls that don't start with / and
# do not contain ../ and no special mod
@ -72,8 +72,7 @@ class UrlRewriter(object):
mod = wburl.mod
final_url = self.prefix + wburl.to_str(mod=mod,
url=new_url,
percent_encode=peh)
url=new_url)
return final_url
def get_new_url(self, **kwargs):

View File

@ -40,7 +40,7 @@ wayback url format.
import re
import urllib
import urlparse
#=================================================================
class BaseWbUrl(object):
@ -96,30 +96,24 @@ class WbUrl(BaseWbUrl):
to have a %-encoded host instead of punycode host
The rest of url should be unchanged
"""
parts = WbUrl.FIRST_PATH.split(url, 1)
scheme_dom = parts[0].rsplit('/', 1)
dom = scheme_dom[-1]
parts = urlparse.urlsplit(url)
domain = parts.netloc
try:
dom = dom.decode('idna')
dom = dom.encode('utf-8', 'ignore')
domain = domain.decode('idna')
domain = domain.encode('utf-8', 'ignore')
except:
# likely already encoded, so use as is
pass
dom = urllib.quote(dom, safe=r':\/')
domain = urllib.quote(domain)#, safe=r':\/')
if len(scheme_dom) > 1:
url = scheme_dom[0] + '/' + dom
else:
url = dom
# no changes
if parts.netloc == domain:
return url
if len(parts) > 1:
url += '/' + parts[1]
return urlparse.urlunsplit((parts[0], domain, parts[2], parts[3], parts[4]))
return url
@staticmethod
def to_uri(url):
@ -174,7 +168,11 @@ class WbUrl(BaseWbUrl):
if not self._init_replay(orig_url):
raise Exception('Invalid WbUrl: ', orig_url)
self.url = WbUrl.to_uri(self.url)
new_uri = WbUrl.to_uri(self.url)
self._do_percent_encode = True
self.url = new_uri
# protocol agnostic url -> http://
# no protocol -> http://
@ -244,13 +242,13 @@ class WbUrl(BaseWbUrl):
self.url = new_url
return self.url
def get_url(self, url=None, percent_encode=False):
def get_url(self, url=None):
if url is not None:
url = WbUrl.to_uri(url)
else:
url = self.url
if percent_encode:
if self._do_percent_encode:
url = WbUrl.percent_encode_host(url)
return url
@ -264,8 +262,7 @@ class WbUrl(BaseWbUrl):
timestamp = overrides.get('timestamp', self.timestamp)
end_timestamp = overrides.get('end_timestamp', self.end_timestamp)
url = self.get_url(overrides.get('url'),
overrides.get('percent_encode', False))
url = self.get_url(overrides.get('url', self.url))
return self.to_wburl_str(url=url,
type=type_,

View File

@ -75,8 +75,7 @@ class SearchPageWbUrlHandler(WbUrlHandler):
return self.handle_not_found(wbrequest, nfe)
def get_top_frame_params(self, wbrequest, mod=''):
embed_url = wbrequest.wb_url.to_str(mod=mod, url='')
embed_url += wbrequest.get_url()
embed_url = wbrequest.wb_url.to_str(mod=mod)
if wbrequest.wb_url.timestamp:
timestamp = wbrequest.wb_url.timestamp
@ -86,7 +85,7 @@ class SearchPageWbUrlHandler(WbUrlHandler):
params = dict(embed_url=embed_url,
wbrequest=wbrequest,
timestamp=timestamp,
url=wbrequest.get_url(),
url=wbrequest.wb_url.get_url(),
banner_html=self.banner_html)
return params

View File

@ -241,7 +241,6 @@ class ReplayView(object):
else:
statusline = '302 Internal Redirect'
new_url = new_url.encode('utf-8')
status_headers = StatusAndHeaders(statusline,
[('Location', new_url)])

View File

@ -129,11 +129,10 @@ class HeadInsertView(J2TemplateView):
def create_insert_func(self, wbrequest,
include_ts=True):
url = wbrequest.get_url()
url = wbrequest.wb_url.get_url()
top_url = wbrequest.wb_prefix
top_url += wbrequest.wb_url.to_str(mod=wbrequest.final_mod, url='')
top_url += url
top_url += wbrequest.wb_url.to_str(mod=wbrequest.final_mod)
include_wombat = not wbrequest.wb_url.is_banner_only
@ -172,12 +171,12 @@ class J2HtmlCapturesView(J2TemplateView):
def render_response(self, wbrequest, cdx_lines, **kwargs):
def format_cdx_lines():
for cdx in cdx_lines:
cdx['url'] = wbrequest.get_url(url=cdx['original'])
cdx['url'] = wbrequest.wb_url.get_url(url=cdx['original'])
yield cdx
return J2TemplateView.render_response(self,
cdx_lines=list(format_cdx_lines()),
url=wbrequest.get_url(),
url=wbrequest.wb_url.get_url(),
type=wbrequest.wb_url.type,
prefix=wbrequest.wb_prefix,
**kwargs)