mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
rewrite: more fixes for IDN #66 - add _do_percent_encode field to wburl itself
defaults to true, may be disabled with 'punycode_links' remove wbrequest and urlrewriter from get_url path, simply call wb_url.get_url() to get properly formatted url
This commit is contained in:
parent
f9452bf48e
commit
afe49a91f4
@ -105,14 +105,6 @@ class WbRequest(object):
|
||||
|
||||
self._parse_extra()
|
||||
|
||||
def get_url(self, url=None):
|
||||
if not self.wb_url:
|
||||
return None
|
||||
|
||||
# pencode urls to force actual urls to appear, unless ascii_links_only set to true
|
||||
pencode = self.urlrewriter.rewrite_opts.get('punycode_link_only', False)
|
||||
return self.wb_url.get_url(url, pencode)
|
||||
|
||||
def _is_ajax(self):
|
||||
value = self.env.get('HTTP_X_REQUESTED_WITH')
|
||||
if value and value.lower() == 'xmlhttprequest':
|
||||
|
@ -174,11 +174,11 @@ import pprint
|
||||
|
||||
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
|
||||
'/web/',
|
||||
rewrite_opts=dict(punycode_links_only=False))
|
||||
rewrite_opts=dict(punycode_links=False))
|
||||
|
||||
urlrewriter_pencode = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
|
||||
'/web/',
|
||||
rewrite_opts=dict(punycode_links_only=True))
|
||||
rewrite_opts=dict(punycode_links=True))
|
||||
|
||||
|
||||
no_base_canon_rewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
|
||||
|
@ -29,6 +29,9 @@ ur"""
|
||||
>>> repr(WbUrl('https://example.com/xyz?a=%2f&b=%2E'))
|
||||
"('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', 'https://example.com/xyz?a=%2f&b=%2E')"
|
||||
|
||||
>>> repr(WbUrl('http://example.com?example=2'))
|
||||
"('latest_replay', '', '', 'http://example.com?example=2', 'http://example.com?example=2')"
|
||||
|
||||
# Test scheme partially encoded urls
|
||||
>>> repr(WbUrl('https%3A//example.com/'))
|
||||
"('latest_replay', '', '', 'https://example.com/', 'https://example.com/')"
|
||||
@ -68,16 +71,16 @@ http://xn--d0-olcluwd.xn--80akhbyknj4f
|
||||
https://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5
|
||||
|
||||
>>> print(to_uri_pencode(u'пример.испытание'))
|
||||
%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5
|
||||
http://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5
|
||||
|
||||
>>> print(to_uri_pencode('http://' + quote_plus(u'пример.испытание'.encode('utf-8'))))
|
||||
http://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5
|
||||
|
||||
>>> print(to_uri_pencode(u'//пример.испытание/abc/испытание'))
|
||||
//%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc/%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5
|
||||
http://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc/%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5
|
||||
|
||||
>>> print(to_uri_pencode(quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc/' + quote_plus(u'пример'.encode('utf-8'))))
|
||||
%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc/%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80
|
||||
http://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc/%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80
|
||||
|
||||
>>> print(to_uri_pencode('https://xn--e1afmkfd.xn--80akhbyknj4f/foo/bar?abc=def'))
|
||||
https://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/foo/bar?abc=def
|
||||
@ -88,28 +91,31 @@ http://d0%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%
|
||||
|
||||
# IRI representation
|
||||
>>> repr(WbUrl(u'http://пример.испытание'))
|
||||
"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f', 'http://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5')"
|
||||
|
||||
>>> x = WbUrl(u'http://пример.испытание'); x._do_percent_encode = False; repr(x)
|
||||
"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f', 'http://xn--e1afmkfd.xn--80akhbyknj4f')"
|
||||
|
||||
>>> repr(WbUrl(u'https://пример.испытание/abc/def_ghi/'))
|
||||
"('latest_replay', '', '', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc/def_ghi/', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc/def_ghi/')"
|
||||
"('latest_replay', '', '', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc/def_ghi/', 'https://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc/def_ghi/')"
|
||||
|
||||
>>> repr(WbUrl(u'//пример.испытание/abc/'))
|
||||
"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc/', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc/')"
|
||||
"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc/', 'http://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc/')"
|
||||
|
||||
>>> repr(WbUrl(u'2014id_/https://пример.испытание/abc'))
|
||||
"('replay', '2014', 'id_', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/https://xn--e1afmkfd.xn--80akhbyknj4f/abc')"
|
||||
"('replay', '2014', 'id_', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/https://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc')"
|
||||
|
||||
# percent-encoded form (as sent by browser usually)
|
||||
>>> repr(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc'))
|
||||
"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://xn--e1afmkfd.xn--80akhbyknj4f/abc')"
|
||||
"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc')"
|
||||
|
||||
# percent-encoded form -- scheme relative
|
||||
>>> repr(WbUrl('2014id_///' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc'))
|
||||
"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://xn--e1afmkfd.xn--80akhbyknj4f/abc')"
|
||||
"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc')"
|
||||
|
||||
# invalid: truncated and superfluous '%', ignore invalid (no exception)
|
||||
>>> repr(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:] + '%' + '/abc'))
|
||||
"('replay', '2014', 'id_', 'http://xn--d0-olcluwd.xn--%-7sbpkb3ampk3g/abc', '2014id_/http://xn--d0-olcluwd.xn--%-7sbpkb3ampk3g/abc')"
|
||||
"('replay', '2014', 'id_', 'http://xn--d0-olcluwd.xn--%-7sbpkb3ampk3g/abc', '2014id_/http://d0%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5%25/abc')"
|
||||
|
||||
|
||||
# Query Urls
|
||||
@ -187,7 +193,7 @@ from StringIO import StringIO
|
||||
|
||||
|
||||
def to_uri_pencode(url):
|
||||
return WbUrl.percent_encode_host(WbUrl.to_uri(url))
|
||||
return WbUrl(url).get_url()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -29,6 +29,9 @@ class UrlRewriter(object):
|
||||
self.cookie_scope = cookie_scope
|
||||
self.rewrite_opts = rewrite_opts
|
||||
|
||||
if rewrite_opts.get('punycode_links'):
|
||||
self.wburl._do_percent_encode = False
|
||||
|
||||
def rewrite(self, url, mod=None):
|
||||
# if special protocol, no rewriting at all
|
||||
if any(url.startswith(x) for x in self.NO_REWRITE_URI_PREFIX):
|
||||
@ -52,9 +55,6 @@ class UrlRewriter(object):
|
||||
is_abs = True
|
||||
url = 'http:' + url
|
||||
|
||||
# convert host to %-encoding instead of default punycode
|
||||
peh = not self.rewrite_opts.get('punycode_links_only', False)
|
||||
|
||||
# Optimized rewriter for
|
||||
# -rel urls that don't start with / and
|
||||
# do not contain ../ and no special mod
|
||||
@ -72,8 +72,7 @@ class UrlRewriter(object):
|
||||
mod = wburl.mod
|
||||
|
||||
final_url = self.prefix + wburl.to_str(mod=mod,
|
||||
url=new_url,
|
||||
percent_encode=peh)
|
||||
url=new_url)
|
||||
return final_url
|
||||
|
||||
def get_new_url(self, **kwargs):
|
||||
|
@ -40,7 +40,7 @@ wayback url format.
|
||||
|
||||
import re
|
||||
import urllib
|
||||
|
||||
import urlparse
|
||||
|
||||
#=================================================================
|
||||
class BaseWbUrl(object):
|
||||
@ -96,30 +96,24 @@ class WbUrl(BaseWbUrl):
|
||||
to have a %-encoded host instead of punycode host
|
||||
The rest of url should be unchanged
|
||||
"""
|
||||
parts = WbUrl.FIRST_PATH.split(url, 1)
|
||||
|
||||
scheme_dom = parts[0].rsplit('/', 1)
|
||||
|
||||
dom = scheme_dom[-1]
|
||||
|
||||
parts = urlparse.urlsplit(url)
|
||||
domain = parts.netloc
|
||||
try:
|
||||
dom = dom.decode('idna')
|
||||
dom = dom.encode('utf-8', 'ignore')
|
||||
domain = domain.decode('idna')
|
||||
domain = domain.encode('utf-8', 'ignore')
|
||||
except:
|
||||
# likely already encoded, so use as is
|
||||
pass
|
||||
|
||||
dom = urllib.quote(dom, safe=r':\/')
|
||||
domain = urllib.quote(domain)#, safe=r':\/')
|
||||
|
||||
if len(scheme_dom) > 1:
|
||||
url = scheme_dom[0] + '/' + dom
|
||||
else:
|
||||
url = dom
|
||||
# no changes
|
||||
if parts.netloc == domain:
|
||||
return url
|
||||
|
||||
if len(parts) > 1:
|
||||
url += '/' + parts[1]
|
||||
return urlparse.urlunsplit((parts[0], domain, parts[2], parts[3], parts[4]))
|
||||
|
||||
return url
|
||||
|
||||
@staticmethod
|
||||
def to_uri(url):
|
||||
@ -174,7 +168,11 @@ class WbUrl(BaseWbUrl):
|
||||
if not self._init_replay(orig_url):
|
||||
raise Exception('Invalid WbUrl: ', orig_url)
|
||||
|
||||
self.url = WbUrl.to_uri(self.url)
|
||||
new_uri = WbUrl.to_uri(self.url)
|
||||
|
||||
self._do_percent_encode = True
|
||||
|
||||
self.url = new_uri
|
||||
|
||||
# protocol agnostic url -> http://
|
||||
# no protocol -> http://
|
||||
@ -244,13 +242,13 @@ class WbUrl(BaseWbUrl):
|
||||
self.url = new_url
|
||||
return self.url
|
||||
|
||||
def get_url(self, url=None, percent_encode=False):
|
||||
def get_url(self, url=None):
|
||||
if url is not None:
|
||||
url = WbUrl.to_uri(url)
|
||||
else:
|
||||
url = self.url
|
||||
|
||||
if percent_encode:
|
||||
if self._do_percent_encode:
|
||||
url = WbUrl.percent_encode_host(url)
|
||||
|
||||
return url
|
||||
@ -264,8 +262,7 @@ class WbUrl(BaseWbUrl):
|
||||
timestamp = overrides.get('timestamp', self.timestamp)
|
||||
end_timestamp = overrides.get('end_timestamp', self.end_timestamp)
|
||||
|
||||
url = self.get_url(overrides.get('url'),
|
||||
overrides.get('percent_encode', False))
|
||||
url = self.get_url(overrides.get('url', self.url))
|
||||
|
||||
return self.to_wburl_str(url=url,
|
||||
type=type_,
|
||||
|
@ -75,8 +75,7 @@ class SearchPageWbUrlHandler(WbUrlHandler):
|
||||
return self.handle_not_found(wbrequest, nfe)
|
||||
|
||||
def get_top_frame_params(self, wbrequest, mod=''):
|
||||
embed_url = wbrequest.wb_url.to_str(mod=mod, url='')
|
||||
embed_url += wbrequest.get_url()
|
||||
embed_url = wbrequest.wb_url.to_str(mod=mod)
|
||||
|
||||
if wbrequest.wb_url.timestamp:
|
||||
timestamp = wbrequest.wb_url.timestamp
|
||||
@ -86,7 +85,7 @@ class SearchPageWbUrlHandler(WbUrlHandler):
|
||||
params = dict(embed_url=embed_url,
|
||||
wbrequest=wbrequest,
|
||||
timestamp=timestamp,
|
||||
url=wbrequest.get_url(),
|
||||
url=wbrequest.wb_url.get_url(),
|
||||
banner_html=self.banner_html)
|
||||
|
||||
return params
|
||||
|
@ -241,7 +241,6 @@ class ReplayView(object):
|
||||
else:
|
||||
statusline = '302 Internal Redirect'
|
||||
|
||||
new_url = new_url.encode('utf-8')
|
||||
status_headers = StatusAndHeaders(statusline,
|
||||
[('Location', new_url)])
|
||||
|
||||
|
@ -129,11 +129,10 @@ class HeadInsertView(J2TemplateView):
|
||||
def create_insert_func(self, wbrequest,
|
||||
include_ts=True):
|
||||
|
||||
url = wbrequest.get_url()
|
||||
url = wbrequest.wb_url.get_url()
|
||||
|
||||
top_url = wbrequest.wb_prefix
|
||||
top_url += wbrequest.wb_url.to_str(mod=wbrequest.final_mod, url='')
|
||||
top_url += url
|
||||
top_url += wbrequest.wb_url.to_str(mod=wbrequest.final_mod)
|
||||
|
||||
include_wombat = not wbrequest.wb_url.is_banner_only
|
||||
|
||||
@ -172,12 +171,12 @@ class J2HtmlCapturesView(J2TemplateView):
|
||||
def render_response(self, wbrequest, cdx_lines, **kwargs):
|
||||
def format_cdx_lines():
|
||||
for cdx in cdx_lines:
|
||||
cdx['url'] = wbrequest.get_url(url=cdx['original'])
|
||||
cdx['url'] = wbrequest.wb_url.get_url(url=cdx['original'])
|
||||
yield cdx
|
||||
|
||||
return J2TemplateView.render_response(self,
|
||||
cdx_lines=list(format_cdx_lines()),
|
||||
url=wbrequest.get_url(),
|
||||
url=wbrequest.wb_url.get_url(),
|
||||
type=wbrequest.wb_url.type,
|
||||
prefix=wbrequest.wb_prefix,
|
||||
**kwargs)
|
||||
|
Loading…
x
Reference in New Issue
Block a user