mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
rewrite: refactor IDN support: instead of returning IRI, return utf-8 %-encoded url
remove support for returning IRI, as that requires detecting charset, instead just use %-encoded form and let browser decode. Should address #66 Add rewrite option 'punycode_links_only' (default to false) to skip the %-encoded conversion of host, and just return punycode. wombat: use getAttribute('href') on <a> tag to get original url, not punycode version replay: add extra sanity check on Location header to ensure utf-8
This commit is contained in:
parent
79cfdd6a08
commit
f9452bf48e
@ -109,13 +109,9 @@ class WbRequest(object):
|
|||||||
if not self.wb_url:
|
if not self.wb_url:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if not url:
|
# pencode urls to force actual urls to appear, unless ascii_links_only set to true
|
||||||
url = self.wb_url.url
|
pencode = self.urlrewriter.rewrite_opts.get('punycode_link_only', False)
|
||||||
|
return self.wb_url.get_url(url, pencode)
|
||||||
if self.urlrewriter.rewrite_opts.get('rewrite_ascii_urls_only'):
|
|
||||||
return self.wb_url.url
|
|
||||||
else:
|
|
||||||
return self.wb_url.to_iri(url)
|
|
||||||
|
|
||||||
def _is_ajax(self):
|
def _is_ajax(self):
|
||||||
value = self.env.get('HTTP_X_REQUESTED_WITH')
|
value = self.env.get('HTTP_X_REQUESTED_WITH')
|
||||||
|
@ -51,10 +51,14 @@ ur"""
|
|||||||
>>> parse('<input value=""X"">X</input>')
|
>>> parse('<input value=""X"">X</input>')
|
||||||
<input value=""X"">X</input>
|
<input value=""X"">X</input>
|
||||||
|
|
||||||
# Unicode
|
# Unicode -- default with %-encoding
|
||||||
>>> parse('<a href="http://испытание.испытание/">испытание</a>')
|
>>> parse(u'<a href="http://испытание.испытание/">испытание</a>')
|
||||||
|
<a href="/web/20131226101010/http://%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/">испытание</a>
|
||||||
|
|
||||||
|
>>> parse(u'<a href="http://испытание.испытание/">испытание</a>', urlrewriter=urlrewriter_pencode)
|
||||||
<a href="/web/20131226101010/http://испытание.испытание/">испытание</a>
|
<a href="/web/20131226101010/http://испытание.испытание/">испытание</a>
|
||||||
|
|
||||||
|
|
||||||
# Meta tag
|
# Meta tag
|
||||||
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
||||||
<meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
|
<meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
|
||||||
@ -168,7 +172,14 @@ from pywb.rewrite.html_rewriter import HTMLRewriter
|
|||||||
|
|
||||||
import pprint
|
import pprint
|
||||||
|
|
||||||
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
|
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
|
||||||
|
'/web/',
|
||||||
|
rewrite_opts=dict(punycode_links_only=False))
|
||||||
|
|
||||||
|
urlrewriter_pencode = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
|
||||||
|
'/web/',
|
||||||
|
rewrite_opts=dict(punycode_links_only=True))
|
||||||
|
|
||||||
|
|
||||||
no_base_canon_rewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
|
no_base_canon_rewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
|
||||||
'/web/',
|
'/web/',
|
||||||
@ -176,6 +187,7 @@ no_base_canon_rewriter = UrlRewriter('20131226101010/http://example.com/some/pat
|
|||||||
rewrite_base=False))
|
rewrite_base=False))
|
||||||
|
|
||||||
def parse(data, head_insert=None, urlrewriter=urlrewriter):
|
def parse(data, head_insert=None, urlrewriter=urlrewriter):
|
||||||
|
data = data.encode('utf-8')
|
||||||
parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
|
parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
|
||||||
#data = data.decode('utf-8')
|
#data = data.decode('utf-8')
|
||||||
result = parser.rewrite(data) + parser.close()
|
result = parser.rewrite(data) + parser.close()
|
||||||
|
@ -33,6 +33,9 @@ r"""
|
|||||||
>>> _test_js(r'location = /http:\/\/example.com/abc.html/')
|
>>> _test_js(r'location = /http:\/\/example.com/abc.html/')
|
||||||
'WB_wombat_location = /http:\\/\\/example.com/abc.html/'
|
'WB_wombat_location = /http:\\/\\/example.com/abc.html/'
|
||||||
|
|
||||||
|
>>> _test_js(r'location = \/http:\/\/example.com\/abc.html\/')
|
||||||
|
'WB_wombat_location = \\/http:\\/\\/example.com\\/abc.html\\/'
|
||||||
|
|
||||||
>>> _test_js('"/location" == some_location_val; locations = location;')
|
>>> _test_js('"/location" == some_location_val; locations = location;')
|
||||||
'"/location" == some_location_val; locations = WB_wombat_location;'
|
'"/location" == some_location_val; locations = WB_wombat_location;'
|
||||||
|
|
||||||
|
@ -4,59 +4,39 @@
|
|||||||
ur"""
|
ur"""
|
||||||
# Replay Urls
|
# Replay Urls
|
||||||
# ======================
|
# ======================
|
||||||
>>> repr_unicode(WbUrl('20131010000506/example.com'))
|
>>> repr(WbUrl('20131010000506/example.com'))
|
||||||
('replay', '20131010000506', '', 'http://example.com', '20131010000506/http://example.com')
|
"('replay', '20131010000506', '', 'http://example.com', '20131010000506/http://example.com')"
|
||||||
|
|
||||||
>>> repr_unicode(WbUrl('20130102im_/https://example.com'))
|
>>> repr(WbUrl('20130102im_/https://example.com'))
|
||||||
('replay', '20130102', 'im_', 'https://example.com', '20130102im_/https://example.com')
|
"('replay', '20130102', 'im_', 'https://example.com', '20130102im_/https://example.com')"
|
||||||
|
|
||||||
>>> repr_unicode(WbUrl('20130102im_/https:/example.com'))
|
>>> repr(WbUrl('20130102im_/https:/example.com'))
|
||||||
('replay', '20130102', 'im_', 'https://example.com', '20130102im_/https://example.com')
|
"('replay', '20130102', 'im_', 'https://example.com', '20130102im_/https://example.com')"
|
||||||
|
|
||||||
# Protocol agnostic convert to http
|
# Protocol agnostic convert to http
|
||||||
>>> repr_unicode(WbUrl('20130102im_///example.com'))
|
>>> repr(WbUrl('20130102im_///example.com'))
|
||||||
('replay', '20130102', 'im_', 'http://example.com', '20130102im_/http://example.com')
|
"('replay', '20130102', 'im_', 'http://example.com', '20130102im_/http://example.com')"
|
||||||
|
|
||||||
>>> repr_unicode(WbUrl('cs_/example.com'))
|
>>> repr(WbUrl('cs_/example.com'))
|
||||||
('latest_replay', '', 'cs_', 'http://example.com', 'cs_/http://example.com')
|
"('latest_replay', '', 'cs_', 'http://example.com', 'cs_/http://example.com')"
|
||||||
|
|
||||||
>>> repr_unicode(WbUrl('https://example.com/xyz'))
|
>>> repr(WbUrl('https://example.com/xyz'))
|
||||||
('latest_replay', '', '', 'https://example.com/xyz', 'https://example.com/xyz')
|
"('latest_replay', '', '', 'https://example.com/xyz', 'https://example.com/xyz')"
|
||||||
|
|
||||||
>>> repr_unicode(WbUrl('https:/example.com/xyz'))
|
>>> repr(WbUrl('https:/example.com/xyz'))
|
||||||
('latest_replay', '', '', 'https://example.com/xyz', 'https://example.com/xyz')
|
"('latest_replay', '', '', 'https://example.com/xyz', 'https://example.com/xyz')"
|
||||||
|
|
||||||
>>> repr_unicode(WbUrl('https://example.com/xyz?a=%2f&b=%2E'))
|
>>> repr(WbUrl('https://example.com/xyz?a=%2f&b=%2E'))
|
||||||
('latest_replay', '', '', 'https://example.com/xyz?a=/&b=.', 'https://example.com/xyz?a=/&b=.')
|
"('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', 'https://example.com/xyz?a=%2f&b=%2E')"
|
||||||
|
|
||||||
# Test scheme partially encoded urls
|
# Test scheme partially encoded urls
|
||||||
>>> repr_unicode(WbUrl('https%3A//example.com/'))
|
>>> repr(WbUrl('https%3A//example.com/'))
|
||||||
('latest_replay', '', '', 'https://example.com/', 'https://example.com/')
|
"('latest_replay', '', '', 'https://example.com/', 'https://example.com/')"
|
||||||
|
|
||||||
>>> repr_unicode(WbUrl('2014/http%3A%2F%2Fexample.com/'))
|
>>> repr(WbUrl('2014/http%3A%2F%2Fexample.com/'))
|
||||||
('replay', '2014', '', 'http://example.com/', '2014/http://example.com/')
|
"('replay', '2014', '', 'http://example.com/', '2014/http://example.com/')"
|
||||||
|
|
||||||
# Test IDNs
|
|
||||||
|
|
||||||
To IRI
|
|
||||||
>>> print(WbUrl.to_iri(u'https://пример.испытание'))
|
|
||||||
https://пример.испытание
|
|
||||||
|
|
||||||
>>> print(WbUrl.to_iri(u'пример.испытание'))
|
|
||||||
пример.испытание
|
|
||||||
|
|
||||||
>>> print(WbUrl.to_iri('http://' + quote_plus(u'пример.испытание'.encode('utf-8'))))
|
|
||||||
http://пример.испытание
|
|
||||||
|
|
||||||
>>> print(WbUrl.to_iri(u'//пример.испытание/abc/испытание'))
|
|
||||||
//пример.испытание/abc/испытание
|
|
||||||
|
|
||||||
>>> print(WbUrl.to_iri(quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc/' + quote_plus(u'пример'.encode('utf-8'))))
|
|
||||||
пример.испытание/abc/пример
|
|
||||||
|
|
||||||
>>> print(WbUrl.to_iri('https://xn--e1afmkfd.xn--80akhbyknj4f'))
|
|
||||||
https://пример.испытание
|
|
||||||
|
|
||||||
|
# ===== Test IDNs
|
||||||
|
|
||||||
To URI
|
To URI
|
||||||
>>> print(WbUrl.to_uri(u'https://пример.испытание'))
|
>>> print(WbUrl.to_uri(u'https://пример.испытание'))
|
||||||
@ -69,73 +49,99 @@ xn--e1afmkfd.xn--80akhbyknj4f
|
|||||||
http://xn--e1afmkfd.xn--80akhbyknj4f
|
http://xn--e1afmkfd.xn--80akhbyknj4f
|
||||||
|
|
||||||
>>> print(WbUrl.to_uri(u'//пример.испытание/abc/испытание'))
|
>>> print(WbUrl.to_uri(u'//пример.испытание/abc/испытание'))
|
||||||
//xn--e1afmkfd.xn--80akhbyknj4f/abc%2F%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5
|
//xn--e1afmkfd.xn--80akhbyknj4f/abc/%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5
|
||||||
|
|
||||||
>>> print(WbUrl.to_uri('//' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc/' + quote_plus(u'пример'.encode('utf-8'))))
|
>>> print(WbUrl.to_uri('//' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc/' + quote_plus(u'пример'.encode('utf-8'))))
|
||||||
//xn--e1afmkfd.xn--80akhbyknj4f/abc/%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80
|
//xn--e1afmkfd.xn--80akhbyknj4f/abc/%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80
|
||||||
|
|
||||||
>>> print(WbUrl.to_uri('https://xn--e1afmkfd.xn--80akhbyknj4f/abc/'))
|
>>> print(WbUrl.to_uri('https://xn--e1afmkfd.xn--80akhbyknj4f/foo/bar?abc=def'))
|
||||||
https://xn--e1afmkfd.xn--80akhbyknj4f/abc/
|
https://xn--e1afmkfd.xn--80akhbyknj4f/foo/bar?abc=def
|
||||||
|
|
||||||
|
# truncated
|
||||||
>>> print(WbUrl.to_uri('http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:]))
|
>>> print(WbUrl.to_uri('http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:]))
|
||||||
http://xn--d0-olcluwd.xn--80akhbyknj4f
|
http://xn--d0-olcluwd.xn--80akhbyknj4f
|
||||||
|
|
||||||
|
|
||||||
|
# To %-encoded host uri -- instead of punycode, %-encode host
|
||||||
|
|
||||||
|
>>> print(to_uri_pencode(u'https://пример.испытание'))
|
||||||
|
https://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5
|
||||||
|
|
||||||
|
>>> print(to_uri_pencode(u'пример.испытание'))
|
||||||
|
%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5
|
||||||
|
|
||||||
|
>>> print(to_uri_pencode('http://' + quote_plus(u'пример.испытание'.encode('utf-8'))))
|
||||||
|
http://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5
|
||||||
|
|
||||||
|
>>> print(to_uri_pencode(u'//пример.испытание/abc/испытание'))
|
||||||
|
//%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc/%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5
|
||||||
|
|
||||||
|
>>> print(to_uri_pencode(quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc/' + quote_plus(u'пример'.encode('utf-8'))))
|
||||||
|
%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/abc/%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80
|
||||||
|
|
||||||
|
>>> print(to_uri_pencode('https://xn--e1afmkfd.xn--80akhbyknj4f/foo/bar?abc=def'))
|
||||||
|
https://%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5/foo/bar?abc=def
|
||||||
|
|
||||||
|
>>> print(to_uri_pencode('http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:]))
|
||||||
|
http://d0%D1%80%D0%B8%D0%BC%D0%B5%D1%80.%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5
|
||||||
|
|
||||||
|
|
||||||
# IRI representation
|
# IRI representation
|
||||||
>>> repr_unicode(WbUrl(u'http://пример.испытание'))
|
>>> repr(WbUrl(u'http://пример.испытание'))
|
||||||
('latest_replay', '', '', 'http://пример.испытание', 'http://пример.испытание')
|
"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f', 'http://xn--e1afmkfd.xn--80akhbyknj4f')"
|
||||||
|
|
||||||
>>> repr_unicode(WbUrl(u'https://пример.испытание/abc/'))
|
>>> repr(WbUrl(u'https://пример.испытание/abc/def_ghi/'))
|
||||||
('latest_replay', '', '', 'https://пример.испытание/abc/', 'https://пример.испытание/abc/')
|
"('latest_replay', '', '', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc/def_ghi/', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc/def_ghi/')"
|
||||||
|
|
||||||
>>> repr_unicode(WbUrl(u'//пример.испытание/abc/'))
|
>>> repr(WbUrl(u'//пример.испытание/abc/'))
|
||||||
('latest_replay', '', '', 'http://пример.испытание/abc/', 'http://пример.испытание/abc/')
|
"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc/', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc/')"
|
||||||
|
|
||||||
>>> repr_unicode(WbUrl(u'2014id_/https://пример.испытание/abc'))
|
>>> repr(WbUrl(u'2014id_/https://пример.испытание/abc'))
|
||||||
('replay', '2014', 'id_', 'https://пример.испытание/abc', '2014id_/https://пример.испытание/abc')
|
"('replay', '2014', 'id_', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/https://xn--e1afmkfd.xn--80akhbyknj4f/abc')"
|
||||||
|
|
||||||
# percent-encoded form (as sent by browser usually)
|
# percent-encoded form (as sent by browser usually)
|
||||||
>>> repr_unicode(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc'))
|
>>> repr(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc'))
|
||||||
('replay', '2014', 'id_', 'http://пример.испытание/abc', '2014id_/http://пример.испытание/abc')
|
"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://xn--e1afmkfd.xn--80akhbyknj4f/abc')"
|
||||||
|
|
||||||
# percent-encoded form -- scheme relative
|
# percent-encoded form -- scheme relative
|
||||||
>>> repr_unicode(WbUrl('2014id_///' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc'))
|
>>> repr(WbUrl('2014id_///' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc'))
|
||||||
('replay', '2014', 'id_', 'http://пример.испытание/abc', '2014id_/http://пример.испытание/abc')
|
"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://xn--e1afmkfd.xn--80akhbyknj4f/abc')"
|
||||||
|
|
||||||
# invalid: truncated and superfluous '%', ignore invalid (no exception)
|
# invalid: truncated and superfluous '%', ignore invalid (no exception)
|
||||||
>>> repr_unicode(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:] + '%' + '/abc'))
|
>>> repr(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:] + '%' + '/abc'))
|
||||||
('replay', '2014', 'id_', 'http://d0ример.испытание%/abc', '2014id_/http://d0ример.испытание%/abc')
|
"('replay', '2014', 'id_', 'http://xn--d0-olcluwd.xn--%-7sbpkb3ampk3g/abc', '2014id_/http://xn--d0-olcluwd.xn--%-7sbpkb3ampk3g/abc')"
|
||||||
|
|
||||||
|
|
||||||
# Query Urls
|
# Query Urls
|
||||||
# ======================
|
# ======================
|
||||||
>>> repr_unicode(WbUrl('*/http://example.com/abc?def=a'))
|
>>> repr(WbUrl('*/http://example.com/abc?def=a'))
|
||||||
('query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a')
|
"('query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a')"
|
||||||
|
|
||||||
>>> repr_unicode(WbUrl('*/http://example.com/abc?def=a*'))
|
>>> repr(WbUrl('*/http://example.com/abc?def=a*'))
|
||||||
('url_query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a*')
|
"('url_query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a*')"
|
||||||
|
|
||||||
>>> repr_unicode(WbUrl('2010*/http://example.com/abc?def=a'))
|
>>> repr(WbUrl('2010*/http://example.com/abc?def=a'))
|
||||||
('query', '2010', '', 'http://example.com/abc?def=a', '2010*/http://example.com/abc?def=a')
|
"('query', '2010', '', 'http://example.com/abc?def=a', '2010*/http://example.com/abc?def=a')"
|
||||||
|
|
||||||
# timestamp range query
|
# timestamp range query
|
||||||
>>> repr_unicode(WbUrl('2009-2015*/http://example.com/abc?def=a'))
|
>>> repr(WbUrl('2009-2015*/http://example.com/abc?def=a'))
|
||||||
('query', '2009', '', 'http://example.com/abc?def=a', '2009-2015*/http://example.com/abc?def=a')
|
"('query', '2009', '', 'http://example.com/abc?def=a', '2009-2015*/http://example.com/abc?def=a')"
|
||||||
|
|
||||||
>>> repr_unicode(WbUrl('json/*/http://example.com/abc?def=a'))
|
>>> repr(WbUrl('json/*/http://example.com/abc?def=a'))
|
||||||
('query', '', 'json', 'http://example.com/abc?def=a', 'json/*/http://example.com/abc?def=a')
|
"('query', '', 'json', 'http://example.com/abc?def=a', 'json/*/http://example.com/abc?def=a')"
|
||||||
|
|
||||||
>>> repr_unicode(WbUrl('timemap-link/2011*/http://example.com/abc?def=a'))
|
>>> repr(WbUrl('timemap-link/2011*/http://example.com/abc?def=a'))
|
||||||
('query', '2011', 'timemap-link', 'http://example.com/abc?def=a', 'timemap-link/2011*/http://example.com/abc?def=a')
|
"('query', '2011', 'timemap-link', 'http://example.com/abc?def=a', 'timemap-link/2011*/http://example.com/abc?def=a')"
|
||||||
|
|
||||||
# strip off repeated, likely scheme-agnostic, slashes altogether
|
# strip off repeated, likely scheme-agnostic, slashes altogether
|
||||||
>>> repr_unicode(WbUrl('///example.com'))
|
>>> repr(WbUrl('///example.com'))
|
||||||
('latest_replay', '', '', 'http://example.com', 'http://example.com')
|
"('latest_replay', '', '', 'http://example.com', 'http://example.com')"
|
||||||
|
|
||||||
>>> repr_unicode(WbUrl('//example.com/'))
|
>>> repr(WbUrl('//example.com/'))
|
||||||
('latest_replay', '', '', 'http://example.com/', 'http://example.com/')
|
"('latest_replay', '', '', 'http://example.com/', 'http://example.com/')"
|
||||||
|
|
||||||
>>> repr_unicode(WbUrl('/example.com/'))
|
>>> repr(WbUrl('/example.com/'))
|
||||||
('latest_replay', '', '', 'http://example.com/', 'http://example.com/')
|
"('latest_replay', '', '', 'http://example.com/', 'http://example.com/')"
|
||||||
|
|
||||||
# Is_ Tests
|
# Is_ Tests
|
||||||
>>> u = WbUrl('*/http://example.com/abc?def=a*')
|
>>> u = WbUrl('*/http://example.com/abc?def=a*')
|
||||||
@ -156,7 +162,7 @@ True
|
|||||||
# Error Urls
|
# Error Urls
|
||||||
# ======================
|
# ======================
|
||||||
# no longer rejecting this here
|
# no longer rejecting this here
|
||||||
#>>> x = WbUrl('/#$%#/')
|
#>>> x = WbUrl('/#$%#/')"
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
Exception: Bad Request Url: http://#$%#/
|
Exception: Bad Request Url: http://#$%#/
|
||||||
|
|
||||||
@ -180,14 +186,8 @@ from urllib import quote_plus, unquote_plus
|
|||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
|
|
||||||
|
|
||||||
def repr_unicode(wburl):
|
def to_uri_pencode(url):
|
||||||
buff = StringIO()
|
return WbUrl.percent_encode_host(WbUrl.to_uri(url))
|
||||||
buff.write("('{0}', '{1}', '{2}', '".format(wburl.type, wburl.timestamp, wburl.mod))
|
|
||||||
buff.write(WbUrl.to_iri(wburl.url))
|
|
||||||
buff.write("', '")
|
|
||||||
buff.write(wburl.to_str(iri=True))
|
|
||||||
buff.write("')")
|
|
||||||
print(buff.getvalue())
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -52,8 +52,8 @@ class UrlRewriter(object):
|
|||||||
is_abs = True
|
is_abs = True
|
||||||
url = 'http:' + url
|
url = 'http:' + url
|
||||||
|
|
||||||
# always convert any unicode urls to punycode
|
# convert host to %-encoding instead of default punycode
|
||||||
ascii_urls_only = self.rewrite_opts.get('rewrite_ascii_urls_only', False)
|
peh = not self.rewrite_opts.get('punycode_links_only', False)
|
||||||
|
|
||||||
# Optimized rewriter for
|
# Optimized rewriter for
|
||||||
# -rel urls that don't start with / and
|
# -rel urls that don't start with / and
|
||||||
@ -73,13 +73,7 @@ class UrlRewriter(object):
|
|||||||
|
|
||||||
final_url = self.prefix + wburl.to_str(mod=mod,
|
final_url = self.prefix + wburl.to_str(mod=mod,
|
||||||
url=new_url,
|
url=new_url,
|
||||||
iri=not ascii_urls_only)
|
percent_encode=peh)
|
||||||
if not ascii_urls_only:
|
|
||||||
try:
|
|
||||||
final_url = final_url.encode('utf-8')
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return final_url
|
return final_url
|
||||||
|
|
||||||
def get_new_url(self, **kwargs):
|
def get_new_url(self, **kwargs):
|
||||||
|
@ -87,56 +87,52 @@ class WbUrl(BaseWbUrl):
|
|||||||
|
|
||||||
DEFAULT_SCHEME = 'http://'
|
DEFAULT_SCHEME = 'http://'
|
||||||
|
|
||||||
#PARTIAL_ENC_RX = re.compile('(https?%3A)?(%2F%2F)?', re.I)
|
|
||||||
FIRST_PATH = re.compile('(?<![:/])/(?![/])')
|
FIRST_PATH = re.compile('(?<![:/])/(?![/])')
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def to_iri(url):
|
def percent_encode_host(url):
|
||||||
if isinstance(url, str):
|
""" Convert the host of uri formatted with to_uri()
|
||||||
url = urllib.unquote_plus(url)
|
to have a %-encoded host instead of punycode host
|
||||||
url = url.decode('utf-8')
|
The rest of url should be unchanged
|
||||||
|
"""
|
||||||
parts = WbUrl.FIRST_PATH.split(url, 1)
|
parts = WbUrl.FIRST_PATH.split(url, 1)
|
||||||
scheme_dom = parts[0]
|
|
||||||
|
|
||||||
#scheme_dom = urllib.unquote_plus(parts[0])
|
scheme_dom = parts[0].rsplit('/', 1)
|
||||||
|
|
||||||
#if isinstance(scheme_dom, str):
|
|
||||||
# scheme_dom = scheme_dom.decode('utf-8', 'ignore')
|
|
||||||
|
|
||||||
scheme_dom = scheme_dom.rsplit(u'/', 1)
|
|
||||||
dom = scheme_dom[-1]
|
dom = scheme_dom[-1]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
dom = dom.decode('idna')
|
dom = dom.decode('idna')
|
||||||
|
dom = dom.encode('utf-8', 'ignore')
|
||||||
except:
|
except:
|
||||||
|
# likely already encoded, so use as is
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
dom = urllib.quote(dom, safe=r':\/')
|
||||||
|
|
||||||
if len(scheme_dom) > 1:
|
if len(scheme_dom) > 1:
|
||||||
url = scheme_dom[0] + u'/' + dom
|
url = scheme_dom[0] + '/' + dom
|
||||||
else:
|
else:
|
||||||
url = dom
|
url = dom
|
||||||
|
|
||||||
if len(parts) > 1:
|
if len(parts) > 1:
|
||||||
url += u'/' + parts[1]
|
url += '/' + parts[1]
|
||||||
|
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def to_uri(url, was_uni=False):
|
def to_uri(url):
|
||||||
#if not was_uni:
|
""" Converts a url to an ascii %-encoded form
|
||||||
# if isinstance(url, unicode):
|
where:
|
||||||
# was_uni = True
|
- scheme is ascii,
|
||||||
|
- host is punycode,
|
||||||
#if not was_uni and not '%' in url:
|
- and remainder is %-encoded
|
||||||
# return url
|
Not using urlsplit to also decode partially encoded
|
||||||
|
scheme urls
|
||||||
|
"""
|
||||||
parts = WbUrl.FIRST_PATH.split(url, 1)
|
parts = WbUrl.FIRST_PATH.split(url, 1)
|
||||||
|
|
||||||
#if not was_uni and not '%' in parts[0]:
|
|
||||||
# return url
|
|
||||||
|
|
||||||
scheme_dom = urllib.unquote_plus(parts[0])
|
scheme_dom = urllib.unquote_plus(parts[0])
|
||||||
|
|
||||||
if isinstance(scheme_dom, str):
|
if isinstance(scheme_dom, str):
|
||||||
@ -146,18 +142,18 @@ class WbUrl(BaseWbUrl):
|
|||||||
scheme_dom = scheme_dom.decode('utf-8', 'ignore')
|
scheme_dom = scheme_dom.decode('utf-8', 'ignore')
|
||||||
|
|
||||||
scheme_dom = scheme_dom.rsplit('/', 1)
|
scheme_dom = scheme_dom.rsplit('/', 1)
|
||||||
dom = scheme_dom[-1]
|
domain = scheme_dom[-1]
|
||||||
|
|
||||||
dom = dom.encode('idna')
|
domain = domain.encode('idna')
|
||||||
|
|
||||||
if len(scheme_dom) > 1:
|
if len(scheme_dom) > 1:
|
||||||
url = scheme_dom[0] + '/' + dom
|
url = scheme_dom[0].encode('utf-8') + '/' + domain
|
||||||
else:
|
else:
|
||||||
url = dom
|
url = domain
|
||||||
|
|
||||||
if len(parts) > 1:
|
if len(parts) > 1:
|
||||||
if isinstance(parts[1], unicode):
|
if isinstance(parts[1], unicode):
|
||||||
url += '/' + urllib.quote_plus(parts[1].encode('utf-8'))
|
url += '/' + urllib.quote(parts[1].encode('utf-8'))
|
||||||
else:
|
else:
|
||||||
url += '/' + parts[1]
|
url += '/' + parts[1]
|
||||||
|
|
||||||
@ -168,10 +164,9 @@ class WbUrl(BaseWbUrl):
|
|||||||
def __init__(self, orig_url):
|
def __init__(self, orig_url):
|
||||||
super(WbUrl, self).__init__()
|
super(WbUrl, self).__init__()
|
||||||
|
|
||||||
was_uni = False
|
|
||||||
if isinstance(orig_url, unicode):
|
if isinstance(orig_url, unicode):
|
||||||
orig_url = orig_url.encode('utf-8')
|
orig_url = orig_url.encode('utf-8')
|
||||||
was_uni = True
|
orig_url = urllib.quote(orig_url)
|
||||||
|
|
||||||
self.original_url = orig_url
|
self.original_url = orig_url
|
||||||
|
|
||||||
@ -179,7 +174,7 @@ class WbUrl(BaseWbUrl):
|
|||||||
if not self._init_replay(orig_url):
|
if not self._init_replay(orig_url):
|
||||||
raise Exception('Invalid WbUrl: ', orig_url)
|
raise Exception('Invalid WbUrl: ', orig_url)
|
||||||
|
|
||||||
self.url = WbUrl.to_uri(self.url, was_uni)
|
self.url = WbUrl.to_uri(self.url)
|
||||||
|
|
||||||
# protocol agnostic url -> http://
|
# protocol agnostic url -> http://
|
||||||
# no protocol -> http://
|
# no protocol -> http://
|
||||||
@ -249,6 +244,18 @@ class WbUrl(BaseWbUrl):
|
|||||||
self.url = new_url
|
self.url = new_url
|
||||||
return self.url
|
return self.url
|
||||||
|
|
||||||
|
def get_url(self, url=None, percent_encode=False):
|
||||||
|
if url is not None:
|
||||||
|
url = WbUrl.to_uri(url)
|
||||||
|
else:
|
||||||
|
url = self.url
|
||||||
|
|
||||||
|
if percent_encode:
|
||||||
|
url = WbUrl.percent_encode_host(url)
|
||||||
|
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
# Str Representation
|
# Str Representation
|
||||||
# ====================
|
# ====================
|
||||||
def to_str(self, **overrides):
|
def to_str(self, **overrides):
|
||||||
@ -256,9 +263,9 @@ class WbUrl(BaseWbUrl):
|
|||||||
mod = overrides.get('mod', self.mod)
|
mod = overrides.get('mod', self.mod)
|
||||||
timestamp = overrides.get('timestamp', self.timestamp)
|
timestamp = overrides.get('timestamp', self.timestamp)
|
||||||
end_timestamp = overrides.get('end_timestamp', self.end_timestamp)
|
end_timestamp = overrides.get('end_timestamp', self.end_timestamp)
|
||||||
url = overrides.get('url', self.url)
|
|
||||||
if overrides.get('iri'):
|
url = self.get_url(overrides.get('url'),
|
||||||
url = WbUrl.to_iri(url)
|
overrides.get('percent_encode', False))
|
||||||
|
|
||||||
return self.to_wburl_str(url=url,
|
return self.to_wburl_str(url=url,
|
||||||
type=type_,
|
type=type_,
|
||||||
|
@ -323,7 +323,7 @@ _WBWombat = (function() {
|
|||||||
return url + this._orig_loc.hash;
|
return url + this._orig_loc.hash;
|
||||||
}
|
}
|
||||||
|
|
||||||
href = parser.href;
|
href = parser.getAttribute("href");
|
||||||
var hash = parser.hash;
|
var hash = parser.hash;
|
||||||
|
|
||||||
if (hash) {
|
if (hash) {
|
||||||
|
@ -241,6 +241,7 @@ class ReplayView(object):
|
|||||||
else:
|
else:
|
||||||
statusline = '302 Internal Redirect'
|
statusline = '302 Internal Redirect'
|
||||||
|
|
||||||
|
new_url = new_url.encode('utf-8')
|
||||||
status_headers = StatusAndHeaders(statusline,
|
status_headers = StatusAndHeaders(statusline,
|
||||||
[('Location', new_url)])
|
[('Location', new_url)])
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user