1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

wburl: convert %-encoded hostnames or unicode urls to punycode for

better IDN support (#66)
This commit is contained in:
Ilya Kreymer 2015-01-25 12:21:50 -08:00
parent 933343fa01
commit edff3f17fb
2 changed files with 75 additions and 15 deletions

View File

@ -1,4 +1,7 @@
"""
#!/usr/bin/env python
# -*- coding: utf-8 -*-
ur"""
# Replay Urls
# ======================
>>> repr(WbUrl('20131010000506/example.com'))
@ -33,6 +36,32 @@
>>> repr(WbUrl('2014/http%3A%2F%2Fexample.com/'))
"('replay', '2014', '', 'http://example.com/', '2014/http://example.com/')"
# Test IDNs
>>> repr(WbUrl(u'http://пример.испытание'))
"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f', 'http://xn--e1afmkfd.xn--80akhbyknj4f')"
>>> repr(WbUrl(u'https://пример.испытание/abc/'))
"('latest_replay', '', '', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc/', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc/')"
>>> repr(WbUrl(u'//пример.испытание/abc/'))
"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc/', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc/')"
>>> repr(WbUrl(u'2014id_/https://пример.испытание/abc'))
"('replay', '2014', 'id_', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/https://xn--e1afmkfd.xn--80akhbyknj4f/abc')"
# percent-encoded form (as sent by browser usually)
>>> repr(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc'))
"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://xn--e1afmkfd.xn--80akhbyknj4f/abc')"
# percent-encoded form -- scheme relative
>>> repr(WbUrl('2014id_///' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc'))
"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://xn--e1afmkfd.xn--80akhbyknj4f/abc')"
# invalid: truncated and superfluous '%', ignore invalid (no exception)
>>> repr(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:] + '%' + '/abc'))
"('replay', '2014', 'id_', 'http://xn--d0-olcluwd.xn--%-7sbpkb3ampk3g/abc', '2014id_/http://xn--d0-olcluwd.xn--%-7sbpkb3ampk3g/abc')"
# Query Urls
# ======================
>>> repr(WbUrl('*/http://example.com/abc?def=a'))
@ -102,7 +131,7 @@ Exception: ('Invalid WbUrl: ', '')
"""
from pywb.rewrite.wburl import WbUrl
from urllib import quote_plus
if __name__ == "__main__":
import doctest

View File

@ -87,30 +87,61 @@ class WbUrl(BaseWbUrl):
DEFAULT_SCHEME = 'http://'
PARTIAL_ENC_RX = re.compile('(https?%3A)?(%2F%2F)?', re.I)
#PARTIAL_ENC_RX = re.compile('(https?%3A)?(%2F%2F)?', re.I)
FIRST_PATH = re.compile('(?<![:/])/(?![/])')
# ======================
def __init__(self, url):
def __init__(self, orig_url):
super(WbUrl, self).__init__()
self.original_url = url
was_uni = False
if isinstance(orig_url, unicode):
orig_url = orig_url.encode('utf-8')
was_uni = True
self.original_url = orig_url
if not self._init_query(orig_url):
if not self._init_replay(orig_url):
raise Exception('Invalid WbUrl: ', orig_url)
if was_uni or '%' in self.url:
parts = self.FIRST_PATH.split(self.url, 1)
if was_uni or '%' in parts[0]:
if not was_uni:
scheme_dom = urllib.unquote_plus(parts[0])
else:
scheme_dom = parts[0]
scheme_dom = scheme_dom.rsplit('/', 1)
dom = scheme_dom[-1]
dom = dom.decode('utf-8', 'ignore')
dom = dom.encode('idna')
if len(scheme_dom) > 1:
self.url = scheme_dom[0] + '/' + dom
else:
self.url = dom
if len(parts) > 1:
self.url += '/' + parts[1]
if not self._init_query(url):
if not self._init_replay(url):
raise Exception('Invalid WbUrl: ', url)
# protocol agnostic url -> http://
# no protocol -> http://
inx = self.url.find(':/')
if inx < 0:
#if inx < 0:
# check for other partially encoded variants
m = self.PARTIAL_ENC_RX.match(self.url)
if m:
len_ = len(m.group(0))
self.url = (urllib.unquote_plus(self.url[:len_]) +
self.url[len_:])
inx = self.url.find(':/')
# m = self.PARTIAL_ENC_RX.match(self.url)
# if m:
# len_ = len(m.group(0))
# self.url = (urllib.unquote_plus(self.url[:len_]) +
# self.url[len_:])
# inx = self.url.find(':/')
if inx < 0:
self.url = self.DEFAULT_SCHEME + self.url