mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 08:04:49 +01:00
wburl: convert %-encoded hostnames or unicode urls to punycode for
better IDN support (#66)
This commit is contained in:
parent
933343fa01
commit
edff3f17fb
@ -1,4 +1,7 @@
|
||||
"""
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
ur"""
|
||||
# Replay Urls
|
||||
# ======================
|
||||
>>> repr(WbUrl('20131010000506/example.com'))
|
||||
@ -33,6 +36,32 @@
|
||||
>>> repr(WbUrl('2014/http%3A%2F%2Fexample.com/'))
|
||||
"('replay', '2014', '', 'http://example.com/', '2014/http://example.com/')"
|
||||
|
||||
# Test IDNs
|
||||
>>> repr(WbUrl(u'http://пример.испытание'))
|
||||
"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f', 'http://xn--e1afmkfd.xn--80akhbyknj4f')"
|
||||
|
||||
>>> repr(WbUrl(u'https://пример.испытание/abc/'))
|
||||
"('latest_replay', '', '', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc/', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc/')"
|
||||
|
||||
>>> repr(WbUrl(u'//пример.испытание/abc/'))
|
||||
"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc/', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc/')"
|
||||
|
||||
>>> repr(WbUrl(u'2014id_/https://пример.испытание/abc'))
|
||||
"('replay', '2014', 'id_', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/https://xn--e1afmkfd.xn--80akhbyknj4f/abc')"
|
||||
|
||||
# percent-encoded form (as sent by browser usually)
|
||||
>>> repr(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc'))
|
||||
"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://xn--e1afmkfd.xn--80akhbyknj4f/abc')"
|
||||
|
||||
# percent-encoded form -- scheme relative
|
||||
>>> repr(WbUrl('2014id_///' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc'))
|
||||
"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://xn--e1afmkfd.xn--80akhbyknj4f/abc')"
|
||||
|
||||
# invalid: truncated and superfluous '%', ignore invalid (no exception)
|
||||
>>> repr(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:] + '%' + '/abc'))
|
||||
"('replay', '2014', 'id_', 'http://xn--d0-olcluwd.xn--%-7sbpkb3ampk3g/abc', '2014id_/http://xn--d0-olcluwd.xn--%-7sbpkb3ampk3g/abc')"
|
||||
|
||||
|
||||
# Query Urls
|
||||
# ======================
|
||||
>>> repr(WbUrl('*/http://example.com/abc?def=a'))
|
||||
@ -102,7 +131,7 @@ Exception: ('Invalid WbUrl: ', '')
|
||||
"""
|
||||
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
|
||||
from urllib import quote_plus
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
|
@ -87,30 +87,61 @@ class WbUrl(BaseWbUrl):
|
||||
|
||||
DEFAULT_SCHEME = 'http://'
|
||||
|
||||
PARTIAL_ENC_RX = re.compile('(https?%3A)?(%2F%2F)?', re.I)
|
||||
#PARTIAL_ENC_RX = re.compile('(https?%3A)?(%2F%2F)?', re.I)
|
||||
FIRST_PATH = re.compile('(?<![:/])/(?![/])')
|
||||
|
||||
# ======================
|
||||
|
||||
def __init__(self, url):
|
||||
def __init__(self, orig_url):
|
||||
super(WbUrl, self).__init__()
|
||||
|
||||
self.original_url = url
|
||||
was_uni = False
|
||||
if isinstance(orig_url, unicode):
|
||||
orig_url = orig_url.encode('utf-8')
|
||||
was_uni = True
|
||||
|
||||
self.original_url = orig_url
|
||||
|
||||
if not self._init_query(orig_url):
|
||||
if not self._init_replay(orig_url):
|
||||
raise Exception('Invalid WbUrl: ', orig_url)
|
||||
|
||||
if was_uni or '%' in self.url:
|
||||
parts = self.FIRST_PATH.split(self.url, 1)
|
||||
|
||||
if was_uni or '%' in parts[0]:
|
||||
if not was_uni:
|
||||
scheme_dom = urllib.unquote_plus(parts[0])
|
||||
else:
|
||||
scheme_dom = parts[0]
|
||||
|
||||
scheme_dom = scheme_dom.rsplit('/', 1)
|
||||
|
||||
dom = scheme_dom[-1]
|
||||
|
||||
dom = dom.decode('utf-8', 'ignore')
|
||||
dom = dom.encode('idna')
|
||||
|
||||
if len(scheme_dom) > 1:
|
||||
self.url = scheme_dom[0] + '/' + dom
|
||||
else:
|
||||
self.url = dom
|
||||
|
||||
if len(parts) > 1:
|
||||
self.url += '/' + parts[1]
|
||||
|
||||
if not self._init_query(url):
|
||||
if not self._init_replay(url):
|
||||
raise Exception('Invalid WbUrl: ', url)
|
||||
|
||||
# protocol agnostic url -> http://
|
||||
# no protocol -> http://
|
||||
inx = self.url.find(':/')
|
||||
if inx < 0:
|
||||
#if inx < 0:
|
||||
# check for other partially encoded variants
|
||||
m = self.PARTIAL_ENC_RX.match(self.url)
|
||||
if m:
|
||||
len_ = len(m.group(0))
|
||||
self.url = (urllib.unquote_plus(self.url[:len_]) +
|
||||
self.url[len_:])
|
||||
inx = self.url.find(':/')
|
||||
# m = self.PARTIAL_ENC_RX.match(self.url)
|
||||
# if m:
|
||||
# len_ = len(m.group(0))
|
||||
# self.url = (urllib.unquote_plus(self.url[:len_]) +
|
||||
# self.url[len_:])
|
||||
# inx = self.url.find(':/')
|
||||
|
||||
if inx < 0:
|
||||
self.url = self.DEFAULT_SCHEME + self.url
|
||||
|
Loading…
x
Reference in New Issue
Block a user