mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
wburl: convert %-encoded hostnames or unicode urls to punycode for
better IDN support (#66)
This commit is contained in:
parent
933343fa01
commit
edff3f17fb
@ -1,4 +1,7 @@
|
|||||||
"""
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
ur"""
|
||||||
# Replay Urls
|
# Replay Urls
|
||||||
# ======================
|
# ======================
|
||||||
>>> repr(WbUrl('20131010000506/example.com'))
|
>>> repr(WbUrl('20131010000506/example.com'))
|
||||||
@ -33,6 +36,32 @@
|
|||||||
>>> repr(WbUrl('2014/http%3A%2F%2Fexample.com/'))
|
>>> repr(WbUrl('2014/http%3A%2F%2Fexample.com/'))
|
||||||
"('replay', '2014', '', 'http://example.com/', '2014/http://example.com/')"
|
"('replay', '2014', '', 'http://example.com/', '2014/http://example.com/')"
|
||||||
|
|
||||||
|
# Test IDNs
|
||||||
|
>>> repr(WbUrl(u'http://пример.испытание'))
|
||||||
|
"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f', 'http://xn--e1afmkfd.xn--80akhbyknj4f')"
|
||||||
|
|
||||||
|
>>> repr(WbUrl(u'https://пример.испытание/abc/'))
|
||||||
|
"('latest_replay', '', '', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc/', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc/')"
|
||||||
|
|
||||||
|
>>> repr(WbUrl(u'//пример.испытание/abc/'))
|
||||||
|
"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc/', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc/')"
|
||||||
|
|
||||||
|
>>> repr(WbUrl(u'2014id_/https://пример.испытание/abc'))
|
||||||
|
"('replay', '2014', 'id_', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/https://xn--e1afmkfd.xn--80akhbyknj4f/abc')"
|
||||||
|
|
||||||
|
# percent-encoded form (as sent by browser usually)
|
||||||
|
>>> repr(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc'))
|
||||||
|
"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://xn--e1afmkfd.xn--80akhbyknj4f/abc')"
|
||||||
|
|
||||||
|
# percent-encoded form -- scheme relative
|
||||||
|
>>> repr(WbUrl('2014id_///' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc'))
|
||||||
|
"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://xn--e1afmkfd.xn--80akhbyknj4f/abc')"
|
||||||
|
|
||||||
|
# invalid: truncated and superfluous '%', ignore invalid (no exception)
|
||||||
|
>>> repr(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:] + '%' + '/abc'))
|
||||||
|
"('replay', '2014', 'id_', 'http://xn--d0-olcluwd.xn--%-7sbpkb3ampk3g/abc', '2014id_/http://xn--d0-olcluwd.xn--%-7sbpkb3ampk3g/abc')"
|
||||||
|
|
||||||
|
|
||||||
# Query Urls
|
# Query Urls
|
||||||
# ======================
|
# ======================
|
||||||
>>> repr(WbUrl('*/http://example.com/abc?def=a'))
|
>>> repr(WbUrl('*/http://example.com/abc?def=a'))
|
||||||
@ -102,7 +131,7 @@ Exception: ('Invalid WbUrl: ', '')
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
from pywb.rewrite.wburl import WbUrl
|
from pywb.rewrite.wburl import WbUrl
|
||||||
|
from urllib import quote_plus
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import doctest
|
import doctest
|
||||||
|
@ -87,30 +87,61 @@ class WbUrl(BaseWbUrl):
|
|||||||
|
|
||||||
DEFAULT_SCHEME = 'http://'
|
DEFAULT_SCHEME = 'http://'
|
||||||
|
|
||||||
PARTIAL_ENC_RX = re.compile('(https?%3A)?(%2F%2F)?', re.I)
|
#PARTIAL_ENC_RX = re.compile('(https?%3A)?(%2F%2F)?', re.I)
|
||||||
|
FIRST_PATH = re.compile('(?<![:/])/(?![/])')
|
||||||
|
|
||||||
# ======================
|
# ======================
|
||||||
|
|
||||||
def __init__(self, url):
|
def __init__(self, orig_url):
|
||||||
super(WbUrl, self).__init__()
|
super(WbUrl, self).__init__()
|
||||||
|
|
||||||
self.original_url = url
|
was_uni = False
|
||||||
|
if isinstance(orig_url, unicode):
|
||||||
|
orig_url = orig_url.encode('utf-8')
|
||||||
|
was_uni = True
|
||||||
|
|
||||||
|
self.original_url = orig_url
|
||||||
|
|
||||||
|
if not self._init_query(orig_url):
|
||||||
|
if not self._init_replay(orig_url):
|
||||||
|
raise Exception('Invalid WbUrl: ', orig_url)
|
||||||
|
|
||||||
|
if was_uni or '%' in self.url:
|
||||||
|
parts = self.FIRST_PATH.split(self.url, 1)
|
||||||
|
|
||||||
|
if was_uni or '%' in parts[0]:
|
||||||
|
if not was_uni:
|
||||||
|
scheme_dom = urllib.unquote_plus(parts[0])
|
||||||
|
else:
|
||||||
|
scheme_dom = parts[0]
|
||||||
|
|
||||||
|
scheme_dom = scheme_dom.rsplit('/', 1)
|
||||||
|
|
||||||
|
dom = scheme_dom[-1]
|
||||||
|
|
||||||
|
dom = dom.decode('utf-8', 'ignore')
|
||||||
|
dom = dom.encode('idna')
|
||||||
|
|
||||||
|
if len(scheme_dom) > 1:
|
||||||
|
self.url = scheme_dom[0] + '/' + dom
|
||||||
|
else:
|
||||||
|
self.url = dom
|
||||||
|
|
||||||
|
if len(parts) > 1:
|
||||||
|
self.url += '/' + parts[1]
|
||||||
|
|
||||||
if not self._init_query(url):
|
|
||||||
if not self._init_replay(url):
|
|
||||||
raise Exception('Invalid WbUrl: ', url)
|
|
||||||
|
|
||||||
# protocol agnostic url -> http://
|
# protocol agnostic url -> http://
|
||||||
# no protocol -> http://
|
# no protocol -> http://
|
||||||
inx = self.url.find(':/')
|
inx = self.url.find(':/')
|
||||||
if inx < 0:
|
#if inx < 0:
|
||||||
# check for other partially encoded variants
|
# check for other partially encoded variants
|
||||||
m = self.PARTIAL_ENC_RX.match(self.url)
|
# m = self.PARTIAL_ENC_RX.match(self.url)
|
||||||
if m:
|
# if m:
|
||||||
len_ = len(m.group(0))
|
# len_ = len(m.group(0))
|
||||||
self.url = (urllib.unquote_plus(self.url[:len_]) +
|
# self.url = (urllib.unquote_plus(self.url[:len_]) +
|
||||||
self.url[len_:])
|
# self.url[len_:])
|
||||||
inx = self.url.find(':/')
|
# inx = self.url.find(':/')
|
||||||
|
|
||||||
if inx < 0:
|
if inx < 0:
|
||||||
self.url = self.DEFAULT_SCHEME + self.url
|
self.url = self.DEFAULT_SCHEME + self.url
|
||||||
|
Loading…
x
Reference in New Issue
Block a user