diff --git a/pywb/rewrite/test/test_wburl.py b/pywb/rewrite/test/test_wburl.py index b4d15b5d..2e7475af 100644 --- a/pywb/rewrite/test/test_wburl.py +++ b/pywb/rewrite/test/test_wburl.py @@ -1,4 +1,7 @@ -""" +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +ur""" # Replay Urls # ====================== >>> repr(WbUrl('20131010000506/example.com')) @@ -33,6 +36,32 @@ >>> repr(WbUrl('2014/http%3A%2F%2Fexample.com/')) "('replay', '2014', '', 'http://example.com/', '2014/http://example.com/')" +# Test IDNs +>>> repr(WbUrl(u'http://пример.испытание')) +"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f', 'http://xn--e1afmkfd.xn--80akhbyknj4f')" + +>>> repr(WbUrl(u'https://пример.испытание/abc/')) +"('latest_replay', '', '', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc/', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc/')" + +>>> repr(WbUrl(u'//пример.испытание/abc/')) +"('latest_replay', '', '', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc/', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc/')" + +>>> repr(WbUrl(u'2014id_/https://пример.испытание/abc')) +"('replay', '2014', 'id_', 'https://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/https://xn--e1afmkfd.xn--80akhbyknj4f/abc')" + +# percent-encoded form (as sent by browser usually) +>>> repr(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc')) +"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://xn--e1afmkfd.xn--80akhbyknj4f/abc')" + +# percent-encoded form -- scheme relative +>>> repr(WbUrl('2014id_///' + quote_plus(u'пример.испытание'.encode('utf-8')) + '/abc')) +"('replay', '2014', 'id_', 'http://xn--e1afmkfd.xn--80akhbyknj4f/abc', '2014id_/http://xn--e1afmkfd.xn--80akhbyknj4f/abc')" + +# invalid: truncated and superfluous '%', ignore invalid (no exception) +>>> repr(WbUrl('2014id_/http://' + quote_plus(u'пример.испытание'.encode('utf-8'))[1:] + '%' + '/abc')) +"('replay', '2014', 'id_', 'http://xn--d0-olcluwd.xn--%-7sbpkb3ampk3g/abc', '2014id_/http://xn--d0-olcluwd.xn--%-7sbpkb3ampk3g/abc')" + + # Query Urls # ====================== >>> repr(WbUrl('*/http://example.com/abc?def=a')) @@ -102,7 +131,7 @@ Exception: ('Invalid WbUrl: ', '') """ from pywb.rewrite.wburl import WbUrl - +from urllib import quote_plus if __name__ == "__main__": import doctest diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py index c612f8c2..9436dc20 100644 --- a/pywb/rewrite/wburl.py +++ b/pywb/rewrite/wburl.py @@ -87,30 +87,61 @@ class WbUrl(BaseWbUrl): DEFAULT_SCHEME = 'http://' - PARTIAL_ENC_RX = re.compile('(https?%3A)?(%2F%2F)?', re.I) + #PARTIAL_ENC_RX = re.compile('(https?%3A)?(%2F%2F)?', re.I) + FIRST_PATH = re.compile('(? 1: + self.url = scheme_dom[0] + '/' + dom + else: + self.url = dom + + if len(parts) > 1: + self.url += '/' + parts[1] - if not self._init_query(url): - if not self._init_replay(url): - raise Exception('Invalid WbUrl: ', url) # protocol agnostic url -> http:// # no protocol -> http:// inx = self.url.find(':/') - if inx < 0: + #if inx < 0: # check for other partially encoded variants - m = self.PARTIAL_ENC_RX.match(self.url) - if m: - len_ = len(m.group(0)) - self.url = (urllib.unquote_plus(self.url[:len_]) + - self.url[len_:]) - inx = self.url.find(':/') + # m = self.PARTIAL_ENC_RX.match(self.url) + # if m: + # len_ = len(m.group(0)) + # self.url = (urllib.unquote_plus(self.url[:len_]) + + # self.url[len_:]) + # inx = self.url.find(':/') if inx < 0: self.url = self.DEFAULT_SCHEME + self.url