1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

wburl improved scheme detection: use regex to match acceptable scheme before :/, don't treat something like 'a.com/?x=http://' as having a scheme, update tests to check for this

This commit is contained in:
Ilya Kreymer 2016-09-20 15:40:05 -07:00
parent 9a3017bfcd
commit 1bb7aa01ce
2 changed files with 11 additions and 1 deletions

View File

@ -20,6 +20,9 @@ u"""
>>> repr(WbUrl('cs_/example.com'))
"('latest_replay', '', 'cs_', 'http://example.com', 'cs_/http://example.com')"
>>> repr(WbUrl('cs_/example.com/?foo=http://example.com/'))
"('latest_replay', '', 'cs_', 'http://example.com/?foo=http://example.com/', 'cs_/http://example.com/?foo=http://example.com/')"
>>> repr(WbUrl('im_/20130102.org'))
"('latest_replay', '', 'im_', 'http://20130102.org', 'im_/http://20130102.org')"

View File

@ -98,6 +98,8 @@ class WbUrl(BaseWbUrl):
FIRST_PATH = re.compile('(?<![:/])[/?](?![/])')
SCHEME_RX = re.compile('[a-zA-Z0-9+-.]+(:/)')
@staticmethod
def percent_encode_host(url):
@ -200,7 +202,12 @@ class WbUrl(BaseWbUrl):
# protocol agnostic url -> http://
# no protocol -> http://
inx = self.url.find(':/')
#inx = self.url.find('://')
inx = -1
m = self.SCHEME_RX.match(self.url)
if m:
inx = m.span(1)[0]
#if inx < 0:
# check for other partially encoded variants
# m = self.PARTIAL_ENC_RX.match(self.url)