mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
wburl improved scheme detection: use regex to match acceptable scheme before :/, don't treat something like 'a.com/?x=http://' as having a scheme, update tests to check for this
This commit is contained in:
parent
9a3017bfcd
commit
1bb7aa01ce
@ -20,6 +20,9 @@ u"""
|
||||
>>> repr(WbUrl('cs_/example.com'))
|
||||
"('latest_replay', '', 'cs_', 'http://example.com', 'cs_/http://example.com')"
|
||||
|
||||
>>> repr(WbUrl('cs_/example.com/?foo=http://example.com/'))
|
||||
"('latest_replay', '', 'cs_', 'http://example.com/?foo=http://example.com/', 'cs_/http://example.com/?foo=http://example.com/')"
|
||||
|
||||
>>> repr(WbUrl('im_/20130102.org'))
|
||||
"('latest_replay', '', 'im_', 'http://20130102.org', 'im_/http://20130102.org')"
|
||||
|
||||
|
@ -98,6 +98,8 @@ class WbUrl(BaseWbUrl):
|
||||
|
||||
FIRST_PATH = re.compile('(?<![:/])[/?](?![/])')
|
||||
|
||||
SCHEME_RX = re.compile('[a-zA-Z0-9+-.]+(:/)')
|
||||
|
||||
|
||||
@staticmethod
|
||||
def percent_encode_host(url):
|
||||
@ -200,7 +202,12 @@ class WbUrl(BaseWbUrl):
|
||||
|
||||
# protocol agnostic url -> http://
|
||||
# no protocol -> http://
|
||||
inx = self.url.find(':/')
|
||||
#inx = self.url.find('://')
|
||||
inx = -1
|
||||
m = self.SCHEME_RX.match(self.url)
|
||||
if m:
|
||||
inx = m.span(1)[0]
|
||||
|
||||
#if inx < 0:
|
||||
# check for other partially encoded variants
|
||||
# m = self.PARTIAL_ENC_RX.match(self.url)
|
||||
|
Loading…
x
Reference in New Issue
Block a user