mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
wburl: detect and decode partially encoded schemes in url, such as http%3A//,
https%A2F2F// before handling further add additional tests for wburl
This commit is contained in:
parent
d7eb40af20
commit
c996e70a6e
@ -131,7 +131,7 @@ class WbRequest(object):
|
|||||||
if not self.wb_url:
|
if not self.wb_url:
|
||||||
return
|
return
|
||||||
|
|
||||||
mime = self.env.get('CONTENT_TYPE').split(';')[0]
|
mime = self.env.get('CONTENT_TYPE', '').split(';')[0]
|
||||||
length = self.env.get('CONTENT_LENGTH')
|
length = self.env.get('CONTENT_LENGTH')
|
||||||
stream = self.env['wsgi.input']
|
stream = self.env['wsgi.input']
|
||||||
|
|
||||||
|
@ -26,6 +26,13 @@
|
|||||||
>>> repr(WbUrl('https://example.com/xyz?a=%2f&b=%2E'))
|
>>> repr(WbUrl('https://example.com/xyz?a=%2f&b=%2E'))
|
||||||
"('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', 'https://example.com/xyz?a=%2f&b=%2E')"
|
"('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', 'https://example.com/xyz?a=%2f&b=%2E')"
|
||||||
|
|
||||||
|
# Test scheme partially encoded urls
|
||||||
|
>>> repr(WbUrl('https%3A//example.com/'))
|
||||||
|
"('latest_replay', '', '', 'https://example.com/', 'https://example.com/')"
|
||||||
|
|
||||||
|
>>> repr(WbUrl('2014/http%3A%2F%2Fexample.com/'))
|
||||||
|
"('replay', '2014', '', 'http://example.com/', '2014/http://example.com/')"
|
||||||
|
|
||||||
# Query Urls
|
# Query Urls
|
||||||
# ======================
|
# ======================
|
||||||
>>> repr(WbUrl('*/http://example.com/abc?def=a'))
|
>>> repr(WbUrl('*/http://example.com/abc?def=a'))
|
||||||
@ -57,6 +64,21 @@
|
|||||||
>>> repr(WbUrl('/example.com/'))
|
>>> repr(WbUrl('/example.com/'))
|
||||||
"('latest_replay', '', '', 'http://example.com/', 'http://example.com/')"
|
"('latest_replay', '', '', 'http://example.com/', 'http://example.com/')"
|
||||||
|
|
||||||
|
# Is_ Tests
|
||||||
|
>>> u = WbUrl('*/http://example.com/abc?def=a*')
|
||||||
|
>>> u.is_url_query()
|
||||||
|
True
|
||||||
|
|
||||||
|
>>> u.is_query()
|
||||||
|
True
|
||||||
|
|
||||||
|
>>> u2 = WbUrl('20130102im_/https:/example.com')
|
||||||
|
>>> u2.is_embed
|
||||||
|
True
|
||||||
|
|
||||||
|
>>> u2.is_replay()
|
||||||
|
True
|
||||||
|
|
||||||
|
|
||||||
# Error Urls
|
# Error Urls
|
||||||
# ======================
|
# ======================
|
||||||
|
@ -85,6 +85,9 @@ class WbUrl(BaseWbUrl):
|
|||||||
REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.+)$')
|
REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.+)$')
|
||||||
|
|
||||||
DEFAULT_SCHEME = 'http://'
|
DEFAULT_SCHEME = 'http://'
|
||||||
|
|
||||||
|
PARTIAL_ENC_RX = re.compile('(https?%3A)?(%2F%2F)?', re.I)
|
||||||
|
|
||||||
# ======================
|
# ======================
|
||||||
|
|
||||||
def __init__(self, url):
|
def __init__(self, url):
|
||||||
@ -99,6 +102,14 @@ class WbUrl(BaseWbUrl):
|
|||||||
# protocol agnostic url -> http://
|
# protocol agnostic url -> http://
|
||||||
# no protocol -> http://
|
# no protocol -> http://
|
||||||
inx = self.url.find(':/')
|
inx = self.url.find(':/')
|
||||||
|
if inx < 0:
|
||||||
|
# check for other partially encoded variants
|
||||||
|
m = self.PARTIAL_ENC_RX.match(self.url)
|
||||||
|
if m:
|
||||||
|
len_ = len(m.group(0))
|
||||||
|
self.url = urllib.unquote_plus(self.url[:len_]) + self.url[len_:]
|
||||||
|
inx = self.url.find(':/')
|
||||||
|
|
||||||
if inx < 0:
|
if inx < 0:
|
||||||
self.url = self.DEFAULT_SCHEME + self.url
|
self.url = self.DEFAULT_SCHEME + self.url
|
||||||
else:
|
else:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user