1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

wburl: detect and decode partially encoded schemes in url, such as http%3A//,

https%A2F2F// before handling further
add additional tests for wburl
This commit is contained in:
Ilya Kreymer 2014-11-29 11:13:57 -08:00
parent d7eb40af20
commit c996e70a6e
3 changed files with 34 additions and 1 deletions

View File

@ -131,7 +131,7 @@ class WbRequest(object):
if not self.wb_url:
return
mime = self.env.get('CONTENT_TYPE').split(';')[0]
mime = self.env.get('CONTENT_TYPE', '').split(';')[0]
length = self.env.get('CONTENT_LENGTH')
stream = self.env['wsgi.input']

View File

@ -26,6 +26,13 @@
>>> repr(WbUrl('https://example.com/xyz?a=%2f&b=%2E'))
"('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', 'https://example.com/xyz?a=%2f&b=%2E')"
# Test scheme partially encoded urls
>>> repr(WbUrl('https%3A//example.com/'))
"('latest_replay', '', '', 'https://example.com/', 'https://example.com/')"
>>> repr(WbUrl('2014/http%3A%2F%2Fexample.com/'))
"('replay', '2014', '', 'http://example.com/', '2014/http://example.com/')"
# Query Urls
# ======================
>>> repr(WbUrl('*/http://example.com/abc?def=a'))
@ -57,6 +64,21 @@
>>> repr(WbUrl('/example.com/'))
"('latest_replay', '', '', 'http://example.com/', 'http://example.com/')"
# Is_ Tests
>>> u = WbUrl('*/http://example.com/abc?def=a*')
>>> u.is_url_query()
True
>>> u.is_query()
True
>>> u2 = WbUrl('20130102im_/https:/example.com')
>>> u2.is_embed
True
>>> u2.is_replay()
True
# Error Urls
# ======================

View File

@ -85,6 +85,9 @@ class WbUrl(BaseWbUrl):
REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.+)$')
DEFAULT_SCHEME = 'http://'
PARTIAL_ENC_RX = re.compile('(https?%3A)?(%2F%2F)?', re.I)
# ======================
def __init__(self, url):
@ -99,6 +102,14 @@ class WbUrl(BaseWbUrl):
# protocol agnostic url -> http://
# no protocol -> http://
inx = self.url.find(':/')
if inx < 0:
# check for other partially encoded variants
m = self.PARTIAL_ENC_RX.match(self.url)
if m:
len_ = len(m.group(0))
self.url = urllib.unquote_plus(self.url[:len_]) + self.url[len_:]
inx = self.url.find(':/')
if inx < 0:
self.url = self.DEFAULT_SCHEME + self.url
else: