From c996e70a6e9c89cbf5bb0f7e29ff9c3ac043aff5 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 29 Nov 2014 11:13:57 -0800 Subject: [PATCH] wburl: detect and decode partially encoded schemes in url, such as http%3A//, https%A2F2F// before handling further add additional tests for wburl --- pywb/framework/wbrequestresponse.py | 2 +- pywb/rewrite/test/test_wburl.py | 22 ++++++++++++++++++++++ pywb/rewrite/wburl.py | 11 +++++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index 808563ea..06970316 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -131,7 +131,7 @@ class WbRequest(object): if not self.wb_url: return - mime = self.env.get('CONTENT_TYPE').split(';')[0] + mime = self.env.get('CONTENT_TYPE', '').split(';')[0] length = self.env.get('CONTENT_LENGTH') stream = self.env['wsgi.input'] diff --git a/pywb/rewrite/test/test_wburl.py b/pywb/rewrite/test/test_wburl.py index bcad948e..b4d15b5d 100644 --- a/pywb/rewrite/test/test_wburl.py +++ b/pywb/rewrite/test/test_wburl.py @@ -26,6 +26,13 @@ >>> repr(WbUrl('https://example.com/xyz?a=%2f&b=%2E')) "('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', 'https://example.com/xyz?a=%2f&b=%2E')" +# Test scheme partially encoded urls +>>> repr(WbUrl('https%3A//example.com/')) +"('latest_replay', '', '', 'https://example.com/', 'https://example.com/')" + +>>> repr(WbUrl('2014/http%3A%2F%2Fexample.com/')) +"('replay', '2014', '', 'http://example.com/', '2014/http://example.com/')" + # Query Urls # ====================== >>> repr(WbUrl('*/http://example.com/abc?def=a')) @@ -57,6 +64,21 @@ >>> repr(WbUrl('/example.com/')) "('latest_replay', '', '', 'http://example.com/', 'http://example.com/')" +# Is_ Tests +>>> u = WbUrl('*/http://example.com/abc?def=a*') +>>> u.is_url_query() +True + +>>> u.is_query() +True + +>>> u2 = WbUrl('20130102im_/https:/example.com') +>>> u2.is_embed +True + +>>> u2.is_replay() +True + # Error Urls # ====================== diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py index 91d36455..5421a1de 100644 --- a/pywb/rewrite/wburl.py +++ b/pywb/rewrite/wburl.py @@ -85,6 +85,9 @@ class WbUrl(BaseWbUrl): REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.+)$') DEFAULT_SCHEME = 'http://' + + PARTIAL_ENC_RX = re.compile('(https?%3A)?(%2F%2F)?', re.I) + # ====================== def __init__(self, url): @@ -99,6 +102,14 @@ class WbUrl(BaseWbUrl): # protocol agnostic url -> http:// # no protocol -> http:// inx = self.url.find(':/') + if inx < 0: + # check for other partially encoded variants + m = self.PARTIAL_ENC_RX.match(self.url) + if m: + len_ = len(m.group(0)) + self.url = urllib.unquote_plus(self.url[:len_]) + self.url[len_:] + inx = self.url.find(':/') + if inx < 0: self.url = self.DEFAULT_SCHEME + self.url else: