wburl: detect and decode partially encoded schemes in url, such as http%3A//,

https%A2F2F// before handling further add additional tests for wburl
2025-03-15 00:03:28 +01:00 · 2014-11-29 11:13:57 -08:00 · 2014-11-29 11:13:57 -08:00 · c996e70a6e
commit c996e70a6e
parent d7eb40af20
3 changed files with 34 additions and 1 deletions
--- a/pywb/framework/wbrequestresponse.py
+++ b/pywb/framework/wbrequestresponse.py
@ -131,7 +131,7 @@ class WbRequest(object):
        if not self.wb_url:
            return

-        mime = self.env.get('CONTENT_TYPE').split(';')[0]
+        mime = self.env.get('CONTENT_TYPE', '').split(';')[0]
        length = self.env.get('CONTENT_LENGTH')
        stream = self.env['wsgi.input']

--- a/pywb/rewrite/test/test_wburl.py
+++ b/pywb/rewrite/test/test_wburl.py
@ -26,6 +26,13 @@
 >>> repr(WbUrl('https://example.com/xyz?a=%2f&b=%2E'))
 "('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', 'https://example.com/xyz?a=%2f&b=%2E')"

+# Test scheme partially encoded urls
+>>> repr(WbUrl('https%3A//example.com/'))
+"('latest_replay', '', '', 'https://example.com/', 'https://example.com/')"
+
+>>> repr(WbUrl('2014/http%3A%2F%2Fexample.com/'))
+"('replay', '2014', '', 'http://example.com/', '2014/http://example.com/')"
+
 # Query Urls
 # ======================
 >>> repr(WbUrl('*/http://example.com/abc?def=a'))
@ -57,6 +64,21 @@
 >>> repr(WbUrl('/example.com/'))
 "('latest_replay', '', '', 'http://example.com/', 'http://example.com/')"

+# Is_ Tests
+>>> u = WbUrl('*/http://example.com/abc?def=a*')
+>>> u.is_url_query()
+True
+
+>>> u.is_query()
+True
+
+>>> u2 = WbUrl('20130102im_/https:/example.com')
+>>> u2.is_embed
+True
+
+>>> u2.is_replay()
+True
+

 # Error Urls
 # ======================
--- a/pywb/rewrite/wburl.py
+++ b/pywb/rewrite/wburl.py
@ -85,6 +85,9 @@ class WbUrl(BaseWbUrl):
    REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.+)$')

    DEFAULT_SCHEME = 'http://'
+
+    PARTIAL_ENC_RX = re.compile('(https?%3A)?(%2F%2F)?', re.I)
+
    # ======================

    def __init__(self, url):
@ -99,6 +102,14 @@ class WbUrl(BaseWbUrl):
        # protocol agnostic url -> http://
        # no protocol -> http://
        inx = self.url.find(':/')
+        if inx < 0:
+            # check for other partially encoded variants
+            m = self.PARTIAL_ENC_RX.match(self.url)
+            if m:
+                len_ = len(m.group(0))
+                self.url = urllib.unquote_plus(self.url[:len_]) + self.url[len_:]
+                inx = self.url.find(':/')
+
        if inx < 0:
            self.url = self.DEFAULT_SCHEME + self.url
        else: