rename aurl -> archiveurl, add default scheme, test for empty url

2025-03-15 00:03:28 +01:00 · 2013-12-13 15:43:07 -08:00 · 2013-12-13 15:43:07 -08:00 · 5d42cc0cac
commit 5d42cc0cac
parent 6b78f59e49
4 changed files with 43 additions and 34 deletions
--- a/pywb/archiveurl.py
+++ b/pywb/archiveurl.py
@ -5,50 +5,56 @@ import rfc3987

 import wbexceptions

-# aurl : ArchivalUrl representation for WB
+# archiveurl : archivalurl representation for WB

-class aurl:
+class archiveurl:
    """
    # Replay Urls
    # ======================
-    >>> repr(aurl('/20131010000506/example.com'))
-    "('replay', '20131010000506', '', 'example.com', '/20131010000506/example.com')"
+    >>> repr(archiveurl('/20131010000506/example.com'))
+    "('replay', '20131010000506', '', 'http://example.com', '/20131010000506/http://example.com')"

-    >>> repr(aurl('/20130102im_/example.com'))
-    "('replay', '20130102', 'im_', 'example.com', '/20130102im_/example.com')"
+    >>> repr(archiveurl('/20130102im_/https://example.com'))
+    "('replay', '20130102', 'im_', 'https://example.com', '/20130102im_/https://example.com')"

-    >>> repr(aurl('/cs_/example.com'))
-    "('latest_replay', '', 'cs_', 'example.com', '/cs_/example.com')"
+    >>> repr(archiveurl('/cs_/example.com'))
+    "('latest_replay', '', 'cs_', 'http://example.com', '/cs_/http://example.com')"

-    >>> repr(aurl('/https://example.com/xyz'))
+    >>> repr(archiveurl('/https://example.com/xyz'))
    "('latest_replay', '', '', 'https://example.com/xyz', '/https://example.com/xyz')"


    # Query Urls
    # ======================
-    >>> repr(aurl('/*/http://example.com/abc?def=a'))
+    >>> repr(archiveurl('/*/http://example.com/abc?def=a'))
    "('query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a')"


    # Error Urls
    # ======================
-    >>> x = aurl('abc')
+    >>> x = archiveurl('abc')
    Traceback (most recent call last):
    RequestParseException: Invalid WB Request Url: abc

-    >>> x = aurl('/#$%#/')
+    >>> x = archiveurl('/#$%#/')
    Traceback (most recent call last):
-    BadUrlException: Bad Request Url: #$%#/
+    BadUrlException: Bad Request Url: http://#$%#/

-    >>> x = aurl('/http://example.com:abc/')
+    >>> x = archiveurl('/http://example.com:abc/')
    Traceback (most recent call last):
    BadUrlException: Bad Request Url: http://example.com:abc/
    """

    # Regexs
    # ======================
-    QUERY_REGEX = re.compile('^/(\d{1,14})?\*/(.*)$')
-    REPLAY_REGEX = re.compile('^/(\d{1,14})?([a-z]{2}_)?/?(.*)$')
+    QUERY_REGEX = re.compile('^/(\d*)\*/(.*)$')
+    REPLAY_REGEX = re.compile('^/(\d*)([a-z]{2}_)?/?(.*)$')
+
+    QUERY = 'query'
+    REPLAY = 'replay'
+    LATEST_REPLAY = 'latest_replay'
+
+    DEFAULT_SCHEME = 'http://'
    # ======================


@ -59,10 +65,16 @@ class aurl:
        self.timestamp = ''
        self.mod = ''

-        if not any (f(self, url) for f in [aurl._init_query, aurl._init_replay]):
+        if not any (f(self, url) for f in [archiveurl._init_query, archiveurl._init_replay]):
            raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)

-        matcher = rfc3987.match(self.url, 'IRI_reference')
+        if len(self.url) == 0:
+            raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)
+
+        if not self.url.startswith('//') and not '://' in self.url:
+            self.url = archiveurl.DEFAULT_SCHEME + self.url
+
+        matcher = rfc3987.match(self.url, 'IRI')

        if not matcher:
            raise wbexceptions.BadUrlException('Bad Request Url: ' + self.url)
@ -70,7 +82,7 @@ class aurl:
    # Match query regex
    # ======================
    def _init_query(self, url):
-        query = aurl.QUERY_REGEX.match(url)
+        query = archiveurl.QUERY_REGEX.match(url)
        if not query:
            return None

@ -78,13 +90,13 @@ class aurl:

        self.timestamp = res[0]
        self.url = res[1]
-        self.type = 'query'
+        self.type = archiveurl.QUERY
        return True

    # Match replay regex
    # ======================
    def _init_replay(self, url):
-        replay = aurl.REPLAY_REGEX.match(url)
+        replay = archiveurl.REPLAY_REGEX.match(url)
        if not replay:
            return None

@ -94,16 +106,16 @@ class aurl:
        self.mod = res[1]
        self.url = res[2]
        if self.timestamp:
-            self.type = 'replay'
+            self.type = archiveurl.REPLAY
        else:
-            self.type = 'latest_replay'
+            self.type = archiveurl.LATEST_REPLAY

        return True

    # Str Representation
    # ====================
    def __str__(self):
-        if self.type == 'query':
+        if self.type == archiveurl.QUERY:
            return "/*/" + self.url
        else:
            tsmod = self.timestamp + self.mod
@ -117,8 +129,4 @@ class aurl:

 if __name__ == "__main__":
    import doctest
-
-    #def print_test(self):
-    #    return self.type, self.timestamp, self.mod, self.url, str(self)
-
    doctest.testmod()
--- a/pywb/refer_redirect.py
+++ b/pywb/refer_redirect.py
@ -1,6 +1,7 @@
-import aurl
 import urlparse
 from wbrequestresponse import WbRequest, WbResponse
+from archiveurl import archiveurl
+

 # Redirect urls that have 'fallen through' based on the referrer
 # settings
@ -43,7 +44,7 @@ class ReferRedirect:
            ref_split = urlparse.urlsplit(wbrequest.referrer)
            ref_path = ref_split.path[1:].split('/', 1)

-            ref_wb_url = aurl.aurl('/' + ref_path[1])
+            ref_wb_url = archiveurl('/' + ref_path[1])

            ref_wb_url.url = urlparse.urljoin(ref_wb_url.url, wbrequest.request_uri[1:])
            ref_wb_url.url = ref_wb_url.url.replace('../', '')
--- a/pywb/wbapp.py
+++ b/pywb/wbapp.py
@ -1,10 +1,10 @@
 from wbrequestresponse import WbRequest, WbResponse
 from refer_redirect import ReferRedirect
-import aurl
+from archiveurl import archiveurl

 class WBHandler:
    def run(self, wbrequest):
-        wburl = aurl.aurl(wbrequest.wb_url)
+        wburl = archiveurl(wbrequest.wb_url)
        return WbResponse.text_response(repr(wburl))

 class ArchivalParser:
--- a/pywb/wbrequestresponse.py
+++ b/pywb/wbrequestresponse.py
@ -1,6 +1,6 @@
-#from werkzeug.wrappers import BaseRequest, BaseResponse, CommonRequestDescriptorsMixin, CommonResponseDescriptorsMixin

-# WbRequest
+#WB Request and Response
+
 class WbRequest:
    def __init__(self, env):
        self.env = env