add wburlrewriter, ReferRedirect uses the rewriter

more refactoring, ReferRedirect moved into archivalrouter module wbrequest: parses from uri directly, keeps track of wburl and prefix
2025-03-15 00:03:28 +01:00 · 2013-12-20 14:54:41 -08:00 · 2013-12-20 14:54:41 -08:00 · 4cf4bf3bbb
commit 4cf4bf3bbb
parent 0a2b16407d
7 changed files with 289 additions and 160 deletions
--- a/pywb/archivalrouter.py
+++ b/pywb/archivalrouter.py
@ -1,22 +1,122 @@
-from refer_redirect import ReferRedirect
+import urlparse
 from wbrequestresponse import WbRequest, WbResponse
 from wbrequestresponse import WbRequest, WbResponse
 from wburlrewriter import ArchivalUrlRewriter
 #=================================================================
 # ArchivalRequestRouter -- route WB requests in archival mode
 #=================================================================
 class ArchivalRequestRouter:
-    def __init__(self, mappings, hostpaths=None):
+    def __init__(self, mappings, hostpaths = None, abs_path = True):
        self.mappings = mappings
        self.fallback = ReferRedirect(hostpaths)
        self.abs_path = abs_path
-    def parse_request(self, env):
+    def _parseRequest(self, env):
        request_uri = env['REQUEST_URI']
-        for key, value in self.mappings.iteritems():
+        for coll, handler in self.mappings.iteritems():
-            if request_uri.startswith(key):
+            rel_prefix = '/' + coll + '/'
-                return value, WbRequest.prefix_request(env, key, request_uri)
+            if request_uri.startswith(rel_prefix):
                #return value, ArchivalRequestRouter._prefix_request(env, key, request_uri)
                req = WbRequest(env,
                                request_uri = request_uri,
                                coll = coll,
                                wb_url = request_uri[len(coll) + 1:],
                                wb_prefix = self.getPrefix(env, rel_prefix))
                return handler, req
        return self.fallback, WbRequest(env)
-    def handle_request(self, env):
+    def handleRequest(self, env):
-        handler, wbrequest = self.parse_request(env)
+        handler, wbrequest = self._parseRequest(env)
        return handler.run(wbrequest)
    def getPrefix(self, env, rel_prefix):
        if self.abs_path:
            try:
                return env['wsgi.url_scheme'] + '://' + env['HTTP_HOST'] + rel_prefix
            except KeyError:
                return rel_prefix
        else:
            return rel_prefix
 #=================================================================
 # ReferRedirect -- redirect urls that have 'fallen through' based on the referrer settings
 #=================================================================
 class ReferRedirect:
    """
    >>> ReferRedirect('http://localhost:8080/').matchPrefixs
    ['http://localhost:8080/']
    >>> ReferRedirect(['http://example:9090/']).matchPrefixs
    ['http://example:9090/']
    >>> test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
    'http://localhost:8080/coll/20131010/http://example.com/path/other.html'
    >>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
    'http://localhost:8080/coll/20131010/http://example.com/other.html'
    >>> test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
    'http://localhost:8080/coll/20131010/http://example.com/other.html'
    >>> test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
    False
    """
    def __init__(self, matchPrefixs):
        if isinstance(matchPrefixs, list):
            self.matchPrefixs = matchPrefixs
        else:
            self.matchPrefixs = [matchPrefixs]
    def run(self, wbrequest):
        if wbrequest.referrer is None:
            return None
        if not any (wbrequest.referrer.startswith(i) for i in self.matchPrefixs):
            return None
        try:
            ref_split = urlparse.urlsplit(wbrequest.referrer)
            ref_path = ref_split.path[1:].split('/', 1)
            rewriter = ArchivalUrlRewriter('/' + ref_path[1], '/' + ref_path[0])
            rel_request_uri = wbrequest.request_uri[1:]
            #ref_wb_url = archiveurl('/' + ref_path[1])
            #ref_wb_url.url = urlparse.urljoin(ref_wb_url.url, wbrequest.request_uri[1:])
            #ref_wb_url.url = ref_wb_url.url.replace('../', '')
            #final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, ref_path[0] + str(ref_wb_url), '', ''))
            final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, rewriter.rewrite(rel_request_uri), '', ''))
        except Exception as e:
            raise e
        return WbResponse.redir_response(final_url)
 if __name__ == "__main__":
    import doctest
    def test_redir(matchHost, request_uri, referrer):
        env = {'REQUEST_URI': request_uri, 'HTTP_REFERER': referrer}
        redir = ReferRedirect(matchHost)
        req = WbRequest.parse(env)
        rep = redir.run(req)
        if not rep:
            return False
        return rep.get_header('Location')
    doctest.testmod()
--- a/pywb/indexreader.py
+++ b/pywb/indexreader.py
@ -2,6 +2,8 @@ import urllib
 import urllib2
 import wbexceptions
 from wbarchivalurl import ArchivalUrl
 class RemoteCDXServer:
    """
    >>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2')
@ -45,6 +47,27 @@ class RemoteCDXServer:
        else:
            return response
    @staticmethod
    def getQueryParams(wburl):
        return {
            ArchivalUrl.QUERY:
                {'collapseTime': '10', 'filter': '!statuscode:(500|502|504)', 'limit': '150000'},
            ArchivalUrl.URL_QUERY:
                {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': '100',
                 'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
                },
            ArchivalUrl.REPLAY:
                {'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': '10', 'closest': wburl.timestamp, 'resolveRevisits': True},
            ArchivalUrl.LATEST_REPLAY:
                {'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
        }[wburl.type]
 class CDXCaptureResult:
    CDX_FORMATS = [["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
                   ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"]]
--- a/pywb/refer_redirect.py
+++ b/pywb/refer_redirect.py
@ -1,76 +0,0 @@
 import urlparse
 from wbrequestresponse import WbRequest, WbResponse
 from archiveurl import archiveurl
 # Redirect urls that have 'fallen through' based on the referrer
 # settings
 class ReferRedirect:
    """
    >>> ReferRedirect('http://localhost:8080/').matchPrefixs
    ['http://localhost:8080/']
    >>> ReferRedirect(['http://example:9090/']).matchPrefixs
    ['http://example:9090/']
    >>> test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
    'http://localhost:8080/coll/20131010/http://example.com/path/other.html'
    >>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
    'http://localhost:8080/coll/20131010/http://example.com/other.html'
    >>> test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
    'http://localhost:8080/coll/20131010/http://example.com/other.html'
    >>> test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
    False
    """
    def __init__(self, matchPrefixs):
        if isinstance(matchPrefixs, list):
            self.matchPrefixs = matchPrefixs
        else:
            self.matchPrefixs = [matchPrefixs]
    def run(self, wbrequest):
        if wbrequest.referrer is None:
            return None
        if not any (wbrequest.referrer.startswith(i) for i in self.matchPrefixs):
            return None
        try:
            ref_split = urlparse.urlsplit(wbrequest.referrer)
            ref_path = ref_split.path[1:].split('/', 1)
            ref_wb_url = archiveurl('/' + ref_path[1])
            ref_wb_url.url = urlparse.urljoin(ref_wb_url.url, wbrequest.request_uri[1:])
            ref_wb_url.url = ref_wb_url.url.replace('../', '')
            final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, ref_path[0] + str(ref_wb_url), '', ''))
        except Exception as e:
            return None
        return WbResponse.redir_response(final_url)
 if __name__ == "__main__":
    import doctest
    def test_redir(matchHost, request_uri, referrer):
        env = {'REQUEST_URI': request_uri, 'HTTP_REFERER': referrer}
        redir = ReferRedirect(matchHost)
        req = WbRequest(env)
        rep = redir.run(req)
        if not rep:
            return False
        return rep.get_header('Location')
    doctest.testmod()
--- a/pywb/wbapp.py
+++ b/pywb/wbapp.py
@ -1,47 +1,28 @@
 from wbrequestresponse import WbResponse
 from archiveurl import archiveurl
 from archivalrouter import ArchivalRequestRouter
 import indexreader
 import json
 import wbexceptions
 import utils
 from wbrequestresponse import WbResponse
 from archivalrouter import ArchivalRequestRouter
 class EchoEnv:
    def run(self, wbrequest):
        return WbResponse.text_response(str(wbrequest.env))
 class WBHandler:
    def run(self, wbrequest):
-        wburl = archiveurl(wbrequest.wb_url)
+        return WbResponse.text_response(str(wbrequest))
        wbrequest.parsed_url = wburl
        return WbResponse.text_stream(str(vars(wburl)))
 class QueryHandler:
    def __init__(self):
        self.cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
    @staticmethod
    def get_query_params(wburl):
        return {
            archiveurl.QUERY:
                {'collapseTime': '10', 'filter': '!statuscode:(500|502|504)', 'limit': '150000'},
            archiveurl.URL_QUERY:
                {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': '100',
                 'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
                },
            archiveurl.REPLAY:
                {'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': '10', 'closest': wburl.timestamp, 'resolveRevisits': True},
            archiveurl.LATEST_REPLAY:
                {'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
        }[wburl.type]
    def run(self, wbrequest):
-        wburl = archiveurl(wbrequest.wb_url)
+        wburl = wbrequest.wb_url
        #wburl = wbresponse.body.parsed_url
-        params = QueryHandler.get_query_params(wburl)
+        params = self.cdxserver.getQueryParams(wburl)
        cdxlines = self.cdxserver.load(wburl.url, params)
@ -56,8 +37,10 @@ class QueryHandler:
 ## ===========
 parser = ArchivalRequestRouter(
-    {'/t1/' : WBHandler(),
+    {
-     '/t2/' : QueryHandler()
+     't0' : EchoEnv(),
     't1' : WBHandler(),
     't2' : QueryHandler()
    },
    hostpaths = ['http://localhost:9090/'])
 ## ===========
@ -67,7 +50,7 @@ def application(env, start_response):
    response = None
    try:
-        response = parser.handle_request(env)
+        response = parser.handleRequest(env)
        if not response:
            raise wbexceptions.NotFoundException(env['REQUEST_URI'] + ' was not found')
@ -76,11 +59,11 @@ def application(env, start_response):
        last_exc = e
        import traceback
        traceback.print_exc()
-        response = handle_exception(env, e)
+        response = handleException(env, e)
    return response(env, start_response)
-def handle_exception(env, exc):
+def handleException(env, exc):
    if hasattr(exc, 'status'):
        status = exc.status()
    else:
--- a/pywb/wbarchivalurl.py
+++ b/pywb/wbarchivalurl.py
@ -5,51 +5,51 @@ import rfc3987
 import wbexceptions
-# archiveurl : archivalurl representation for WB
+# ArchivalUrl : archivalurl representation for WB
-class archiveurl:
+class ArchivalUrl:
    """
    # Replay Urls
    # ======================
-    >>> repr(archiveurl('/20131010000506/example.com'))
+    >>> repr(ArchivalUrl('/20131010000506/example.com'))
    "('replay', '20131010000506', '', 'http://example.com', '/20131010000506/http://example.com')"
-    >>> repr(archiveurl('/20130102im_/https://example.com'))
+    >>> repr(ArchivalUrl('/20130102im_/https://example.com'))
    "('replay', '20130102', 'im_', 'https://example.com', '/20130102im_/https://example.com')"
-    >>> repr(archiveurl('/cs_/example.com'))
+    >>> repr(ArchivalUrl('/cs_/example.com'))
    "('latest_replay', '', 'cs_', 'http://example.com', '/cs_/http://example.com')"
-    >>> repr(archiveurl('/https://example.com/xyz'))
+    >>> repr(ArchivalUrl('/https://example.com/xyz'))
    "('latest_replay', '', '', 'https://example.com/xyz', '/https://example.com/xyz')"
    # Query Urls
    # ======================
-    >>> repr(archiveurl('/*/http://example.com/abc?def=a'))
+    >>> repr(ArchivalUrl('/*/http://example.com/abc?def=a'))
    "('query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a')"
-    >>> repr(archiveurl('/*/http://example.com/abc?def=a*'))
+    >>> repr(ArchivalUrl('/*/http://example.com/abc?def=a*'))
    "('url_query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a*')"
-    >>> repr(archiveurl('/json/*/http://example.com/abc?def=a'))
+    >>> repr(ArchivalUrl('/json/*/http://example.com/abc?def=a'))
    "('query', '', 'json', 'http://example.com/abc?def=a', '/json/*/http://example.com/abc?def=a')"
-    >>> repr(archiveurl('/timemap-link/2011*/http://example.com/abc?def=a'))
+    >>> repr(ArchivalUrl('/timemap-link/2011*/http://example.com/abc?def=a'))
    "('query', '2011', 'timemap-link', 'http://example.com/abc?def=a', '/timemap-link/2011*/http://example.com/abc?def=a')"
    # Error Urls
    # ======================
-    >>> x = archiveurl('abc')
+    >>> x = ArchivalUrl('abc')
    Traceback (most recent call last):
    RequestParseException: Invalid WB Request Url: abc
-    >>> x = archiveurl('/#$%#/')
+    >>> x = ArchivalUrl('/#$%#/')
    Traceback (most recent call last):
    BadUrlException: Bad Request Url: http://#$%#/
-    >>> x = archiveurl('/http://example.com:abc/')
+    >>> x = ArchivalUrl('/http://example.com:abc/')
    Traceback (most recent call last):
    BadUrlException: Bad Request Url: http://example.com:abc/
    """
@ -75,14 +75,14 @@ class archiveurl:
        self.timestamp = ''
        self.mod = ''
-        if not any (f(self, url) for f in [archiveurl._init_query, archiveurl._init_replay]):
+        if not any (f(self, url) for f in [ArchivalUrl._init_query, ArchivalUrl._init_replay]):
            raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)
        if len(self.url) == 0:
            raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url)
        if not self.url.startswith('//') and not '://' in self.url:
-            self.url = archiveurl.DEFAULT_SCHEME + self.url
+            self.url = ArchivalUrl.DEFAULT_SCHEME + self.url
        matcher = rfc3987.match(self.url, 'IRI')
@ -92,7 +92,7 @@ class archiveurl:
    # Match query regex
    # ======================
    def _init_query(self, url):
-        query = archiveurl.QUERY_REGEX.match(url)
+        query = ArchivalUrl.QUERY_REGEX.match(url)
        if not query:
            return None
@ -102,16 +102,16 @@ class archiveurl:
        self.timestamp = res[1]
        self.url = res[2]
        if self.url.endswith('*'):
-            self.type = archiveurl.URL_QUERY
+            self.type = ArchivalUrl.URL_QUERY
            self.url = self.url[:-1]
        else:
-            self.type = archiveurl.QUERY
+            self.type = ArchivalUrl.QUERY
        return True
    # Match replay regex
    # ======================
    def _init_replay(self, url):
-        replay = archiveurl.REPLAY_REGEX.match(url)
+        replay = ArchivalUrl.REPLAY_REGEX.match(url)
        if not replay:
            return None
@ -121,16 +121,16 @@ class archiveurl:
        self.mod = res[1]
        self.url = res[2]
        if self.timestamp:
-            self.type = archiveurl.REPLAY
+            self.type = ArchivalUrl.REPLAY
        else:
-            self.type = archiveurl.LATEST_REPLAY
+            self.type = ArchivalUrl.LATEST_REPLAY
        return True
    # Str Representation
    # ====================
    def __str__(self):
-        if self.type == archiveurl.QUERY or self.type == archiveurl.URL_QUERY:
+        if self.type == ArchivalUrl.QUERY or self.type == ArchivalUrl.URL_QUERY:
            tsmod = "/"
            if self.mod:
                tsmod += self.mod + "/"
@ -138,7 +138,7 @@ class archiveurl:
                tsmod += self.timestamp
            tsmod += "*/" + self.url
-            if self.type == archiveurl.URL_QUERY:
+            if self.type == ArchivalUrl.URL_QUERY:
                tsmod += "*"
            return tsmod
        else:
--- a/pywb/wbrequestresponse.py
+++ b/pywb/wbrequestresponse.py
@ -1,32 +1,79 @@
 from wbarchivalurl import ArchivalUrl
 #WB Request and Response
 class WbRequest:
    """
-    >>> WbRequest.prefix_request({'REQUEST_URI': '/save/_embed/example.com/?a=b'}, '/save/')
+    >>> WbRequest.parse({'REQUEST_URI': '/save/_embed/example.com/?a=b'})
-    WbRequest(env, '/_embed/example.com/?a=b', 'save')
+    {'wb_url': ('latest_replay', '', '', 'http://_embed/example.com/?a=b', '/http://_embed/example.com/?a=b'), 'coll': 'save', 'wb_prefix': '/save/', 'request_uri': '/save/_embed/example.com/?a=b'}
    >>> WbRequest.parse({'REQUEST_URI': '/2345/20101024101112im_/example.com/?b=c'})
    {'wb_url': ('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '/20101024101112im_/http://example.com/?b=c'), 'coll': '2345', 'wb_prefix': '/2345/', 'request_uri': '/2345/20101024101112im_/example.com/?b=c'}
    >>> WbRequest.parse({'REQUEST_URI': '/2010/example.com'})
    {'wb_url': ('latest_replay', '', '', 'http://example.com', '/http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
    >>> WbRequest.parse({'REQUEST_URI': '../example.com'})
    {'wb_url': ('latest_replay', '', '', 'http://example.com', '/http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'}
    """
    def __init__(self, env, request_uri = '', wb_url = '', coll = ''):
        self.env = env
 #       if len(wb_url) == 0:
 #           wb_url = request_uri
        setattr(self, 'wb_url', wb_url)
        setattr(self, 'coll', coll)
        setattr(self, 'request_uri', request_uri)
        setattr(self, 'referrer', env.get('HTTP_REFERER'))
    @staticmethod
-    def prefix_request(env, prefix, request_uri = ''):
+    def parse(env, request_uri = ''):
        if not request_uri:
            request_uri = env.get('REQUEST_URI')
-        return WbRequest(env, request_uri, request_uri[len(prefix)-1:], coll = prefix[1:-1])
+
        parts = request_uri.split('/', 2)
        # Has coll prefix
        if len(parts) == 3:
            wb_prefix = '/' + parts[1] + '/'
            wb_url = '/' + parts[2]
            coll = parts[1]
        # No Coll Prefix
        elif len(parts) == 2:
            wb_prefix = '/'
            wb_url = '/' + parts[1]
            coll = ''
        else:
            wb_prefix = '/'
            wb_url = parts[0]
            coll = ''
        return WbRequest(env, request_uri, wb_prefix, wb_url, coll)
    def __init__(self, env, request_uri, wb_prefix, wb_url, coll):
        self.env = env
        self.request_uri = request_uri if request_uri else env.get('REQUEST_URI')
        self.wb_prefix = wb_prefix
        self.wb_url = ArchivalUrl(wb_url)
        self.coll = coll
        self.referrer = env.get('HTTP_REFERER')
        self.is_ajax = self._is_ajax()
    def _is_ajax(self):
        value = self.env.get('HTTP_X_REQUESTED_WITH')
        if not value:
            return False
        if value.lower() == 'xmlhttprequest':
            return True
        if self.referrer and ('ajaxpipe' in self.env.get('QUERY_STRING')):
            return True
        return False
    def __repr__(self):
-        return "WbRequest(env, '" + (self.wb_url) + "', '" + (self.coll) + "')"
+        #return "WbRequest(env, '" + (self.wb_url) + "', '" + (self.coll) + "')"
        #return str(vars(self))
        varlist = vars(self)
        return str({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')})
 class WbResponse:
--- a/pywb/wburlrewriter.py
+++ b/pywb/wburlrewriter.py
@ -0,0 +1,52 @@
 import copy
 import urlparse
 from wbarchivalurl import ArchivalUrl
 class ArchivalUrlRewriter:
    """
    >>> test_rewrite('other.html', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
    'https://web.archive.org/web/20131010/http://example.com/path/other.html'
    >>> test_rewrite('./other.html', '/20130907*/http://example.com/path/page.html', '/coll/')
    '/coll/20130907*/http://example.com/path/other.html'
    >>> test_rewrite('../other.html', '/20131112im_/http://example.com/path/page.html', '/coll/')
    '/coll/20131112im_/http://example.com/other.html'
    >>> test_rewrite('../../other.html', '/*/http://example.com/index.html', 'localhost:8080/')
    'localhost:8080/*/http://example.com/other.html'
    >>> test_rewrite('../../other.html', '/2020/http://example.com/index.html', '/')
    '/2020/http://example.com/other.html'
    >>> test_rewrite('../../other.html', '/2020/http://example.com/index.html', '')
    '/2020/http://example.com/other.html'
      """
    def __init__(self, wburl_str, prefix):
        self.wburl_str = wburl_str
        self.prefix = prefix
        if self.prefix.endswith('/'):
            self.prefix = self.prefix[:-1]
    def rewrite(self, rel_url):
        if '../' in rel_url:
            wburl = ArchivalUrl(self.wburl_str)
            wburl.url = urlparse.urljoin(wburl.url, rel_url)
            wburl.url = wburl.url.replace('../', '')
            final_url = self.prefix + str(wburl)
        else:
            final_url = urlparse.urljoin(self.prefix + self.wburl_str, rel_url)
        return final_url
 if __name__ == "__main__":
    import doctest
    def test_rewrite(rel_url, base_url, prefix):
        rewriter = ArchivalUrlRewriter(base_url, prefix)
        return rewriter.rewrite(rel_url)
    doctest.testmod()