From c8d2271e8a748df1f137fe5872252f447287b47a Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ilya@archive.org>
Date: Wed, 18 Dec 2013 18:52:52 -0800
Subject: [PATCH] archiveurl: add support for url_query, format modifier for
 more unit tests archivalrouter: flesh out router seperately indexreader:
 RemoteCDXServer reader unit tests for req/resp wbapp -- cdx output for query,
 urlquery, replay and latest_replay!

---
 pywb/archivalrouter.py    | 28 +++++++++++++++
 pywb/archiveurl.py        | 38 ++++++++++++++++----
 pywb/indexreader.py       | 74 +++++++++++++++++++++++++++++++++++++++
 pywb/wbapp.py             | 62 ++++++++++++++++++++------------
 pywb/wbrequestresponse.py | 49 ++++++++++++++++++++++----
 5 files changed, 215 insertions(+), 36 deletions(-)
 create mode 100644 pywb/archivalrouter.py
 create mode 100644 pywb/indexreader.py

diff --git a/pywb/archivalrouter.py b/pywb/archivalrouter.py
new file mode 100644
index 00000000..7f885f66
--- /dev/null
+++ b/pywb/archivalrouter.py
@@ -0,0 +1,28 @@
+from refer_redirect import ReferRedirect
+from wbrequestresponse import WbRequest, WbResponse
+
+class ArchivalRequestRouter:
+    def __init__(self, mappings, hostpaths=None):
+        self.mappings = mappings
+        self.fallback = ReferRedirect(hostpaths)
+
+    def parse_request(self, env):
+        request_uri = env['REQUEST_URI']
+
+        for key, value in self.mappings.iteritems():
+            if request_uri.startswith(key):
+                return value, WbRequest.prefix_request(env, key, request_uri)
+
+        return self.fallback, WbRequest(env)
+
+    def handle_request(self, env):
+        handler, wbrequest = self.parse_request(env)
+        return handler.run(wbrequest)
+
+    def handle_exception(self, env, exc):
+        return WbResponse.text_response('Error: ' + str(exc), status = '400 Bad Request')
+
+    def handle_not_found(self, env):
+        return WbResponse.text_response('Not Found: ' + env['REQUEST_URI'], status = '404 Not Found')
+
+
diff --git a/pywb/archiveurl.py b/pywb/archiveurl.py
index 626df774..5cdd1fe9 100644
--- a/pywb/archiveurl.py
+++ b/pywb/archiveurl.py
@@ -29,6 +29,15 @@ class archiveurl:
     >>> repr(archiveurl('/*/http://example.com/abc?def=a'))
     "('query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a')"
 
+    >>> repr(archiveurl('/*/http://example.com/abc?def=a*'))
+    "('url_query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a*')"
+
+    >>> repr(archiveurl('/json/*/http://example.com/abc?def=a'))
+    "('query', '', 'json', 'http://example.com/abc?def=a', '/json/*/http://example.com/abc?def=a')"
+
+    >>> repr(archiveurl('/timemap-link/2011*/http://example.com/abc?def=a'))
+    "('query', '2011', 'timemap-link', 'http://example.com/abc?def=a', '/timemap-link/2011*/http://example.com/abc?def=a')"
+
 
     # Error Urls
     # ======================
@@ -47,10 +56,11 @@ class archiveurl:
 
     # Regexs
     # ======================
-    QUERY_REGEX = re.compile('^/(\d*)\*/(.*)$')
-    REPLAY_REGEX = re.compile('^/(\d*)([a-z]{2}_)?/?(.*)$')
+    QUERY_REGEX = re.compile('^/?([\w\-:]+)?/(\d*)\*/(.*)$')
+    REPLAY_REGEX = re.compile('^/(\d*)([a-z]+_)?/?(.*)$')
 
     QUERY = 'query'
+    URL_QUERY = 'url_query'
     REPLAY = 'replay'
     LATEST_REPLAY = 'latest_replay'
 
@@ -88,9 +98,14 @@ class archiveurl:
 
         res = query.groups('')
 
-        self.timestamp = res[0]
-        self.url = res[1]
-        self.type = archiveurl.QUERY
+        self.mod = res[0]
+        self.timestamp = res[1]
+        self.url = res[2]
+        if self.url.endswith('*'):
+            self.type = archiveurl.URL_QUERY
+            self.url = self.url[:-1]
+        else:
+            self.type = archiveurl.QUERY
         return True
 
     # Match replay regex
@@ -115,8 +130,17 @@ class archiveurl:
     # Str Representation
     # ====================
     def __str__(self):
-        if self.type == archiveurl.QUERY:
-            return "/*/" + self.url
+        if self.type == archiveurl.QUERY or self.type == archiveurl.URL_QUERY:
+            tsmod = "/"
+            if self.mod:
+                tsmod += self.mod + "/"
+            if self.timestamp:
+                tsmod += self.timestamp
+
+            tsmod += "*/" + self.url
+            if self.type == archiveurl.URL_QUERY:
+                tsmod += "*"
+            return tsmod
         else:
             tsmod = self.timestamp + self.mod
             if len(tsmod) > 0:
diff --git a/pywb/indexreader.py b/pywb/indexreader.py
new file mode 100644
index 00000000..4ad8acb5
--- /dev/null
+++ b/pywb/indexreader.py
@@ -0,0 +1,74 @@
+import urllib
+import urllib2
+
+class RemoteCDXServer:
+    """
+    >>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2')
+    >>> pprint(vars(x[0]))
+    {'digest': 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA',
+     'filename': 'DJ_crawl2.20020401123359-c/DJ_crawl3.20020120141301.arc.gz',
+     'length': '1792',
+     'mimetype': 'text/html',
+     'offset': '49482198',
+     'original': 'http://example.com:80/',
+     'redirect': '-',
+     'robotflags': '-',
+     'statuscode': '200',
+     'timestamp': '20020120142510',
+     'urlkey': 'com,example)/'}
+    """
+
+    def __init__(self, serverUrl):
+        self.serverUrl = serverUrl
+
+    def load(self, url, params = {}, parse_cdx = False, **kwvalues):
+        #url is required, must be passed explicitly!
+        params['url'] = url
+        params.update(**kwvalues)
+
+        urlparams = urllib.urlencode(params)
+        request = urllib2.Request(self.serverUrl, urlparams)
+        response = urllib2.urlopen(request)
+
+        if parse_cdx:
+            return map(CDXCaptureResult, response)
+        else:
+            return response
+
+class InvalidCDXException(Exception):
+    pass
+
+class CDXCaptureResult:
+    CDX_FORMATS = [["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
+                   ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"]]
+
+    def __init__(self, cdxline):
+        cdxline = cdxline.rstrip()
+        fields = cdxline.split(' ')
+
+        cdxformat = None
+        for i in CDXCaptureResult.CDX_FORMATS:
+            if len(i) == len(fields):
+                cdxformat = i
+
+        if not cdxformat:
+            raise InvalidCDXException('unknown %d-field cdx format' % len(fields))
+
+        for header, field in zip(cdxformat, fields):
+            setattr(self, header, field)
+
+    def __repr__(self):
+        return str(vars(self))
+
+
+
+# Testing
+
+
+if __name__ == "__main__":
+    import doctest
+    from pprint import pprint
+
+    cdxserver = RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
+
+    doctest.testmod()
diff --git a/pywb/wbapp.py b/pywb/wbapp.py
index f3e464a3..b5139585 100644
--- a/pywb/wbapp.py
+++ b/pywb/wbapp.py
@@ -1,43 +1,59 @@
-from wbrequestresponse import WbRequest, WbResponse
-from refer_redirect import ReferRedirect
+from wbrequestresponse import WbResponse
 from archiveurl import archiveurl
+from archivalrouter import ArchivalRequestRouter
+import indexreader
+import json
 
 class WBHandler:
     def run(self, wbrequest):
         wburl = archiveurl(wbrequest.wb_url)
         return WbResponse.text_response(repr(wburl))
 
-class ArchivalParser:
-    def __init__(self, mappings, hostpaths=None):
-        self.mappings = mappings
-        self.fallback = ReferRedirect(hostpaths)
+class QueryHandler:
+    def __init__(self):
+        self.cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
 
-    def find_handler(self, env):
-        request_uri = env['REQUEST_URI']
+    @staticmethod
+    def get_query_params(wburl):
+        print wburl.type
+        return {
 
-        for key, value in self.mappings.iteritems():
-            if request_uri.startswith(key):
-                env['WB_URL'] = request_uri[len(key)-1:]
-                env['WB_COLL'] = key[1:-1]
-                #print "Found: " + str(value) + " for " + key
-                return value
+            archiveurl.QUERY:
+                {'collapseTime': '10', 'filter': '!statuscode:(500|502|504)', 'limit': '150000'},
 
-        return self.fallback
+            archiveurl.URL_QUERY:
+                {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': '100',
+                 'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
+                },
 
-    def handle_request(self, env):
-        handler = self.find_handler(env)
-        return handler.run(WbRequest(env))
+            archiveurl.REPLAY:
+                {'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': '10', 'closest': wburl.timestamp, 'resolveRevisits': True},
 
-    def handle_exception(self, env, exc):
-        return WbResponse.text_response('Error: ' + str(exc), status = '400 Bad Request')
+            archiveurl.LATEST_REPLAY:
+                {'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
 
-    def handle_not_found(self, env):
-        return WbResponse.text_response('Not Found: ' + env['REQUEST_URI'], status = '404 Not Found')
+        }[wburl.type]
 
 
+    def run(self, wbrequest):
+        wburl = archiveurl(wbrequest.wb_url)
+
+        params = QueryHandler.get_query_params(wburl)
+
+        #parse_cdx = (wburl.mod == 'json')
+        cdxlines = self.cdxserver.load(wburl.url, params)
+
+        return WbResponse.text_stream(cdxlines)
+
+        #if parse_cdx:
+        #    text = str("\n".join(map(str, cdxlines)))
+        #    text = json.dumps(cdxlines, default=lambda o: o.__dict__)
+        #else:
+        #    text = cdxlines
+
 
 ## ===========
-parser = ArchivalParser({'/web/': WBHandler()}, hostpaths = ['http://localhost:9090/'])
+parser = ArchivalRequestRouter({'/web/': QueryHandler()}, hostpaths = ['http://localhost:9090/'])
 ## ===========
 
 
diff --git a/pywb/wbrequestresponse.py b/pywb/wbrequestresponse.py
index 20321c47..d9189dd8 100644
--- a/pywb/wbrequestresponse.py
+++ b/pywb/wbrequestresponse.py
@@ -2,24 +2,56 @@
 #WB Request and Response
 
 class WbRequest:
-    def __init__(self, env):
-        self.env = env
-        self.wb_url = env.get('WB_URL')
-        self.coll = env.get('WB_COLL')
+    """
+    >>> WbRequest.prefix_request({'REQUEST_URI': '/save/_embed/example.com/?a=b'}, '/save/')
+    WbRequest(env, '/_embed/example.com/?a=b', 'save')
+    """
 
-        setattr(self, 'request_uri', env.get('REQUEST_URI'))
+    def __init__(self, env, request_uri = '', wb_url = '', coll = ''):
+        self.env = env
+
+ #       if len(wb_url) == 0:
+ #           wb_url = request_uri
+
+        setattr(self, 'wb_url', wb_url)
+        setattr(self, 'coll', coll)
+
+        setattr(self, 'request_uri', request_uri)
         setattr(self, 'referrer', env.get('HTTP_REFERER'))
 
+
+    @staticmethod
+    def prefix_request(env, prefix, request_uri = ''):
+        if not request_uri:
+            request_uri = env.get('REQUEST_URI')
+        return WbRequest(env, request_uri, request_uri[len(prefix)-1:], coll = prefix[1:-1])
+
     def __repr__(self):
-        return self.coll + " " + self.wb_url
+        return "WbRequest(env, '" + (self.wb_url) + "', '" + (self.coll) + "')"
 
 
 class WbResponse:
+    """
+    >>> WbResponse.text_response('Test')
+    {'status': '200 OK', 'body': ['Test'], 'headersList': [('Content-Type', 'text/plain')]}
+
+    >>> WbResponse.text_stream(['Test', 'Another'], '404')
+    {'status': '404', 'body': ['Test', 'Another'], 'headersList': [('Content-Type', 'text/plain')]}
+
+    >>> WbResponse.redir_response('http://example.com/otherfile')
+    {'status': '302 Redirect', 'body': [], 'headersList': [('Location', 'http://example.com/otherfile')]}
+
+    """
+
     def __init__(self, status, value = [], headersList = []):
         self.status = status
         self.body = value
         self.headersList = headersList
 
+    @staticmethod
+    def text_stream(text, status = '200 OK'):
+        return WbResponse(status, value = text, headersList = [('Content-Type', 'text/plain')])
+
     @staticmethod
     def text_response(text, status = '200 OK'):
         return WbResponse(status, value = [text], headersList = [('Content-Type', 'text/plain')])
@@ -42,7 +74,12 @@ class WbResponse:
         start_response(self.status, self.headersList)
         return self.body
 
+    def __repr__(self):
+        return str(vars(self))
 
 
 
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()