From c8d2271e8a748df1f137fe5872252f447287b47a Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 18 Dec 2013 18:52:52 -0800 Subject: [PATCH] archiveurl: add support for url_query, format modifier for more unit tests archivalrouter: flesh out router seperately indexreader: RemoteCDXServer reader unit tests for req/resp wbapp -- cdx output for query, urlquery, replay and latest_replay! --- pywb/archivalrouter.py | 28 +++++++++++++++ pywb/archiveurl.py | 38 ++++++++++++++++---- pywb/indexreader.py | 74 +++++++++++++++++++++++++++++++++++++++ pywb/wbapp.py | 62 ++++++++++++++++++++------------ pywb/wbrequestresponse.py | 49 ++++++++++++++++++++++---- 5 files changed, 215 insertions(+), 36 deletions(-) create mode 100644 pywb/archivalrouter.py create mode 100644 pywb/indexreader.py diff --git a/pywb/archivalrouter.py b/pywb/archivalrouter.py new file mode 100644 index 00000000..7f885f66 --- /dev/null +++ b/pywb/archivalrouter.py @@ -0,0 +1,28 @@ +from refer_redirect import ReferRedirect +from wbrequestresponse import WbRequest, WbResponse + +class ArchivalRequestRouter: + def __init__(self, mappings, hostpaths=None): + self.mappings = mappings + self.fallback = ReferRedirect(hostpaths) + + def parse_request(self, env): + request_uri = env['REQUEST_URI'] + + for key, value in self.mappings.iteritems(): + if request_uri.startswith(key): + return value, WbRequest.prefix_request(env, key, request_uri) + + return self.fallback, WbRequest(env) + + def handle_request(self, env): + handler, wbrequest = self.parse_request(env) + return handler.run(wbrequest) + + def handle_exception(self, env, exc): + return WbResponse.text_response('Error: ' + str(exc), status = '400 Bad Request') + + def handle_not_found(self, env): + return WbResponse.text_response('Not Found: ' + env['REQUEST_URI'], status = '404 Not Found') + + diff --git a/pywb/archiveurl.py b/pywb/archiveurl.py index 626df774..5cdd1fe9 100644 --- a/pywb/archiveurl.py +++ b/pywb/archiveurl.py @@ -29,6 +29,15 @@ class archiveurl: >>> repr(archiveurl('/*/http://example.com/abc?def=a')) "('query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a')" + >>> repr(archiveurl('/*/http://example.com/abc?def=a*')) + "('url_query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a*')" + + >>> repr(archiveurl('/json/*/http://example.com/abc?def=a')) + "('query', '', 'json', 'http://example.com/abc?def=a', '/json/*/http://example.com/abc?def=a')" + + >>> repr(archiveurl('/timemap-link/2011*/http://example.com/abc?def=a')) + "('query', '2011', 'timemap-link', 'http://example.com/abc?def=a', '/timemap-link/2011*/http://example.com/abc?def=a')" + # Error Urls # ====================== @@ -47,10 +56,11 @@ class archiveurl: # Regexs # ====================== - QUERY_REGEX = re.compile('^/(\d*)\*/(.*)$') - REPLAY_REGEX = re.compile('^/(\d*)([a-z]{2}_)?/?(.*)$') + QUERY_REGEX = re.compile('^/?([\w\-:]+)?/(\d*)\*/(.*)$') + REPLAY_REGEX = re.compile('^/(\d*)([a-z]+_)?/?(.*)$') QUERY = 'query' + URL_QUERY = 'url_query' REPLAY = 'replay' LATEST_REPLAY = 'latest_replay' @@ -88,9 +98,14 @@ class archiveurl: res = query.groups('') - self.timestamp = res[0] - self.url = res[1] - self.type = archiveurl.QUERY + self.mod = res[0] + self.timestamp = res[1] + self.url = res[2] + if self.url.endswith('*'): + self.type = archiveurl.URL_QUERY + self.url = self.url[:-1] + else: + self.type = archiveurl.QUERY return True # Match replay regex @@ -115,8 +130,17 @@ class archiveurl: # Str Representation # ==================== def __str__(self): - if self.type == archiveurl.QUERY: - return "/*/" + self.url + if self.type == archiveurl.QUERY or self.type == archiveurl.URL_QUERY: + tsmod = "/" + if self.mod: + tsmod += self.mod + "/" + if self.timestamp: + tsmod += self.timestamp + + tsmod += "*/" + self.url + if self.type == archiveurl.URL_QUERY: + tsmod += "*" + return tsmod else: tsmod = self.timestamp + self.mod if len(tsmod) > 0: diff --git a/pywb/indexreader.py b/pywb/indexreader.py new file mode 100644 index 00000000..4ad8acb5 --- /dev/null +++ b/pywb/indexreader.py @@ -0,0 +1,74 @@ +import urllib +import urllib2 + +class RemoteCDXServer: + """ + >>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2') + >>> pprint(vars(x[0])) + {'digest': 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA', + 'filename': 'DJ_crawl2.20020401123359-c/DJ_crawl3.20020120141301.arc.gz', + 'length': '1792', + 'mimetype': 'text/html', + 'offset': '49482198', + 'original': 'http://example.com:80/', + 'redirect': '-', + 'robotflags': '-', + 'statuscode': '200', + 'timestamp': '20020120142510', + 'urlkey': 'com,example)/'} + """ + + def __init__(self, serverUrl): + self.serverUrl = serverUrl + + def load(self, url, params = {}, parse_cdx = False, **kwvalues): + #url is required, must be passed explicitly! + params['url'] = url + params.update(**kwvalues) + + urlparams = urllib.urlencode(params) + request = urllib2.Request(self.serverUrl, urlparams) + response = urllib2.urlopen(request) + + if parse_cdx: + return map(CDXCaptureResult, response) + else: + return response + +class InvalidCDXException(Exception): + pass + +class CDXCaptureResult: + CDX_FORMATS = [["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"], + ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"]] + + def __init__(self, cdxline): + cdxline = cdxline.rstrip() + fields = cdxline.split(' ') + + cdxformat = None + for i in CDXCaptureResult.CDX_FORMATS: + if len(i) == len(fields): + cdxformat = i + + if not cdxformat: + raise InvalidCDXException('unknown %d-field cdx format' % len(fields)) + + for header, field in zip(cdxformat, fields): + setattr(self, header, field) + + def __repr__(self): + return str(vars(self)) + + + +# Testing + + +if __name__ == "__main__": + import doctest + from pprint import pprint + + cdxserver = RemoteCDXServer('http://web.archive.org/cdx/search/cdx') + + doctest.testmod() diff --git a/pywb/wbapp.py b/pywb/wbapp.py index f3e464a3..b5139585 100644 --- a/pywb/wbapp.py +++ b/pywb/wbapp.py @@ -1,43 +1,59 @@ -from wbrequestresponse import WbRequest, WbResponse -from refer_redirect import ReferRedirect +from wbrequestresponse import WbResponse from archiveurl import archiveurl +from archivalrouter import ArchivalRequestRouter +import indexreader +import json class WBHandler: def run(self, wbrequest): wburl = archiveurl(wbrequest.wb_url) return WbResponse.text_response(repr(wburl)) -class ArchivalParser: - def __init__(self, mappings, hostpaths=None): - self.mappings = mappings - self.fallback = ReferRedirect(hostpaths) +class QueryHandler: + def __init__(self): + self.cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx') - def find_handler(self, env): - request_uri = env['REQUEST_URI'] + @staticmethod + def get_query_params(wburl): + print wburl.type + return { - for key, value in self.mappings.iteritems(): - if request_uri.startswith(key): - env['WB_URL'] = request_uri[len(key)-1:] - env['WB_COLL'] = key[1:-1] - #print "Found: " + str(value) + " for " + key - return value + archiveurl.QUERY: + {'collapseTime': '10', 'filter': '!statuscode:(500|502|504)', 'limit': '150000'}, - return self.fallback + archiveurl.URL_QUERY: + {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': '100', + 'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount', + }, - def handle_request(self, env): - handler = self.find_handler(env) - return handler.run(WbRequest(env)) + archiveurl.REPLAY: + {'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': '10', 'closest': wburl.timestamp, 'resolveRevisits': True}, - def handle_exception(self, env, exc): - return WbResponse.text_response('Error: ' + str(exc), status = '400 Bad Request') + archiveurl.LATEST_REPLAY: + {'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True} - def handle_not_found(self, env): - return WbResponse.text_response('Not Found: ' + env['REQUEST_URI'], status = '404 Not Found') + }[wburl.type] + def run(self, wbrequest): + wburl = archiveurl(wbrequest.wb_url) + + params = QueryHandler.get_query_params(wburl) + + #parse_cdx = (wburl.mod == 'json') + cdxlines = self.cdxserver.load(wburl.url, params) + + return WbResponse.text_stream(cdxlines) + + #if parse_cdx: + # text = str("\n".join(map(str, cdxlines))) + # text = json.dumps(cdxlines, default=lambda o: o.__dict__) + #else: + # text = cdxlines + ## =========== -parser = ArchivalParser({'/web/': WBHandler()}, hostpaths = ['http://localhost:9090/']) +parser = ArchivalRequestRouter({'/web/': QueryHandler()}, hostpaths = ['http://localhost:9090/']) ## =========== diff --git a/pywb/wbrequestresponse.py b/pywb/wbrequestresponse.py index 20321c47..d9189dd8 100644 --- a/pywb/wbrequestresponse.py +++ b/pywb/wbrequestresponse.py @@ -2,24 +2,56 @@ #WB Request and Response class WbRequest: - def __init__(self, env): - self.env = env - self.wb_url = env.get('WB_URL') - self.coll = env.get('WB_COLL') + """ + >>> WbRequest.prefix_request({'REQUEST_URI': '/save/_embed/example.com/?a=b'}, '/save/') + WbRequest(env, '/_embed/example.com/?a=b', 'save') + """ - setattr(self, 'request_uri', env.get('REQUEST_URI')) + def __init__(self, env, request_uri = '', wb_url = '', coll = ''): + self.env = env + + # if len(wb_url) == 0: + # wb_url = request_uri + + setattr(self, 'wb_url', wb_url) + setattr(self, 'coll', coll) + + setattr(self, 'request_uri', request_uri) setattr(self, 'referrer', env.get('HTTP_REFERER')) + + @staticmethod + def prefix_request(env, prefix, request_uri = ''): + if not request_uri: + request_uri = env.get('REQUEST_URI') + return WbRequest(env, request_uri, request_uri[len(prefix)-1:], coll = prefix[1:-1]) + def __repr__(self): - return self.coll + " " + self.wb_url + return "WbRequest(env, '" + (self.wb_url) + "', '" + (self.coll) + "')" class WbResponse: + """ + >>> WbResponse.text_response('Test') + {'status': '200 OK', 'body': ['Test'], 'headersList': [('Content-Type', 'text/plain')]} + + >>> WbResponse.text_stream(['Test', 'Another'], '404') + {'status': '404', 'body': ['Test', 'Another'], 'headersList': [('Content-Type', 'text/plain')]} + + >>> WbResponse.redir_response('http://example.com/otherfile') + {'status': '302 Redirect', 'body': [], 'headersList': [('Location', 'http://example.com/otherfile')]} + + """ + def __init__(self, status, value = [], headersList = []): self.status = status self.body = value self.headersList = headersList + @staticmethod + def text_stream(text, status = '200 OK'): + return WbResponse(status, value = text, headersList = [('Content-Type', 'text/plain')]) + @staticmethod def text_response(text, status = '200 OK'): return WbResponse(status, value = [text], headersList = [('Content-Type', 'text/plain')]) @@ -42,7 +74,12 @@ class WbResponse: start_response(self.status, self.headersList) return self.body + def __repr__(self): + return str(vars(self)) +if __name__ == "__main__": + import doctest + doctest.testmod()