mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
archiveurl: add support for url_query, format modifier for more unit tests
archivalrouter: flesh out router seperately indexreader: RemoteCDXServer reader unit tests for req/resp wbapp -- cdx output for query, urlquery, replay and latest_replay!
This commit is contained in:
parent
5d42cc0cac
commit
c8d2271e8a
28
pywb/archivalrouter.py
Normal file
28
pywb/archivalrouter.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
from refer_redirect import ReferRedirect
|
||||||
|
from wbrequestresponse import WbRequest, WbResponse
|
||||||
|
|
||||||
|
class ArchivalRequestRouter:
|
||||||
|
def __init__(self, mappings, hostpaths=None):
|
||||||
|
self.mappings = mappings
|
||||||
|
self.fallback = ReferRedirect(hostpaths)
|
||||||
|
|
||||||
|
def parse_request(self, env):
|
||||||
|
request_uri = env['REQUEST_URI']
|
||||||
|
|
||||||
|
for key, value in self.mappings.iteritems():
|
||||||
|
if request_uri.startswith(key):
|
||||||
|
return value, WbRequest.prefix_request(env, key, request_uri)
|
||||||
|
|
||||||
|
return self.fallback, WbRequest(env)
|
||||||
|
|
||||||
|
def handle_request(self, env):
|
||||||
|
handler, wbrequest = self.parse_request(env)
|
||||||
|
return handler.run(wbrequest)
|
||||||
|
|
||||||
|
def handle_exception(self, env, exc):
|
||||||
|
return WbResponse.text_response('Error: ' + str(exc), status = '400 Bad Request')
|
||||||
|
|
||||||
|
def handle_not_found(self, env):
|
||||||
|
return WbResponse.text_response('Not Found: ' + env['REQUEST_URI'], status = '404 Not Found')
|
||||||
|
|
||||||
|
|
@ -29,6 +29,15 @@ class archiveurl:
|
|||||||
>>> repr(archiveurl('/*/http://example.com/abc?def=a'))
|
>>> repr(archiveurl('/*/http://example.com/abc?def=a'))
|
||||||
"('query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a')"
|
"('query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a')"
|
||||||
|
|
||||||
|
>>> repr(archiveurl('/*/http://example.com/abc?def=a*'))
|
||||||
|
"('url_query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a*')"
|
||||||
|
|
||||||
|
>>> repr(archiveurl('/json/*/http://example.com/abc?def=a'))
|
||||||
|
"('query', '', 'json', 'http://example.com/abc?def=a', '/json/*/http://example.com/abc?def=a')"
|
||||||
|
|
||||||
|
>>> repr(archiveurl('/timemap-link/2011*/http://example.com/abc?def=a'))
|
||||||
|
"('query', '2011', 'timemap-link', 'http://example.com/abc?def=a', '/timemap-link/2011*/http://example.com/abc?def=a')"
|
||||||
|
|
||||||
|
|
||||||
# Error Urls
|
# Error Urls
|
||||||
# ======================
|
# ======================
|
||||||
@ -47,10 +56,11 @@ class archiveurl:
|
|||||||
|
|
||||||
# Regexs
|
# Regexs
|
||||||
# ======================
|
# ======================
|
||||||
QUERY_REGEX = re.compile('^/(\d*)\*/(.*)$')
|
QUERY_REGEX = re.compile('^/?([\w\-:]+)?/(\d*)\*/(.*)$')
|
||||||
REPLAY_REGEX = re.compile('^/(\d*)([a-z]{2}_)?/?(.*)$')
|
REPLAY_REGEX = re.compile('^/(\d*)([a-z]+_)?/?(.*)$')
|
||||||
|
|
||||||
QUERY = 'query'
|
QUERY = 'query'
|
||||||
|
URL_QUERY = 'url_query'
|
||||||
REPLAY = 'replay'
|
REPLAY = 'replay'
|
||||||
LATEST_REPLAY = 'latest_replay'
|
LATEST_REPLAY = 'latest_replay'
|
||||||
|
|
||||||
@ -88,9 +98,14 @@ class archiveurl:
|
|||||||
|
|
||||||
res = query.groups('')
|
res = query.groups('')
|
||||||
|
|
||||||
self.timestamp = res[0]
|
self.mod = res[0]
|
||||||
self.url = res[1]
|
self.timestamp = res[1]
|
||||||
self.type = archiveurl.QUERY
|
self.url = res[2]
|
||||||
|
if self.url.endswith('*'):
|
||||||
|
self.type = archiveurl.URL_QUERY
|
||||||
|
self.url = self.url[:-1]
|
||||||
|
else:
|
||||||
|
self.type = archiveurl.QUERY
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# Match replay regex
|
# Match replay regex
|
||||||
@ -115,8 +130,17 @@ class archiveurl:
|
|||||||
# Str Representation
|
# Str Representation
|
||||||
# ====================
|
# ====================
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
if self.type == archiveurl.QUERY:
|
if self.type == archiveurl.QUERY or self.type == archiveurl.URL_QUERY:
|
||||||
return "/*/" + self.url
|
tsmod = "/"
|
||||||
|
if self.mod:
|
||||||
|
tsmod += self.mod + "/"
|
||||||
|
if self.timestamp:
|
||||||
|
tsmod += self.timestamp
|
||||||
|
|
||||||
|
tsmod += "*/" + self.url
|
||||||
|
if self.type == archiveurl.URL_QUERY:
|
||||||
|
tsmod += "*"
|
||||||
|
return tsmod
|
||||||
else:
|
else:
|
||||||
tsmod = self.timestamp + self.mod
|
tsmod = self.timestamp + self.mod
|
||||||
if len(tsmod) > 0:
|
if len(tsmod) > 0:
|
||||||
|
74
pywb/indexreader.py
Normal file
74
pywb/indexreader.py
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
import urllib
|
||||||
|
import urllib2
|
||||||
|
|
||||||
|
class RemoteCDXServer:
|
||||||
|
"""
|
||||||
|
>>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2')
|
||||||
|
>>> pprint(vars(x[0]))
|
||||||
|
{'digest': 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA',
|
||||||
|
'filename': 'DJ_crawl2.20020401123359-c/DJ_crawl3.20020120141301.arc.gz',
|
||||||
|
'length': '1792',
|
||||||
|
'mimetype': 'text/html',
|
||||||
|
'offset': '49482198',
|
||||||
|
'original': 'http://example.com:80/',
|
||||||
|
'redirect': '-',
|
||||||
|
'robotflags': '-',
|
||||||
|
'statuscode': '200',
|
||||||
|
'timestamp': '20020120142510',
|
||||||
|
'urlkey': 'com,example)/'}
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, serverUrl):
|
||||||
|
self.serverUrl = serverUrl
|
||||||
|
|
||||||
|
def load(self, url, params = {}, parse_cdx = False, **kwvalues):
|
||||||
|
#url is required, must be passed explicitly!
|
||||||
|
params['url'] = url
|
||||||
|
params.update(**kwvalues)
|
||||||
|
|
||||||
|
urlparams = urllib.urlencode(params)
|
||||||
|
request = urllib2.Request(self.serverUrl, urlparams)
|
||||||
|
response = urllib2.urlopen(request)
|
||||||
|
|
||||||
|
if parse_cdx:
|
||||||
|
return map(CDXCaptureResult, response)
|
||||||
|
else:
|
||||||
|
return response
|
||||||
|
|
||||||
|
class InvalidCDXException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class CDXCaptureResult:
|
||||||
|
CDX_FORMATS = [["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
|
||||||
|
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"]]
|
||||||
|
|
||||||
|
def __init__(self, cdxline):
|
||||||
|
cdxline = cdxline.rstrip()
|
||||||
|
fields = cdxline.split(' ')
|
||||||
|
|
||||||
|
cdxformat = None
|
||||||
|
for i in CDXCaptureResult.CDX_FORMATS:
|
||||||
|
if len(i) == len(fields):
|
||||||
|
cdxformat = i
|
||||||
|
|
||||||
|
if not cdxformat:
|
||||||
|
raise InvalidCDXException('unknown %d-field cdx format' % len(fields))
|
||||||
|
|
||||||
|
for header, field in zip(cdxformat, fields):
|
||||||
|
setattr(self, header, field)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return str(vars(self))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Testing
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
from pprint import pprint
|
||||||
|
|
||||||
|
cdxserver = RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
|
||||||
|
|
||||||
|
doctest.testmod()
|
@ -1,43 +1,59 @@
|
|||||||
from wbrequestresponse import WbRequest, WbResponse
|
from wbrequestresponse import WbResponse
|
||||||
from refer_redirect import ReferRedirect
|
|
||||||
from archiveurl import archiveurl
|
from archiveurl import archiveurl
|
||||||
|
from archivalrouter import ArchivalRequestRouter
|
||||||
|
import indexreader
|
||||||
|
import json
|
||||||
|
|
||||||
class WBHandler:
|
class WBHandler:
|
||||||
def run(self, wbrequest):
|
def run(self, wbrequest):
|
||||||
wburl = archiveurl(wbrequest.wb_url)
|
wburl = archiveurl(wbrequest.wb_url)
|
||||||
return WbResponse.text_response(repr(wburl))
|
return WbResponse.text_response(repr(wburl))
|
||||||
|
|
||||||
class ArchivalParser:
|
class QueryHandler:
|
||||||
def __init__(self, mappings, hostpaths=None):
|
def __init__(self):
|
||||||
self.mappings = mappings
|
self.cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx')
|
||||||
self.fallback = ReferRedirect(hostpaths)
|
|
||||||
|
|
||||||
def find_handler(self, env):
|
@staticmethod
|
||||||
request_uri = env['REQUEST_URI']
|
def get_query_params(wburl):
|
||||||
|
print wburl.type
|
||||||
|
return {
|
||||||
|
|
||||||
for key, value in self.mappings.iteritems():
|
archiveurl.QUERY:
|
||||||
if request_uri.startswith(key):
|
{'collapseTime': '10', 'filter': '!statuscode:(500|502|504)', 'limit': '150000'},
|
||||||
env['WB_URL'] = request_uri[len(key)-1:]
|
|
||||||
env['WB_COLL'] = key[1:-1]
|
|
||||||
#print "Found: " + str(value) + " for " + key
|
|
||||||
return value
|
|
||||||
|
|
||||||
return self.fallback
|
archiveurl.URL_QUERY:
|
||||||
|
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': '100',
|
||||||
|
'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
|
||||||
|
},
|
||||||
|
|
||||||
def handle_request(self, env):
|
archiveurl.REPLAY:
|
||||||
handler = self.find_handler(env)
|
{'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': '10', 'closest': wburl.timestamp, 'resolveRevisits': True},
|
||||||
return handler.run(WbRequest(env))
|
|
||||||
|
|
||||||
def handle_exception(self, env, exc):
|
archiveurl.LATEST_REPLAY:
|
||||||
return WbResponse.text_response('Error: ' + str(exc), status = '400 Bad Request')
|
{'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
|
||||||
|
|
||||||
def handle_not_found(self, env):
|
}[wburl.type]
|
||||||
return WbResponse.text_response('Not Found: ' + env['REQUEST_URI'], status = '404 Not Found')
|
|
||||||
|
|
||||||
|
|
||||||
|
def run(self, wbrequest):
|
||||||
|
wburl = archiveurl(wbrequest.wb_url)
|
||||||
|
|
||||||
|
params = QueryHandler.get_query_params(wburl)
|
||||||
|
|
||||||
|
#parse_cdx = (wburl.mod == 'json')
|
||||||
|
cdxlines = self.cdxserver.load(wburl.url, params)
|
||||||
|
|
||||||
|
return WbResponse.text_stream(cdxlines)
|
||||||
|
|
||||||
|
#if parse_cdx:
|
||||||
|
# text = str("\n".join(map(str, cdxlines)))
|
||||||
|
# text = json.dumps(cdxlines, default=lambda o: o.__dict__)
|
||||||
|
#else:
|
||||||
|
# text = cdxlines
|
||||||
|
|
||||||
|
|
||||||
## ===========
|
## ===========
|
||||||
parser = ArchivalParser({'/web/': WBHandler()}, hostpaths = ['http://localhost:9090/'])
|
parser = ArchivalRequestRouter({'/web/': QueryHandler()}, hostpaths = ['http://localhost:9090/'])
|
||||||
## ===========
|
## ===========
|
||||||
|
|
||||||
|
|
||||||
|
@ -2,24 +2,56 @@
|
|||||||
#WB Request and Response
|
#WB Request and Response
|
||||||
|
|
||||||
class WbRequest:
|
class WbRequest:
|
||||||
def __init__(self, env):
|
"""
|
||||||
self.env = env
|
>>> WbRequest.prefix_request({'REQUEST_URI': '/save/_embed/example.com/?a=b'}, '/save/')
|
||||||
self.wb_url = env.get('WB_URL')
|
WbRequest(env, '/_embed/example.com/?a=b', 'save')
|
||||||
self.coll = env.get('WB_COLL')
|
"""
|
||||||
|
|
||||||
setattr(self, 'request_uri', env.get('REQUEST_URI'))
|
def __init__(self, env, request_uri = '', wb_url = '', coll = ''):
|
||||||
|
self.env = env
|
||||||
|
|
||||||
|
# if len(wb_url) == 0:
|
||||||
|
# wb_url = request_uri
|
||||||
|
|
||||||
|
setattr(self, 'wb_url', wb_url)
|
||||||
|
setattr(self, 'coll', coll)
|
||||||
|
|
||||||
|
setattr(self, 'request_uri', request_uri)
|
||||||
setattr(self, 'referrer', env.get('HTTP_REFERER'))
|
setattr(self, 'referrer', env.get('HTTP_REFERER'))
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def prefix_request(env, prefix, request_uri = ''):
|
||||||
|
if not request_uri:
|
||||||
|
request_uri = env.get('REQUEST_URI')
|
||||||
|
return WbRequest(env, request_uri, request_uri[len(prefix)-1:], coll = prefix[1:-1])
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return self.coll + " " + self.wb_url
|
return "WbRequest(env, '" + (self.wb_url) + "', '" + (self.coll) + "')"
|
||||||
|
|
||||||
|
|
||||||
class WbResponse:
|
class WbResponse:
|
||||||
|
"""
|
||||||
|
>>> WbResponse.text_response('Test')
|
||||||
|
{'status': '200 OK', 'body': ['Test'], 'headersList': [('Content-Type', 'text/plain')]}
|
||||||
|
|
||||||
|
>>> WbResponse.text_stream(['Test', 'Another'], '404')
|
||||||
|
{'status': '404', 'body': ['Test', 'Another'], 'headersList': [('Content-Type', 'text/plain')]}
|
||||||
|
|
||||||
|
>>> WbResponse.redir_response('http://example.com/otherfile')
|
||||||
|
{'status': '302 Redirect', 'body': [], 'headersList': [('Location', 'http://example.com/otherfile')]}
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, status, value = [], headersList = []):
|
def __init__(self, status, value = [], headersList = []):
|
||||||
self.status = status
|
self.status = status
|
||||||
self.body = value
|
self.body = value
|
||||||
self.headersList = headersList
|
self.headersList = headersList
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def text_stream(text, status = '200 OK'):
|
||||||
|
return WbResponse(status, value = text, headersList = [('Content-Type', 'text/plain')])
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def text_response(text, status = '200 OK'):
|
def text_response(text, status = '200 OK'):
|
||||||
return WbResponse(status, value = [text], headersList = [('Content-Type', 'text/plain')])
|
return WbResponse(status, value = [text], headersList = [('Content-Type', 'text/plain')])
|
||||||
@ -42,7 +74,12 @@ class WbResponse:
|
|||||||
start_response(self.status, self.headersList)
|
start_response(self.status, self.headersList)
|
||||||
return self.body
|
return self.body
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return str(vars(self))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user