1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

errors and timeouts reported back to the user via ResErrors header

add new /index, /resource access point system
This commit is contained in:
Ilya Kreymer 2016-03-02 18:13:13 -08:00
parent 1f3763d02c
commit 65e969a492
10 changed files with 333 additions and 143 deletions

View File

@ -13,9 +13,10 @@ from pywb.cdx.query import CDXQuery
from heapq import merge from heapq import merge
from collections import deque from collections import deque
from itertools import chain
from rezag.indexsource import FileIndexSource from rezag.indexsource import FileIndexSource
from pywb.utils.wbexception import NotFoundException from pywb.utils.wbexception import NotFoundException, WbException
import six import six
import glob import glob
@ -29,13 +30,10 @@ class BaseAggregator(object):
query = CDXQuery(params) query = CDXQuery(params)
self._set_src_params(params) self._set_src_params(params)
try: cdx_iter, errs = self.load_index(query.params)
cdx_iter = self.load_index(query.params)
except NotFoundException as nf:
cdx_iter = iter([])
cdx_iter = process_cdx(cdx_iter, query) cdx_iter = process_cdx(cdx_iter, query)
return cdx_iter return cdx_iter, dict(errs)
def _set_src_params(self, params): def _set_src_params(self, params):
src_params = {} src_params = {}
@ -60,16 +58,23 @@ class BaseAggregator(object):
params['_all_src_params'] = src_params params['_all_src_params'] = src_params
def load_child_source_list(self, name, source, params): def load_child_source_list(self, name, source, params):
return list(self.load_child_source(name, source, params)) res = self.load_child_source(name, source, params)
return list(res[0]), res[1]
def load_child_source(self, name, source, params): def load_child_source(self, name, source, params):
try: try:
_src_params = params['_all_src_params'].get(name) _src_params = params['_all_src_params'].get(name)
params['_src_params'] = _src_params params['_src_params'] = _src_params
cdx_iter = source.load_index(params) res = source.load_index(params)
except NotFoundException as nf: if isinstance(res, tuple):
print('Not found in ' + name) cdx_iter, err_list = res
else:
cdx_iter = res
err_list = []
except WbException as wbe:
#print('Not found in ' + name)
cdx_iter = iter([]) cdx_iter = iter([])
err_list = [(name, repr(wbe))]
def add_name(cdx): def add_name(cdx):
if cdx.get('source'): if cdx.get('source'):
@ -78,10 +83,13 @@ class BaseAggregator(object):
cdx['source'] = name cdx['source'] = name
return cdx return cdx
return (add_name(cdx) for cdx in cdx_iter) return (add_name(cdx) for cdx in cdx_iter), err_list
def load_index(self, params): def load_index(self, params):
iter_list = self._load_all(params) res_list = self._load_all(params)
iter_list = [res[0] for res in res_list]
err_list = chain(*[res[1] for res in res_list])
#optimization: if only a single entry (or empty) just load directly #optimization: if only a single entry (or empty) just load directly
if len(iter_list) <= 1: if len(iter_list) <= 1:
@ -89,7 +97,7 @@ class BaseAggregator(object):
else: else:
cdx_iter = merge(*(iter_list)) cdx_iter = merge(*(iter_list))
return cdx_iter return cdx_iter, err_list
def _on_source_error(self, name): #pragma: no cover def _on_source_error(self, name): #pragma: no cover
pass pass
@ -207,6 +215,7 @@ class GeventMixin(object):
if job.value is not None: if job.value is not None:
results.append(job.value) results.append(job.value)
else: else:
results.append((iter([]), [(name, 'timeout')]))
self._on_source_error(name) self._on_source_error(name)
return results return results
@ -247,7 +256,9 @@ class ConcurrentMixin(object):
results.append(job.result()) results.append(job.result())
for job in res_not_done: for job in res_not_done:
self._on_source_error(jobs[job]) name = jobs[job]
results.append((iter([]), [(name, 'timeout')]))
self._on_source_error(name)
return results return results

View File

@ -2,25 +2,49 @@ from rezag.inputrequest import DirectWSGIInputRequest, POSTInputRequest
from bottle import route, request, response, default_app, abort from bottle import route, request, response, default_app, abort
import bottle import bottle
from pywb.utils.wbexception import WbException
import traceback import traceback
import json import json
JSON_CT = 'application/json; charset=utf-8'
def err_handler(exc): def err_handler(exc):
response.status = exc.status_code response.status = exc.status_code
response.content_type = 'application/json' response.content_type = JSON_CT
return json.dumps({'message': exc.body}) err_msg = json.dumps({'message': exc.body})
response.headers['ResErrors'] = err_msg
return err_msg
def wrap_error(func): def wrap_error(func):
def wrap_func(*args, **kwargs): def wrap_func(*args, **kwargs):
try: try:
return func(*args, **kwargs) res, errs = func(*args, **kwargs)
except WbException as exc:
if bottle.debug: if res:
traceback.print_exc() if errs:
abort(exc.status(), exc.msg) response.headers['ResErrors'] = json.dumps(errs)
return res
last_exc = errs.pop('last_exc', None)
if last_exc:
if bottle.debug:
traceback.print_exc()
response.status = last_exc.status()
message = last_exc.msg
else:
response.status = 404
message = 'No Resource Found'
response.content_type = JSON_CT
res = {'message': message}
if errs:
res['errors'] = errs
err_msg = json.dumps(res)
response.headers['ResErrors'] = err_msg
return err_msg
except Exception as e: except Exception as e:
if bottle.debug: if bottle.debug:
traceback.print_exc() traceback.print_exc()
@ -32,35 +56,33 @@ def wrap_error(func):
route_dict = {} route_dict = {}
def add_route(path, handler): def add_route(path, handler):
@route(path, 'ANY') @route([path, path + '/<mode:path>'], 'ANY')
@wrap_error @wrap_error
def direct_input_request(): def direct_input_request(mode=''):
params = dict(request.query) params = dict(request.query)
params['mode'] = mode
params['_input_req'] = DirectWSGIInputRequest(request.environ) params['_input_req'] = DirectWSGIInputRequest(request.environ)
return handler(params) return handler(params)
@route(path + '/postreq', 'POST') @route([path + '/postreq', path + '/<mode:path>/postreq'], 'POST')
@wrap_error @wrap_error
def post_fullrequest(): def post_fullrequest(mode=''):
params = dict(request.query) params = dict(request.query)
params['mode'] = mode
params['_input_req'] = POSTInputRequest(request.environ) params['_input_req'] = POSTInputRequest(request.environ)
return handler(params) return handler(params)
global route_dict global route_dict
handler_dict = {'handler': handler.get_supported_modes()} handler_dict = handler.get_supported_modes()
route_dict[path] = handler_dict route_dict[path] = handler_dict
route_dict[path + '/postreq'] = handler_dict route_dict[path + '/postreq'] = handler_dict
@route('/') @route('/')
def list_routes(): def list_routes():
return route_dict return route_dict
application = default_app() application = default_app()
application.default_error_handler = err_handler application.default_error_handler = err_handler

View File

@ -39,12 +39,13 @@ class IndexHandler(object):
self.opts = opts or {} self.opts = opts or {}
def get_supported_modes(self): def get_supported_modes(self):
return dict(modes=['list_modes', 'list_sources', 'index']) return dict(modes=['list_sources', 'index'])
def _load_index_source(self, params): def _load_index_source(self, params):
url = params.get('url') url = params.get('url')
if not url: if not url:
raise BadRequestException('The "url" param is required') errs = dict(last_exc=BadRequestException('The "url" param is required'))
return None, errs
input_req = params.get('_input_req') input_req = params.get('_input_req')
if input_req: if input_req:
@ -55,21 +56,25 @@ class IndexHandler(object):
def __call__(self, params): def __call__(self, params):
mode = params.get('mode', 'index') mode = params.get('mode', 'index')
if mode == 'list_sources': if mode == 'list_sources':
return self.index_source.get_source_list(params) return self.index_source.get_source_list(params), {}
if mode == 'list_modes' or mode != 'index': if mode != 'index':
return self.get_supported_modes() return self.get_supported_modes(), {}
output = params.get('output', self.DEF_OUTPUT) output = params.get('output', self.DEF_OUTPUT)
fields = params.get('fields') fields = params.get('fields')
handler = self.OUTPUTS.get(output) handler = self.OUTPUTS.get(output)
if not handler: if not handler:
raise BadRequestException('output={0} not supported'.format(output)) errs = dict(last_exc=BadRequestException('output={0} not supported'.format(output)))
return None, errs
cdx_iter, errs = self._load_index_source(params)
if not cdx_iter:
return None, errs
cdx_iter = self._load_index_source(params)
res = handler(cdx_iter, fields) res = handler(cdx_iter, fields)
return res return res, errs
#============================================================================= #=============================================================================
@ -87,7 +92,10 @@ class ResourceHandler(IndexHandler):
if params.get('mode', 'resource') != 'resource': if params.get('mode', 'resource') != 'resource':
return super(ResourceHandler, self).__call__(params) return super(ResourceHandler, self).__call__(params)
cdx_iter = self._load_index_source(params) cdx_iter, errs = self._load_index_source(params)
if not cdx_iter:
return None, errs
last_exc = None last_exc = None
for cdx in cdx_iter: for cdx in cdx_iter:
@ -95,15 +103,15 @@ class ResourceHandler(IndexHandler):
try: try:
resp = loader(cdx, params) resp = loader(cdx, params)
if resp is not None: if resp is not None:
return resp return resp, errs
except WbException as e: except WbException as e:
last_exc = e last_exc = e
errs[str(loader)] = repr(e)
if last_exc: if last_exc:
raise last_exc errs['last_exc'] = last_exc
#raise ArchiveLoadFailed('Resource Found, could not be Loaded')
else: return None, errs
raise NotFoundException('No Resource Found')
#============================================================================= #=============================================================================
@ -121,20 +129,19 @@ class HandlerSeq(object):
self.handlers = handlers self.handlers = handlers
def get_supported_modes(self): def get_supported_modes(self):
return [] if self.handlers:
# return zip([self.handlers.get_supported_modes()] return self.handlers[0].get_supported_modes()
else:
return {}
def __call__(self, params): def __call__(self, params):
last_exc = None all_errs = {}
for handler in self.handlers: for handler in self.handlers:
try: res, errs = handler(params)
res = handler(params) all_errs.update(errs)
if res is not None: if res is not None:
return res return res, all_errs
except WbException as e:
last_exc = e return None, all_errs
if last_exc:
raise last_exc
else:
raise NotFoundException('No Resource Found')

View File

@ -58,6 +58,10 @@ class WARCPathLoader(object):
no_record_parse=True) no_record_parse=True)
self.cdx_source = cdx_source self.cdx_source = cdx_source
def cdx_index_source(self, *args, **kwargs):
cdx_iter, errs = self.cdx_source(*args, **kwargs)
return cdx_iter
def warc_paths(self): def warc_paths(self):
for path in self.paths: for path in self.paths:
def check(filename, cdx): def check(filename, cdx):
@ -83,7 +87,7 @@ class WARCPathLoader(object):
headers, payload = (self.resolve_loader. headers, payload = (self.resolve_loader.
load_headers_and_payload(cdx, load_headers_and_payload(cdx,
failed_files, failed_files,
self.cdx_source)) self.cdx_index_source))
record = payload record = payload
@ -102,6 +106,9 @@ class WARCPathLoader(object):
res = StreamIter(record.stream) res = StreamIter(record.stream)
return res return res
def __str__(self):
return 'WARCPathLoader'
#============================================================================= #=============================================================================
class HeaderRecorder(BaseRecorder): class HeaderRecorder(BaseRecorder):
@ -200,3 +207,7 @@ class LiveWebLoader(object):
if not id_: if not id_:
id_ = uuid.uuid1() id_ = uuid.uuid1()
return '<urn:uuid:{0}>'.format(id_) return '<urn:uuid:{0}>'.format(id_)
def __str__(self):
return 'LiveWebLoader'

View File

@ -2,7 +2,7 @@ import re
import six import six
from pywb.utils.timeutils import timestamp_to_http_date from pywb.utils.timeutils import timestamp_to_http_date
from pywb.utils.wbexception import BadRequestException
LINK_SPLIT = re.compile(',\s*(?=[<])') LINK_SPLIT = re.compile(',\s*(?=[<])')
LINK_SEG_SPLIT = re.compile(';\s*') LINK_SEG_SPLIT = re.compile(';\s*')
@ -10,6 +10,11 @@ LINK_URL = re.compile('<(.*)>')
LINK_PROP = re.compile('([\w]+)="([^"]+)') LINK_PROP = re.compile('([\w]+)="([^"]+)')
#=================================================================
class MementoException(BadRequestException):
pass
#================================================================= #=================================================================
class MementoUtils(object): class MementoUtils(object):
@staticmethod @staticmethod
@ -22,7 +27,7 @@ class MementoUtils(object):
props = LINK_SEG_SPLIT.split(link) props = LINK_SEG_SPLIT.split(link)
m = LINK_URL.match(props[0]) m = LINK_URL.match(props[0])
if not m: if not m:
raise Exception('Invalid Link Url: ' + props[0]) raise MementoException('Invalid Link Url: ' + props[0])
result = dict(url=m.group(1)) result = dict(url=m.group(1))
key = '' key = ''
@ -31,7 +36,7 @@ class MementoUtils(object):
for prop in props[1:]: for prop in props[1:]:
m = LINK_PROP.match(prop) m = LINK_PROP.match(prop)
if not m: if not m:
raise Exception('Invalid prop ' + prop) raise MementoException('Invalid prop ' + prop)
name = m.group(1) name = m.group(1)
value = m.group(2) value = m.group(2)

View File

@ -59,43 +59,47 @@ def to_json_list(cdxlist, fields=['timestamp', 'load_url', 'filename', 'source']
def test_agg_no_coll_set(): def test_agg_no_coll_set():
res = dir_loader(dict(url='example.com/')) res, errs = dir_loader(dict(url='example.com/'))
assert(to_json_list(res) == []) assert(to_json_list(res) == [])
assert(errs == {})
def test_agg_collA_found(): def test_agg_collA_found():
res = dir_loader({'url': 'example.com/', 'param.coll': 'A'}) res, errs = dir_loader({'url': 'example.com/', 'param.coll': 'A'})
exp = [{'source': 'colls/A/indexes', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}] exp = [{'source': 'colls/A/indexes', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}]
assert(to_json_list(res) == exp) assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_collB(): def test_agg_collB():
res = dir_loader({'url': 'example.com/', 'param.coll': 'B'}) res, errs = dir_loader({'url': 'example.com/', 'param.coll': 'B'})
exp = [] exp = []
assert(to_json_list(res) == exp) assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_collB_found(): def test_agg_collB_found():
res = dir_loader({'url': 'iana.org/', 'param.coll': 'B'}) res, errs = dir_loader({'url': 'iana.org/', 'param.coll': 'B'})
exp = [{'source': 'colls/B/indexes', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] exp = [{'source': 'colls/B/indexes', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
assert(to_json_list(res) == exp) assert(to_json_list(res) == exp)
assert(errs == {})
def test_extra_agg_collB(): def test_extra_agg_collB():
agg_source = SimpleAggregator({'dir': dir_loader}) agg_source = SimpleAggregator({'dir': dir_loader})
res = agg_source({'url': 'iana.org/', 'param.coll': 'B'}) res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'})
exp = [{'source': 'dir:colls/B/indexes', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] exp = [{'source': 'dir:colls/B/indexes', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
assert(to_json_list(res) == exp) assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_all_found_1(): def test_agg_all_found_1():
res = dir_loader({'url': 'iana.org/', 'param.coll': '*'}) res, errs = dir_loader({'url': 'iana.org/', 'param.coll': '*'})
exp = [ exp = [
{'source': 'colls/B/indexes', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}, {'source': 'colls/B/indexes', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'},
@ -104,10 +108,11 @@ def test_agg_all_found_1():
] ]
assert(to_json_list(res) == exp) assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_all_found_2(): def test_agg_all_found_2():
res = dir_loader({'url': 'example.com/', 'param.coll': '*'}) res, errs = dir_loader({'url': 'example.com/', 'param.coll': '*'})
exp = [ exp = [
{'source': 'colls/C/indexes', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, {'source': 'colls/C/indexes', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
@ -116,6 +121,7 @@ def test_agg_all_found_2():
] ]
assert(to_json_list(res) == exp) assert(to_json_list(res) == exp)
assert(errs == {})
@ -124,7 +130,7 @@ def test_agg_dir_and_memento():
'local': dir_loader} 'local': dir_loader}
agg_source = SimpleAggregator(sources) agg_source = SimpleAggregator(sources)
res = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6}) res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})
exp = [ exp = [
{'source': 'ia', 'timestamp': '20100513052358', 'load_url': 'http://web.archive.org/web/20100513052358id_/http://example.com/'}, {'source': 'ia', 'timestamp': '20100513052358', 'load_url': 'http://web.archive.org/web/20100513052358id_/http://example.com/'},
@ -136,23 +142,26 @@ def test_agg_dir_and_memento():
] ]
assert(to_json_list(res) == exp) assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_no_dir_1(): def test_agg_no_dir_1():
res = dir_loader({'url': 'example.com/', 'param.coll': 'X'}) res, errs = dir_loader({'url': 'example.com/', 'param.coll': 'X'})
exp = [] exp = []
assert(to_json_list(res) == exp) assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_no_dir_2(): def test_agg_no_dir_2():
loader = DirectoryIndexSource(root_dir, '') loader = DirectoryIndexSource(root_dir, '')
res = loader({'url': 'example.com/', 'param.coll': 'X'}) res, errs = loader({'url': 'example.com/', 'param.coll': 'X'})
exp = [] exp = []
assert(to_json_list(res) == exp) assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_dir_sources_1(): def test_agg_dir_sources_1():

View File

@ -50,7 +50,7 @@ def setup_module(self):
handler2])) handler2]))
add_route('/empty', HandlerSeq([])) add_route('/empty', HandlerSeq([]))
add_route('/invalid', HandlerSeq(['foo'])) add_route('/invalid', DefaultResourceHandler([SimpleAggregator({'invalid': 'should not be a callable'})]))
application.debug = True application.debug = True
global testapp global testapp
@ -65,23 +65,49 @@ class TestResAgg(object):
def setup(self): def setup(self):
self.testapp = testapp self.testapp = testapp
def test_list_routes(self):
resp = self.testapp.get('/')
res = resp.json
assert set(res.keys()) == set(['/empty', '/empty/postreq',
'/fallback', '/fallback/postreq',
'/live', '/live/postreq',
'/many', '/many/postreq',
'/posttest', '/posttest/postreq',
'/seq', '/seq/postreq',
'/invalid', '/invalid/postreq'])
assert res['/fallback'] == {'modes': ['list_sources', 'index', 'resource']}
def test_list_handlers(self): def test_list_handlers(self):
resp = self.testapp.get('/many?mode=list_modes') resp = self.testapp.get('/many')
assert resp.json == {'modes': ['list_modes', 'list_sources', 'index', 'resource']} assert resp.json == {'modes': ['list_sources', 'index', 'resource']}
assert 'ResErrors' not in resp.headers
resp = self.testapp.get('/many?mode=other') resp = self.testapp.get('/many/other')
assert resp.json == {'modes': ['list_modes', 'list_sources', 'index', 'resource']} assert resp.json == {'modes': ['list_sources', 'index', 'resource']}
assert 'ResErrors' not in resp.headers
# defaults to resource, must specify url def test_list_errors(self):
resp = self.testapp.get('/many', status=400) # must specify url for index or resource
resp = self.testapp.get('/many/index', status=400)
assert resp.json == {'message': 'The "url" param is required'} assert resp.json == {'message': 'The "url" param is required'}
assert resp.text == resp.headers['ResErrors']
resp = self.testapp.get('/many/index', status=400)
assert resp.json == {'message': 'The "url" param is required'}
assert resp.text == resp.headers['ResErrors']
resp = self.testapp.get('/many/resource', status=400)
assert resp.json == {'message': 'The "url" param is required'}
assert resp.text == resp.headers['ResErrors']
def test_list_sources(self): def test_list_sources(self):
resp = self.testapp.get('/many?mode=list_sources') resp = self.testapp.get('/many/list_sources')
assert resp.json == {'sources': {'local': 'file_dir', 'ia': 'memento', 'rhiz': 'memento', 'live': 'live'}} assert resp.json == {'sources': {'local': 'file_dir', 'ia': 'memento', 'rhiz': 'memento', 'live': 'live'}}
assert 'ResErrors' not in resp.headers
def test_live_index(self): def test_live_index(self):
resp = self.testapp.get('/live?mode=index&url=http://httpbin.org/get&output=json') resp = self.testapp.get('/live/index?url=http://httpbin.org/get&output=json')
resp.charset = 'utf-8' resp.charset = 'utf-8'
res = to_json_list(resp.text) res = to_json_list(resp.text)
@ -91,7 +117,7 @@ class TestResAgg(object):
def test_live_resource(self): def test_live_resource(self):
headers = {'foo': 'bar'} headers = {'foo': 'bar'}
resp = self.testapp.get('/live?url=http://httpbin.org/get?foo=bar', headers=headers) resp = self.testapp.get('/live/resource?url=http://httpbin.org/get?foo=bar', headers=headers)
assert resp.headers['WARC-Coll'] == 'live' assert resp.headers['WARC-Coll'] == 'live'
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar' assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar'
@ -100,9 +126,10 @@ class TestResAgg(object):
assert b'HTTP/1.1 200 OK' in resp.body assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body assert b'"foo": "bar"' in resp.body
assert 'ResErrors' not in resp.headers
def test_live_post_resource(self): def test_live_post_resource(self):
resp = self.testapp.post('/live?url=http://httpbin.org/post', resp = self.testapp.post('/live/resource?url=http://httpbin.org/post',
OrderedDict([('foo', 'bar')])) OrderedDict([('foo', 'bar')]))
assert resp.headers['WARC-Coll'] == 'live' assert resp.headers['WARC-Coll'] == 'live'
@ -112,38 +139,45 @@ class TestResAgg(object):
assert b'HTTP/1.1 200 OK' in resp.body assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body assert b'"foo": "bar"' in resp.body
assert 'ResErrors' not in resp.headers
def test_agg_select_mem_1(self): def test_agg_select_mem_1(self):
resp = self.testapp.get('/many?url=http://vvork.com/&closest=20141001') resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20141001')
assert resp.headers['WARC-Coll'] == 'rhiz' assert resp.headers['WARC-Coll'] == 'rhiz'
assert resp.headers['WARC-Target-URI'] == 'http://www.vvork.com/' assert resp.headers['WARC-Target-URI'] == 'http://www.vvork.com/'
assert resp.headers['WARC-Date'] == '2014-10-06T18:43:57Z' assert resp.headers['WARC-Date'] == '2014-10-06T18:43:57Z'
assert b'HTTP/1.1 200 OK' in resp.body assert b'HTTP/1.1 200 OK' in resp.body
assert 'ResErrors' not in resp.headers
def test_agg_select_mem_2(self): def test_agg_select_mem_2(self):
resp = self.testapp.get('/many?url=http://vvork.com/&closest=20151231') resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20151231')
assert resp.headers['WARC-Coll'] == 'ia' assert resp.headers['WARC-Coll'] == 'ia'
assert resp.headers['WARC-Target-URI'] == 'http://vvork.com/' assert resp.headers['WARC-Target-URI'] == 'http://vvork.com/'
assert resp.headers['WARC-Date'] == '2016-01-10T13:48:55Z' assert resp.headers['WARC-Date'] == '2016-01-10T13:48:55Z'
assert b'HTTP/1.1 200 OK' in resp.body assert b'HTTP/1.1 200 OK' in resp.body
assert 'ResErrors' not in resp.headers
def test_agg_select_live(self): def test_agg_select_live(self):
resp = self.testapp.get('/many?url=http://vvork.com/&closest=2016') resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=2016')
assert resp.headers['WARC-Coll'] == 'live' assert resp.headers['WARC-Coll'] == 'live'
assert resp.headers['WARC-Target-URI'] == 'http://vvork.com/' assert resp.headers['WARC-Target-URI'] == 'http://vvork.com/'
assert resp.headers['WARC-Date'] != '' assert resp.headers['WARC-Date'] != ''
assert 'ResErrors' not in resp.headers
def test_agg_select_local(self): def test_agg_select_local(self):
resp = self.testapp.get('/many?url=http://iana.org/&closest=20140126200624') resp = self.testapp.get('/many/resource?url=http://iana.org/&closest=20140126200624')
assert resp.headers['WARC-Coll'] == 'local' assert resp.headers['WARC-Coll'] == 'local'
assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/' assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/'
assert resp.headers['WARC-Date'] == '2014-01-26T20:06:24Z' assert resp.headers['WARC-Date'] == '2014-01-26T20:06:24Z'
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}
def test_agg_select_local_postreq(self): def test_agg_select_local_postreq(self):
req_data = """\ req_data = """\
@ -153,12 +187,13 @@ User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (
Host: iana.org Host: iana.org
""" """
resp = self.testapp.post('/many/postreq?url=http://iana.org/&closest=20140126200624', req_data) resp = self.testapp.post('/many/resource/postreq?url=http://iana.org/&closest=20140126200624', req_data)
assert resp.headers['WARC-Coll'] == 'local' assert resp.headers['WARC-Coll'] == 'local'
assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/' assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/'
assert resp.headers['WARC-Date'] == '2014-01-26T20:06:24Z' assert resp.headers['WARC-Date'] == '2014-01-26T20:06:24Z'
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}
def test_agg_live_postreq(self): def test_agg_live_postreq(self):
req_data = """\ req_data = """\
@ -168,7 +203,7 @@ User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (
Host: httpbin.org Host: httpbin.org
""" """
resp = self.testapp.post('/many/postreq?url=http://httpbin.org/get?foo=bar&closest=now', req_data) resp = self.testapp.post('/many/resource/postreq?url=http://httpbin.org/get?foo=bar&closest=now', req_data)
assert resp.headers['WARC-Coll'] == 'live' assert resp.headers['WARC-Coll'] == 'live'
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar' assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar'
@ -177,6 +212,8 @@ Host: httpbin.org
assert b'HTTP/1.1 200 OK' in resp.body assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body assert b'"foo": "bar"' in resp.body
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://httpbin.org/get?foo=bar',)"}
def test_agg_post_resolve_postreq(self): def test_agg_post_resolve_postreq(self):
req_data = """\ req_data = """\
POST /post HTTP/1.1 POST /post HTTP/1.1
@ -188,7 +225,7 @@ content-type: application/x-www-form-urlencoded
foo=bar&test=abc""" foo=bar&test=abc"""
resp = self.testapp.post('/posttest/postreq?url=http://httpbin.org/post', req_data) resp = self.testapp.post('/posttest/resource/postreq?url=http://httpbin.org/post', req_data)
assert resp.headers['WARC-Coll'] == 'post' assert resp.headers['WARC-Coll'] == 'post'
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/post' assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/post'
@ -197,10 +234,12 @@ foo=bar&test=abc"""
assert b'"test": "abc"' in resp.body assert b'"test": "abc"' in resp.body
assert b'"url": "http://httpbin.org/post"' in resp.body assert b'"url": "http://httpbin.org/post"' in resp.body
assert 'ResErrors' not in resp.headers
def test_agg_post_resolve_fallback(self): def test_agg_post_resolve_fallback(self):
req_data = OrderedDict([('foo', 'bar'), ('test', 'abc')]) req_data = OrderedDict([('foo', 'bar'), ('test', 'abc')])
resp = self.testapp.post('/fallback?url=http://httpbin.org/post', req_data) resp = self.testapp.post('/fallback/resource?url=http://httpbin.org/post', req_data)
assert resp.headers['WARC-Coll'] == 'post' assert resp.headers['WARC-Coll'] == 'post'
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/post' assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/post'
@ -209,28 +248,37 @@ foo=bar&test=abc"""
assert b'"test": "abc"' in resp.body assert b'"test": "abc"' in resp.body
assert b'"url": "http://httpbin.org/post"' in resp.body assert b'"url": "http://httpbin.org/post"' in resp.body
assert 'ResErrors' not in resp.headers
def test_agg_seq_fallback_1(self): def test_agg_seq_fallback_1(self):
resp = self.testapp.get('/fallback?url=http://www.iana.org/') resp = self.testapp.get('/fallback/resource?url=http://www.iana.org/')
assert resp.headers['WARC-Coll'] == 'live' assert resp.headers['WARC-Coll'] == 'live'
assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/' assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/'
assert b'HTTP/1.1 200 OK' in resp.body assert b'HTTP/1.1 200 OK' in resp.body
assert 'ResErrors' not in resp.headers
def test_agg_seq_fallback_2(self): def test_agg_seq_fallback_2(self):
resp = self.testapp.get('/fallback?url=http://www.example.com/') resp = self.testapp.get('/fallback/resource?url=http://www.example.com/')
assert resp.headers['WARC-Coll'] == 'example' assert resp.headers['WARC-Coll'] == 'example'
assert resp.headers['WARC-Date'] == '2016-02-25T04:23:29Z' assert resp.headers['WARC-Date'] == '2016-02-25T04:23:29Z'
assert resp.headers['WARC-Target-URI'] == 'http://example.com/' assert resp.headers['WARC-Target-URI'] == 'http://example.com/'
assert b'HTTP/1.1 200 OK' in resp.body assert b'HTTP/1.1 200 OK' in resp.body
def test_error_fallback_live_not_found(self): assert 'ResErrors' not in resp.headers
resp = self.testapp.get('/fallback?url=http://invalid.url-not-found', status=400)
assert resp.json == {'message': 'http://invalid.url-not-found'} def test_error_fallback_live_not_found(self):
resp = self.testapp.get('/fallback/resource?url=http://invalid.url-not-found', status=400)
assert resp.json == {'message': 'http://invalid.url-not-found',
'errors': {'LiveWebLoader': "LiveResourceException('http://invalid.url-not-found',)"}}
assert resp.text == resp.headers['ResErrors']
def test_agg_local_revisit(self): def test_agg_local_revisit(self):
resp = self.testapp.get('/many?url=http://www.example.com/&closest=20140127171251&sources=local') resp = self.testapp.get('/many/resource?url=http://www.example.com/&closest=20140127171251&sources=local')
assert resp.headers['WARC-Coll'] == 'local' assert resp.headers['WARC-Coll'] == 'local'
assert resp.headers['WARC-Target-URI'] == 'http://example.com' assert resp.headers['WARC-Target-URI'] == 'http://example.com'
@ -240,23 +288,30 @@ foo=bar&test=abc"""
assert b'HTTP/1.1 200 OK' in resp.body assert b'HTTP/1.1 200 OK' in resp.body
assert b'<!doctype html>' in resp.body assert b'<!doctype html>' in resp.body
assert 'ResErrors' not in resp.headers
def test_error_invalid_index_output(self): def test_error_invalid_index_output(self):
resp = self.testapp.get('/live?mode=index&url=http://httpbin.org/get&output=foobar', status=400) resp = self.testapp.get('/live/index?url=http://httpbin.org/get&output=foobar', status=400)
assert resp.json == {'message': 'output=foobar not supported'} assert resp.json == {'message': 'output=foobar not supported'}
assert resp.text == resp.headers['ResErrors']
def test_error_local_not_found(self): def test_error_local_not_found(self):
resp = self.testapp.get('/many?url=http://not-found.error/&sources=local', status=404) resp = self.testapp.get('/many/resource?url=http://not-found.error/&sources=local', status=404)
assert resp.json == {'message': 'No Resource Found'} assert resp.json == {'message': 'No Resource Found'}
assert resp.text == resp.headers['ResErrors']
def test_error_empty(self): def test_error_empty(self):
resp = self.testapp.get('/empty?url=http://example.com/', status=404) resp = self.testapp.get('/empty/resource?url=http://example.com/', status=404)
assert resp.json == {'message': 'No Resource Found'} assert resp.json == {'message': 'No Resource Found'}
assert resp.text == resp.headers['ResErrors']
def test_error_invalid(self): def test_error_invalid(self):
resp = self.testapp.get('/invalid?url=http://example.com/', status=500) resp = self.testapp.get('/invalid/resource?url=http://example.com/', status=500)
assert resp.json == {'message': "Internal Error: 'list' object is not callable"}
assert resp.text == resp.headers['ResErrors']
assert resp.json['message'].startswith('Internal Error')

View File

@ -55,7 +55,7 @@ def query_single_source(source, params):
@pytest.mark.parametrize("source", local_sources, ids=["file", "redis"]) @pytest.mark.parametrize("source", local_sources, ids=["file", "redis"])
def test_local_cdxj_loader(source): def test_local_cdxj_loader(source):
url = 'http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf' url = 'http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf'
res = query_single_source(source, dict(url=url, limit=3)) res, errs = query_single_source(source, dict(url=url, limit=3))
expected = """\ expected = """\
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 iana.warc.gz org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 iana.warc.gz
@ -63,6 +63,7 @@ org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 iana.warc.gz""" org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 iana.warc.gz"""
assert(key_ts_res(res) == expected) assert(key_ts_res(res) == expected)
assert(errs == {})
# Closest -- Local Loaders # Closest -- Local Loaders
@ -70,7 +71,7 @@ org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 iana.warc.gz"""
@pytest.mark.parametrize("source", local_sources, ids=["file", "redis"]) @pytest.mark.parametrize("source", local_sources, ids=["file", "redis"])
def test_local_closest_loader(source): def test_local_closest_loader(source):
url = 'http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf' url = 'http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf'
res = query_single_source(source, dict(url=url, res, errs = query_single_source(source, dict(url=url,
closest='20140126200930', closest='20140126200930',
limit=3)) limit=3))
@ -80,13 +81,14 @@ org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 iana.warc.gz""" org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 iana.warc.gz"""
assert(key_ts_res(res) == expected) assert(key_ts_res(res) == expected)
assert(errs == {})
# Prefix -- Local Loaders # Prefix -- Local Loaders
# ============================================================================ # ============================================================================
@pytest.mark.parametrize("source", local_sources, ids=["file", "redis"]) @pytest.mark.parametrize("source", local_sources, ids=["file", "redis"])
def test_file_prefix_loader(source): def test_file_prefix_loader(source):
res = query_single_source(source, dict(url='http://iana.org/domains/root/*')) res, errs = query_single_source(source, dict(url='http://iana.org/domains/root/*'))
expected = """\ expected = """\
org,iana)/domains/root/db 20140126200927 iana.warc.gz org,iana)/domains/root/db 20140126200927 iana.warc.gz
@ -94,6 +96,7 @@ org,iana)/domains/root/db 20140126200928 iana.warc.gz
org,iana)/domains/root/servers 20140126201227 iana.warc.gz""" org,iana)/domains/root/servers 20140126201227 iana.warc.gz"""
assert(key_ts_res(res) == expected) assert(key_ts_res(res) == expected)
assert(errs == {})
# Url Match -- Remote Loaders # Url Match -- Remote Loaders
@ -101,7 +104,7 @@ org,iana)/domains/root/servers 20140126201227 iana.warc.gz"""
@pytest.mark.parametrize("source", remote_sources, ids=["remote_cdx", "memento"]) @pytest.mark.parametrize("source", remote_sources, ids=["remote_cdx", "memento"])
def test_remote_loader(source): def test_remote_loader(source):
url = 'http://instagram.com/amaliaulman' url = 'http://instagram.com/amaliaulman'
res = query_single_source(source, dict(url=url)) res, errs = query_single_source(source, dict(url=url))
expected = """\ expected = """\
com,instagram)/amaliaulman 20141014150552 http://webenact.rhizome.org/all/20141014150552id_/http://instagram.com/amaliaulman com,instagram)/amaliaulman 20141014150552 http://webenact.rhizome.org/all/20141014150552id_/http://instagram.com/amaliaulman
@ -110,6 +113,7 @@ com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/201410
com,instagram)/amaliaulman 20141014171636 http://webenact.rhizome.org/all/20141014171636id_/http://instagram.com/amaliaulman""" com,instagram)/amaliaulman 20141014171636 http://webenact.rhizome.org/all/20141014171636id_/http://instagram.com/amaliaulman"""
assert(key_ts_res(res, 'load_url') == expected) assert(key_ts_res(res, 'load_url') == expected)
assert(errs == {})
# Url Match -- Remote Loaders # Url Match -- Remote Loaders
@ -117,12 +121,13 @@ com,instagram)/amaliaulman 20141014171636 http://webenact.rhizome.org/all/201410
@pytest.mark.parametrize("source", remote_sources, ids=["remote_cdx", "memento"]) @pytest.mark.parametrize("source", remote_sources, ids=["remote_cdx", "memento"])
def test_remote_closest_loader(source): def test_remote_closest_loader(source):
url = 'http://instagram.com/amaliaulman' url = 'http://instagram.com/amaliaulman'
res = query_single_source(source, dict(url=url, closest='20141014162332', limit=1)) res, errs = query_single_source(source, dict(url=url, closest='20141014162332', limit=1))
expected = """\ expected = """\
com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/20141014162333id_/http://instagram.com/amaliaulman""" com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/20141014162333id_/http://instagram.com/amaliaulman"""
assert(key_ts_res(res, 'load_url') == expected) assert(key_ts_res(res, 'load_url') == expected)
assert(errs == {})
# Url Match -- Memento # Url Match -- Memento
@ -130,25 +135,26 @@ com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/201410
@pytest.mark.parametrize("source", remote_sources, ids=["remote_cdx", "memento"]) @pytest.mark.parametrize("source", remote_sources, ids=["remote_cdx", "memento"])
def test_remote_closest_loader(source): def test_remote_closest_loader(source):
url = 'http://instagram.com/amaliaulman' url = 'http://instagram.com/amaliaulman'
res = query_single_source(source, dict(url=url, closest='20141014162332', limit=1)) res, errs = query_single_source(source, dict(url=url, closest='20141014162332', limit=1))
expected = """\ expected = """\
com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/20141014162333id_/http://instagram.com/amaliaulman""" com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/20141014162333id_/http://instagram.com/amaliaulman"""
assert(key_ts_res(res, 'load_url') == expected) assert(key_ts_res(res, 'load_url') == expected)
assert(errs == {})
# Live Index -- No Load! # Live Index -- No Load!
# ============================================================================ # ============================================================================
def test_live(): def test_live():
url = 'http://example.com/' url = 'http://example.com/'
source = LiveIndexSource() source = LiveIndexSource()
res = query_single_source(source, dict(url=url)) res, errs = query_single_source(source, dict(url=url))
expected = 'com,example)/ {0} http://example.com/'.format(timestamp_now()) expected = 'com,example)/ {0} http://example.com/'.format(timestamp_now())
assert(key_ts_res(res, 'load_url') == expected) assert(key_ts_res(res, 'load_url') == expected)
assert(errs == {})
# Errors -- Not Found All # Errors -- Not Found All
@ -156,31 +162,36 @@ def test_live():
@pytest.mark.parametrize("source", local_sources + remote_sources, ids=["file", "redis", "remote_cdx", "memento"]) @pytest.mark.parametrize("source", local_sources + remote_sources, ids=["file", "redis", "remote_cdx", "memento"])
def test_all_not_found(source): def test_all_not_found(source):
url = 'http://x-not-found-x.notfound/' url = 'http://x-not-found-x.notfound/'
res = query_single_source(source, dict(url=url, limit=3)) res, errs = query_single_source(source, dict(url=url, limit=3))
expected = '' expected = ''
assert(key_ts_res(res) == expected) assert(key_ts_res(res) == expected)
if source == remote_sources[0]:
assert('http://x-not-found-x.notfound/' in errs['source'])
else:
assert(errs == {})
# ============================================================================ # ============================================================================
def test_another_remote_not_found(): def test_another_remote_not_found():
source = MementoIndexSource.from_timegate_url('http://www.webarchive.org.uk/wayback/archive/') source = MementoIndexSource.from_timegate_url('http://www.webarchive.org.uk/wayback/archive/')
url = 'http://x-not-found-x.notfound/' url = 'http://x-not-found-x.notfound/'
res = query_single_source(source, dict(url=url, limit=3)) res, errs = query_single_source(source, dict(url=url, limit=3))
expected = '' expected = ''
assert(key_ts_res(res) == expected) assert(key_ts_res(res) == expected)
assert(errs['source'] == "NotFoundException('http://www.webarchive.org.uk/wayback/archive/timemap/link/http://x-not-found-x.notfound/',)")
# ============================================================================ # ============================================================================
def test_file_not_found(): def test_file_not_found():
source = FileIndexSource('testdata/not-found-x') source = FileIndexSource('testdata/not-found-x')
url = 'http://x-not-found-x.notfound/' url = 'http://x-not-found-x.notfound/'
res = query_single_source(source, dict(url=url, limit=3)) res, errs = query_single_source(source, dict(url=url, limit=3))
expected = '' expected = ''
assert(key_ts_res(res) == expected) assert(key_ts_res(res) == expected)
assert(errs['source'] == "NotFoundException('testdata/not-found-x',)"), errs
# ============================================================================ # ============================================================================
@ -188,7 +199,7 @@ def test_ait_filters():
ait_source = RemoteIndexSource('http://wayback.archive-it.org/cdx/search/cdx?url={url}&filter=filename:ARCHIVEIT-({colls})-.*', ait_source = RemoteIndexSource('http://wayback.archive-it.org/cdx/search/cdx?url={url}&filter=filename:ARCHIVEIT-({colls})-.*',
'http://wayback.archive-it.org/all/{timestamp}id_/{url}') 'http://wayback.archive-it.org/all/{timestamp}id_/{url}')
cdxlist = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '5610|933'}) cdxlist, errs = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '5610|933'})
filenames = [cdx['filename'] for cdx in cdxlist] filenames = [cdx['filename'] for cdx in cdxlist]
prefix = ('ARCHIVEIT-5610-', 'ARCHIVEIT-933-') prefix = ('ARCHIVEIT-5610-', 'ARCHIVEIT-933-')
@ -196,7 +207,7 @@ def test_ait_filters():
assert(all([x.startswith(prefix) for x in filenames])) assert(all([x.startswith(prefix) for x in filenames]))
cdxlist = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '1883|366|905'}) cdxlist, errs = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '1883|366|905'})
filenames = [cdx['filename'] for cdx in cdxlist] filenames = [cdx['filename'] for cdx in cdxlist]
prefix = ('ARCHIVEIT-1883-', 'ARCHIVEIT-366-', 'ARCHIVEIT-905-') prefix = ('ARCHIVEIT-1883-', 'ARCHIVEIT-366-', 'ARCHIVEIT-905-')

View File

@ -1,13 +1,14 @@
from gevent import monkey; monkey.patch_all(thread=False) from gevent import monkey; monkey.patch_all(thread=False)
from rezag.aggindexsource import SimpleAggregator, GeventTimeoutAggregator from rezag.aggindexsource import SimpleAggregator, GeventTimeoutAggregator
from rezag.aggindexsource import ThreadedTimeoutAggregator from rezag.aggindexsource import ThreadedTimeoutAggregator, BaseAggregator
from rezag.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource from rezag.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource
from .testutils import json_list, to_path from .testutils import json_list, to_path
import json import json
import pytest import pytest
import time
from rezag.handlers import IndexHandler from rezag.handlers import IndexHandler
@ -27,6 +28,10 @@ aggs = {'simple': SimpleAggregator(sources),
'processes': ThreadedTimeoutAggregator(sources, timeout=5.0, use_processes=True), 'processes': ThreadedTimeoutAggregator(sources, timeout=5.0, use_processes=True),
} }
agg_tm = {'gevent': GeventTimeoutAggregator(sources, timeout=0.0),
'threaded': ThreadedTimeoutAggregator(sources, timeout=0.0),
'processes': ThreadedTimeoutAggregator(sources, timeout=0.0, use_processes=True)}
nf = {'notfound': FileIndexSource(to_path('testdata/not-found-x'))} nf = {'notfound': FileIndexSource(to_path('testdata/not-found-x'))}
agg_nf = {'simple': SimpleAggregator(nf), agg_nf = {'simple': SimpleAggregator(nf),
'gevent': GeventTimeoutAggregator(nf, timeout=5.0), 'gevent': GeventTimeoutAggregator(nf, timeout=5.0),
@ -41,7 +46,7 @@ agg_nf = {'simple': SimpleAggregator(nf),
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys())) @pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
def test_mem_agg_index_1(agg): def test_mem_agg_index_1(agg):
url = 'http://iana.org/' url = 'http://iana.org/'
res = agg(dict(url=url, closest='20140126000000', limit=5)) res, errs = agg(dict(url=url, closest='20140126000000', limit=5))
exp = [{"timestamp": "20140126093743", "load_url": "http://web.archive.org/web/20140126093743id_/http://iana.org/", "source": "ia"}, exp = [{"timestamp": "20140126093743", "load_url": "http://web.archive.org/web/20140126093743id_/http://iana.org/", "source": "ia"},
@ -52,12 +57,13 @@ def test_mem_agg_index_1(agg):
] ]
assert(json_list(res) == exp) assert(json_list(res) == exp)
assert(errs == {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://iana.org/',)",
'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"})
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys())) @pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
def test_mem_agg_index_2(agg): def test_mem_agg_index_2(agg):
url = 'http://example.com/' url = 'http://example.com/'
res = agg(dict(url=url, closest='20100512', limit=6)) res, errs = agg(dict(url=url, closest='20100512', limit=6))
exp = [{"timestamp": "20100513010014", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100513010014id_/http://example.com/", "source": "bl"}, exp = [{"timestamp": "20100513010014", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100513010014id_/http://example.com/", "source": "bl"},
{"timestamp": "20100512204410", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100512204410id_/http://example.com/", "source": "bl"}, {"timestamp": "20100512204410", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100512204410id_/http://example.com/", "source": "bl"},
@ -67,12 +73,13 @@ def test_mem_agg_index_2(agg):
{"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"}] {"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"}]
assert(json_list(res) == exp) assert(json_list(res) == exp)
assert(errs == {'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://example.com/',)"})
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys())) @pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
def test_mem_agg_index_3(agg): def test_mem_agg_index_3(agg):
url = 'http://vvork.com/' url = 'http://vvork.com/'
res = agg(dict(url=url, closest='20141001', limit=5)) res, errs = agg(dict(url=url, closest='20141001', limit=5))
exp = [{"timestamp": "20141006184357", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}, exp = [{"timestamp": "20141006184357", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"},
{"timestamp": "20141018133107", "load_url": "http://web.archive.org/web/20141018133107id_/http://vvork.com/", "source": "ia"}, {"timestamp": "20141018133107", "load_url": "http://web.archive.org/web/20141018133107id_/http://vvork.com/", "source": "ia"},
@ -81,32 +88,53 @@ def test_mem_agg_index_3(agg):
{"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}] {"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}]
assert(json_list(res) == exp) assert(json_list(res) == exp)
assert(errs == {})
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys())) @pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
def test_mem_agg_index_4(agg): def test_mem_agg_index_4(agg):
url = 'http://vvork.com/' url = 'http://vvork.com/'
res = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait')) res, errs = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
exp = [{"timestamp": "20141006184357", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}, exp = [{"timestamp": "20141006184357", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"},
{"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}] {"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}]
assert(json_list(res) == exp) assert(json_list(res) == exp)
assert(errs == {})
@pytest.mark.parametrize("agg", list(agg_nf.values()), ids=list(agg_nf.keys())) @pytest.mark.parametrize("agg", list(agg_nf.values()), ids=list(agg_nf.keys()))
def test_mem_agg_not_found(agg): def test_mem_agg_not_found(agg):
url = 'http://vvork.com/' url = 'http://vvork.com/'
res = agg(dict(url=url, closest='20141001', limit=2)) res, errs = agg(dict(url=url, closest='20141001', limit=2))
assert(json_list(res) == []) assert(json_list(res) == [])
assert(errs == {'notfound': "NotFoundException('testdata/not-found-x',)"})
@pytest.mark.parametrize("agg", list(agg_tm.values()), ids=list(agg_tm.keys()))
def test_mem_agg_timeout(agg):
url = 'http://vvork.com/'
orig_source = BaseAggregator.load_child_source
def load_child_source(self, name, source, params):
time.sleep(0.1)
return orig_source(name, source, params)
BaseAggregator.load_child_source = load_child_source
res, errs = agg(dict(url=url, closest='20141001', limit=2))
BaseAggregator.load_child_source = orig_source
assert(json_list(res) == [])
assert(errs == {'local': 'timeout',
'ait': 'timeout', 'bl': 'timeout', 'ia': 'timeout', 'rhiz': 'timeout'})
def test_handler_output_cdxj(): def test_handler_output_cdxj():
agg = GeventTimeoutAggregator(sources, timeout=5.0) agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg) handler = IndexHandler(agg)
url = 'http://vvork.com/' url = 'http://vvork.com/'
res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait')) res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
exp = """\ exp = """\
com,vvork)/ 20141006184357 {"url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"} com,vvork)/ 20141006184357 {"url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
@ -114,13 +142,14 @@ com,vvork)/ 20131004231540 {"url": "http://vvork.com/", "mem_rel": "last memento
""" """
assert(''.join(res) == exp) assert(''.join(res) == exp)
assert(errs == {})
def test_handler_output_json(): def test_handler_output_json():
agg = GeventTimeoutAggregator(sources, timeout=5.0) agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg) handler = IndexHandler(agg)
url = 'http://vvork.com/' url = 'http://vvork.com/'
res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='json')) res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='json'))
exp = """\ exp = """\
{"urlkey": "com,vvork)/", "timestamp": "20141006184357", "url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"} {"urlkey": "com,vvork)/", "timestamp": "20141006184357", "url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
@ -128,26 +157,27 @@ def test_handler_output_json():
""" """
assert(''.join(res) == exp) assert(''.join(res) == exp)
assert(errs == {})
def test_handler_output_link(): def test_handler_output_link():
agg = GeventTimeoutAggregator(sources, timeout=5.0) agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg) handler = IndexHandler(agg)
url = 'http://vvork.com/' url = 'http://vvork.com/'
res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='link')) res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='link'))
exp = """\ exp = """\
<http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/>; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT"; src="rhiz", <http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/>; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT"; src="rhiz",
<http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/>; rel="memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT"; src="ait" <http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/>; rel="memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT"; src="ait"
""" """
assert(''.join(res) == exp) assert(''.join(res) == exp)
assert(errs == {})
def test_handler_output_link_2(): def test_handler_output_link_2():
agg = GeventTimeoutAggregator(sources, timeout=5.0) agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg) handler = IndexHandler(agg)
url = 'http://iana.org/' url = 'http://iana.org/'
res = handler(dict(url=url, closest='20140126000000', limit=5, output='link')) res, errs = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
exp = """\ exp = """\
<http://web.archive.org/web/20140126093743id_/http://iana.org/>; rel="memento"; datetime="Sun, 26 Jan 2014 09:37:43 GMT"; src="ia", <http://web.archive.org/web/20140126093743id_/http://iana.org/>; rel="memento"; datetime="Sun, 26 Jan 2014 09:37:43 GMT"; src="ia",
@ -158,38 +188,54 @@ def test_handler_output_link_2():
""" """
assert(''.join(res) == exp) assert(''.join(res) == exp)
exp_errs = {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://iana.org/',)",
'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}
assert(errs == exp_errs)
def test_handler_output_link_3(): def test_handler_output_link_3():
agg = GeventTimeoutAggregator(sources, timeout=5.0) agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg) handler = IndexHandler(agg)
url = 'http://foo.bar.non-existent' url = 'http://foo.bar.non-existent'
res = handler(dict(url=url, closest='20140126000000', limit=5, output='link')) res, errs = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
exp = '' exp = ''
assert(''.join(res) == exp) assert(''.join(res) == exp)
exp_errs = {'ait': "NotFoundException('http://wayback.archive-it.org/all/http://foo.bar.non-existent',)",
'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://foo.bar.non-existent',)",
'ia': "NotFoundException('http://web.archive.org/web/http://foo.bar.non-existent',)",
'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://foo.bar.non-existent',)"}
assert(errs == exp_errs)
def test_handler_output_text(): def test_handler_output_text():
agg = GeventTimeoutAggregator(sources, timeout=5.0) agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg) handler = IndexHandler(agg)
url = 'http://vvork.com/' url = 'http://vvork.com/'
res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='text')) res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='text'))
exp = """\ exp = """\
com,vvork)/ 20141006184357 http://www.vvork.com/ memento http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/ http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/ rhiz com,vvork)/ 20141006184357 http://www.vvork.com/ memento http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/ http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/ rhiz
com,vvork)/ 20131004231540 http://vvork.com/ last memento http://wayback.archive-it.org/all/20131004231540/http://vvork.com/ http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/ ait com,vvork)/ 20131004231540 http://vvork.com/ last memento http://wayback.archive-it.org/all/20131004231540/http://vvork.com/ http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/ ait
""" """
assert(''.join(res) == exp) assert(''.join(res) == exp)
assert(errs == {})
def test_handler_list_sources(): def test_handler_list_sources():
agg = GeventTimeoutAggregator(sources, timeout=5.0) agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg) handler = IndexHandler(agg)
res = handler(dict(mode='list_sources')) res, errs = handler(dict(mode='list_sources'))
assert(res == {'sources': {'bl': 'memento', assert(res == {'sources': {'bl': 'memento',
'ait': 'memento', 'ait': 'memento',
'ia': 'memento', 'ia': 'memento',
'rhiz': 'memento', 'rhiz': 'memento',
'local': 'file'}}) 'local': 'file'}})
assert(errs == {})

View File

@ -35,7 +35,7 @@ def setup_module():
def test_timeout_long_all_pass(): def test_timeout_long_all_pass():
agg = TimeoutAggregator(sources, timeout=1.0) agg = TimeoutAggregator(sources, timeout=1.0)
res = agg(dict(url='http://example.com/')) res, errs = agg(dict(url='http://example.com/'))
exp = [{'source': 'slower', 'timestamp': '20140127171200'}, exp = [{'source': 'slower', 'timestamp': '20140127171200'},
{'source': 'slower', 'timestamp': '20140127171251'}, {'source': 'slower', 'timestamp': '20140127171251'},
@ -43,27 +43,31 @@ def test_timeout_long_all_pass():
assert(json_list(res, fields=['source', 'timestamp']) == exp) assert(json_list(res, fields=['source', 'timestamp']) == exp)
assert(errs == {})
def test_timeout_slower_skipped_1(): def test_timeout_slower_skipped_1():
agg = GeventTimeoutAggregator(sources, timeout=0.49) agg = GeventTimeoutAggregator(sources, timeout=0.49)
res = agg(dict(url='http://example.com/')) res, errs = agg(dict(url='http://example.com/'))
exp = [{'source': 'slow', 'timestamp': '20160225042329'}] exp = [{'source': 'slow', 'timestamp': '20160225042329'}]
assert(json_list(res, fields=['source', 'timestamp']) == exp) assert(json_list(res, fields=['source', 'timestamp']) == exp)
assert(errs == {'slower': 'timeout'})
def test_timeout_slower_skipped_2(): def test_timeout_slower_skipped_2():
agg = GeventTimeoutAggregator(sources, timeout=0.19) agg = GeventTimeoutAggregator(sources, timeout=0.19)
res = agg(dict(url='http://example.com/')) res, errs = agg(dict(url='http://example.com/'))
exp = [] exp = []
assert(json_list(res, fields=['source', 'timestamp']) == exp) assert(json_list(res, fields=['source', 'timestamp']) == exp)
assert(errs == {'slower': 'timeout', 'slow': 'timeout'})
def test_timeout_skipping(): def test_timeout_skipping():
@ -75,31 +79,40 @@ def test_timeout_skipping():
exp = [{'source': 'slow', 'timestamp': '20160225042329'}] exp = [{'source': 'slow', 'timestamp': '20160225042329'}]
res = agg(dict(url='http://example.com/')) res, errs = agg(dict(url='http://example.com/'))
assert(json_list(res, fields=['source', 'timestamp']) == exp) assert(json_list(res, fields=['source', 'timestamp']) == exp)
assert(sources['slow'].calls == 4) assert(sources['slow'].calls == 4)
assert(sources['slower'].calls == 4) assert(sources['slower'].calls == 4)
res = agg(dict(url='http://example.com/')) assert(errs == {'slower': 'timeout'})
res, errs = agg(dict(url='http://example.com/'))
assert(json_list(res, fields=['source', 'timestamp']) == exp) assert(json_list(res, fields=['source', 'timestamp']) == exp)
assert(sources['slow'].calls == 5) assert(sources['slow'].calls == 5)
assert(sources['slower'].calls == 5) assert(sources['slower'].calls == 5)
res = agg(dict(url='http://example.com/')) assert(errs == {'slower': 'timeout'})
res, errs = agg(dict(url='http://example.com/'))
assert(json_list(res, fields=['source', 'timestamp']) == exp) assert(json_list(res, fields=['source', 'timestamp']) == exp)
assert(sources['slow'].calls == 6) assert(sources['slow'].calls == 6)
assert(sources['slower'].calls == 5) assert(sources['slower'].calls == 5)
res = agg(dict(url='http://example.com/')) assert(errs == {})
res, errs = agg(dict(url='http://example.com/'))
assert(json_list(res, fields=['source', 'timestamp']) == exp) assert(json_list(res, fields=['source', 'timestamp']) == exp)
assert(sources['slow'].calls == 7) assert(sources['slow'].calls == 7)
assert(sources['slower'].calls == 5) assert(sources['slower'].calls == 5)
assert(errs == {})
time.sleep(2.01) time.sleep(2.01)
res = agg(dict(url='http://example.com/')) res, errs = agg(dict(url='http://example.com/'))
assert(json_list(res, fields=['source', 'timestamp']) == exp) assert(json_list(res, fields=['source', 'timestamp']) == exp)
assert(sources['slow'].calls == 8) assert(sources['slow'].calls == 8)
assert(sources['slower'].calls == 6) assert(sources['slower'].calls == 6)
assert(errs == {'slower': 'timeout'})