mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 08:04:49 +01:00
errors and timeouts reported back to the user via ResErrors header
add new /index, /resource access point system
This commit is contained in:
parent
1f3763d02c
commit
65e969a492
@ -13,9 +13,10 @@ from pywb.cdx.query import CDXQuery
|
||||
|
||||
from heapq import merge
|
||||
from collections import deque
|
||||
from itertools import chain
|
||||
|
||||
from rezag.indexsource import FileIndexSource
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
from pywb.utils.wbexception import NotFoundException, WbException
|
||||
import six
|
||||
import glob
|
||||
|
||||
@ -29,13 +30,10 @@ class BaseAggregator(object):
|
||||
query = CDXQuery(params)
|
||||
self._set_src_params(params)
|
||||
|
||||
try:
|
||||
cdx_iter = self.load_index(query.params)
|
||||
except NotFoundException as nf:
|
||||
cdx_iter = iter([])
|
||||
cdx_iter, errs = self.load_index(query.params)
|
||||
|
||||
cdx_iter = process_cdx(cdx_iter, query)
|
||||
return cdx_iter
|
||||
return cdx_iter, dict(errs)
|
||||
|
||||
def _set_src_params(self, params):
|
||||
src_params = {}
|
||||
@ -60,16 +58,23 @@ class BaseAggregator(object):
|
||||
params['_all_src_params'] = src_params
|
||||
|
||||
def load_child_source_list(self, name, source, params):
|
||||
return list(self.load_child_source(name, source, params))
|
||||
res = self.load_child_source(name, source, params)
|
||||
return list(res[0]), res[1]
|
||||
|
||||
def load_child_source(self, name, source, params):
|
||||
try:
|
||||
_src_params = params['_all_src_params'].get(name)
|
||||
params['_src_params'] = _src_params
|
||||
cdx_iter = source.load_index(params)
|
||||
except NotFoundException as nf:
|
||||
print('Not found in ' + name)
|
||||
res = source.load_index(params)
|
||||
if isinstance(res, tuple):
|
||||
cdx_iter, err_list = res
|
||||
else:
|
||||
cdx_iter = res
|
||||
err_list = []
|
||||
except WbException as wbe:
|
||||
#print('Not found in ' + name)
|
||||
cdx_iter = iter([])
|
||||
err_list = [(name, repr(wbe))]
|
||||
|
||||
def add_name(cdx):
|
||||
if cdx.get('source'):
|
||||
@ -78,10 +83,13 @@ class BaseAggregator(object):
|
||||
cdx['source'] = name
|
||||
return cdx
|
||||
|
||||
return (add_name(cdx) for cdx in cdx_iter)
|
||||
return (add_name(cdx) for cdx in cdx_iter), err_list
|
||||
|
||||
def load_index(self, params):
|
||||
iter_list = self._load_all(params)
|
||||
res_list = self._load_all(params)
|
||||
|
||||
iter_list = [res[0] for res in res_list]
|
||||
err_list = chain(*[res[1] for res in res_list])
|
||||
|
||||
#optimization: if only a single entry (or empty) just load directly
|
||||
if len(iter_list) <= 1:
|
||||
@ -89,7 +97,7 @@ class BaseAggregator(object):
|
||||
else:
|
||||
cdx_iter = merge(*(iter_list))
|
||||
|
||||
return cdx_iter
|
||||
return cdx_iter, err_list
|
||||
|
||||
def _on_source_error(self, name): #pragma: no cover
|
||||
pass
|
||||
@ -207,6 +215,7 @@ class GeventMixin(object):
|
||||
if job.value is not None:
|
||||
results.append(job.value)
|
||||
else:
|
||||
results.append((iter([]), [(name, 'timeout')]))
|
||||
self._on_source_error(name)
|
||||
|
||||
return results
|
||||
@ -247,7 +256,9 @@ class ConcurrentMixin(object):
|
||||
results.append(job.result())
|
||||
|
||||
for job in res_not_done:
|
||||
self._on_source_error(jobs[job])
|
||||
name = jobs[job]
|
||||
results.append((iter([]), [(name, 'timeout')]))
|
||||
self._on_source_error(name)
|
||||
|
||||
return results
|
||||
|
||||
|
60
rezag/app.py
60
rezag/app.py
@ -2,25 +2,49 @@ from rezag.inputrequest import DirectWSGIInputRequest, POSTInputRequest
|
||||
from bottle import route, request, response, default_app, abort
|
||||
import bottle
|
||||
|
||||
from pywb.utils.wbexception import WbException
|
||||
|
||||
import traceback
|
||||
import json
|
||||
|
||||
JSON_CT = 'application/json; charset=utf-8'
|
||||
|
||||
def err_handler(exc):
|
||||
response.status = exc.status_code
|
||||
response.content_type = 'application/json'
|
||||
return json.dumps({'message': exc.body})
|
||||
response.content_type = JSON_CT
|
||||
err_msg = json.dumps({'message': exc.body})
|
||||
response.headers['ResErrors'] = err_msg
|
||||
return err_msg
|
||||
|
||||
|
||||
def wrap_error(func):
|
||||
def wrap_func(*args, **kwargs):
|
||||
try:
|
||||
return func(*args, **kwargs)
|
||||
except WbException as exc:
|
||||
if bottle.debug:
|
||||
traceback.print_exc()
|
||||
abort(exc.status(), exc.msg)
|
||||
res, errs = func(*args, **kwargs)
|
||||
|
||||
if res:
|
||||
if errs:
|
||||
response.headers['ResErrors'] = json.dumps(errs)
|
||||
return res
|
||||
|
||||
last_exc = errs.pop('last_exc', None)
|
||||
if last_exc:
|
||||
if bottle.debug:
|
||||
traceback.print_exc()
|
||||
|
||||
response.status = last_exc.status()
|
||||
message = last_exc.msg
|
||||
else:
|
||||
response.status = 404
|
||||
message = 'No Resource Found'
|
||||
|
||||
response.content_type = JSON_CT
|
||||
res = {'message': message}
|
||||
if errs:
|
||||
res['errors'] = errs
|
||||
|
||||
err_msg = json.dumps(res)
|
||||
response.headers['ResErrors'] = err_msg
|
||||
return err_msg
|
||||
|
||||
except Exception as e:
|
||||
if bottle.debug:
|
||||
traceback.print_exc()
|
||||
@ -32,35 +56,33 @@ def wrap_error(func):
|
||||
route_dict = {}
|
||||
|
||||
def add_route(path, handler):
|
||||
@route(path, 'ANY')
|
||||
@route([path, path + '/<mode:path>'], 'ANY')
|
||||
@wrap_error
|
||||
def direct_input_request():
|
||||
def direct_input_request(mode=''):
|
||||
params = dict(request.query)
|
||||
params['mode'] = mode
|
||||
params['_input_req'] = DirectWSGIInputRequest(request.environ)
|
||||
return handler(params)
|
||||
|
||||
@route(path + '/postreq', 'POST')
|
||||
@route([path + '/postreq', path + '/<mode:path>/postreq'], 'POST')
|
||||
@wrap_error
|
||||
def post_fullrequest():
|
||||
def post_fullrequest(mode=''):
|
||||
params = dict(request.query)
|
||||
params['mode'] = mode
|
||||
params['_input_req'] = POSTInputRequest(request.environ)
|
||||
return handler(params)
|
||||
|
||||
global route_dict
|
||||
handler_dict = {'handler': handler.get_supported_modes()}
|
||||
handler_dict = handler.get_supported_modes()
|
||||
route_dict[path] = handler_dict
|
||||
route_dict[path + '/postreq'] = handler_dict
|
||||
|
||||
|
||||
@route('/')
|
||||
def list_routes():
|
||||
return route_dict
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
application = default_app()
|
||||
application.default_error_handler = err_handler
|
||||
|
||||
|
@ -39,12 +39,13 @@ class IndexHandler(object):
|
||||
self.opts = opts or {}
|
||||
|
||||
def get_supported_modes(self):
|
||||
return dict(modes=['list_modes', 'list_sources', 'index'])
|
||||
return dict(modes=['list_sources', 'index'])
|
||||
|
||||
def _load_index_source(self, params):
|
||||
url = params.get('url')
|
||||
if not url:
|
||||
raise BadRequestException('The "url" param is required')
|
||||
errs = dict(last_exc=BadRequestException('The "url" param is required'))
|
||||
return None, errs
|
||||
|
||||
input_req = params.get('_input_req')
|
||||
if input_req:
|
||||
@ -55,21 +56,25 @@ class IndexHandler(object):
|
||||
def __call__(self, params):
|
||||
mode = params.get('mode', 'index')
|
||||
if mode == 'list_sources':
|
||||
return self.index_source.get_source_list(params)
|
||||
return self.index_source.get_source_list(params), {}
|
||||
|
||||
if mode == 'list_modes' or mode != 'index':
|
||||
return self.get_supported_modes()
|
||||
if mode != 'index':
|
||||
return self.get_supported_modes(), {}
|
||||
|
||||
output = params.get('output', self.DEF_OUTPUT)
|
||||
fields = params.get('fields')
|
||||
|
||||
handler = self.OUTPUTS.get(output)
|
||||
if not handler:
|
||||
raise BadRequestException('output={0} not supported'.format(output))
|
||||
errs = dict(last_exc=BadRequestException('output={0} not supported'.format(output)))
|
||||
return None, errs
|
||||
|
||||
cdx_iter, errs = self._load_index_source(params)
|
||||
if not cdx_iter:
|
||||
return None, errs
|
||||
|
||||
cdx_iter = self._load_index_source(params)
|
||||
res = handler(cdx_iter, fields)
|
||||
return res
|
||||
return res, errs
|
||||
|
||||
|
||||
#=============================================================================
|
||||
@ -87,7 +92,10 @@ class ResourceHandler(IndexHandler):
|
||||
if params.get('mode', 'resource') != 'resource':
|
||||
return super(ResourceHandler, self).__call__(params)
|
||||
|
||||
cdx_iter = self._load_index_source(params)
|
||||
cdx_iter, errs = self._load_index_source(params)
|
||||
if not cdx_iter:
|
||||
return None, errs
|
||||
|
||||
last_exc = None
|
||||
|
||||
for cdx in cdx_iter:
|
||||
@ -95,15 +103,15 @@ class ResourceHandler(IndexHandler):
|
||||
try:
|
||||
resp = loader(cdx, params)
|
||||
if resp is not None:
|
||||
return resp
|
||||
return resp, errs
|
||||
except WbException as e:
|
||||
last_exc = e
|
||||
errs[str(loader)] = repr(e)
|
||||
|
||||
if last_exc:
|
||||
raise last_exc
|
||||
#raise ArchiveLoadFailed('Resource Found, could not be Loaded')
|
||||
else:
|
||||
raise NotFoundException('No Resource Found')
|
||||
errs['last_exc'] = last_exc
|
||||
|
||||
return None, errs
|
||||
|
||||
|
||||
#=============================================================================
|
||||
@ -121,20 +129,19 @@ class HandlerSeq(object):
|
||||
self.handlers = handlers
|
||||
|
||||
def get_supported_modes(self):
|
||||
return []
|
||||
# return zip([self.handlers.get_supported_modes()]
|
||||
if self.handlers:
|
||||
return self.handlers[0].get_supported_modes()
|
||||
else:
|
||||
return {}
|
||||
|
||||
def __call__(self, params):
|
||||
last_exc = None
|
||||
all_errs = {}
|
||||
for handler in self.handlers:
|
||||
try:
|
||||
res = handler(params)
|
||||
if res is not None:
|
||||
return res
|
||||
except WbException as e:
|
||||
last_exc = e
|
||||
res, errs = handler(params)
|
||||
all_errs.update(errs)
|
||||
if res is not None:
|
||||
return res, all_errs
|
||||
|
||||
return None, all_errs
|
||||
|
||||
|
||||
if last_exc:
|
||||
raise last_exc
|
||||
else:
|
||||
raise NotFoundException('No Resource Found')
|
||||
|
@ -58,6 +58,10 @@ class WARCPathLoader(object):
|
||||
no_record_parse=True)
|
||||
self.cdx_source = cdx_source
|
||||
|
||||
def cdx_index_source(self, *args, **kwargs):
|
||||
cdx_iter, errs = self.cdx_source(*args, **kwargs)
|
||||
return cdx_iter
|
||||
|
||||
def warc_paths(self):
|
||||
for path in self.paths:
|
||||
def check(filename, cdx):
|
||||
@ -83,7 +87,7 @@ class WARCPathLoader(object):
|
||||
headers, payload = (self.resolve_loader.
|
||||
load_headers_and_payload(cdx,
|
||||
failed_files,
|
||||
self.cdx_source))
|
||||
self.cdx_index_source))
|
||||
|
||||
record = payload
|
||||
|
||||
@ -102,6 +106,9 @@ class WARCPathLoader(object):
|
||||
res = StreamIter(record.stream)
|
||||
return res
|
||||
|
||||
def __str__(self):
|
||||
return 'WARCPathLoader'
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class HeaderRecorder(BaseRecorder):
|
||||
@ -200,3 +207,7 @@ class LiveWebLoader(object):
|
||||
if not id_:
|
||||
id_ = uuid.uuid1()
|
||||
return '<urn:uuid:{0}>'.format(id_)
|
||||
|
||||
def __str__(self):
|
||||
return 'LiveWebLoader'
|
||||
|
||||
|
@ -2,7 +2,7 @@ import re
|
||||
import six
|
||||
|
||||
from pywb.utils.timeutils import timestamp_to_http_date
|
||||
|
||||
from pywb.utils.wbexception import BadRequestException
|
||||
|
||||
LINK_SPLIT = re.compile(',\s*(?=[<])')
|
||||
LINK_SEG_SPLIT = re.compile(';\s*')
|
||||
@ -10,6 +10,11 @@ LINK_URL = re.compile('<(.*)>')
|
||||
LINK_PROP = re.compile('([\w]+)="([^"]+)')
|
||||
|
||||
|
||||
#=================================================================
|
||||
class MementoException(BadRequestException):
|
||||
pass
|
||||
|
||||
|
||||
#=================================================================
|
||||
class MementoUtils(object):
|
||||
@staticmethod
|
||||
@ -22,7 +27,7 @@ class MementoUtils(object):
|
||||
props = LINK_SEG_SPLIT.split(link)
|
||||
m = LINK_URL.match(props[0])
|
||||
if not m:
|
||||
raise Exception('Invalid Link Url: ' + props[0])
|
||||
raise MementoException('Invalid Link Url: ' + props[0])
|
||||
|
||||
result = dict(url=m.group(1))
|
||||
key = ''
|
||||
@ -31,7 +36,7 @@ class MementoUtils(object):
|
||||
for prop in props[1:]:
|
||||
m = LINK_PROP.match(prop)
|
||||
if not m:
|
||||
raise Exception('Invalid prop ' + prop)
|
||||
raise MementoException('Invalid prop ' + prop)
|
||||
|
||||
name = m.group(1)
|
||||
value = m.group(2)
|
||||
|
@ -59,43 +59,47 @@ def to_json_list(cdxlist, fields=['timestamp', 'load_url', 'filename', 'source']
|
||||
|
||||
|
||||
def test_agg_no_coll_set():
|
||||
res = dir_loader(dict(url='example.com/'))
|
||||
res, errs = dir_loader(dict(url='example.com/'))
|
||||
assert(to_json_list(res) == [])
|
||||
|
||||
assert(errs == {})
|
||||
|
||||
def test_agg_collA_found():
|
||||
res = dir_loader({'url': 'example.com/', 'param.coll': 'A'})
|
||||
res, errs = dir_loader({'url': 'example.com/', 'param.coll': 'A'})
|
||||
|
||||
exp = [{'source': 'colls/A/indexes', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
def test_agg_collB():
|
||||
res = dir_loader({'url': 'example.com/', 'param.coll': 'B'})
|
||||
res, errs = dir_loader({'url': 'example.com/', 'param.coll': 'B'})
|
||||
|
||||
exp = []
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
def test_agg_collB_found():
|
||||
res = dir_loader({'url': 'iana.org/', 'param.coll': 'B'})
|
||||
res, errs = dir_loader({'url': 'iana.org/', 'param.coll': 'B'})
|
||||
|
||||
exp = [{'source': 'colls/B/indexes', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
def test_extra_agg_collB():
|
||||
agg_source = SimpleAggregator({'dir': dir_loader})
|
||||
res = agg_source({'url': 'iana.org/', 'param.coll': 'B'})
|
||||
res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'})
|
||||
|
||||
exp = [{'source': 'dir:colls/B/indexes', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
def test_agg_all_found_1():
|
||||
res = dir_loader({'url': 'iana.org/', 'param.coll': '*'})
|
||||
res, errs = dir_loader({'url': 'iana.org/', 'param.coll': '*'})
|
||||
|
||||
exp = [
|
||||
{'source': 'colls/B/indexes', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'},
|
||||
@ -104,10 +108,11 @@ def test_agg_all_found_1():
|
||||
]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
def test_agg_all_found_2():
|
||||
res = dir_loader({'url': 'example.com/', 'param.coll': '*'})
|
||||
res, errs = dir_loader({'url': 'example.com/', 'param.coll': '*'})
|
||||
|
||||
exp = [
|
||||
{'source': 'colls/C/indexes', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
|
||||
@ -116,6 +121,7 @@ def test_agg_all_found_2():
|
||||
]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
|
||||
@ -124,7 +130,7 @@ def test_agg_dir_and_memento():
|
||||
'local': dir_loader}
|
||||
agg_source = SimpleAggregator(sources)
|
||||
|
||||
res = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})
|
||||
res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})
|
||||
|
||||
exp = [
|
||||
{'source': 'ia', 'timestamp': '20100513052358', 'load_url': 'http://web.archive.org/web/20100513052358id_/http://example.com/'},
|
||||
@ -136,23 +142,26 @@ def test_agg_dir_and_memento():
|
||||
]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
def test_agg_no_dir_1():
|
||||
res = dir_loader({'url': 'example.com/', 'param.coll': 'X'})
|
||||
res, errs = dir_loader({'url': 'example.com/', 'param.coll': 'X'})
|
||||
|
||||
exp = []
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
def test_agg_no_dir_2():
|
||||
loader = DirectoryIndexSource(root_dir, '')
|
||||
res = loader({'url': 'example.com/', 'param.coll': 'X'})
|
||||
res, errs = loader({'url': 'example.com/', 'param.coll': 'X'})
|
||||
|
||||
exp = []
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
def test_agg_dir_sources_1():
|
||||
|
@ -50,7 +50,7 @@ def setup_module(self):
|
||||
handler2]))
|
||||
|
||||
add_route('/empty', HandlerSeq([]))
|
||||
add_route('/invalid', HandlerSeq(['foo']))
|
||||
add_route('/invalid', DefaultResourceHandler([SimpleAggregator({'invalid': 'should not be a callable'})]))
|
||||
|
||||
application.debug = True
|
||||
global testapp
|
||||
@ -65,23 +65,49 @@ class TestResAgg(object):
|
||||
def setup(self):
|
||||
self.testapp = testapp
|
||||
|
||||
def test_list_routes(self):
|
||||
resp = self.testapp.get('/')
|
||||
res = resp.json
|
||||
assert set(res.keys()) == set(['/empty', '/empty/postreq',
|
||||
'/fallback', '/fallback/postreq',
|
||||
'/live', '/live/postreq',
|
||||
'/many', '/many/postreq',
|
||||
'/posttest', '/posttest/postreq',
|
||||
'/seq', '/seq/postreq',
|
||||
'/invalid', '/invalid/postreq'])
|
||||
|
||||
assert res['/fallback'] == {'modes': ['list_sources', 'index', 'resource']}
|
||||
|
||||
def test_list_handlers(self):
|
||||
resp = self.testapp.get('/many?mode=list_modes')
|
||||
assert resp.json == {'modes': ['list_modes', 'list_sources', 'index', 'resource']}
|
||||
resp = self.testapp.get('/many')
|
||||
assert resp.json == {'modes': ['list_sources', 'index', 'resource']}
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
resp = self.testapp.get('/many?mode=other')
|
||||
assert resp.json == {'modes': ['list_modes', 'list_sources', 'index', 'resource']}
|
||||
resp = self.testapp.get('/many/other')
|
||||
assert resp.json == {'modes': ['list_sources', 'index', 'resource']}
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
# defaults to resource, must specify url
|
||||
resp = self.testapp.get('/many', status=400)
|
||||
def test_list_errors(self):
|
||||
# must specify url for index or resource
|
||||
resp = self.testapp.get('/many/index', status=400)
|
||||
assert resp.json == {'message': 'The "url" param is required'}
|
||||
assert resp.text == resp.headers['ResErrors']
|
||||
|
||||
resp = self.testapp.get('/many/index', status=400)
|
||||
assert resp.json == {'message': 'The "url" param is required'}
|
||||
assert resp.text == resp.headers['ResErrors']
|
||||
|
||||
resp = self.testapp.get('/many/resource', status=400)
|
||||
assert resp.json == {'message': 'The "url" param is required'}
|
||||
assert resp.text == resp.headers['ResErrors']
|
||||
|
||||
def test_list_sources(self):
|
||||
resp = self.testapp.get('/many?mode=list_sources')
|
||||
resp = self.testapp.get('/many/list_sources')
|
||||
assert resp.json == {'sources': {'local': 'file_dir', 'ia': 'memento', 'rhiz': 'memento', 'live': 'live'}}
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_live_index(self):
|
||||
resp = self.testapp.get('/live?mode=index&url=http://httpbin.org/get&output=json')
|
||||
resp = self.testapp.get('/live/index?url=http://httpbin.org/get&output=json')
|
||||
resp.charset = 'utf-8'
|
||||
|
||||
res = to_json_list(resp.text)
|
||||
@ -91,7 +117,7 @@ class TestResAgg(object):
|
||||
|
||||
def test_live_resource(self):
|
||||
headers = {'foo': 'bar'}
|
||||
resp = self.testapp.get('/live?url=http://httpbin.org/get?foo=bar', headers=headers)
|
||||
resp = self.testapp.get('/live/resource?url=http://httpbin.org/get?foo=bar', headers=headers)
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'live'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar'
|
||||
@ -100,9 +126,10 @@ class TestResAgg(object):
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"foo": "bar"' in resp.body
|
||||
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_live_post_resource(self):
|
||||
resp = self.testapp.post('/live?url=http://httpbin.org/post',
|
||||
resp = self.testapp.post('/live/resource?url=http://httpbin.org/post',
|
||||
OrderedDict([('foo', 'bar')]))
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'live'
|
||||
@ -112,38 +139,45 @@ class TestResAgg(object):
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"foo": "bar"' in resp.body
|
||||
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_agg_select_mem_1(self):
|
||||
resp = self.testapp.get('/many?url=http://vvork.com/&closest=20141001')
|
||||
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20141001')
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'rhiz'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://www.vvork.com/'
|
||||
assert resp.headers['WARC-Date'] == '2014-10-06T18:43:57Z'
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_agg_select_mem_2(self):
|
||||
resp = self.testapp.get('/many?url=http://vvork.com/&closest=20151231')
|
||||
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20151231')
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'ia'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://vvork.com/'
|
||||
assert resp.headers['WARC-Date'] == '2016-01-10T13:48:55Z'
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_agg_select_live(self):
|
||||
resp = self.testapp.get('/many?url=http://vvork.com/&closest=2016')
|
||||
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=2016')
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'live'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://vvork.com/'
|
||||
assert resp.headers['WARC-Date'] != ''
|
||||
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_agg_select_local(self):
|
||||
resp = self.testapp.get('/many?url=http://iana.org/&closest=20140126200624')
|
||||
resp = self.testapp.get('/many/resource?url=http://iana.org/&closest=20140126200624')
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'local'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/'
|
||||
assert resp.headers['WARC-Date'] == '2014-01-26T20:06:24Z'
|
||||
|
||||
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}
|
||||
|
||||
def test_agg_select_local_postreq(self):
|
||||
req_data = """\
|
||||
@ -153,12 +187,13 @@ User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (
|
||||
Host: iana.org
|
||||
"""
|
||||
|
||||
resp = self.testapp.post('/many/postreq?url=http://iana.org/&closest=20140126200624', req_data)
|
||||
resp = self.testapp.post('/many/resource/postreq?url=http://iana.org/&closest=20140126200624', req_data)
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'local'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/'
|
||||
assert resp.headers['WARC-Date'] == '2014-01-26T20:06:24Z'
|
||||
|
||||
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}
|
||||
|
||||
def test_agg_live_postreq(self):
|
||||
req_data = """\
|
||||
@ -168,7 +203,7 @@ User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (
|
||||
Host: httpbin.org
|
||||
"""
|
||||
|
||||
resp = self.testapp.post('/many/postreq?url=http://httpbin.org/get?foo=bar&closest=now', req_data)
|
||||
resp = self.testapp.post('/many/resource/postreq?url=http://httpbin.org/get?foo=bar&closest=now', req_data)
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'live'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar'
|
||||
@ -177,6 +212,8 @@ Host: httpbin.org
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"foo": "bar"' in resp.body
|
||||
|
||||
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://httpbin.org/get?foo=bar',)"}
|
||||
|
||||
def test_agg_post_resolve_postreq(self):
|
||||
req_data = """\
|
||||
POST /post HTTP/1.1
|
||||
@ -188,7 +225,7 @@ content-type: application/x-www-form-urlencoded
|
||||
|
||||
foo=bar&test=abc"""
|
||||
|
||||
resp = self.testapp.post('/posttest/postreq?url=http://httpbin.org/post', req_data)
|
||||
resp = self.testapp.post('/posttest/resource/postreq?url=http://httpbin.org/post', req_data)
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'post'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/post'
|
||||
@ -197,10 +234,12 @@ foo=bar&test=abc"""
|
||||
assert b'"test": "abc"' in resp.body
|
||||
assert b'"url": "http://httpbin.org/post"' in resp.body
|
||||
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_agg_post_resolve_fallback(self):
|
||||
req_data = OrderedDict([('foo', 'bar'), ('test', 'abc')])
|
||||
|
||||
resp = self.testapp.post('/fallback?url=http://httpbin.org/post', req_data)
|
||||
resp = self.testapp.post('/fallback/resource?url=http://httpbin.org/post', req_data)
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'post'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/post'
|
||||
@ -209,28 +248,37 @@ foo=bar&test=abc"""
|
||||
assert b'"test": "abc"' in resp.body
|
||||
assert b'"url": "http://httpbin.org/post"' in resp.body
|
||||
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_agg_seq_fallback_1(self):
|
||||
resp = self.testapp.get('/fallback?url=http://www.iana.org/')
|
||||
resp = self.testapp.get('/fallback/resource?url=http://www.iana.org/')
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'live'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/'
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_agg_seq_fallback_2(self):
|
||||
resp = self.testapp.get('/fallback?url=http://www.example.com/')
|
||||
resp = self.testapp.get('/fallback/resource?url=http://www.example.com/')
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'example'
|
||||
assert resp.headers['WARC-Date'] == '2016-02-25T04:23:29Z'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://example.com/'
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
|
||||
def test_error_fallback_live_not_found(self):
|
||||
resp = self.testapp.get('/fallback?url=http://invalid.url-not-found', status=400)
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
assert resp.json == {'message': 'http://invalid.url-not-found'}
|
||||
def test_error_fallback_live_not_found(self):
|
||||
resp = self.testapp.get('/fallback/resource?url=http://invalid.url-not-found', status=400)
|
||||
|
||||
assert resp.json == {'message': 'http://invalid.url-not-found',
|
||||
'errors': {'LiveWebLoader': "LiveResourceException('http://invalid.url-not-found',)"}}
|
||||
|
||||
assert resp.text == resp.headers['ResErrors']
|
||||
|
||||
def test_agg_local_revisit(self):
|
||||
resp = self.testapp.get('/many?url=http://www.example.com/&closest=20140127171251&sources=local')
|
||||
resp = self.testapp.get('/many/resource?url=http://www.example.com/&closest=20140127171251&sources=local')
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'local'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://example.com'
|
||||
@ -240,23 +288,30 @@ foo=bar&test=abc"""
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'<!doctype html>' in resp.body
|
||||
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_error_invalid_index_output(self):
|
||||
resp = self.testapp.get('/live?mode=index&url=http://httpbin.org/get&output=foobar', status=400)
|
||||
resp = self.testapp.get('/live/index?url=http://httpbin.org/get&output=foobar', status=400)
|
||||
|
||||
assert resp.json == {'message': 'output=foobar not supported'}
|
||||
assert resp.text == resp.headers['ResErrors']
|
||||
|
||||
def test_error_local_not_found(self):
|
||||
resp = self.testapp.get('/many?url=http://not-found.error/&sources=local', status=404)
|
||||
resp = self.testapp.get('/many/resource?url=http://not-found.error/&sources=local', status=404)
|
||||
|
||||
assert resp.json == {'message': 'No Resource Found'}
|
||||
assert resp.text == resp.headers['ResErrors']
|
||||
|
||||
def test_error_empty(self):
|
||||
resp = self.testapp.get('/empty?url=http://example.com/', status=404)
|
||||
resp = self.testapp.get('/empty/resource?url=http://example.com/', status=404)
|
||||
|
||||
assert resp.json == {'message': 'No Resource Found'}
|
||||
assert resp.text == resp.headers['ResErrors']
|
||||
|
||||
def test_error_invalid(self):
|
||||
resp = self.testapp.get('/invalid?url=http://example.com/', status=500)
|
||||
resp = self.testapp.get('/invalid/resource?url=http://example.com/', status=500)
|
||||
|
||||
assert resp.json == {'message': "Internal Error: 'list' object is not callable"}
|
||||
assert resp.text == resp.headers['ResErrors']
|
||||
|
||||
assert resp.json['message'].startswith('Internal Error')
|
||||
|
||||
|
@ -55,7 +55,7 @@ def query_single_source(source, params):
|
||||
@pytest.mark.parametrize("source", local_sources, ids=["file", "redis"])
|
||||
def test_local_cdxj_loader(source):
|
||||
url = 'http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf'
|
||||
res = query_single_source(source, dict(url=url, limit=3))
|
||||
res, errs = query_single_source(source, dict(url=url, limit=3))
|
||||
|
||||
expected = """\
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 iana.warc.gz
|
||||
@ -63,6 +63,7 @@ org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 iana.warc.gz"""
|
||||
|
||||
assert(key_ts_res(res) == expected)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
# Closest -- Local Loaders
|
||||
@ -70,7 +71,7 @@ org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 iana.warc.gz"""
|
||||
@pytest.mark.parametrize("source", local_sources, ids=["file", "redis"])
|
||||
def test_local_closest_loader(source):
|
||||
url = 'http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf'
|
||||
res = query_single_source(source, dict(url=url,
|
||||
res, errs = query_single_source(source, dict(url=url,
|
||||
closest='20140126200930',
|
||||
limit=3))
|
||||
|
||||
@ -80,13 +81,14 @@ org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 iana.warc.gz"""
|
||||
|
||||
assert(key_ts_res(res) == expected)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
# Prefix -- Local Loaders
|
||||
# ============================================================================
|
||||
@pytest.mark.parametrize("source", local_sources, ids=["file", "redis"])
|
||||
def test_file_prefix_loader(source):
|
||||
res = query_single_source(source, dict(url='http://iana.org/domains/root/*'))
|
||||
res, errs = query_single_source(source, dict(url='http://iana.org/domains/root/*'))
|
||||
|
||||
expected = """\
|
||||
org,iana)/domains/root/db 20140126200927 iana.warc.gz
|
||||
@ -94,6 +96,7 @@ org,iana)/domains/root/db 20140126200928 iana.warc.gz
|
||||
org,iana)/domains/root/servers 20140126201227 iana.warc.gz"""
|
||||
|
||||
assert(key_ts_res(res) == expected)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
# Url Match -- Remote Loaders
|
||||
@ -101,7 +104,7 @@ org,iana)/domains/root/servers 20140126201227 iana.warc.gz"""
|
||||
@pytest.mark.parametrize("source", remote_sources, ids=["remote_cdx", "memento"])
|
||||
def test_remote_loader(source):
|
||||
url = 'http://instagram.com/amaliaulman'
|
||||
res = query_single_source(source, dict(url=url))
|
||||
res, errs = query_single_source(source, dict(url=url))
|
||||
|
||||
expected = """\
|
||||
com,instagram)/amaliaulman 20141014150552 http://webenact.rhizome.org/all/20141014150552id_/http://instagram.com/amaliaulman
|
||||
@ -110,6 +113,7 @@ com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/201410
|
||||
com,instagram)/amaliaulman 20141014171636 http://webenact.rhizome.org/all/20141014171636id_/http://instagram.com/amaliaulman"""
|
||||
|
||||
assert(key_ts_res(res, 'load_url') == expected)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
# Url Match -- Remote Loaders
|
||||
@ -117,12 +121,13 @@ com,instagram)/amaliaulman 20141014171636 http://webenact.rhizome.org/all/201410
|
||||
@pytest.mark.parametrize("source", remote_sources, ids=["remote_cdx", "memento"])
|
||||
def test_remote_closest_loader(source):
|
||||
url = 'http://instagram.com/amaliaulman'
|
||||
res = query_single_source(source, dict(url=url, closest='20141014162332', limit=1))
|
||||
res, errs = query_single_source(source, dict(url=url, closest='20141014162332', limit=1))
|
||||
|
||||
expected = """\
|
||||
com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/20141014162333id_/http://instagram.com/amaliaulman"""
|
||||
|
||||
assert(key_ts_res(res, 'load_url') == expected)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
# Url Match -- Memento
|
||||
@ -130,25 +135,26 @@ com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/201410
|
||||
@pytest.mark.parametrize("source", remote_sources, ids=["remote_cdx", "memento"])
|
||||
def test_remote_closest_loader(source):
|
||||
url = 'http://instagram.com/amaliaulman'
|
||||
res = query_single_source(source, dict(url=url, closest='20141014162332', limit=1))
|
||||
res, errs = query_single_source(source, dict(url=url, closest='20141014162332', limit=1))
|
||||
|
||||
expected = """\
|
||||
com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/20141014162333id_/http://instagram.com/amaliaulman"""
|
||||
|
||||
assert(key_ts_res(res, 'load_url') == expected)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
# Live Index -- No Load!
|
||||
# ============================================================================
|
||||
def test_live():
|
||||
url = 'http://example.com/'
|
||||
source = LiveIndexSource()
|
||||
res = query_single_source(source, dict(url=url))
|
||||
res, errs = query_single_source(source, dict(url=url))
|
||||
|
||||
expected = 'com,example)/ {0} http://example.com/'.format(timestamp_now())
|
||||
|
||||
assert(key_ts_res(res, 'load_url') == expected)
|
||||
|
||||
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
# Errors -- Not Found All
|
||||
@ -156,31 +162,36 @@ def test_live():
|
||||
@pytest.mark.parametrize("source", local_sources + remote_sources, ids=["file", "redis", "remote_cdx", "memento"])
|
||||
def test_all_not_found(source):
|
||||
url = 'http://x-not-found-x.notfound/'
|
||||
res = query_single_source(source, dict(url=url, limit=3))
|
||||
res, errs = query_single_source(source, dict(url=url, limit=3))
|
||||
|
||||
expected = ''
|
||||
assert(key_ts_res(res) == expected)
|
||||
if source == remote_sources[0]:
|
||||
assert('http://x-not-found-x.notfound/' in errs['source'])
|
||||
else:
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
# ============================================================================
|
||||
def test_another_remote_not_found():
|
||||
source = MementoIndexSource.from_timegate_url('http://www.webarchive.org.uk/wayback/archive/')
|
||||
url = 'http://x-not-found-x.notfound/'
|
||||
res = query_single_source(source, dict(url=url, limit=3))
|
||||
res, errs = query_single_source(source, dict(url=url, limit=3))
|
||||
|
||||
|
||||
expected = ''
|
||||
assert(key_ts_res(res) == expected)
|
||||
|
||||
assert(errs['source'] == "NotFoundException('http://www.webarchive.org.uk/wayback/archive/timemap/link/http://x-not-found-x.notfound/',)")
|
||||
|
||||
# ============================================================================
|
||||
def test_file_not_found():
|
||||
source = FileIndexSource('testdata/not-found-x')
|
||||
url = 'http://x-not-found-x.notfound/'
|
||||
res = query_single_source(source, dict(url=url, limit=3))
|
||||
res, errs = query_single_source(source, dict(url=url, limit=3))
|
||||
|
||||
expected = ''
|
||||
assert(key_ts_res(res) == expected)
|
||||
assert(errs['source'] == "NotFoundException('testdata/not-found-x',)"), errs
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@ -188,7 +199,7 @@ def test_ait_filters():
|
||||
ait_source = RemoteIndexSource('http://wayback.archive-it.org/cdx/search/cdx?url={url}&filter=filename:ARCHIVEIT-({colls})-.*',
|
||||
'http://wayback.archive-it.org/all/{timestamp}id_/{url}')
|
||||
|
||||
cdxlist = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '5610|933'})
|
||||
cdxlist, errs = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '5610|933'})
|
||||
filenames = [cdx['filename'] for cdx in cdxlist]
|
||||
|
||||
prefix = ('ARCHIVEIT-5610-', 'ARCHIVEIT-933-')
|
||||
@ -196,7 +207,7 @@ def test_ait_filters():
|
||||
assert(all([x.startswith(prefix) for x in filenames]))
|
||||
|
||||
|
||||
cdxlist = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '1883|366|905'})
|
||||
cdxlist, errs = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '1883|366|905'})
|
||||
filenames = [cdx['filename'] for cdx in cdxlist]
|
||||
|
||||
prefix = ('ARCHIVEIT-1883-', 'ARCHIVEIT-366-', 'ARCHIVEIT-905-')
|
||||
|
@ -1,13 +1,14 @@
|
||||
from gevent import monkey; monkey.patch_all(thread=False)
|
||||
|
||||
from rezag.aggindexsource import SimpleAggregator, GeventTimeoutAggregator
|
||||
from rezag.aggindexsource import ThreadedTimeoutAggregator
|
||||
from rezag.aggindexsource import ThreadedTimeoutAggregator, BaseAggregator
|
||||
|
||||
from rezag.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource
|
||||
from .testutils import json_list, to_path
|
||||
|
||||
import json
|
||||
import pytest
|
||||
import time
|
||||
|
||||
from rezag.handlers import IndexHandler
|
||||
|
||||
@ -27,6 +28,10 @@ aggs = {'simple': SimpleAggregator(sources),
|
||||
'processes': ThreadedTimeoutAggregator(sources, timeout=5.0, use_processes=True),
|
||||
}
|
||||
|
||||
agg_tm = {'gevent': GeventTimeoutAggregator(sources, timeout=0.0),
|
||||
'threaded': ThreadedTimeoutAggregator(sources, timeout=0.0),
|
||||
'processes': ThreadedTimeoutAggregator(sources, timeout=0.0, use_processes=True)}
|
||||
|
||||
nf = {'notfound': FileIndexSource(to_path('testdata/not-found-x'))}
|
||||
agg_nf = {'simple': SimpleAggregator(nf),
|
||||
'gevent': GeventTimeoutAggregator(nf, timeout=5.0),
|
||||
@ -41,7 +46,7 @@ agg_nf = {'simple': SimpleAggregator(nf),
|
||||
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
||||
def test_mem_agg_index_1(agg):
|
||||
url = 'http://iana.org/'
|
||||
res = agg(dict(url=url, closest='20140126000000', limit=5))
|
||||
res, errs = agg(dict(url=url, closest='20140126000000', limit=5))
|
||||
|
||||
|
||||
exp = [{"timestamp": "20140126093743", "load_url": "http://web.archive.org/web/20140126093743id_/http://iana.org/", "source": "ia"},
|
||||
@ -52,12 +57,13 @@ def test_mem_agg_index_1(agg):
|
||||
]
|
||||
|
||||
assert(json_list(res) == exp)
|
||||
|
||||
assert(errs == {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://iana.org/',)",
|
||||
'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"})
|
||||
|
||||
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
||||
def test_mem_agg_index_2(agg):
|
||||
url = 'http://example.com/'
|
||||
res = agg(dict(url=url, closest='20100512', limit=6))
|
||||
res, errs = agg(dict(url=url, closest='20100512', limit=6))
|
||||
|
||||
exp = [{"timestamp": "20100513010014", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100513010014id_/http://example.com/", "source": "bl"},
|
||||
{"timestamp": "20100512204410", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100512204410id_/http://example.com/", "source": "bl"},
|
||||
@ -67,12 +73,13 @@ def test_mem_agg_index_2(agg):
|
||||
{"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"}]
|
||||
|
||||
assert(json_list(res) == exp)
|
||||
assert(errs == {'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://example.com/',)"})
|
||||
|
||||
|
||||
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
||||
def test_mem_agg_index_3(agg):
|
||||
url = 'http://vvork.com/'
|
||||
res = agg(dict(url=url, closest='20141001', limit=5))
|
||||
res, errs = agg(dict(url=url, closest='20141001', limit=5))
|
||||
|
||||
exp = [{"timestamp": "20141006184357", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"},
|
||||
{"timestamp": "20141018133107", "load_url": "http://web.archive.org/web/20141018133107id_/http://vvork.com/", "source": "ia"},
|
||||
@ -81,32 +88,53 @@ def test_mem_agg_index_3(agg):
|
||||
{"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}]
|
||||
|
||||
assert(json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
||||
def test_mem_agg_index_4(agg):
|
||||
url = 'http://vvork.com/'
|
||||
res = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
|
||||
res, errs = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
|
||||
|
||||
exp = [{"timestamp": "20141006184357", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"},
|
||||
{"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}]
|
||||
|
||||
assert(json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
@pytest.mark.parametrize("agg", list(agg_nf.values()), ids=list(agg_nf.keys()))
|
||||
def test_mem_agg_not_found(agg):
|
||||
url = 'http://vvork.com/'
|
||||
res = agg(dict(url=url, closest='20141001', limit=2))
|
||||
res, errs = agg(dict(url=url, closest='20141001', limit=2))
|
||||
|
||||
assert(json_list(res) == [])
|
||||
assert(errs == {'notfound': "NotFoundException('testdata/not-found-x',)"})
|
||||
|
||||
|
||||
@pytest.mark.parametrize("agg", list(agg_tm.values()), ids=list(agg_tm.keys()))
|
||||
def test_mem_agg_timeout(agg):
|
||||
url = 'http://vvork.com/'
|
||||
|
||||
orig_source = BaseAggregator.load_child_source
|
||||
def load_child_source(self, name, source, params):
|
||||
time.sleep(0.1)
|
||||
return orig_source(name, source, params)
|
||||
|
||||
BaseAggregator.load_child_source = load_child_source
|
||||
res, errs = agg(dict(url=url, closest='20141001', limit=2))
|
||||
BaseAggregator.load_child_source = orig_source
|
||||
|
||||
assert(json_list(res) == [])
|
||||
assert(errs == {'local': 'timeout',
|
||||
'ait': 'timeout', 'bl': 'timeout', 'ia': 'timeout', 'rhiz': 'timeout'})
|
||||
|
||||
|
||||
def test_handler_output_cdxj():
|
||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||
handler = IndexHandler(agg)
|
||||
url = 'http://vvork.com/'
|
||||
res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
|
||||
res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
|
||||
|
||||
exp = """\
|
||||
com,vvork)/ 20141006184357 {"url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
|
||||
@ -114,13 +142,14 @@ com,vvork)/ 20131004231540 {"url": "http://vvork.com/", "mem_rel": "last memento
|
||||
"""
|
||||
|
||||
assert(''.join(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
def test_handler_output_json():
|
||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||
handler = IndexHandler(agg)
|
||||
url = 'http://vvork.com/'
|
||||
res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='json'))
|
||||
res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='json'))
|
||||
|
||||
exp = """\
|
||||
{"urlkey": "com,vvork)/", "timestamp": "20141006184357", "url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
|
||||
@ -128,26 +157,27 @@ def test_handler_output_json():
|
||||
"""
|
||||
|
||||
assert(''.join(res) == exp)
|
||||
|
||||
assert(errs == {})
|
||||
|
||||
def test_handler_output_link():
|
||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||
handler = IndexHandler(agg)
|
||||
url = 'http://vvork.com/'
|
||||
res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='link'))
|
||||
res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='link'))
|
||||
|
||||
exp = """\
|
||||
<http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/>; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT"; src="rhiz",
|
||||
<http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/>; rel="memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT"; src="ait"
|
||||
"""
|
||||
assert(''.join(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
def test_handler_output_link_2():
|
||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||
handler = IndexHandler(agg)
|
||||
url = 'http://iana.org/'
|
||||
res = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
|
||||
res, errs = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
|
||||
|
||||
exp = """\
|
||||
<http://web.archive.org/web/20140126093743id_/http://iana.org/>; rel="memento"; datetime="Sun, 26 Jan 2014 09:37:43 GMT"; src="ia",
|
||||
@ -158,38 +188,54 @@ def test_handler_output_link_2():
|
||||
"""
|
||||
assert(''.join(res) == exp)
|
||||
|
||||
exp_errs = {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://iana.org/',)",
|
||||
'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}
|
||||
|
||||
assert(errs == exp_errs)
|
||||
|
||||
|
||||
|
||||
def test_handler_output_link_3():
|
||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||
handler = IndexHandler(agg)
|
||||
url = 'http://foo.bar.non-existent'
|
||||
res = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
|
||||
res, errs = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
|
||||
|
||||
exp = ''
|
||||
|
||||
assert(''.join(res) == exp)
|
||||
|
||||
exp_errs = {'ait': "NotFoundException('http://wayback.archive-it.org/all/http://foo.bar.non-existent',)",
|
||||
'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://foo.bar.non-existent',)",
|
||||
'ia': "NotFoundException('http://web.archive.org/web/http://foo.bar.non-existent',)",
|
||||
'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://foo.bar.non-existent',)"}
|
||||
|
||||
assert(errs == exp_errs)
|
||||
|
||||
def test_handler_output_text():
|
||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||
handler = IndexHandler(agg)
|
||||
url = 'http://vvork.com/'
|
||||
res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='text'))
|
||||
res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='text'))
|
||||
|
||||
exp = """\
|
||||
com,vvork)/ 20141006184357 http://www.vvork.com/ memento http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/ http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/ rhiz
|
||||
com,vvork)/ 20131004231540 http://vvork.com/ last memento http://wayback.archive-it.org/all/20131004231540/http://vvork.com/ http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/ ait
|
||||
"""
|
||||
assert(''.join(res) == exp)
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
def test_handler_list_sources():
|
||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||
handler = IndexHandler(agg)
|
||||
res = handler(dict(mode='list_sources'))
|
||||
res, errs = handler(dict(mode='list_sources'))
|
||||
|
||||
assert(res == {'sources': {'bl': 'memento',
|
||||
'ait': 'memento',
|
||||
'ia': 'memento',
|
||||
'rhiz': 'memento',
|
||||
'local': 'file'}})
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
|
@ -35,7 +35,7 @@ def setup_module():
|
||||
def test_timeout_long_all_pass():
|
||||
agg = TimeoutAggregator(sources, timeout=1.0)
|
||||
|
||||
res = agg(dict(url='http://example.com/'))
|
||||
res, errs = agg(dict(url='http://example.com/'))
|
||||
|
||||
exp = [{'source': 'slower', 'timestamp': '20140127171200'},
|
||||
{'source': 'slower', 'timestamp': '20140127171251'},
|
||||
@ -43,27 +43,31 @@ def test_timeout_long_all_pass():
|
||||
|
||||
assert(json_list(res, fields=['source', 'timestamp']) == exp)
|
||||
|
||||
assert(errs == {})
|
||||
|
||||
|
||||
def test_timeout_slower_skipped_1():
|
||||
agg = GeventTimeoutAggregator(sources, timeout=0.49)
|
||||
|
||||
res = agg(dict(url='http://example.com/'))
|
||||
res, errs = agg(dict(url='http://example.com/'))
|
||||
|
||||
exp = [{'source': 'slow', 'timestamp': '20160225042329'}]
|
||||
|
||||
assert(json_list(res, fields=['source', 'timestamp']) == exp)
|
||||
|
||||
assert(errs == {'slower': 'timeout'})
|
||||
|
||||
|
||||
def test_timeout_slower_skipped_2():
|
||||
agg = GeventTimeoutAggregator(sources, timeout=0.19)
|
||||
|
||||
res = agg(dict(url='http://example.com/'))
|
||||
res, errs = agg(dict(url='http://example.com/'))
|
||||
|
||||
exp = []
|
||||
|
||||
assert(json_list(res, fields=['source', 'timestamp']) == exp)
|
||||
|
||||
assert(errs == {'slower': 'timeout', 'slow': 'timeout'})
|
||||
|
||||
|
||||
def test_timeout_skipping():
|
||||
@ -75,31 +79,40 @@ def test_timeout_skipping():
|
||||
|
||||
exp = [{'source': 'slow', 'timestamp': '20160225042329'}]
|
||||
|
||||
res = agg(dict(url='http://example.com/'))
|
||||
res, errs = agg(dict(url='http://example.com/'))
|
||||
assert(json_list(res, fields=['source', 'timestamp']) == exp)
|
||||
assert(sources['slow'].calls == 4)
|
||||
assert(sources['slower'].calls == 4)
|
||||
|
||||
res = agg(dict(url='http://example.com/'))
|
||||
assert(errs == {'slower': 'timeout'})
|
||||
|
||||
res, errs = agg(dict(url='http://example.com/'))
|
||||
assert(json_list(res, fields=['source', 'timestamp']) == exp)
|
||||
assert(sources['slow'].calls == 5)
|
||||
assert(sources['slower'].calls == 5)
|
||||
|
||||
res = agg(dict(url='http://example.com/'))
|
||||
assert(errs == {'slower': 'timeout'})
|
||||
|
||||
res, errs = agg(dict(url='http://example.com/'))
|
||||
assert(json_list(res, fields=['source', 'timestamp']) == exp)
|
||||
assert(sources['slow'].calls == 6)
|
||||
assert(sources['slower'].calls == 5)
|
||||
|
||||
res = agg(dict(url='http://example.com/'))
|
||||
assert(errs == {})
|
||||
|
||||
res, errs = agg(dict(url='http://example.com/'))
|
||||
assert(json_list(res, fields=['source', 'timestamp']) == exp)
|
||||
assert(sources['slow'].calls == 7)
|
||||
assert(sources['slower'].calls == 5)
|
||||
|
||||
assert(errs == {})
|
||||
|
||||
time.sleep(2.01)
|
||||
|
||||
res = agg(dict(url='http://example.com/'))
|
||||
res, errs = agg(dict(url='http://example.com/'))
|
||||
assert(json_list(res, fields=['source', 'timestamp']) == exp)
|
||||
assert(sources['slow'].calls == 8)
|
||||
assert(sources['slower'].calls == 6)
|
||||
|
||||
assert(errs == {'slower': 'timeout'})
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user