1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

errors and timeouts reported back to the user via ResErrors header

add new /index, /resource access point system
This commit is contained in:
Ilya Kreymer 2016-03-02 18:13:13 -08:00
parent 1f3763d02c
commit 65e969a492
10 changed files with 333 additions and 143 deletions

View File

@ -13,9 +13,10 @@ from pywb.cdx.query import CDXQuery
from heapq import merge
from collections import deque
from itertools import chain
from rezag.indexsource import FileIndexSource
from pywb.utils.wbexception import NotFoundException
from pywb.utils.wbexception import NotFoundException, WbException
import six
import glob
@ -29,13 +30,10 @@ class BaseAggregator(object):
query = CDXQuery(params)
self._set_src_params(params)
try:
cdx_iter = self.load_index(query.params)
except NotFoundException as nf:
cdx_iter = iter([])
cdx_iter, errs = self.load_index(query.params)
cdx_iter = process_cdx(cdx_iter, query)
return cdx_iter
return cdx_iter, dict(errs)
def _set_src_params(self, params):
src_params = {}
@ -60,16 +58,23 @@ class BaseAggregator(object):
params['_all_src_params'] = src_params
def load_child_source_list(self, name, source, params):
return list(self.load_child_source(name, source, params))
res = self.load_child_source(name, source, params)
return list(res[0]), res[1]
def load_child_source(self, name, source, params):
try:
_src_params = params['_all_src_params'].get(name)
params['_src_params'] = _src_params
cdx_iter = source.load_index(params)
except NotFoundException as nf:
print('Not found in ' + name)
res = source.load_index(params)
if isinstance(res, tuple):
cdx_iter, err_list = res
else:
cdx_iter = res
err_list = []
except WbException as wbe:
#print('Not found in ' + name)
cdx_iter = iter([])
err_list = [(name, repr(wbe))]
def add_name(cdx):
if cdx.get('source'):
@ -78,10 +83,13 @@ class BaseAggregator(object):
cdx['source'] = name
return cdx
return (add_name(cdx) for cdx in cdx_iter)
return (add_name(cdx) for cdx in cdx_iter), err_list
def load_index(self, params):
iter_list = self._load_all(params)
res_list = self._load_all(params)
iter_list = [res[0] for res in res_list]
err_list = chain(*[res[1] for res in res_list])
#optimization: if only a single entry (or empty) just load directly
if len(iter_list) <= 1:
@ -89,7 +97,7 @@ class BaseAggregator(object):
else:
cdx_iter = merge(*(iter_list))
return cdx_iter
return cdx_iter, err_list
def _on_source_error(self, name): #pragma: no cover
pass
@ -207,6 +215,7 @@ class GeventMixin(object):
if job.value is not None:
results.append(job.value)
else:
results.append((iter([]), [(name, 'timeout')]))
self._on_source_error(name)
return results
@ -247,7 +256,9 @@ class ConcurrentMixin(object):
results.append(job.result())
for job in res_not_done:
self._on_source_error(jobs[job])
name = jobs[job]
results.append((iter([]), [(name, 'timeout')]))
self._on_source_error(name)
return results

View File

@ -2,25 +2,49 @@ from rezag.inputrequest import DirectWSGIInputRequest, POSTInputRequest
from bottle import route, request, response, default_app, abort
import bottle
from pywb.utils.wbexception import WbException
import traceback
import json
JSON_CT = 'application/json; charset=utf-8'
def err_handler(exc):
response.status = exc.status_code
response.content_type = 'application/json'
return json.dumps({'message': exc.body})
response.content_type = JSON_CT
err_msg = json.dumps({'message': exc.body})
response.headers['ResErrors'] = err_msg
return err_msg
def wrap_error(func):
def wrap_func(*args, **kwargs):
try:
return func(*args, **kwargs)
except WbException as exc:
if bottle.debug:
traceback.print_exc()
abort(exc.status(), exc.msg)
res, errs = func(*args, **kwargs)
if res:
if errs:
response.headers['ResErrors'] = json.dumps(errs)
return res
last_exc = errs.pop('last_exc', None)
if last_exc:
if bottle.debug:
traceback.print_exc()
response.status = last_exc.status()
message = last_exc.msg
else:
response.status = 404
message = 'No Resource Found'
response.content_type = JSON_CT
res = {'message': message}
if errs:
res['errors'] = errs
err_msg = json.dumps(res)
response.headers['ResErrors'] = err_msg
return err_msg
except Exception as e:
if bottle.debug:
traceback.print_exc()
@ -32,35 +56,33 @@ def wrap_error(func):
route_dict = {}
def add_route(path, handler):
@route(path, 'ANY')
@route([path, path + '/<mode:path>'], 'ANY')
@wrap_error
def direct_input_request():
def direct_input_request(mode=''):
params = dict(request.query)
params['mode'] = mode
params['_input_req'] = DirectWSGIInputRequest(request.environ)
return handler(params)
@route(path + '/postreq', 'POST')
@route([path + '/postreq', path + '/<mode:path>/postreq'], 'POST')
@wrap_error
def post_fullrequest():
def post_fullrequest(mode=''):
params = dict(request.query)
params['mode'] = mode
params['_input_req'] = POSTInputRequest(request.environ)
return handler(params)
global route_dict
handler_dict = {'handler': handler.get_supported_modes()}
handler_dict = handler.get_supported_modes()
route_dict[path] = handler_dict
route_dict[path + '/postreq'] = handler_dict
@route('/')
def list_routes():
return route_dict
application = default_app()
application.default_error_handler = err_handler

View File

@ -39,12 +39,13 @@ class IndexHandler(object):
self.opts = opts or {}
def get_supported_modes(self):
return dict(modes=['list_modes', 'list_sources', 'index'])
return dict(modes=['list_sources', 'index'])
def _load_index_source(self, params):
url = params.get('url')
if not url:
raise BadRequestException('The "url" param is required')
errs = dict(last_exc=BadRequestException('The "url" param is required'))
return None, errs
input_req = params.get('_input_req')
if input_req:
@ -55,21 +56,25 @@ class IndexHandler(object):
def __call__(self, params):
mode = params.get('mode', 'index')
if mode == 'list_sources':
return self.index_source.get_source_list(params)
return self.index_source.get_source_list(params), {}
if mode == 'list_modes' or mode != 'index':
return self.get_supported_modes()
if mode != 'index':
return self.get_supported_modes(), {}
output = params.get('output', self.DEF_OUTPUT)
fields = params.get('fields')
handler = self.OUTPUTS.get(output)
if not handler:
raise BadRequestException('output={0} not supported'.format(output))
errs = dict(last_exc=BadRequestException('output={0} not supported'.format(output)))
return None, errs
cdx_iter, errs = self._load_index_source(params)
if not cdx_iter:
return None, errs
cdx_iter = self._load_index_source(params)
res = handler(cdx_iter, fields)
return res
return res, errs
#=============================================================================
@ -87,7 +92,10 @@ class ResourceHandler(IndexHandler):
if params.get('mode', 'resource') != 'resource':
return super(ResourceHandler, self).__call__(params)
cdx_iter = self._load_index_source(params)
cdx_iter, errs = self._load_index_source(params)
if not cdx_iter:
return None, errs
last_exc = None
for cdx in cdx_iter:
@ -95,15 +103,15 @@ class ResourceHandler(IndexHandler):
try:
resp = loader(cdx, params)
if resp is not None:
return resp
return resp, errs
except WbException as e:
last_exc = e
errs[str(loader)] = repr(e)
if last_exc:
raise last_exc
#raise ArchiveLoadFailed('Resource Found, could not be Loaded')
else:
raise NotFoundException('No Resource Found')
errs['last_exc'] = last_exc
return None, errs
#=============================================================================
@ -121,20 +129,19 @@ class HandlerSeq(object):
self.handlers = handlers
def get_supported_modes(self):
return []
# return zip([self.handlers.get_supported_modes()]
if self.handlers:
return self.handlers[0].get_supported_modes()
else:
return {}
def __call__(self, params):
last_exc = None
all_errs = {}
for handler in self.handlers:
try:
res = handler(params)
if res is not None:
return res
except WbException as e:
last_exc = e
res, errs = handler(params)
all_errs.update(errs)
if res is not None:
return res, all_errs
return None, all_errs
if last_exc:
raise last_exc
else:
raise NotFoundException('No Resource Found')

View File

@ -58,6 +58,10 @@ class WARCPathLoader(object):
no_record_parse=True)
self.cdx_source = cdx_source
def cdx_index_source(self, *args, **kwargs):
cdx_iter, errs = self.cdx_source(*args, **kwargs)
return cdx_iter
def warc_paths(self):
for path in self.paths:
def check(filename, cdx):
@ -83,7 +87,7 @@ class WARCPathLoader(object):
headers, payload = (self.resolve_loader.
load_headers_and_payload(cdx,
failed_files,
self.cdx_source))
self.cdx_index_source))
record = payload
@ -102,6 +106,9 @@ class WARCPathLoader(object):
res = StreamIter(record.stream)
return res
def __str__(self):
return 'WARCPathLoader'
#=============================================================================
class HeaderRecorder(BaseRecorder):
@ -200,3 +207,7 @@ class LiveWebLoader(object):
if not id_:
id_ = uuid.uuid1()
return '<urn:uuid:{0}>'.format(id_)
def __str__(self):
return 'LiveWebLoader'

View File

@ -2,7 +2,7 @@ import re
import six
from pywb.utils.timeutils import timestamp_to_http_date
from pywb.utils.wbexception import BadRequestException
LINK_SPLIT = re.compile(',\s*(?=[<])')
LINK_SEG_SPLIT = re.compile(';\s*')
@ -10,6 +10,11 @@ LINK_URL = re.compile('<(.*)>')
LINK_PROP = re.compile('([\w]+)="([^"]+)')
#=================================================================
class MementoException(BadRequestException):
pass
#=================================================================
class MementoUtils(object):
@staticmethod
@ -22,7 +27,7 @@ class MementoUtils(object):
props = LINK_SEG_SPLIT.split(link)
m = LINK_URL.match(props[0])
if not m:
raise Exception('Invalid Link Url: ' + props[0])
raise MementoException('Invalid Link Url: ' + props[0])
result = dict(url=m.group(1))
key = ''
@ -31,7 +36,7 @@ class MementoUtils(object):
for prop in props[1:]:
m = LINK_PROP.match(prop)
if not m:
raise Exception('Invalid prop ' + prop)
raise MementoException('Invalid prop ' + prop)
name = m.group(1)
value = m.group(2)

View File

@ -59,43 +59,47 @@ def to_json_list(cdxlist, fields=['timestamp', 'load_url', 'filename', 'source']
def test_agg_no_coll_set():
res = dir_loader(dict(url='example.com/'))
res, errs = dir_loader(dict(url='example.com/'))
assert(to_json_list(res) == [])
assert(errs == {})
def test_agg_collA_found():
res = dir_loader({'url': 'example.com/', 'param.coll': 'A'})
res, errs = dir_loader({'url': 'example.com/', 'param.coll': 'A'})
exp = [{'source': 'colls/A/indexes', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}]
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_collB():
res = dir_loader({'url': 'example.com/', 'param.coll': 'B'})
res, errs = dir_loader({'url': 'example.com/', 'param.coll': 'B'})
exp = []
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_collB_found():
res = dir_loader({'url': 'iana.org/', 'param.coll': 'B'})
res, errs = dir_loader({'url': 'iana.org/', 'param.coll': 'B'})
exp = [{'source': 'colls/B/indexes', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
assert(to_json_list(res) == exp)
assert(errs == {})
def test_extra_agg_collB():
agg_source = SimpleAggregator({'dir': dir_loader})
res = agg_source({'url': 'iana.org/', 'param.coll': 'B'})
res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'})
exp = [{'source': 'dir:colls/B/indexes', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_all_found_1():
res = dir_loader({'url': 'iana.org/', 'param.coll': '*'})
res, errs = dir_loader({'url': 'iana.org/', 'param.coll': '*'})
exp = [
{'source': 'colls/B/indexes', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'},
@ -104,10 +108,11 @@ def test_agg_all_found_1():
]
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_all_found_2():
res = dir_loader({'url': 'example.com/', 'param.coll': '*'})
res, errs = dir_loader({'url': 'example.com/', 'param.coll': '*'})
exp = [
{'source': 'colls/C/indexes', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
@ -116,6 +121,7 @@ def test_agg_all_found_2():
]
assert(to_json_list(res) == exp)
assert(errs == {})
@ -124,7 +130,7 @@ def test_agg_dir_and_memento():
'local': dir_loader}
agg_source = SimpleAggregator(sources)
res = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})
res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})
exp = [
{'source': 'ia', 'timestamp': '20100513052358', 'load_url': 'http://web.archive.org/web/20100513052358id_/http://example.com/'},
@ -136,23 +142,26 @@ def test_agg_dir_and_memento():
]
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_no_dir_1():
res = dir_loader({'url': 'example.com/', 'param.coll': 'X'})
res, errs = dir_loader({'url': 'example.com/', 'param.coll': 'X'})
exp = []
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_no_dir_2():
loader = DirectoryIndexSource(root_dir, '')
res = loader({'url': 'example.com/', 'param.coll': 'X'})
res, errs = loader({'url': 'example.com/', 'param.coll': 'X'})
exp = []
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_dir_sources_1():

View File

@ -50,7 +50,7 @@ def setup_module(self):
handler2]))
add_route('/empty', HandlerSeq([]))
add_route('/invalid', HandlerSeq(['foo']))
add_route('/invalid', DefaultResourceHandler([SimpleAggregator({'invalid': 'should not be a callable'})]))
application.debug = True
global testapp
@ -65,23 +65,49 @@ class TestResAgg(object):
def setup(self):
self.testapp = testapp
def test_list_routes(self):
resp = self.testapp.get('/')
res = resp.json
assert set(res.keys()) == set(['/empty', '/empty/postreq',
'/fallback', '/fallback/postreq',
'/live', '/live/postreq',
'/many', '/many/postreq',
'/posttest', '/posttest/postreq',
'/seq', '/seq/postreq',
'/invalid', '/invalid/postreq'])
assert res['/fallback'] == {'modes': ['list_sources', 'index', 'resource']}
def test_list_handlers(self):
resp = self.testapp.get('/many?mode=list_modes')
assert resp.json == {'modes': ['list_modes', 'list_sources', 'index', 'resource']}
resp = self.testapp.get('/many')
assert resp.json == {'modes': ['list_sources', 'index', 'resource']}
assert 'ResErrors' not in resp.headers
resp = self.testapp.get('/many?mode=other')
assert resp.json == {'modes': ['list_modes', 'list_sources', 'index', 'resource']}
resp = self.testapp.get('/many/other')
assert resp.json == {'modes': ['list_sources', 'index', 'resource']}
assert 'ResErrors' not in resp.headers
# defaults to resource, must specify url
resp = self.testapp.get('/many', status=400)
def test_list_errors(self):
# must specify url for index or resource
resp = self.testapp.get('/many/index', status=400)
assert resp.json == {'message': 'The "url" param is required'}
assert resp.text == resp.headers['ResErrors']
resp = self.testapp.get('/many/index', status=400)
assert resp.json == {'message': 'The "url" param is required'}
assert resp.text == resp.headers['ResErrors']
resp = self.testapp.get('/many/resource', status=400)
assert resp.json == {'message': 'The "url" param is required'}
assert resp.text == resp.headers['ResErrors']
def test_list_sources(self):
resp = self.testapp.get('/many?mode=list_sources')
resp = self.testapp.get('/many/list_sources')
assert resp.json == {'sources': {'local': 'file_dir', 'ia': 'memento', 'rhiz': 'memento', 'live': 'live'}}
assert 'ResErrors' not in resp.headers
def test_live_index(self):
resp = self.testapp.get('/live?mode=index&url=http://httpbin.org/get&output=json')
resp = self.testapp.get('/live/index?url=http://httpbin.org/get&output=json')
resp.charset = 'utf-8'
res = to_json_list(resp.text)
@ -91,7 +117,7 @@ class TestResAgg(object):
def test_live_resource(self):
headers = {'foo': 'bar'}
resp = self.testapp.get('/live?url=http://httpbin.org/get?foo=bar', headers=headers)
resp = self.testapp.get('/live/resource?url=http://httpbin.org/get?foo=bar', headers=headers)
assert resp.headers['WARC-Coll'] == 'live'
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar'
@ -100,9 +126,10 @@ class TestResAgg(object):
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
assert 'ResErrors' not in resp.headers
def test_live_post_resource(self):
resp = self.testapp.post('/live?url=http://httpbin.org/post',
resp = self.testapp.post('/live/resource?url=http://httpbin.org/post',
OrderedDict([('foo', 'bar')]))
assert resp.headers['WARC-Coll'] == 'live'
@ -112,38 +139,45 @@ class TestResAgg(object):
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
assert 'ResErrors' not in resp.headers
def test_agg_select_mem_1(self):
resp = self.testapp.get('/many?url=http://vvork.com/&closest=20141001')
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20141001')
assert resp.headers['WARC-Coll'] == 'rhiz'
assert resp.headers['WARC-Target-URI'] == 'http://www.vvork.com/'
assert resp.headers['WARC-Date'] == '2014-10-06T18:43:57Z'
assert b'HTTP/1.1 200 OK' in resp.body
assert 'ResErrors' not in resp.headers
def test_agg_select_mem_2(self):
resp = self.testapp.get('/many?url=http://vvork.com/&closest=20151231')
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20151231')
assert resp.headers['WARC-Coll'] == 'ia'
assert resp.headers['WARC-Target-URI'] == 'http://vvork.com/'
assert resp.headers['WARC-Date'] == '2016-01-10T13:48:55Z'
assert b'HTTP/1.1 200 OK' in resp.body
assert 'ResErrors' not in resp.headers
def test_agg_select_live(self):
resp = self.testapp.get('/many?url=http://vvork.com/&closest=2016')
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=2016')
assert resp.headers['WARC-Coll'] == 'live'
assert resp.headers['WARC-Target-URI'] == 'http://vvork.com/'
assert resp.headers['WARC-Date'] != ''
assert 'ResErrors' not in resp.headers
def test_agg_select_local(self):
resp = self.testapp.get('/many?url=http://iana.org/&closest=20140126200624')
resp = self.testapp.get('/many/resource?url=http://iana.org/&closest=20140126200624')
assert resp.headers['WARC-Coll'] == 'local'
assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/'
assert resp.headers['WARC-Date'] == '2014-01-26T20:06:24Z'
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}
def test_agg_select_local_postreq(self):
req_data = """\
@ -153,12 +187,13 @@ User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (
Host: iana.org
"""
resp = self.testapp.post('/many/postreq?url=http://iana.org/&closest=20140126200624', req_data)
resp = self.testapp.post('/many/resource/postreq?url=http://iana.org/&closest=20140126200624', req_data)
assert resp.headers['WARC-Coll'] == 'local'
assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/'
assert resp.headers['WARC-Date'] == '2014-01-26T20:06:24Z'
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}
def test_agg_live_postreq(self):
req_data = """\
@ -168,7 +203,7 @@ User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (
Host: httpbin.org
"""
resp = self.testapp.post('/many/postreq?url=http://httpbin.org/get?foo=bar&closest=now', req_data)
resp = self.testapp.post('/many/resource/postreq?url=http://httpbin.org/get?foo=bar&closest=now', req_data)
assert resp.headers['WARC-Coll'] == 'live'
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar'
@ -177,6 +212,8 @@ Host: httpbin.org
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://httpbin.org/get?foo=bar',)"}
def test_agg_post_resolve_postreq(self):
req_data = """\
POST /post HTTP/1.1
@ -188,7 +225,7 @@ content-type: application/x-www-form-urlencoded
foo=bar&test=abc"""
resp = self.testapp.post('/posttest/postreq?url=http://httpbin.org/post', req_data)
resp = self.testapp.post('/posttest/resource/postreq?url=http://httpbin.org/post', req_data)
assert resp.headers['WARC-Coll'] == 'post'
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/post'
@ -197,10 +234,12 @@ foo=bar&test=abc"""
assert b'"test": "abc"' in resp.body
assert b'"url": "http://httpbin.org/post"' in resp.body
assert 'ResErrors' not in resp.headers
def test_agg_post_resolve_fallback(self):
req_data = OrderedDict([('foo', 'bar'), ('test', 'abc')])
resp = self.testapp.post('/fallback?url=http://httpbin.org/post', req_data)
resp = self.testapp.post('/fallback/resource?url=http://httpbin.org/post', req_data)
assert resp.headers['WARC-Coll'] == 'post'
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/post'
@ -209,28 +248,37 @@ foo=bar&test=abc"""
assert b'"test": "abc"' in resp.body
assert b'"url": "http://httpbin.org/post"' in resp.body
assert 'ResErrors' not in resp.headers
def test_agg_seq_fallback_1(self):
resp = self.testapp.get('/fallback?url=http://www.iana.org/')
resp = self.testapp.get('/fallback/resource?url=http://www.iana.org/')
assert resp.headers['WARC-Coll'] == 'live'
assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/'
assert b'HTTP/1.1 200 OK' in resp.body
assert 'ResErrors' not in resp.headers
def test_agg_seq_fallback_2(self):
resp = self.testapp.get('/fallback?url=http://www.example.com/')
resp = self.testapp.get('/fallback/resource?url=http://www.example.com/')
assert resp.headers['WARC-Coll'] == 'example'
assert resp.headers['WARC-Date'] == '2016-02-25T04:23:29Z'
assert resp.headers['WARC-Target-URI'] == 'http://example.com/'
assert b'HTTP/1.1 200 OK' in resp.body
def test_error_fallback_live_not_found(self):
resp = self.testapp.get('/fallback?url=http://invalid.url-not-found', status=400)
assert 'ResErrors' not in resp.headers
assert resp.json == {'message': 'http://invalid.url-not-found'}
def test_error_fallback_live_not_found(self):
resp = self.testapp.get('/fallback/resource?url=http://invalid.url-not-found', status=400)
assert resp.json == {'message': 'http://invalid.url-not-found',
'errors': {'LiveWebLoader': "LiveResourceException('http://invalid.url-not-found',)"}}
assert resp.text == resp.headers['ResErrors']
def test_agg_local_revisit(self):
resp = self.testapp.get('/many?url=http://www.example.com/&closest=20140127171251&sources=local')
resp = self.testapp.get('/many/resource?url=http://www.example.com/&closest=20140127171251&sources=local')
assert resp.headers['WARC-Coll'] == 'local'
assert resp.headers['WARC-Target-URI'] == 'http://example.com'
@ -240,23 +288,30 @@ foo=bar&test=abc"""
assert b'HTTP/1.1 200 OK' in resp.body
assert b'<!doctype html>' in resp.body
assert 'ResErrors' not in resp.headers
def test_error_invalid_index_output(self):
resp = self.testapp.get('/live?mode=index&url=http://httpbin.org/get&output=foobar', status=400)
resp = self.testapp.get('/live/index?url=http://httpbin.org/get&output=foobar', status=400)
assert resp.json == {'message': 'output=foobar not supported'}
assert resp.text == resp.headers['ResErrors']
def test_error_local_not_found(self):
resp = self.testapp.get('/many?url=http://not-found.error/&sources=local', status=404)
resp = self.testapp.get('/many/resource?url=http://not-found.error/&sources=local', status=404)
assert resp.json == {'message': 'No Resource Found'}
assert resp.text == resp.headers['ResErrors']
def test_error_empty(self):
resp = self.testapp.get('/empty?url=http://example.com/', status=404)
resp = self.testapp.get('/empty/resource?url=http://example.com/', status=404)
assert resp.json == {'message': 'No Resource Found'}
assert resp.text == resp.headers['ResErrors']
def test_error_invalid(self):
resp = self.testapp.get('/invalid?url=http://example.com/', status=500)
resp = self.testapp.get('/invalid/resource?url=http://example.com/', status=500)
assert resp.json == {'message': "Internal Error: 'list' object is not callable"}
assert resp.text == resp.headers['ResErrors']
assert resp.json['message'].startswith('Internal Error')

View File

@ -55,7 +55,7 @@ def query_single_source(source, params):
@pytest.mark.parametrize("source", local_sources, ids=["file", "redis"])
def test_local_cdxj_loader(source):
url = 'http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf'
res = query_single_source(source, dict(url=url, limit=3))
res, errs = query_single_source(source, dict(url=url, limit=3))
expected = """\
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 iana.warc.gz
@ -63,6 +63,7 @@ org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 iana.warc.gz"""
assert(key_ts_res(res) == expected)
assert(errs == {})
# Closest -- Local Loaders
@ -70,7 +71,7 @@ org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 iana.warc.gz"""
@pytest.mark.parametrize("source", local_sources, ids=["file", "redis"])
def test_local_closest_loader(source):
url = 'http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf'
res = query_single_source(source, dict(url=url,
res, errs = query_single_source(source, dict(url=url,
closest='20140126200930',
limit=3))
@ -80,13 +81,14 @@ org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 iana.warc.gz"""
assert(key_ts_res(res) == expected)
assert(errs == {})
# Prefix -- Local Loaders
# ============================================================================
@pytest.mark.parametrize("source", local_sources, ids=["file", "redis"])
def test_file_prefix_loader(source):
res = query_single_source(source, dict(url='http://iana.org/domains/root/*'))
res, errs = query_single_source(source, dict(url='http://iana.org/domains/root/*'))
expected = """\
org,iana)/domains/root/db 20140126200927 iana.warc.gz
@ -94,6 +96,7 @@ org,iana)/domains/root/db 20140126200928 iana.warc.gz
org,iana)/domains/root/servers 20140126201227 iana.warc.gz"""
assert(key_ts_res(res) == expected)
assert(errs == {})
# Url Match -- Remote Loaders
@ -101,7 +104,7 @@ org,iana)/domains/root/servers 20140126201227 iana.warc.gz"""
@pytest.mark.parametrize("source", remote_sources, ids=["remote_cdx", "memento"])
def test_remote_loader(source):
url = 'http://instagram.com/amaliaulman'
res = query_single_source(source, dict(url=url))
res, errs = query_single_source(source, dict(url=url))
expected = """\
com,instagram)/amaliaulman 20141014150552 http://webenact.rhizome.org/all/20141014150552id_/http://instagram.com/amaliaulman
@ -110,6 +113,7 @@ com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/201410
com,instagram)/amaliaulman 20141014171636 http://webenact.rhizome.org/all/20141014171636id_/http://instagram.com/amaliaulman"""
assert(key_ts_res(res, 'load_url') == expected)
assert(errs == {})
# Url Match -- Remote Loaders
@ -117,12 +121,13 @@ com,instagram)/amaliaulman 20141014171636 http://webenact.rhizome.org/all/201410
@pytest.mark.parametrize("source", remote_sources, ids=["remote_cdx", "memento"])
def test_remote_closest_loader(source):
url = 'http://instagram.com/amaliaulman'
res = query_single_source(source, dict(url=url, closest='20141014162332', limit=1))
res, errs = query_single_source(source, dict(url=url, closest='20141014162332', limit=1))
expected = """\
com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/20141014162333id_/http://instagram.com/amaliaulman"""
assert(key_ts_res(res, 'load_url') == expected)
assert(errs == {})
# Url Match -- Memento
@ -130,25 +135,26 @@ com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/201410
@pytest.mark.parametrize("source", remote_sources, ids=["remote_cdx", "memento"])
def test_remote_closest_loader(source):
url = 'http://instagram.com/amaliaulman'
res = query_single_source(source, dict(url=url, closest='20141014162332', limit=1))
res, errs = query_single_source(source, dict(url=url, closest='20141014162332', limit=1))
expected = """\
com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/20141014162333id_/http://instagram.com/amaliaulman"""
assert(key_ts_res(res, 'load_url') == expected)
assert(errs == {})
# Live Index -- No Load!
# ============================================================================
def test_live():
url = 'http://example.com/'
source = LiveIndexSource()
res = query_single_source(source, dict(url=url))
res, errs = query_single_source(source, dict(url=url))
expected = 'com,example)/ {0} http://example.com/'.format(timestamp_now())
assert(key_ts_res(res, 'load_url') == expected)
assert(errs == {})
# Errors -- Not Found All
@ -156,31 +162,36 @@ def test_live():
@pytest.mark.parametrize("source", local_sources + remote_sources, ids=["file", "redis", "remote_cdx", "memento"])
def test_all_not_found(source):
url = 'http://x-not-found-x.notfound/'
res = query_single_source(source, dict(url=url, limit=3))
res, errs = query_single_source(source, dict(url=url, limit=3))
expected = ''
assert(key_ts_res(res) == expected)
if source == remote_sources[0]:
assert('http://x-not-found-x.notfound/' in errs['source'])
else:
assert(errs == {})
# ============================================================================
def test_another_remote_not_found():
source = MementoIndexSource.from_timegate_url('http://www.webarchive.org.uk/wayback/archive/')
url = 'http://x-not-found-x.notfound/'
res = query_single_source(source, dict(url=url, limit=3))
res, errs = query_single_source(source, dict(url=url, limit=3))
expected = ''
assert(key_ts_res(res) == expected)
assert(errs['source'] == "NotFoundException('http://www.webarchive.org.uk/wayback/archive/timemap/link/http://x-not-found-x.notfound/',)")
# ============================================================================
def test_file_not_found():
source = FileIndexSource('testdata/not-found-x')
url = 'http://x-not-found-x.notfound/'
res = query_single_source(source, dict(url=url, limit=3))
res, errs = query_single_source(source, dict(url=url, limit=3))
expected = ''
assert(key_ts_res(res) == expected)
assert(errs['source'] == "NotFoundException('testdata/not-found-x',)"), errs
# ============================================================================
@ -188,7 +199,7 @@ def test_ait_filters():
ait_source = RemoteIndexSource('http://wayback.archive-it.org/cdx/search/cdx?url={url}&filter=filename:ARCHIVEIT-({colls})-.*',
'http://wayback.archive-it.org/all/{timestamp}id_/{url}')
cdxlist = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '5610|933'})
cdxlist, errs = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '5610|933'})
filenames = [cdx['filename'] for cdx in cdxlist]
prefix = ('ARCHIVEIT-5610-', 'ARCHIVEIT-933-')
@ -196,7 +207,7 @@ def test_ait_filters():
assert(all([x.startswith(prefix) for x in filenames]))
cdxlist = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '1883|366|905'})
cdxlist, errs = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '1883|366|905'})
filenames = [cdx['filename'] for cdx in cdxlist]
prefix = ('ARCHIVEIT-1883-', 'ARCHIVEIT-366-', 'ARCHIVEIT-905-')

View File

@ -1,13 +1,14 @@
from gevent import monkey; monkey.patch_all(thread=False)
from rezag.aggindexsource import SimpleAggregator, GeventTimeoutAggregator
from rezag.aggindexsource import ThreadedTimeoutAggregator
from rezag.aggindexsource import ThreadedTimeoutAggregator, BaseAggregator
from rezag.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource
from .testutils import json_list, to_path
import json
import pytest
import time
from rezag.handlers import IndexHandler
@ -27,6 +28,10 @@ aggs = {'simple': SimpleAggregator(sources),
'processes': ThreadedTimeoutAggregator(sources, timeout=5.0, use_processes=True),
}
agg_tm = {'gevent': GeventTimeoutAggregator(sources, timeout=0.0),
'threaded': ThreadedTimeoutAggregator(sources, timeout=0.0),
'processes': ThreadedTimeoutAggregator(sources, timeout=0.0, use_processes=True)}
nf = {'notfound': FileIndexSource(to_path('testdata/not-found-x'))}
agg_nf = {'simple': SimpleAggregator(nf),
'gevent': GeventTimeoutAggregator(nf, timeout=5.0),
@ -41,7 +46,7 @@ agg_nf = {'simple': SimpleAggregator(nf),
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
def test_mem_agg_index_1(agg):
url = 'http://iana.org/'
res = agg(dict(url=url, closest='20140126000000', limit=5))
res, errs = agg(dict(url=url, closest='20140126000000', limit=5))
exp = [{"timestamp": "20140126093743", "load_url": "http://web.archive.org/web/20140126093743id_/http://iana.org/", "source": "ia"},
@ -52,12 +57,13 @@ def test_mem_agg_index_1(agg):
]
assert(json_list(res) == exp)
assert(errs == {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://iana.org/',)",
'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"})
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
def test_mem_agg_index_2(agg):
url = 'http://example.com/'
res = agg(dict(url=url, closest='20100512', limit=6))
res, errs = agg(dict(url=url, closest='20100512', limit=6))
exp = [{"timestamp": "20100513010014", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100513010014id_/http://example.com/", "source": "bl"},
{"timestamp": "20100512204410", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100512204410id_/http://example.com/", "source": "bl"},
@ -67,12 +73,13 @@ def test_mem_agg_index_2(agg):
{"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"}]
assert(json_list(res) == exp)
assert(errs == {'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://example.com/',)"})
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
def test_mem_agg_index_3(agg):
url = 'http://vvork.com/'
res = agg(dict(url=url, closest='20141001', limit=5))
res, errs = agg(dict(url=url, closest='20141001', limit=5))
exp = [{"timestamp": "20141006184357", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"},
{"timestamp": "20141018133107", "load_url": "http://web.archive.org/web/20141018133107id_/http://vvork.com/", "source": "ia"},
@ -81,32 +88,53 @@ def test_mem_agg_index_3(agg):
{"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}]
assert(json_list(res) == exp)
assert(errs == {})
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
def test_mem_agg_index_4(agg):
url = 'http://vvork.com/'
res = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
res, errs = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
exp = [{"timestamp": "20141006184357", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"},
{"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}]
assert(json_list(res) == exp)
assert(errs == {})
@pytest.mark.parametrize("agg", list(agg_nf.values()), ids=list(agg_nf.keys()))
def test_mem_agg_not_found(agg):
url = 'http://vvork.com/'
res = agg(dict(url=url, closest='20141001', limit=2))
res, errs = agg(dict(url=url, closest='20141001', limit=2))
assert(json_list(res) == [])
assert(errs == {'notfound': "NotFoundException('testdata/not-found-x',)"})
@pytest.mark.parametrize("agg", list(agg_tm.values()), ids=list(agg_tm.keys()))
def test_mem_agg_timeout(agg):
url = 'http://vvork.com/'
orig_source = BaseAggregator.load_child_source
def load_child_source(self, name, source, params):
time.sleep(0.1)
return orig_source(name, source, params)
BaseAggregator.load_child_source = load_child_source
res, errs = agg(dict(url=url, closest='20141001', limit=2))
BaseAggregator.load_child_source = orig_source
assert(json_list(res) == [])
assert(errs == {'local': 'timeout',
'ait': 'timeout', 'bl': 'timeout', 'ia': 'timeout', 'rhiz': 'timeout'})
def test_handler_output_cdxj():
agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
url = 'http://vvork.com/'
res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
exp = """\
com,vvork)/ 20141006184357 {"url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
@ -114,13 +142,14 @@ com,vvork)/ 20131004231540 {"url": "http://vvork.com/", "mem_rel": "last memento
"""
assert(''.join(res) == exp)
assert(errs == {})
def test_handler_output_json():
agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
url = 'http://vvork.com/'
res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='json'))
res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='json'))
exp = """\
{"urlkey": "com,vvork)/", "timestamp": "20141006184357", "url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
@ -128,26 +157,27 @@ def test_handler_output_json():
"""
assert(''.join(res) == exp)
assert(errs == {})
def test_handler_output_link():
agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
url = 'http://vvork.com/'
res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='link'))
res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='link'))
exp = """\
<http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/>; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT"; src="rhiz",
<http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/>; rel="memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT"; src="ait"
"""
assert(''.join(res) == exp)
assert(errs == {})
def test_handler_output_link_2():
agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
url = 'http://iana.org/'
res = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
res, errs = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
exp = """\
<http://web.archive.org/web/20140126093743id_/http://iana.org/>; rel="memento"; datetime="Sun, 26 Jan 2014 09:37:43 GMT"; src="ia",
@ -158,38 +188,54 @@ def test_handler_output_link_2():
"""
assert(''.join(res) == exp)
exp_errs = {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://iana.org/',)",
'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}
assert(errs == exp_errs)
def test_handler_output_link_3():
agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
url = 'http://foo.bar.non-existent'
res = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
res, errs = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
exp = ''
assert(''.join(res) == exp)
exp_errs = {'ait': "NotFoundException('http://wayback.archive-it.org/all/http://foo.bar.non-existent',)",
'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://foo.bar.non-existent',)",
'ia': "NotFoundException('http://web.archive.org/web/http://foo.bar.non-existent',)",
'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://foo.bar.non-existent',)"}
assert(errs == exp_errs)
def test_handler_output_text():
agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
url = 'http://vvork.com/'
res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='text'))
res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='text'))
exp = """\
com,vvork)/ 20141006184357 http://www.vvork.com/ memento http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/ http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/ rhiz
com,vvork)/ 20131004231540 http://vvork.com/ last memento http://wayback.archive-it.org/all/20131004231540/http://vvork.com/ http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/ ait
"""
assert(''.join(res) == exp)
assert(errs == {})
def test_handler_list_sources():
agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
res = handler(dict(mode='list_sources'))
res, errs = handler(dict(mode='list_sources'))
assert(res == {'sources': {'bl': 'memento',
'ait': 'memento',
'ia': 'memento',
'rhiz': 'memento',
'local': 'file'}})
assert(errs == {})

View File

@ -35,7 +35,7 @@ def setup_module():
def test_timeout_long_all_pass():
agg = TimeoutAggregator(sources, timeout=1.0)
res = agg(dict(url='http://example.com/'))
res, errs = agg(dict(url='http://example.com/'))
exp = [{'source': 'slower', 'timestamp': '20140127171200'},
{'source': 'slower', 'timestamp': '20140127171251'},
@ -43,27 +43,31 @@ def test_timeout_long_all_pass():
assert(json_list(res, fields=['source', 'timestamp']) == exp)
assert(errs == {})
def test_timeout_slower_skipped_1():
agg = GeventTimeoutAggregator(sources, timeout=0.49)
res = agg(dict(url='http://example.com/'))
res, errs = agg(dict(url='http://example.com/'))
exp = [{'source': 'slow', 'timestamp': '20160225042329'}]
assert(json_list(res, fields=['source', 'timestamp']) == exp)
assert(errs == {'slower': 'timeout'})
def test_timeout_slower_skipped_2():
agg = GeventTimeoutAggregator(sources, timeout=0.19)
res = agg(dict(url='http://example.com/'))
res, errs = agg(dict(url='http://example.com/'))
exp = []
assert(json_list(res, fields=['source', 'timestamp']) == exp)
assert(errs == {'slower': 'timeout', 'slow': 'timeout'})
def test_timeout_skipping():
@ -75,31 +79,40 @@ def test_timeout_skipping():
exp = [{'source': 'slow', 'timestamp': '20160225042329'}]
res = agg(dict(url='http://example.com/'))
res, errs = agg(dict(url='http://example.com/'))
assert(json_list(res, fields=['source', 'timestamp']) == exp)
assert(sources['slow'].calls == 4)
assert(sources['slower'].calls == 4)
res = agg(dict(url='http://example.com/'))
assert(errs == {'slower': 'timeout'})
res, errs = agg(dict(url='http://example.com/'))
assert(json_list(res, fields=['source', 'timestamp']) == exp)
assert(sources['slow'].calls == 5)
assert(sources['slower'].calls == 5)
res = agg(dict(url='http://example.com/'))
assert(errs == {'slower': 'timeout'})
res, errs = agg(dict(url='http://example.com/'))
assert(json_list(res, fields=['source', 'timestamp']) == exp)
assert(sources['slow'].calls == 6)
assert(sources['slower'].calls == 5)
res = agg(dict(url='http://example.com/'))
assert(errs == {})
res, errs = agg(dict(url='http://example.com/'))
assert(json_list(res, fields=['source', 'timestamp']) == exp)
assert(sources['slow'].calls == 7)
assert(sources['slower'].calls == 5)
assert(errs == {})
time.sleep(2.01)
res = agg(dict(url='http://example.com/'))
res, errs = agg(dict(url='http://example.com/'))
assert(json_list(res, fields=['source', 'timestamp']) == exp)
assert(sources['slow'].calls == 8)
assert(sources['slower'].calls == 6)
assert(errs == {'slower': 'timeout'})