1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

seperate iter_sources from list_sources api

all errors returned as json block with error msg
tests for not found, invalid errors
This commit is contained in:
Ilya Kreymer 2016-02-29 12:34:06 -08:00
parent 68090d00c1
commit 008e5284b1
12 changed files with 304 additions and 145 deletions

View File

@ -63,7 +63,6 @@ class BaseAggregator(object):
try: try:
_src_params = all_params['_all_src_params'].get(name) _src_params = all_params['_all_src_params'].get(name)
all_params['_src_params'] = _src_params all_params['_src_params'] = _src_params
cdx_iter = source.load_index(all_params) cdx_iter = source.load_index(all_params)
except NotFoundException as nf: except NotFoundException as nf:
print('Not found in ' + name) print('Not found in ' + name)
@ -89,15 +88,21 @@ class BaseAggregator(object):
return cdx_iter return cdx_iter
def _on_source_error(self, name): def _on_source_error(self, name): #pragma: no cover
pass pass
def _load_all(self, params): #pragma: no cover def _load_all(self, params): #pragma: no cover
raise NotImplemented() raise NotImplemented()
def get_sources(self, params): #pragma: no cover def _iter_sources(self, params): #pragma: no cover
raise NotImplemented() raise NotImplemented()
def get_source_list(self, params):
srcs = self._iter_sources(params)
result = [(name, str(value)) for name, value in srcs]
result = {'sources': dict(result)}
return result
#============================================================================= #=============================================================================
class BaseSourceListAggregator(BaseAggregator): class BaseSourceListAggregator(BaseAggregator):
@ -107,7 +112,7 @@ class BaseSourceListAggregator(BaseAggregator):
def get_all_sources(self, params): def get_all_sources(self, params):
return self.sources return self.sources
def get_sources(self, params): def _iter_sources(self, params):
sources = self.get_all_sources(params) sources = self.get_all_sources(params)
srcs_list = params.get('sources') srcs_list = params.get('sources')
if not srcs_list: if not srcs_list:
@ -125,7 +130,7 @@ class SeqAggMixin(object):
def _load_all(self, params): def _load_all(self, params):
sources = list(self.get_sources(params)) sources = list(self._iter_sources(params))
return list([self.load_child_source(name, source, params) return list([self.load_child_source(name, source, params)
for name, source in sources]) for name, source in sources])
@ -160,8 +165,8 @@ class TimeoutMixin(object):
return False return False
def get_sources(self, params): def _iter_sources(self, params):
sources = super(TimeoutMixin, self).get_sources(params) sources = super(TimeoutMixin, self)._iter_sources(params)
for name, source in sources: for name, source in sources:
if not self.is_timed_out(name): if not self.is_timed_out(name):
yield name, source yield name, source
@ -185,7 +190,7 @@ class GeventMixin(object):
def _load_all(self, params): def _load_all(self, params):
params['_timeout'] = self.timeout params['_timeout'] = self.timeout
sources = list(self.get_sources(params)) sources = list(self._iter_sources(params))
def do_spawn(name, source): def do_spawn(name, source):
return self.pool.spawn(self.load_child_source, name, source, params) return self.pool.spawn(self.load_child_source, name, source, params)
@ -223,7 +228,7 @@ class ConcurrentMixin(object):
def _load_all(self, params): def _load_all(self, params):
params['_timeout'] = self.timeout params['_timeout'] = self.timeout
sources = list(self.get_sources(params)) sources = list(self._iter_sources(params))
with self.pool_class(max_workers=self.size) as executor: with self.pool_class(max_workers=self.size) as executor:
def do_spawn(name, source): def do_spawn(name, source):
@ -257,7 +262,8 @@ class BaseDirectoryIndexAggregator(BaseAggregator):
self.base_prefix = base_prefix self.base_prefix = base_prefix
self.base_dir = base_dir self.base_dir = base_dir
def get_sources(self, params): def _iter_sources(self, params):
self._set_src_params(params)
# see if specific params (when part of another agg) # see if specific params (when part of another agg)
src_params = params.get('_src_params') src_params = params.get('_src_params')
if not src_params: if not src_params:
@ -270,7 +276,6 @@ class BaseDirectoryIndexAggregator(BaseAggregator):
the_dir = self.base_dir the_dir = self.base_dir
the_dir = os.path.join(self.base_prefix, the_dir) the_dir = os.path.join(self.base_prefix, the_dir)
try: try:
sources = list(self._load_files(the_dir)) sources = list(self._load_files(the_dir))
except Exception: except Exception:
@ -290,6 +295,10 @@ class BaseDirectoryIndexAggregator(BaseAggregator):
rel_path = '' rel_path = ''
yield rel_path, FileIndexSource(filename) yield rel_path, FileIndexSource(filename)
def __str__(self):
return 'file_dir'
class DirectoryIndexAggregator(SeqAggMixin, BaseDirectoryIndexAggregator): class DirectoryIndexAggregator(SeqAggMixin, BaseDirectoryIndexAggregator):
pass pass

View File

@ -1,31 +1,50 @@
from rezag.inputrequest import WSGIInputRequest, POSTInputRequest from rezag.inputrequest import DirectWSGIInputRequest, POSTInputRequest
from bottle import route, request, response, default_app from bottle import route, request, response, default_app, abort
from pywb.utils.wbexception import WbException
import traceback
import json
def err_handler(exc):
response.status = exc.status_code
response.content_type = 'application/json'
return json.dumps({'message': exc.body})
def wrap_error(func):
def do_d(*args, **kwargs):
try:
return func(*args, **kwargs)
except WbException as exc:
if application.debug:
traceback.print_exc()
abort(exc.status(), exc.msg)
except Exception as e:
if application.debug:
traceback.print_exc()
abort(500, 'Internal Error: ' + str(e))
return do_d
def add_route(path, handler): def add_route(path, handler):
def debug(func): @wrap_error
def do_d(): def direct_input_request(mode=''):
try:
return func()
except Exception:
import traceback
traceback.print_exc()
return do_d
def direct_input_request():
params = dict(request.query) params = dict(request.query)
params['_input_req'] = WSGIInputRequest(request.environ) params['_input_req'] = DirectWSGIInputRequest(request.environ)
return handler(params) return handler(params)
def post_fullrequest(): @wrap_error
def post_fullrequest(mode=''):
params = dict(request.query) params = dict(request.query)
params['_input_req'] = POSTInputRequest(request.environ) params['_input_req'] = POSTInputRequest(request.environ)
return handler(params) return handler(params)
route(path + '/postreq', method=['POST'], callback=debug(post_fullrequest)) route(path + '/postreq', method=['POST'], callback=post_fullrequest)
route(path, method=['ANY'], callback=debug(direct_input_request)) route(path, method=['ANY'], callback=direct_input_request)
application = default_app() application = default_app()
application.default_error_handler = err_handler

View File

@ -1,12 +1,13 @@
from rezag.responseloader import WARCPathHandler, LiveWebHandler from rezag.responseloader import WARCPathLoader, LiveWebLoader
from rezag.utils import MementoUtils from rezag.utils import MementoUtils
from pywb.warc.recordloader import ArchiveLoadFailed from pywb.utils.wbexception import BadRequestException, WbException
from pywb.utils.wbexception import NotFoundException
from bottle import response from bottle import response
#============================================================================= #=============================================================================
def to_cdxj(cdx_iter, fields): def to_cdxj(cdx_iter, fields):
response.headers['Content-Type'] = 'text/x-cdxj' response.headers['Content-Type'] = 'application/x-cdxj'
return [cdx.to_cdxj(fields) for cdx in cdx_iter] return [cdx.to_cdxj(fields) for cdx in cdx_iter]
def to_json(cdx_iter, fields): def to_json(cdx_iter, fields):
@ -37,26 +38,36 @@ class IndexHandler(object):
self.index_source = index_source self.index_source = index_source
self.opts = opts or {} self.opts = opts or {}
def __call__(self, params): def get_supported_modes(self):
if params.get('mode') == 'sources': return dict(modes=['list_modes', 'list_sources', 'index'])
srcs = self.index_source.get_sources(params)
result = [(name, str(value)) for name, value in srcs] def _load_index_source(self, params):
result = {'sources': dict(result)} url = params.get('url')
return result if not url:
raise BadRequestException('The "url" param is required')
input_req = params.get('_input_req') input_req = params.get('_input_req')
if input_req: if input_req:
params['alt_url'] = input_req.include_post_query(params.get('url')) params['alt_url'] = input_req.include_post_query(url)
cdx_iter = self.index_source(params) return self.index_source(params)
def __call__(self, params):
mode = params.get('mode', 'index')
if mode == 'list_sources':
return self.index_source.get_source_list(params)
if mode == 'list_modes' or mode != 'index':
return self.get_supported_modes()
output = params.get('output', self.DEF_OUTPUT) output = params.get('output', self.DEF_OUTPUT)
fields = params.get('fields') fields = params.get('fields')
handler = self.OUTPUTS.get(output) handler = self.OUTPUTS.get(output)
if not handler: if not handler:
handler = self.OUTPUTS[self.DEF_OUTPUT] raise BadRequestException('output={0} not supported'.format(output))
cdx_iter = self._load_index_source(params)
res = handler(cdx_iter, fields) res = handler(cdx_iter, fields)
return res return res
@ -67,57 +78,59 @@ class ResourceHandler(IndexHandler):
super(ResourceHandler, self).__init__(index_source) super(ResourceHandler, self).__init__(index_source)
self.resource_loaders = resource_loaders self.resource_loaders = resource_loaders
def get_supported_modes(self):
res = super(ResourceHandler, self).get_supported_modes()
res['modes'].append('resource')
return res
def __call__(self, params): def __call__(self, params):
if params.get('mode', 'resource') != 'resource': if params.get('mode', 'resource') != 'resource':
return super(ResourceHandler, self).__call__(params) return super(ResourceHandler, self).__call__(params)
input_req = params.get('_input_req') cdx_iter = self._load_index_source(params)
if input_req: last_exc = None
params['alt_url'] = input_req.include_post_query(params.get('url'))
cdx_iter = self.index_source(params)
any_found = False
for cdx in cdx_iter: for cdx in cdx_iter:
any_found = True
for loader in self.resource_loaders: for loader in self.resource_loaders:
try: try:
resp = loader(cdx, params) resp = loader(cdx, params)
if resp: if resp is not None:
return resp return resp
except ArchiveLoadFailed as e: except WbException as e:
print(e) last_exc = e
pass
if any_found: if last_exc:
raise ArchiveLoadFailed('Resource Found, could not be Loaded') raise last_exc
#raise ArchiveLoadFailed('Resource Found, could not be Loaded')
else: else:
raise ArchiveLoadFailed('No Resource Found') raise NotFoundException('No Resource Found')
#============================================================================= #=============================================================================
class DefaultResourceHandler(ResourceHandler): class DefaultResourceHandler(ResourceHandler):
def __init__(self, index_source, warc_paths=''): def __init__(self, index_source, warc_paths=''):
loaders = [WARCPathHandler(warc_paths, index_source), loaders = [WARCPathLoader(warc_paths, index_source),
LiveWebHandler() LiveWebLoader()
] ]
super(DefaultResourceHandler, self).__init__(index_source, loaders) super(DefaultResourceHandler, self).__init__(index_source, loaders)
#============================================================================= #=============================================================================
class HandlerSeq(object): class HandlerSeq(object):
def __init__(self, loaders): def __init__(self, handlers):
self.loaders = loaders self.handlers = handlers
def __call__(self, params): def __call__(self, params):
for loader in self.loaders: last_exc = None
for handler in self.handlers:
try: try:
res = loader(params) res = handler(params)
if res: if res is not None:
return res return res
except ArchiveLoadFailed: except WbException as e:
pass last_exc = e
raise ArchiveLoadFailed('No Resource Found') if last_exc:
raise last_exc
else:
raise NotFoundException('No Resource Found')

View File

@ -14,6 +14,9 @@ from rezag.liverec import patched_requests as requests
from rezag.utils import MementoUtils from rezag.utils import MementoUtils
WAYBACK_ORIG_SUFFIX = '{timestamp}id_/{url}'
#============================================================================= #=============================================================================
class BaseIndexSource(object): class BaseIndexSource(object):
def load_index(self, params): #pragma: no cover def load_index(self, params): #pragma: no cover
@ -22,10 +25,10 @@ class BaseIndexSource(object):
@staticmethod @staticmethod
def res_template(template, params): def res_template(template, params):
src_params = params.get('_src_params') src_params = params.get('_src_params')
if src_params: if not src_params:
res = template.format(**src_params) res = template.format(url=params['url'])
else: else:
res = template res = template.format(url=params['url'], **src_params)
return res return res
@ -59,7 +62,7 @@ class RemoteIndexSource(BaseIndexSource):
def load_index(self, params): def load_index(self, params):
api_url = self.res_template(self.api_url_template, params) api_url = self.res_template(self.api_url_template, params)
api_url += '?url=' + params['url'] print('API URL', api_url)
r = requests.get(api_url, timeout=params.get('_timeout')) r = requests.get(api_url, timeout=params.get('_timeout'))
if r.status_code >= 400: if r.status_code >= 400:
raise NotFoundException(api_url) raise NotFoundException(api_url)
@ -169,7 +172,6 @@ class MementoIndexSource(BaseIndexSource):
def get_timegate_links(self, params, closest): def get_timegate_links(self, params, closest):
url = self.res_template(self.timegate_url, params) url = self.res_template(self.timegate_url, params)
url += params['url']
accept_dt = timestamp_to_http_date(closest) accept_dt = timestamp_to_http_date(closest)
res = requests.head(url, headers={'Accept-Datetime': accept_dt}) res = requests.head(url, headers={'Accept-Datetime': accept_dt})
if res.status_code >= 400: if res.status_code >= 400:
@ -179,7 +181,6 @@ class MementoIndexSource(BaseIndexSource):
def get_timemap_links(self, params): def get_timemap_links(self, params):
url = self.res_template(self.timemap_url, params) url = self.res_template(self.timemap_url, params)
url += params['url']
res = requests.get(url, timeout=params.get('_timeout')) res = requests.get(url, timeout=params.get('_timeout'))
if res.status_code >= 400: if res.status_code >= 400:
raise NotFoundException(url) raise NotFoundException(url)
@ -200,9 +201,9 @@ class MementoIndexSource(BaseIndexSource):
@staticmethod @staticmethod
def from_timegate_url(timegate_url, path='link'): def from_timegate_url(timegate_url, path='link'):
return MementoIndexSource(timegate_url, return MementoIndexSource(timegate_url + '{url}',
timegate_url + 'timemap/' + path + '/', timegate_url + 'timemap/' + path + '/{url}',
timegate_url + '{timestamp}id_/{url}') timegate_url + WAYBACK_ORIG_SUFFIX)
def __str__(self): def __str__(self):
return 'memento' return 'memento'

View File

@ -1,4 +1,3 @@
from pywb.utils.loaders import extract_client_cookie
from pywb.utils.loaders import extract_post_query, append_post_query from pywb.utils.loaders import extract_post_query, append_post_query
from pywb.utils.loaders import LimitReader from pywb.utils.loaders import LimitReader
from pywb.utils.statusandheaders import StatusAndHeadersParser from pywb.utils.statusandheaders import StatusAndHeadersParser
@ -9,7 +8,7 @@ from io import BytesIO
#============================================================================= #=============================================================================
class WSGIInputRequest(object): class DirectWSGIInputRequest(object):
def __init__(self, env): def __init__(self, env):
self.env = env self.env = env
@ -20,26 +19,10 @@ class WSGIInputRequest(object):
headers = {} headers = {}
for name, value in iteritems(self.env): for name, value in iteritems(self.env):
# will be set by requests to match actual host
if name == 'HTTP_HOST': if name == 'HTTP_HOST':
#name = 'Host'
#value = splits.netloc
# will be set automatically
continue continue
#elif name == 'HTTP_ORIGIN':
# name = 'Origin'
# value = (splits.scheme + '://' + splits.netloc)
elif name == 'HTTP_X_CSRFTOKEN':
name = 'X-CSRFToken'
cookie_val = extract_client_cookie(env, 'csrftoken')
if cookie_val:
value = cookie_val
#elif name == 'HTTP_X_FORWARDED_PROTO':
# name = 'X-Forwarded-Proto'
# value = splits.scheme
elif name.startswith('HTTP_'): elif name.startswith('HTTP_'):
name = name[5:].title().replace('_', '-') name = name[5:].title().replace('_', '-')
@ -55,10 +38,7 @@ class WSGIInputRequest(object):
return headers return headers
def get_req_body(self): def get_req_body(self):
input_ = self.env.get('wsgi.input') input_ = self.env['wsgi.input']
if not input_:
return None
len_ = self._get_content_length() len_ = self._get_content_length()
enc = self._get_header('Transfer-Encoding') enc = self._get_header('Transfer-Encoding')
@ -70,9 +50,6 @@ class WSGIInputRequest(object):
data = None data = None
return data return data
#buf = data.read().decode('utf-8')
#print(buf)
#return StringIO(buf)
def _get_content_type(self): def _get_content_type(self):
return self.env.get('CONTENT_TYPE') return self.env.get('CONTENT_TYPE')
@ -105,7 +82,7 @@ class WSGIInputRequest(object):
#============================================================================= #=============================================================================
class POSTInputRequest(WSGIInputRequest): class POSTInputRequest(DirectWSGIInputRequest):
def __init__(self, env): def __init__(self, env):
self.env = env self.env = env

View File

@ -2,6 +2,7 @@ from rezag.liverec import BaseRecorder
from rezag.liverec import request as remote_request from rezag.liverec import request as remote_request
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_http_date from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_http_date
from pywb.utils.wbexception import LiveResourceException
from pywb.warc.resolvingloader import ResolvingLoader from pywb.warc.resolvingloader import ResolvingLoader
from io import BytesIO from io import BytesIO
@ -29,7 +30,7 @@ def incr_reader(stream, header=None, size=8192):
#============================================================================= #=============================================================================
class WARCPathHandler(object): class WARCPathLoader(object):
def __init__(self, paths, cdx_source): def __init__(self, paths, cdx_source):
self.paths = paths self.paths = paths
if isinstance(paths, str): if isinstance(paths, str):
@ -108,7 +109,7 @@ class HeaderRecorder(BaseRecorder):
#============================================================================= #=============================================================================
class LiveWebHandler(object): class LiveWebLoader(object):
SKIP_HEADERS = (b'link', SKIP_HEADERS = (b'link',
b'memento-datetime', b'memento-datetime',
b'content-location', b'content-location',
@ -140,14 +141,17 @@ class LiveWebHandler(object):
method = input_req.get_req_method() method = input_req.get_req_method()
data = input_req.get_req_body() data = input_req.get_req_body()
upstream_res = remote_request(url=load_url, try:
method=method, upstream_res = remote_request(url=load_url,
recorder=recorder, method=method,
stream=True, recorder=recorder,
allow_redirects=False, stream=True,
headers=req_headers, allow_redirects=False,
data=data, headers=req_headers,
timeout=params.get('_timeout')) data=data,
timeout=params.get('_timeout'))
except Exception:
raise LiveResourceException(load_url)
resp_headers = recorder.get_header() resp_headers = recorder.get_header()
@ -175,7 +179,7 @@ class LiveWebHandler(object):
return dt.strftime('%Y-%m-%dT%H:%M:%SZ') return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
@staticmethod @staticmethod
def _make_warc_id(id_=None): def _make_warc_id(id_=None): #pragma: no cover
if not id_: if not id_:
id_ = uuid.uuid1() id_ = uuid.uuid1()
return '<urn:uuid:{0}>'.format(id_) return '<urn:uuid:{0}>'.format(id_)

View File

@ -77,6 +77,7 @@ class MementoUtils(object):
from_date = timestamp_to_http_date(first_cdx['timestamp']) from_date = timestamp_to_http_date(first_cdx['timestamp'])
except StopIteration: except StopIteration:
first_cdx = None first_cdx = None
return
# first memento link # first memento link
yield MementoUtils.make_timemap_memento_link(first_cdx, datetime=from_date) yield MementoUtils.make_timemap_memento_link(first_cdx, datetime=from_date)
@ -91,4 +92,4 @@ class MementoUtils(object):
# last memento link, if any # last memento link, if any
if prev_cdx: if prev_cdx:
yield MementoUtils.make_timemap_memento_link(prev_cdx, end='') yield MementoUtils.make_timemap_memento_link(prev_cdx, end='\n')

View File

@ -32,8 +32,11 @@ setup(
'rezag', 'rezag',
], ],
install_requires=[ install_requires=[
'pywb', 'pywb==1.0b',
], ],
dependency_links=[
'git+https://github.com/ikreymer/pywb.git@py3#egg=pywb-1.0b-py3',
],
zip_safe=True, zip_safe=True,
entry_points=""" entry_points="""
[console_scripts] [console_scripts]

View File

@ -33,6 +33,9 @@ def setup_module():
shutil.copy(to_path('testdata/iana.cdxj'), coll_B) shutil.copy(to_path('testdata/iana.cdxj'), coll_B)
shutil.copy(to_path('testdata/dupes.cdxj'), coll_C) shutil.copy(to_path('testdata/dupes.cdxj'), coll_C)
with open(to_path(root_dir) + 'somefile', 'w') as fh:
fh.write('foo')
global dir_loader global dir_loader
dir_loader = DirectoryIndexAggregator(dir_prefix, dir_path) dir_loader = DirectoryIndexAggregator(dir_prefix, dir_path)
@ -121,7 +124,7 @@ def test_agg_dir_and_memento():
'local': dir_loader} 'local': dir_loader}
agg_source = SimpleAggregator(sources) agg_source = SimpleAggregator(sources)
res = agg_source({'url': 'example.com/', 'param.coll': '*', 'closest': '20100512', 'limit': 6}) res = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})
exp = [ exp = [
{'source': 'ia', 'timestamp': '20100513052358', 'load_url': 'http://web.archive.org/web/20100513052358id_/http://example.com/'}, {'source': 'ia', 'timestamp': '20100513052358', 'load_url': 'http://web.archive.org/web/20100513052358id_/http://example.com/'},
@ -144,7 +147,7 @@ def test_agg_no_dir_1():
def test_agg_no_dir_2(): def test_agg_no_dir_2():
loader = DirectoryIndexAggregator(root_dir, 'no_such') loader = DirectoryIndexAggregator(root_dir, '')
res = loader({'url': 'example.com/', 'param.coll': 'X'}) res = loader({'url': 'example.com/', 'param.coll': 'X'})
exp = [] exp = []
@ -152,4 +155,31 @@ def test_agg_no_dir_2():
assert(to_json_list(res) == exp) assert(to_json_list(res) == exp)
def test_agg_dir_sources_1():
res = dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
exp = {'sources': {'colls/A/indexes': 'file',
'colls/B/indexes': 'file',
'colls/C/indexes': 'file'}
}
assert(res == exp)
def test_agg_dir_sources_2():
res = dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'})
exp = {'sources': {'colls/A/indexes': 'file',
'colls/C/indexes': 'file'}
}
assert(res == exp)
def test_agg_dir_sources_single_dir():
loader = DirectoryIndexAggregator('testdata/', '')
res = loader.get_source_list({'url': 'example.com/'})
exp = {'sources': {}}
assert(res == exp)

View File

@ -42,13 +42,17 @@ def setup_module(self):
source3 = SimpleAggregator({'example': FileIndexSource(to_path('testdata/example.cdxj'))}) source3 = SimpleAggregator({'example': FileIndexSource(to_path('testdata/example.cdxj'))})
handler3 = DefaultResourceHandler(source3, to_path('testdata/')) handler3 = DefaultResourceHandler(source3, to_path('testdata/'))
add_route('/fallback', HandlerSeq([handler3, add_route('/fallback', HandlerSeq([handler3,
handler2, handler2,
live_handler])) live_handler]))
add_route('/seq', HandlerSeq([handler3,
handler2]))
bottle.debug = True add_route('/empty', HandlerSeq([]))
add_route('/invalid', HandlerSeq(['foo']))
application.debug = True
global testapp global testapp
testapp = webtest.TestApp(application) testapp = webtest.TestApp(application)
@ -61,8 +65,23 @@ class TestResAgg(object):
def setup(self): def setup(self):
self.testapp = testapp self.testapp = testapp
def test_list_handlers(self):
resp = self.testapp.get('/many?mode=list_modes')
assert resp.json == {'modes': ['list_modes', 'list_sources', 'index', 'resource']}
resp = self.testapp.get('/many?mode=other')
assert resp.json == {'modes': ['list_modes', 'list_sources', 'index', 'resource']}
# defaults to resource, must specify url
resp = self.testapp.get('/many', status=400)
assert resp.json == {'message': 'The "url" param is required'}
def test_list_sources(self):
resp = self.testapp.get('/many?mode=list_sources')
assert resp.json == {'sources': {'local': 'file_dir', 'ia': 'memento', 'rhiz': 'memento', 'live': 'live'}}
def test_live_index(self): def test_live_index(self):
resp = self.testapp.get('/live?url=http://httpbin.org/get&mode=index&output=json') resp = self.testapp.get('/live?mode=index&url=http://httpbin.org/get&output=json')
resp.charset = 'utf-8' resp.charset = 'utf-8'
res = to_json_list(resp.text) res = to_json_list(resp.text)
@ -71,7 +90,8 @@ class TestResAgg(object):
'load_url': 'http://httpbin.org/get', 'source': 'live', 'timestamp': '2016'}]) 'load_url': 'http://httpbin.org/get', 'source': 'live', 'timestamp': '2016'}])
def test_live_resource(self): def test_live_resource(self):
resp = self.testapp.get('/live?url=http://httpbin.org/get?foo=bar&mode=resource') headers = {'foo': 'bar'}
resp = self.testapp.get('/live?url=http://httpbin.org/get?foo=bar', headers=headers)
assert resp.headers['WARC-Coll'] == 'live' assert resp.headers['WARC-Coll'] == 'live'
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar' assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar'
@ -82,7 +102,7 @@ class TestResAgg(object):
def test_live_post_resource(self): def test_live_post_resource(self):
resp = self.testapp.post('/live?url=http://httpbin.org/post&mode=resource', resp = self.testapp.post('/live?url=http://httpbin.org/post',
OrderedDict([('foo', 'bar')])) OrderedDict([('foo', 'bar')]))
assert resp.headers['WARC-Coll'] == 'live' assert resp.headers['WARC-Coll'] == 'live'
@ -204,6 +224,11 @@ foo=bar&test=abc"""
assert resp.headers['WARC-Target-URI'] == 'http://example.com/' assert resp.headers['WARC-Target-URI'] == 'http://example.com/'
assert b'HTTP/1.1 200 OK' in resp.body assert b'HTTP/1.1 200 OK' in resp.body
def test_error_fallback_live_not_found(self):
resp = self.testapp.get('/fallback?url=http://invalid.url-not-found', status=400)
assert resp.json == {'message': 'http://invalid.url-not-found'}
def test_agg_local_revisit(self): def test_agg_local_revisit(self):
resp = self.testapp.get('/many?url=http://www.example.com/&closest=20140127171251&sources=local') resp = self.testapp.get('/many?url=http://www.example.com/&closest=20140127171251&sources=local')
@ -214,3 +239,24 @@ foo=bar&test=abc"""
assert resp.headers['WARC-Refers-To-Date'] == '2014-01-27T17:12:00Z' assert resp.headers['WARC-Refers-To-Date'] == '2014-01-27T17:12:00Z'
assert b'HTTP/1.1 200 OK' in resp.body assert b'HTTP/1.1 200 OK' in resp.body
assert b'<!doctype html>' in resp.body assert b'<!doctype html>' in resp.body
def test_error_invalid_index_output(self):
resp = self.testapp.get('/live?mode=index&url=http://httpbin.org/get&output=foobar', status=400)
assert resp.json == {'message': 'output=foobar not supported'}
def test_error_local_not_found(self):
resp = self.testapp.get('/many?url=http://not-found.error/&sources=local', status=404)
assert resp.json == {'message': 'No Resource Found'}
def test_error_empty(self):
resp = self.testapp.get('/empty?url=http://example.com/', status=404)
assert resp.json == {'message': 'No Resource Found'}
def test_error_invalid(self):
resp = self.testapp.get('/invalid?url=http://example.com/', status=500)
assert resp.json['message'].startswith('Internal Error')

View File

@ -32,16 +32,20 @@ local_sources = [
remote_sources = [ remote_sources = [
RemoteIndexSource('http://webenact.rhizome.org/all-cdx', RemoteIndexSource('http://webenact.rhizome.org/all-cdx?url={url}',
'http://webenact.rhizome.org/all/{timestamp}id_/{url}'), 'http://webenact.rhizome.org/all/{timestamp}id_/{url}'),
MementoIndexSource('http://webenact.rhizome.org/all/', MementoIndexSource('http://webenact.rhizome.org/all/{url}',
'http://webenact.rhizome.org/all/timemap/*/', 'http://webenact.rhizome.org/all/timemap/*/{url}',
'http://webenact.rhizome.org/all/{timestamp}id_/{url}') 'http://webenact.rhizome.org/all/{timestamp}id_/{url}')
] ]
ait_source = RemoteIndexSource('http://wayback.archive-it.org/cdx?url={url}',
'http://wayback.archive-it.org/all/{timestamp}id_/{url}')
def query_single_source(source, params): def query_single_source(source, params):
string = str(source)
return SimpleAggregator({'source': source})(params) return SimpleAggregator({'source': source})(params)
@ -182,4 +186,22 @@ def test_file_not_found():
def test_ait_filters():
ait_source = RemoteIndexSource('http://wayback.archive-it.org/cdx/search/cdx?url={url}&filter=filename:ARCHIVEIT-({colls})-.*',
'http://wayback.archive-it.org/all/{timestamp}id_/{url}')
cdxlist = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '5610|933'})
filenames = [cdx['filename'] for cdx in cdxlist]
prefix = ('ARCHIVEIT-5610-', 'ARCHIVEIT-933-')
assert(all([x.startswith(prefix) for x in filenames]))
cdxlist = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '1883|366|905'})
filenames = [cdx['filename'] for cdx in cdxlist]
prefix = ('ARCHIVEIT-1883-', 'ARCHIVEIT-366-', 'ARCHIVEIT-905-')
assert(all([x.startswith(prefix) for x in filenames]))

View File

@ -27,10 +27,11 @@ aggs = {'simple': SimpleAggregator(sources),
'processes': ThreadedTimeoutAggregator(sources, timeout=5.0, use_processes=True), 'processes': ThreadedTimeoutAggregator(sources, timeout=5.0, use_processes=True),
} }
#@pytest.mark.parametrize("agg", aggs, ids=["simple", "gevent_timeout"]) #def pytest_generate_tests(metafunc):
def pytest_generate_tests(metafunc): # metafunc.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
metafunc.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
def test_mem_agg_index_1(agg): def test_mem_agg_index_1(agg):
url = 'http://iana.org/' url = 'http://iana.org/'
res = agg(dict(url=url, closest='20140126000000', limit=5)) res = agg(dict(url=url, closest='20140126000000', limit=5))
@ -46,6 +47,7 @@ def test_mem_agg_index_1(agg):
assert(json_list(res) == exp) assert(json_list(res) == exp)
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
def test_mem_agg_index_2(agg): def test_mem_agg_index_2(agg):
url = 'http://example.com/' url = 'http://example.com/'
res = agg(dict(url=url, closest='20100512', limit=6)) res = agg(dict(url=url, closest='20100512', limit=6))
@ -60,6 +62,7 @@ def test_mem_agg_index_2(agg):
assert(json_list(res) == exp) assert(json_list(res) == exp)
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
def test_mem_agg_index_3(agg): def test_mem_agg_index_3(agg):
url = 'http://vvork.com/' url = 'http://vvork.com/'
res = agg(dict(url=url, closest='20141001', limit=5)) res = agg(dict(url=url, closest='20141001', limit=5))
@ -73,6 +76,7 @@ def test_mem_agg_index_3(agg):
assert(json_list(res) == exp) assert(json_list(res) == exp)
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
def test_mem_agg_index_4(agg): def test_mem_agg_index_4(agg):
url = 'http://vvork.com/' url = 'http://vvork.com/'
res = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait')) res = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
@ -83,10 +87,11 @@ def test_mem_agg_index_4(agg):
assert(json_list(res) == exp) assert(json_list(res) == exp)
def test_handler_output_cdxj(agg): def test_handler_output_cdxj():
loader = IndexHandler(agg) agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
url = 'http://vvork.com/' url = 'http://vvork.com/'
res = loader(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait')) res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
exp = """\ exp = """\
com,vvork)/ 20141006184357 {"url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"} com,vvork)/ 20141006184357 {"url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
@ -96,10 +101,11 @@ com,vvork)/ 20131004231540 {"url": "http://vvork.com/", "mem_rel": "last memento
assert(''.join(res) == exp) assert(''.join(res) == exp)
def test_handler_output_json(agg): def test_handler_output_json():
loader = IndexHandler(agg) agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
url = 'http://vvork.com/' url = 'http://vvork.com/'
res = loader(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='json')) res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='json'))
exp = """\ exp = """\
{"urlkey": "com,vvork)/", "timestamp": "20141006184357", "url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"} {"urlkey": "com,vvork)/", "timestamp": "20141006184357", "url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
@ -109,22 +115,50 @@ def test_handler_output_json(agg):
assert(''.join(res) == exp) assert(''.join(res) == exp)
def test_handler_output_link(agg): def test_handler_output_link():
loader = IndexHandler(agg) agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
url = 'http://vvork.com/' url = 'http://vvork.com/'
res = loader(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='link')) res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='link'))
exp = """\ exp = """\
<http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/>; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT"; src="rhiz", <http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/>; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT"; src="rhiz",
<http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/>; rel="memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT"; src="ait"\ <http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/>; rel="memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT"; src="ait"
""" """
assert(''.join(res) == exp) assert(''.join(res) == exp)
def test_handler_output_text(agg): def test_handler_output_link_2():
loader = IndexHandler(agg) agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
url = 'http://iana.org/'
res = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
exp = """\
<http://web.archive.org/web/20140126093743id_/http://iana.org/>; rel="memento"; datetime="Sun, 26 Jan 2014 09:37:43 GMT"; src="ia",
<filename://iana.warc.gz>; rel="memento"; datetime="Sun, 26 Jan 2014 20:06:24 GMT"; src="local",
<http://web.archive.org/web/20140123034755id_/http://iana.org/>; rel="memento"; datetime="Thu, 23 Jan 2014 03:47:55 GMT"; src="ia",
<http://web.archive.org/web/20140129175203id_/http://iana.org/>; rel="memento"; datetime="Wed, 29 Jan 2014 17:52:03 GMT"; src="ia",
<http://wayback.archive-it.org/all/20140107040552id_/http://iana.org/>; rel="memento"; datetime="Tue, 07 Jan 2014 04:05:52 GMT"; src="ait"
"""
assert(''.join(res) == exp)
def test_handler_output_link_3():
agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
url = 'http://foo.bar.non-existent'
res = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
exp = ''
assert(''.join(res) == exp)
def test_handler_output_text():
agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
url = 'http://vvork.com/' url = 'http://vvork.com/'
res = loader(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='text')) res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='text'))
exp = """\ exp = """\
com,vvork)/ 20141006184357 http://www.vvork.com/ memento http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/ http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/ rhiz com,vvork)/ 20141006184357 http://www.vvork.com/ memento http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/ http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/ rhiz
@ -133,9 +167,10 @@ com,vvork)/ 20131004231540 http://vvork.com/ last memento http://wayback.archive
assert(''.join(res) == exp) assert(''.join(res) == exp)
def test_handler_list_sources(agg): def test_handler_list_sources():
loader = IndexHandler(agg) agg = GeventTimeoutAggregator(sources, timeout=5.0)
res = loader(dict(mode='sources')) handler = IndexHandler(agg)
res = handler(dict(mode='list_sources'))
assert(res == {'sources': {'bl': 'memento', assert(res == {'sources': {'bl': 'memento',
'ait': 'memento', 'ait': 'memento',
@ -143,4 +178,3 @@ def test_handler_list_sources(agg):
'rhiz': 'memento', 'rhiz': 'memento',
'local': 'file'}}) 'local': 'file'}})