diff --git a/rezag/aggindexsource.py b/rezag/aggindexsource.py index 435d0152..292622c9 100644 --- a/rezag/aggindexsource.py +++ b/rezag/aggindexsource.py @@ -63,7 +63,6 @@ class BaseAggregator(object): try: _src_params = all_params['_all_src_params'].get(name) all_params['_src_params'] = _src_params - cdx_iter = source.load_index(all_params) except NotFoundException as nf: print('Not found in ' + name) @@ -89,15 +88,21 @@ class BaseAggregator(object): return cdx_iter - def _on_source_error(self, name): + def _on_source_error(self, name): #pragma: no cover pass def _load_all(self, params): #pragma: no cover raise NotImplemented() - def get_sources(self, params): #pragma: no cover + def _iter_sources(self, params): #pragma: no cover raise NotImplemented() + def get_source_list(self, params): + srcs = self._iter_sources(params) + result = [(name, str(value)) for name, value in srcs] + result = {'sources': dict(result)} + return result + #============================================================================= class BaseSourceListAggregator(BaseAggregator): @@ -107,7 +112,7 @@ class BaseSourceListAggregator(BaseAggregator): def get_all_sources(self, params): return self.sources - def get_sources(self, params): + def _iter_sources(self, params): sources = self.get_all_sources(params) srcs_list = params.get('sources') if not srcs_list: @@ -125,7 +130,7 @@ class SeqAggMixin(object): def _load_all(self, params): - sources = list(self.get_sources(params)) + sources = list(self._iter_sources(params)) return list([self.load_child_source(name, source, params) for name, source in sources]) @@ -160,8 +165,8 @@ class TimeoutMixin(object): return False - def get_sources(self, params): - sources = super(TimeoutMixin, self).get_sources(params) + def _iter_sources(self, params): + sources = super(TimeoutMixin, self)._iter_sources(params) for name, source in sources: if not self.is_timed_out(name): yield name, source @@ -185,7 +190,7 @@ class GeventMixin(object): def _load_all(self, params): params['_timeout'] = self.timeout - sources = list(self.get_sources(params)) + sources = list(self._iter_sources(params)) def do_spawn(name, source): return self.pool.spawn(self.load_child_source, name, source, params) @@ -223,7 +228,7 @@ class ConcurrentMixin(object): def _load_all(self, params): params['_timeout'] = self.timeout - sources = list(self.get_sources(params)) + sources = list(self._iter_sources(params)) with self.pool_class(max_workers=self.size) as executor: def do_spawn(name, source): @@ -257,7 +262,8 @@ class BaseDirectoryIndexAggregator(BaseAggregator): self.base_prefix = base_prefix self.base_dir = base_dir - def get_sources(self, params): + def _iter_sources(self, params): + self._set_src_params(params) # see if specific params (when part of another agg) src_params = params.get('_src_params') if not src_params: @@ -270,7 +276,6 @@ class BaseDirectoryIndexAggregator(BaseAggregator): the_dir = self.base_dir the_dir = os.path.join(self.base_prefix, the_dir) - try: sources = list(self._load_files(the_dir)) except Exception: @@ -290,6 +295,10 @@ class BaseDirectoryIndexAggregator(BaseAggregator): rel_path = '' yield rel_path, FileIndexSource(filename) + def __str__(self): + return 'file_dir' + + class DirectoryIndexAggregator(SeqAggMixin, BaseDirectoryIndexAggregator): pass diff --git a/rezag/app.py b/rezag/app.py index c25b4ac7..90275d21 100644 --- a/rezag/app.py +++ b/rezag/app.py @@ -1,31 +1,50 @@ -from rezag.inputrequest import WSGIInputRequest, POSTInputRequest -from bottle import route, request, response, default_app +from rezag.inputrequest import DirectWSGIInputRequest, POSTInputRequest +from bottle import route, request, response, default_app, abort + +from pywb.utils.wbexception import WbException + +import traceback +import json + +def err_handler(exc): + response.status = exc.status_code + response.content_type = 'application/json' + return json.dumps({'message': exc.body}) + +def wrap_error(func): + def do_d(*args, **kwargs): + try: + return func(*args, **kwargs) + except WbException as exc: + if application.debug: + traceback.print_exc() + abort(exc.status(), exc.msg) + except Exception as e: + if application.debug: + traceback.print_exc() + abort(500, 'Internal Error: ' + str(e)) + + return do_d def add_route(path, handler): - def debug(func): - def do_d(): - try: - return func() - except Exception: - import traceback - traceback.print_exc() - - return do_d - - def direct_input_request(): + @wrap_error + def direct_input_request(mode=''): params = dict(request.query) - params['_input_req'] = WSGIInputRequest(request.environ) + params['_input_req'] = DirectWSGIInputRequest(request.environ) return handler(params) - def post_fullrequest(): + @wrap_error + def post_fullrequest(mode=''): params = dict(request.query) params['_input_req'] = POSTInputRequest(request.environ) return handler(params) - route(path + '/postreq', method=['POST'], callback=debug(post_fullrequest)) - route(path, method=['ANY'], callback=debug(direct_input_request)) + route(path + '/postreq', method=['POST'], callback=post_fullrequest) + route(path, method=['ANY'], callback=direct_input_request) application = default_app() +application.default_error_handler = err_handler + diff --git a/rezag/handlers.py b/rezag/handlers.py index 1a6e3495..ff19c725 100644 --- a/rezag/handlers.py +++ b/rezag/handlers.py @@ -1,12 +1,13 @@ -from rezag.responseloader import WARCPathHandler, LiveWebHandler +from rezag.responseloader import WARCPathLoader, LiveWebLoader from rezag.utils import MementoUtils -from pywb.warc.recordloader import ArchiveLoadFailed +from pywb.utils.wbexception import BadRequestException, WbException +from pywb.utils.wbexception import NotFoundException from bottle import response #============================================================================= def to_cdxj(cdx_iter, fields): - response.headers['Content-Type'] = 'text/x-cdxj' + response.headers['Content-Type'] = 'application/x-cdxj' return [cdx.to_cdxj(fields) for cdx in cdx_iter] def to_json(cdx_iter, fields): @@ -37,26 +38,36 @@ class IndexHandler(object): self.index_source = index_source self.opts = opts or {} - def __call__(self, params): - if params.get('mode') == 'sources': - srcs = self.index_source.get_sources(params) - result = [(name, str(value)) for name, value in srcs] - result = {'sources': dict(result)} - return result + def get_supported_modes(self): + return dict(modes=['list_modes', 'list_sources', 'index']) + + def _load_index_source(self, params): + url = params.get('url') + if not url: + raise BadRequestException('The "url" param is required') input_req = params.get('_input_req') if input_req: - params['alt_url'] = input_req.include_post_query(params.get('url')) + params['alt_url'] = input_req.include_post_query(url) - cdx_iter = self.index_source(params) + return self.index_source(params) + + def __call__(self, params): + mode = params.get('mode', 'index') + if mode == 'list_sources': + return self.index_source.get_source_list(params) + + if mode == 'list_modes' or mode != 'index': + return self.get_supported_modes() output = params.get('output', self.DEF_OUTPUT) fields = params.get('fields') handler = self.OUTPUTS.get(output) if not handler: - handler = self.OUTPUTS[self.DEF_OUTPUT] + raise BadRequestException('output={0} not supported'.format(output)) + cdx_iter = self._load_index_source(params) res = handler(cdx_iter, fields) return res @@ -67,57 +78,59 @@ class ResourceHandler(IndexHandler): super(ResourceHandler, self).__init__(index_source) self.resource_loaders = resource_loaders + def get_supported_modes(self): + res = super(ResourceHandler, self).get_supported_modes() + res['modes'].append('resource') + return res + def __call__(self, params): if params.get('mode', 'resource') != 'resource': return super(ResourceHandler, self).__call__(params) - input_req = params.get('_input_req') - if input_req: - params['alt_url'] = input_req.include_post_query(params.get('url')) - - cdx_iter = self.index_source(params) - - any_found = False + cdx_iter = self._load_index_source(params) + last_exc = None for cdx in cdx_iter: - any_found = True - for loader in self.resource_loaders: try: resp = loader(cdx, params) - if resp: + if resp is not None: return resp - except ArchiveLoadFailed as e: - print(e) - pass + except WbException as e: + last_exc = e - if any_found: - raise ArchiveLoadFailed('Resource Found, could not be Loaded') + if last_exc: + raise last_exc + #raise ArchiveLoadFailed('Resource Found, could not be Loaded') else: - raise ArchiveLoadFailed('No Resource Found') + raise NotFoundException('No Resource Found') #============================================================================= class DefaultResourceHandler(ResourceHandler): def __init__(self, index_source, warc_paths=''): - loaders = [WARCPathHandler(warc_paths, index_source), - LiveWebHandler() + loaders = [WARCPathLoader(warc_paths, index_source), + LiveWebLoader() ] super(DefaultResourceHandler, self).__init__(index_source, loaders) #============================================================================= class HandlerSeq(object): - def __init__(self, loaders): - self.loaders = loaders + def __init__(self, handlers): + self.handlers = handlers def __call__(self, params): - for loader in self.loaders: + last_exc = None + for handler in self.handlers: try: - res = loader(params) - if res: + res = handler(params) + if res is not None: return res - except ArchiveLoadFailed: - pass + except WbException as e: + last_exc = e - raise ArchiveLoadFailed('No Resource Found') + if last_exc: + raise last_exc + else: + raise NotFoundException('No Resource Found') diff --git a/rezag/indexsource.py b/rezag/indexsource.py index a597e0c4..ed4a26a6 100644 --- a/rezag/indexsource.py +++ b/rezag/indexsource.py @@ -14,6 +14,9 @@ from rezag.liverec import patched_requests as requests from rezag.utils import MementoUtils +WAYBACK_ORIG_SUFFIX = '{timestamp}id_/{url}' + + #============================================================================= class BaseIndexSource(object): def load_index(self, params): #pragma: no cover @@ -22,10 +25,10 @@ class BaseIndexSource(object): @staticmethod def res_template(template, params): src_params = params.get('_src_params') - if src_params: - res = template.format(**src_params) + if not src_params: + res = template.format(url=params['url']) else: - res = template + res = template.format(url=params['url'], **src_params) return res @@ -59,7 +62,7 @@ class RemoteIndexSource(BaseIndexSource): def load_index(self, params): api_url = self.res_template(self.api_url_template, params) - api_url += '?url=' + params['url'] + print('API URL', api_url) r = requests.get(api_url, timeout=params.get('_timeout')) if r.status_code >= 400: raise NotFoundException(api_url) @@ -169,7 +172,6 @@ class MementoIndexSource(BaseIndexSource): def get_timegate_links(self, params, closest): url = self.res_template(self.timegate_url, params) - url += params['url'] accept_dt = timestamp_to_http_date(closest) res = requests.head(url, headers={'Accept-Datetime': accept_dt}) if res.status_code >= 400: @@ -179,7 +181,6 @@ class MementoIndexSource(BaseIndexSource): def get_timemap_links(self, params): url = self.res_template(self.timemap_url, params) - url += params['url'] res = requests.get(url, timeout=params.get('_timeout')) if res.status_code >= 400: raise NotFoundException(url) @@ -200,9 +201,9 @@ class MementoIndexSource(BaseIndexSource): @staticmethod def from_timegate_url(timegate_url, path='link'): - return MementoIndexSource(timegate_url, - timegate_url + 'timemap/' + path + '/', - timegate_url + '{timestamp}id_/{url}') + return MementoIndexSource(timegate_url + '{url}', + timegate_url + 'timemap/' + path + '/{url}', + timegate_url + WAYBACK_ORIG_SUFFIX) def __str__(self): return 'memento' diff --git a/rezag/inputrequest.py b/rezag/inputrequest.py index 17b6ef6b..332716a2 100644 --- a/rezag/inputrequest.py +++ b/rezag/inputrequest.py @@ -1,4 +1,3 @@ -from pywb.utils.loaders import extract_client_cookie from pywb.utils.loaders import extract_post_query, append_post_query from pywb.utils.loaders import LimitReader from pywb.utils.statusandheaders import StatusAndHeadersParser @@ -9,7 +8,7 @@ from io import BytesIO #============================================================================= -class WSGIInputRequest(object): +class DirectWSGIInputRequest(object): def __init__(self, env): self.env = env @@ -20,26 +19,10 @@ class WSGIInputRequest(object): headers = {} for name, value in iteritems(self.env): + # will be set by requests to match actual host if name == 'HTTP_HOST': - #name = 'Host' - #value = splits.netloc - # will be set automatically continue - #elif name == 'HTTP_ORIGIN': - # name = 'Origin' - # value = (splits.scheme + '://' + splits.netloc) - - elif name == 'HTTP_X_CSRFTOKEN': - name = 'X-CSRFToken' - cookie_val = extract_client_cookie(env, 'csrftoken') - if cookie_val: - value = cookie_val - - #elif name == 'HTTP_X_FORWARDED_PROTO': - # name = 'X-Forwarded-Proto' - # value = splits.scheme - elif name.startswith('HTTP_'): name = name[5:].title().replace('_', '-') @@ -55,10 +38,7 @@ class WSGIInputRequest(object): return headers def get_req_body(self): - input_ = self.env.get('wsgi.input') - if not input_: - return None - + input_ = self.env['wsgi.input'] len_ = self._get_content_length() enc = self._get_header('Transfer-Encoding') @@ -70,9 +50,6 @@ class WSGIInputRequest(object): data = None return data - #buf = data.read().decode('utf-8') - #print(buf) - #return StringIO(buf) def _get_content_type(self): return self.env.get('CONTENT_TYPE') @@ -105,7 +82,7 @@ class WSGIInputRequest(object): #============================================================================= -class POSTInputRequest(WSGIInputRequest): +class POSTInputRequest(DirectWSGIInputRequest): def __init__(self, env): self.env = env diff --git a/rezag/responseloader.py b/rezag/responseloader.py index 52bf4760..ee835c80 100644 --- a/rezag/responseloader.py +++ b/rezag/responseloader.py @@ -2,6 +2,7 @@ from rezag.liverec import BaseRecorder from rezag.liverec import request as remote_request from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_http_date +from pywb.utils.wbexception import LiveResourceException from pywb.warc.resolvingloader import ResolvingLoader from io import BytesIO @@ -29,7 +30,7 @@ def incr_reader(stream, header=None, size=8192): #============================================================================= -class WARCPathHandler(object): +class WARCPathLoader(object): def __init__(self, paths, cdx_source): self.paths = paths if isinstance(paths, str): @@ -108,7 +109,7 @@ class HeaderRecorder(BaseRecorder): #============================================================================= -class LiveWebHandler(object): +class LiveWebLoader(object): SKIP_HEADERS = (b'link', b'memento-datetime', b'content-location', @@ -140,14 +141,17 @@ class LiveWebHandler(object): method = input_req.get_req_method() data = input_req.get_req_body() - upstream_res = remote_request(url=load_url, - method=method, - recorder=recorder, - stream=True, - allow_redirects=False, - headers=req_headers, - data=data, - timeout=params.get('_timeout')) + try: + upstream_res = remote_request(url=load_url, + method=method, + recorder=recorder, + stream=True, + allow_redirects=False, + headers=req_headers, + data=data, + timeout=params.get('_timeout')) + except Exception: + raise LiveResourceException(load_url) resp_headers = recorder.get_header() @@ -175,7 +179,7 @@ class LiveWebHandler(object): return dt.strftime('%Y-%m-%dT%H:%M:%SZ') @staticmethod - def _make_warc_id(id_=None): + def _make_warc_id(id_=None): #pragma: no cover if not id_: id_ = uuid.uuid1() return ''.format(id_) diff --git a/rezag/utils.py b/rezag/utils.py index 2e5ae1c6..b10eeef8 100644 --- a/rezag/utils.py +++ b/rezag/utils.py @@ -77,6 +77,7 @@ class MementoUtils(object): from_date = timestamp_to_http_date(first_cdx['timestamp']) except StopIteration: first_cdx = None + return # first memento link yield MementoUtils.make_timemap_memento_link(first_cdx, datetime=from_date) @@ -91,4 +92,4 @@ class MementoUtils(object): # last memento link, if any if prev_cdx: - yield MementoUtils.make_timemap_memento_link(prev_cdx, end='') + yield MementoUtils.make_timemap_memento_link(prev_cdx, end='\n') diff --git a/setup.py b/setup.py index e3ce8061..cdb646ce 100755 --- a/setup.py +++ b/setup.py @@ -32,8 +32,11 @@ setup( 'rezag', ], install_requires=[ - 'pywb', + 'pywb==1.0b', ], + dependency_links=[ + 'git+https://github.com/ikreymer/pywb.git@py3#egg=pywb-1.0b-py3', + ], zip_safe=True, entry_points=""" [console_scripts] diff --git a/test/test_dir_agg.py b/test/test_dir_agg.py index 3a9c916f..42f6387f 100644 --- a/test/test_dir_agg.py +++ b/test/test_dir_agg.py @@ -33,6 +33,9 @@ def setup_module(): shutil.copy(to_path('testdata/iana.cdxj'), coll_B) shutil.copy(to_path('testdata/dupes.cdxj'), coll_C) + with open(to_path(root_dir) + 'somefile', 'w') as fh: + fh.write('foo') + global dir_loader dir_loader = DirectoryIndexAggregator(dir_prefix, dir_path) @@ -121,7 +124,7 @@ def test_agg_dir_and_memento(): 'local': dir_loader} agg_source = SimpleAggregator(sources) - res = agg_source({'url': 'example.com/', 'param.coll': '*', 'closest': '20100512', 'limit': 6}) + res = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6}) exp = [ {'source': 'ia', 'timestamp': '20100513052358', 'load_url': 'http://web.archive.org/web/20100513052358id_/http://example.com/'}, @@ -144,7 +147,7 @@ def test_agg_no_dir_1(): def test_agg_no_dir_2(): - loader = DirectoryIndexAggregator(root_dir, 'no_such') + loader = DirectoryIndexAggregator(root_dir, '') res = loader({'url': 'example.com/', 'param.coll': 'X'}) exp = [] @@ -152,4 +155,31 @@ def test_agg_no_dir_2(): assert(to_json_list(res) == exp) +def test_agg_dir_sources_1(): + res = dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'}) + exp = {'sources': {'colls/A/indexes': 'file', + 'colls/B/indexes': 'file', + 'colls/C/indexes': 'file'} + } + + assert(res == exp) + + +def test_agg_dir_sources_2(): + res = dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'}) + exp = {'sources': {'colls/A/indexes': 'file', + 'colls/C/indexes': 'file'} + } + + assert(res == exp) + + +def test_agg_dir_sources_single_dir(): + loader = DirectoryIndexAggregator('testdata/', '') + res = loader.get_source_list({'url': 'example.com/'}) + + exp = {'sources': {}} + + assert(res == exp) + diff --git a/test/test_handlers.py b/test/test_handlers.py index 1e2d2822..f5ac05a2 100644 --- a/test/test_handlers.py +++ b/test/test_handlers.py @@ -42,13 +42,17 @@ def setup_module(self): source3 = SimpleAggregator({'example': FileIndexSource(to_path('testdata/example.cdxj'))}) handler3 = DefaultResourceHandler(source3, to_path('testdata/')) - add_route('/fallback', HandlerSeq([handler3, handler2, live_handler])) + add_route('/seq', HandlerSeq([handler3, + handler2])) - bottle.debug = True + add_route('/empty', HandlerSeq([])) + add_route('/invalid', HandlerSeq(['foo'])) + + application.debug = True global testapp testapp = webtest.TestApp(application) @@ -61,8 +65,23 @@ class TestResAgg(object): def setup(self): self.testapp = testapp + def test_list_handlers(self): + resp = self.testapp.get('/many?mode=list_modes') + assert resp.json == {'modes': ['list_modes', 'list_sources', 'index', 'resource']} + + resp = self.testapp.get('/many?mode=other') + assert resp.json == {'modes': ['list_modes', 'list_sources', 'index', 'resource']} + + # defaults to resource, must specify url + resp = self.testapp.get('/many', status=400) + assert resp.json == {'message': 'The "url" param is required'} + + def test_list_sources(self): + resp = self.testapp.get('/many?mode=list_sources') + assert resp.json == {'sources': {'local': 'file_dir', 'ia': 'memento', 'rhiz': 'memento', 'live': 'live'}} + def test_live_index(self): - resp = self.testapp.get('/live?url=http://httpbin.org/get&mode=index&output=json') + resp = self.testapp.get('/live?mode=index&url=http://httpbin.org/get&output=json') resp.charset = 'utf-8' res = to_json_list(resp.text) @@ -71,7 +90,8 @@ class TestResAgg(object): 'load_url': 'http://httpbin.org/get', 'source': 'live', 'timestamp': '2016'}]) def test_live_resource(self): - resp = self.testapp.get('/live?url=http://httpbin.org/get?foo=bar&mode=resource') + headers = {'foo': 'bar'} + resp = self.testapp.get('/live?url=http://httpbin.org/get?foo=bar', headers=headers) assert resp.headers['WARC-Coll'] == 'live' assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar' @@ -82,7 +102,7 @@ class TestResAgg(object): def test_live_post_resource(self): - resp = self.testapp.post('/live?url=http://httpbin.org/post&mode=resource', + resp = self.testapp.post('/live?url=http://httpbin.org/post', OrderedDict([('foo', 'bar')])) assert resp.headers['WARC-Coll'] == 'live' @@ -204,6 +224,11 @@ foo=bar&test=abc""" assert resp.headers['WARC-Target-URI'] == 'http://example.com/' assert b'HTTP/1.1 200 OK' in resp.body + def test_error_fallback_live_not_found(self): + resp = self.testapp.get('/fallback?url=http://invalid.url-not-found', status=400) + + assert resp.json == {'message': 'http://invalid.url-not-found'} + def test_agg_local_revisit(self): resp = self.testapp.get('/many?url=http://www.example.com/&closest=20140127171251&sources=local') @@ -214,3 +239,24 @@ foo=bar&test=abc""" assert resp.headers['WARC-Refers-To-Date'] == '2014-01-27T17:12:00Z' assert b'HTTP/1.1 200 OK' in resp.body assert b'' in resp.body + + def test_error_invalid_index_output(self): + resp = self.testapp.get('/live?mode=index&url=http://httpbin.org/get&output=foobar', status=400) + + assert resp.json == {'message': 'output=foobar not supported'} + + def test_error_local_not_found(self): + resp = self.testapp.get('/many?url=http://not-found.error/&sources=local', status=404) + + assert resp.json == {'message': 'No Resource Found'} + + def test_error_empty(self): + resp = self.testapp.get('/empty?url=http://example.com/', status=404) + + assert resp.json == {'message': 'No Resource Found'} + + def test_error_invalid(self): + resp = self.testapp.get('/invalid?url=http://example.com/', status=500) + + assert resp.json['message'].startswith('Internal Error') + diff --git a/test/test_indexsource.py b/test/test_indexsource.py index 643bd3e0..c935a5fd 100644 --- a/test/test_indexsource.py +++ b/test/test_indexsource.py @@ -32,16 +32,20 @@ local_sources = [ remote_sources = [ - RemoteIndexSource('http://webenact.rhizome.org/all-cdx', + RemoteIndexSource('http://webenact.rhizome.org/all-cdx?url={url}', 'http://webenact.rhizome.org/all/{timestamp}id_/{url}'), - MementoIndexSource('http://webenact.rhizome.org/all/', - 'http://webenact.rhizome.org/all/timemap/*/', + MementoIndexSource('http://webenact.rhizome.org/all/{url}', + 'http://webenact.rhizome.org/all/timemap/*/{url}', 'http://webenact.rhizome.org/all/{timestamp}id_/{url}') ] +ait_source = RemoteIndexSource('http://wayback.archive-it.org/cdx?url={url}', + 'http://wayback.archive-it.org/all/{timestamp}id_/{url}') + def query_single_source(source, params): + string = str(source) return SimpleAggregator({'source': source})(params) @@ -182,4 +186,22 @@ def test_file_not_found(): +def test_ait_filters(): + ait_source = RemoteIndexSource('http://wayback.archive-it.org/cdx/search/cdx?url={url}&filter=filename:ARCHIVEIT-({colls})-.*', + 'http://wayback.archive-it.org/all/{timestamp}id_/{url}') + + cdxlist = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '5610|933'}) + filenames = [cdx['filename'] for cdx in cdxlist] + + prefix = ('ARCHIVEIT-5610-', 'ARCHIVEIT-933-') + + assert(all([x.startswith(prefix) for x in filenames])) + + + cdxlist = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '1883|366|905'}) + filenames = [cdx['filename'] for cdx in cdxlist] + + prefix = ('ARCHIVEIT-1883-', 'ARCHIVEIT-366-', 'ARCHIVEIT-905-') + + assert(all([x.startswith(prefix) for x in filenames])) diff --git a/test/test_memento_agg.py b/test/test_memento_agg.py index be49fe9c..59040670 100644 --- a/test/test_memento_agg.py +++ b/test/test_memento_agg.py @@ -27,10 +27,11 @@ aggs = {'simple': SimpleAggregator(sources), 'processes': ThreadedTimeoutAggregator(sources, timeout=5.0, use_processes=True), } -#@pytest.mark.parametrize("agg", aggs, ids=["simple", "gevent_timeout"]) -def pytest_generate_tests(metafunc): - metafunc.parametrize("agg", list(aggs.values()), ids=list(aggs.keys())) +#def pytest_generate_tests(metafunc): +# metafunc.parametrize("agg", list(aggs.values()), ids=list(aggs.keys())) + +@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys())) def test_mem_agg_index_1(agg): url = 'http://iana.org/' res = agg(dict(url=url, closest='20140126000000', limit=5)) @@ -46,6 +47,7 @@ def test_mem_agg_index_1(agg): assert(json_list(res) == exp) +@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys())) def test_mem_agg_index_2(agg): url = 'http://example.com/' res = agg(dict(url=url, closest='20100512', limit=6)) @@ -60,6 +62,7 @@ def test_mem_agg_index_2(agg): assert(json_list(res) == exp) +@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys())) def test_mem_agg_index_3(agg): url = 'http://vvork.com/' res = agg(dict(url=url, closest='20141001', limit=5)) @@ -73,6 +76,7 @@ def test_mem_agg_index_3(agg): assert(json_list(res) == exp) +@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys())) def test_mem_agg_index_4(agg): url = 'http://vvork.com/' res = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait')) @@ -83,10 +87,11 @@ def test_mem_agg_index_4(agg): assert(json_list(res) == exp) -def test_handler_output_cdxj(agg): - loader = IndexHandler(agg) +def test_handler_output_cdxj(): + agg = GeventTimeoutAggregator(sources, timeout=5.0) + handler = IndexHandler(agg) url = 'http://vvork.com/' - res = loader(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait')) + res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait')) exp = """\ com,vvork)/ 20141006184357 {"url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"} @@ -96,10 +101,11 @@ com,vvork)/ 20131004231540 {"url": "http://vvork.com/", "mem_rel": "last memento assert(''.join(res) == exp) -def test_handler_output_json(agg): - loader = IndexHandler(agg) +def test_handler_output_json(): + agg = GeventTimeoutAggregator(sources, timeout=5.0) + handler = IndexHandler(agg) url = 'http://vvork.com/' - res = loader(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='json')) + res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='json')) exp = """\ {"urlkey": "com,vvork)/", "timestamp": "20141006184357", "url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"} @@ -109,22 +115,50 @@ def test_handler_output_json(agg): assert(''.join(res) == exp) -def test_handler_output_link(agg): - loader = IndexHandler(agg) +def test_handler_output_link(): + agg = GeventTimeoutAggregator(sources, timeout=5.0) + handler = IndexHandler(agg) url = 'http://vvork.com/' - res = loader(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='link')) + res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='link')) exp = """\ ; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT"; src="rhiz", -; rel="memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT"; src="ait"\ +; rel="memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT"; src="ait" """ assert(''.join(res) == exp) -def test_handler_output_text(agg): - loader = IndexHandler(agg) +def test_handler_output_link_2(): + agg = GeventTimeoutAggregator(sources, timeout=5.0) + handler = IndexHandler(agg) + url = 'http://iana.org/' + res = handler(dict(url=url, closest='20140126000000', limit=5, output='link')) + + exp = """\ +; rel="memento"; datetime="Sun, 26 Jan 2014 09:37:43 GMT"; src="ia", +; rel="memento"; datetime="Sun, 26 Jan 2014 20:06:24 GMT"; src="local", +; rel="memento"; datetime="Thu, 23 Jan 2014 03:47:55 GMT"; src="ia", +; rel="memento"; datetime="Wed, 29 Jan 2014 17:52:03 GMT"; src="ia", +; rel="memento"; datetime="Tue, 07 Jan 2014 04:05:52 GMT"; src="ait" +""" + assert(''.join(res) == exp) + + +def test_handler_output_link_3(): + agg = GeventTimeoutAggregator(sources, timeout=5.0) + handler = IndexHandler(agg) + url = 'http://foo.bar.non-existent' + res = handler(dict(url=url, closest='20140126000000', limit=5, output='link')) + + exp = '' + + assert(''.join(res) == exp) + +def test_handler_output_text(): + agg = GeventTimeoutAggregator(sources, timeout=5.0) + handler = IndexHandler(agg) url = 'http://vvork.com/' - res = loader(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='text')) + res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='text')) exp = """\ com,vvork)/ 20141006184357 http://www.vvork.com/ memento http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/ http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/ rhiz @@ -133,9 +167,10 @@ com,vvork)/ 20131004231540 http://vvork.com/ last memento http://wayback.archive assert(''.join(res) == exp) -def test_handler_list_sources(agg): - loader = IndexHandler(agg) - res = loader(dict(mode='sources')) +def test_handler_list_sources(): + agg = GeventTimeoutAggregator(sources, timeout=5.0) + handler = IndexHandler(agg) + res = handler(dict(mode='list_sources')) assert(res == {'sources': {'bl': 'memento', 'ait': 'memento', @@ -143,4 +178,3 @@ def test_handler_list_sources(agg): 'rhiz': 'memento', 'local': 'file'}}) -