seperate iter_sources from list_sources api

all errors returned as json block with error msg tests for not found, invalid errors
2025-03-15 08:04:49 +01:00 · 2016-02-29 12:34:06 -08:00 · 2016-02-29 12:34:06 -08:00 · 008e5284b1
commit 008e5284b1
parent 68090d00c1
12 changed files with 304 additions and 145 deletions
--- a/rezag/aggindexsource.py
+++ b/rezag/aggindexsource.py
@ -63,7 +63,6 @@ class BaseAggregator(object):
        try:
            _src_params = all_params['_all_src_params'].get(name)
            all_params['_src_params'] = _src_params
-
            cdx_iter = source.load_index(all_params)
        except NotFoundException as nf:
            print('Not found in ' + name)
@ -89,15 +88,21 @@ class BaseAggregator(object):

        return cdx_iter

-    def _on_source_error(self, name):
+    def _on_source_error(self, name):  #pragma: no cover
        pass

    def _load_all(self, params):  #pragma: no cover
        raise NotImplemented()

-    def get_sources(self, params):  #pragma: no cover
+    def _iter_sources(self, params):  #pragma: no cover
        raise NotImplemented()

+    def get_source_list(self, params):
+        srcs = self._iter_sources(params)
+        result = [(name, str(value)) for name, value in srcs]
+        result = {'sources': dict(result)}
+        return result
+

 #=============================================================================
 class BaseSourceListAggregator(BaseAggregator):
@ -107,7 +112,7 @@ class BaseSourceListAggregator(BaseAggregator):
    def get_all_sources(self, params):
        return self.sources

-    def get_sources(self, params):
+    def _iter_sources(self, params):
        sources = self.get_all_sources(params)
        srcs_list = params.get('sources')
        if not srcs_list:
@ -125,7 +130,7 @@ class SeqAggMixin(object):


    def _load_all(self, params):
-        sources = list(self.get_sources(params))
+        sources = list(self._iter_sources(params))
        return list([self.load_child_source(name, source, params)
                     for name, source in sources])

@ -160,8 +165,8 @@ class TimeoutMixin(object):

        return False

-    def get_sources(self, params):
-        sources = super(TimeoutMixin, self).get_sources(params)
+    def _iter_sources(self, params):
+        sources = super(TimeoutMixin, self)._iter_sources(params)
        for name, source in sources:
            if not self.is_timed_out(name):
                yield name, source
@ -185,7 +190,7 @@ class GeventMixin(object):
    def _load_all(self, params):
        params['_timeout'] = self.timeout

-        sources = list(self.get_sources(params))
+        sources = list(self._iter_sources(params))

        def do_spawn(name, source):
            return self.pool.spawn(self.load_child_source, name, source, params)
@ -223,7 +228,7 @@ class ConcurrentMixin(object):
    def _load_all(self, params):
        params['_timeout'] = self.timeout

-        sources = list(self.get_sources(params))
+        sources = list(self._iter_sources(params))

        with self.pool_class(max_workers=self.size) as executor:
            def do_spawn(name, source):
@ -257,7 +262,8 @@ class BaseDirectoryIndexAggregator(BaseAggregator):
        self.base_prefix = base_prefix
        self.base_dir = base_dir

-    def get_sources(self, params):
+    def _iter_sources(self, params):
+        self._set_src_params(params)
        # see if specific params (when part of another agg)
        src_params = params.get('_src_params')
        if not src_params:
@ -270,7 +276,6 @@ class BaseDirectoryIndexAggregator(BaseAggregator):
            the_dir = self.base_dir

        the_dir = os.path.join(self.base_prefix, the_dir)
-
        try:
            sources = list(self._load_files(the_dir))
        except Exception:
@ -290,6 +295,10 @@ class BaseDirectoryIndexAggregator(BaseAggregator):
                        rel_path = ''
                    yield rel_path, FileIndexSource(filename)

+    def __str__(self):
+        return 'file_dir'
+
+
 class DirectoryIndexAggregator(SeqAggMixin, BaseDirectoryIndexAggregator):
    pass

--- a/rezag/app.py
+++ b/rezag/app.py
@ -1,31 +1,50 @@
-from rezag.inputrequest import WSGIInputRequest, POSTInputRequest
-from bottle import route, request, response, default_app
+from rezag.inputrequest import DirectWSGIInputRequest, POSTInputRequest
+from bottle import route, request, response, default_app, abort
+
+from pywb.utils.wbexception import WbException
+
+import traceback
+import json
+
+def err_handler(exc):
+    response.status = exc.status_code
+    response.content_type = 'application/json'
+    return json.dumps({'message': exc.body})
+
+def wrap_error(func):
+    def do_d(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except WbException as exc:
+            if application.debug:
+                traceback.print_exc()
+            abort(exc.status(), exc.msg)
+        except Exception as e:
+            if application.debug:
+                traceback.print_exc()
+            abort(500, 'Internal Error: ' + str(e))
+
+    return do_d


 def add_route(path, handler):
-    def debug(func):
-        def do_d():
-            try:
-                return func()
-            except Exception:
-                import traceback
-                traceback.print_exc()
-
-        return do_d
-
-    def direct_input_request():
+    @wrap_error
+    def direct_input_request(mode=''):
        params = dict(request.query)
-        params['_input_req'] = WSGIInputRequest(request.environ)
+        params['_input_req'] = DirectWSGIInputRequest(request.environ)
        return handler(params)

-    def post_fullrequest():
+    @wrap_error
+    def post_fullrequest(mode=''):
        params = dict(request.query)
        params['_input_req'] = POSTInputRequest(request.environ)
        return handler(params)

-    route(path + '/postreq', method=['POST'], callback=debug(post_fullrequest))
-    route(path, method=['ANY'], callback=debug(direct_input_request))
+    route(path + '/postreq', method=['POST'], callback=post_fullrequest)
+    route(path, method=['ANY'], callback=direct_input_request)


 application = default_app()
+application.default_error_handler = err_handler
+

--- a/rezag/handlers.py
+++ b/rezag/handlers.py
@ -1,12 +1,13 @@
-from rezag.responseloader import  WARCPathHandler, LiveWebHandler
+from rezag.responseloader import  WARCPathLoader, LiveWebLoader
 from rezag.utils import MementoUtils
-from pywb.warc.recordloader import ArchiveLoadFailed
+from pywb.utils.wbexception import BadRequestException, WbException
+from pywb.utils.wbexception import NotFoundException
 from bottle import response


 #=============================================================================
 def to_cdxj(cdx_iter, fields):
-    response.headers['Content-Type'] = 'text/x-cdxj'
+    response.headers['Content-Type'] = 'application/x-cdxj'
    return [cdx.to_cdxj(fields) for cdx in cdx_iter]

 def to_json(cdx_iter, fields):
@ -37,26 +38,36 @@ class IndexHandler(object):
        self.index_source = index_source
        self.opts = opts or {}

-    def __call__(self, params):
-        if params.get('mode') == 'sources':
-            srcs = self.index_source.get_sources(params)
-            result = [(name, str(value)) for name, value in srcs]
-            result = {'sources': dict(result)}
-            return result
+    def get_supported_modes(self):
+        return dict(modes=['list_modes', 'list_sources', 'index'])
+
+    def _load_index_source(self, params):
+        url = params.get('url')
+        if not url:
+            raise BadRequestException('The "url" param is required')

        input_req = params.get('_input_req')
        if input_req:
-            params['alt_url'] = input_req.include_post_query(params.get('url'))
+            params['alt_url'] = input_req.include_post_query(url)

-        cdx_iter = self.index_source(params)
+        return self.index_source(params)
+
+    def __call__(self, params):
+        mode = params.get('mode', 'index')
+        if mode == 'list_sources':
+            return self.index_source.get_source_list(params)
+
+        if mode == 'list_modes' or mode != 'index':
+            return self.get_supported_modes()

        output = params.get('output', self.DEF_OUTPUT)
        fields = params.get('fields')

        handler = self.OUTPUTS.get(output)
        if not handler:
-            handler = self.OUTPUTS[self.DEF_OUTPUT]
+            raise BadRequestException('output={0} not supported'.format(output))

+        cdx_iter = self._load_index_source(params)
        res = handler(cdx_iter, fields)
        return res

@ -67,57 +78,59 @@ class ResourceHandler(IndexHandler):
        super(ResourceHandler, self).__init__(index_source)
        self.resource_loaders = resource_loaders

+    def get_supported_modes(self):
+        res = super(ResourceHandler, self).get_supported_modes()
+        res['modes'].append('resource')
+        return res
+
    def __call__(self, params):
        if params.get('mode', 'resource') != 'resource':
            return super(ResourceHandler, self).__call__(params)

-        input_req = params.get('_input_req')
-        if input_req:
-            params['alt_url'] = input_req.include_post_query(params.get('url'))
-
-        cdx_iter = self.index_source(params)
-
-        any_found = False
+        cdx_iter = self._load_index_source(params)
+        last_exc = None

        for cdx in cdx_iter:
-            any_found = True
-
            for loader in self.resource_loaders:
                try:
                    resp = loader(cdx, params)
-                    if resp:
+                    if resp is not None:
                        return resp
-                except ArchiveLoadFailed as e:
-                    print(e)
-                    pass
+                except WbException as e:
+                    last_exc = e

-        if any_found:
-            raise ArchiveLoadFailed('Resource Found, could not be Loaded')
+        if last_exc:
+            raise last_exc
+            #raise ArchiveLoadFailed('Resource Found, could not be Loaded')
        else:
-            raise ArchiveLoadFailed('No Resource Found')
+            raise NotFoundException('No Resource Found')


 #=============================================================================
 class DefaultResourceHandler(ResourceHandler):
    def __init__(self, index_source, warc_paths=''):
-        loaders = [WARCPathHandler(warc_paths, index_source),
-                   LiveWebHandler()
+        loaders = [WARCPathLoader(warc_paths, index_source),
+                   LiveWebLoader()
                  ]
        super(DefaultResourceHandler, self).__init__(index_source, loaders)


 #=============================================================================
 class HandlerSeq(object):
-    def __init__(self, loaders):
-        self.loaders = loaders
+    def __init__(self, handlers):
+        self.handlers = handlers

    def __call__(self, params):
-        for loader in self.loaders:
+        last_exc = None
+        for handler in self.handlers:
            try:
-                res = loader(params)
-                if res:
+                res = handler(params)
+                if res is not None:
                    return res
-            except ArchiveLoadFailed:
-                pass
+            except WbException as e:
+                last_exc = e

-        raise ArchiveLoadFailed('No Resource Found')
+        if last_exc:
+            raise last_exc
+        else:
+            raise NotFoundException('No Resource Found')
--- a/rezag/indexsource.py
+++ b/rezag/indexsource.py
@ -14,6 +14,9 @@ from rezag.liverec import patched_requests as requests
 from rezag.utils import MementoUtils


+WAYBACK_ORIG_SUFFIX = '{timestamp}id_/{url}'
+
+
 #=============================================================================
 class BaseIndexSource(object):
    def load_index(self, params):  #pragma: no cover
@ -22,10 +25,10 @@ class BaseIndexSource(object):
    @staticmethod
    def res_template(template, params):
        src_params = params.get('_src_params')
-        if src_params:
-            res = template.format(**src_params)
+        if not src_params:
+            res = template.format(url=params['url'])
        else:
-            res = template
+            res = template.format(url=params['url'], **src_params)
        return res


@ -59,7 +62,7 @@ class RemoteIndexSource(BaseIndexSource):

    def load_index(self, params):
        api_url = self.res_template(self.api_url_template, params)
-        api_url += '?url=' + params['url']
+        print('API URL', api_url)
        r = requests.get(api_url, timeout=params.get('_timeout'))
        if r.status_code >= 400:
            raise NotFoundException(api_url)
@ -169,7 +172,6 @@ class MementoIndexSource(BaseIndexSource):

    def get_timegate_links(self, params, closest):
        url = self.res_template(self.timegate_url, params)
-        url += params['url']
        accept_dt = timestamp_to_http_date(closest)
        res = requests.head(url, headers={'Accept-Datetime': accept_dt})
        if res.status_code >= 400:
@ -179,7 +181,6 @@ class MementoIndexSource(BaseIndexSource):

    def get_timemap_links(self, params):
        url = self.res_template(self.timemap_url, params)
-        url += params['url']
        res = requests.get(url, timeout=params.get('_timeout'))
        if res.status_code >= 400:
            raise NotFoundException(url)
@ -200,9 +201,9 @@ class MementoIndexSource(BaseIndexSource):

    @staticmethod
    def from_timegate_url(timegate_url, path='link'):
-        return MementoIndexSource(timegate_url,
-                                  timegate_url + 'timemap/' + path + '/',
-                                  timegate_url + '{timestamp}id_/{url}')
+        return MementoIndexSource(timegate_url + '{url}',
+                                  timegate_url + 'timemap/' + path + '/{url}',
+                                  timegate_url + WAYBACK_ORIG_SUFFIX)

    def __str__(self):
        return 'memento'
--- a/rezag/inputrequest.py
+++ b/rezag/inputrequest.py
@ -1,4 +1,3 @@
-from pywb.utils.loaders import extract_client_cookie
 from pywb.utils.loaders import extract_post_query, append_post_query
 from pywb.utils.loaders import LimitReader
 from pywb.utils.statusandheaders import StatusAndHeadersParser
@ -9,7 +8,7 @@ from io import BytesIO


 #=============================================================================
-class WSGIInputRequest(object):
+class DirectWSGIInputRequest(object):
    def __init__(self, env):
        self.env = env

@ -20,26 +19,10 @@ class WSGIInputRequest(object):
        headers = {}

        for name, value in iteritems(self.env):
+            # will be set by requests to match actual host
            if name == 'HTTP_HOST':
-                #name = 'Host'
-                #value = splits.netloc
-                # will be set automatically
                continue

-            #elif name == 'HTTP_ORIGIN':
-            #    name = 'Origin'
-            #    value = (splits.scheme + '://' + splits.netloc)
-
-            elif name == 'HTTP_X_CSRFTOKEN':
-                name = 'X-CSRFToken'
-                cookie_val = extract_client_cookie(env, 'csrftoken')
-                if cookie_val:
-                    value = cookie_val
-
-            #elif name == 'HTTP_X_FORWARDED_PROTO':
-            #    name = 'X-Forwarded-Proto'
-            #    value = splits.scheme
-
            elif name.startswith('HTTP_'):
                name = name[5:].title().replace('_', '-')

@ -55,10 +38,7 @@ class WSGIInputRequest(object):
        return headers

    def get_req_body(self):
-        input_ = self.env.get('wsgi.input')
-        if not input_:
-            return None
-
+        input_ = self.env['wsgi.input']
        len_ = self._get_content_length()
        enc = self._get_header('Transfer-Encoding')

@ -70,9 +50,6 @@ class WSGIInputRequest(object):
            data = None

        return data
-        #buf = data.read().decode('utf-8')
-        #print(buf)
-        #return StringIO(buf)

    def _get_content_type(self):
        return self.env.get('CONTENT_TYPE')
@ -105,7 +82,7 @@ class WSGIInputRequest(object):


 #=============================================================================
-class POSTInputRequest(WSGIInputRequest):
+class POSTInputRequest(DirectWSGIInputRequest):
    def __init__(self, env):
        self.env = env

--- a/rezag/responseloader.py
+++ b/rezag/responseloader.py
@ -2,6 +2,7 @@ from rezag.liverec import BaseRecorder
 from rezag.liverec import request as remote_request

 from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_http_date
+from pywb.utils.wbexception import LiveResourceException
 from pywb.warc.resolvingloader import ResolvingLoader

 from io import BytesIO
@ -29,7 +30,7 @@ def incr_reader(stream, header=None, size=8192):


 #=============================================================================
-class WARCPathHandler(object):
+class WARCPathLoader(object):
    def __init__(self, paths, cdx_source):
        self.paths = paths
        if isinstance(paths, str):
@ -108,7 +109,7 @@ class HeaderRecorder(BaseRecorder):


 #=============================================================================
-class LiveWebHandler(object):
+class LiveWebLoader(object):
    SKIP_HEADERS = (b'link',
                    b'memento-datetime',
                    b'content-location',
@ -140,14 +141,17 @@ class LiveWebHandler(object):
        method = input_req.get_req_method()
        data = input_req.get_req_body()

-        upstream_res = remote_request(url=load_url,
-                                      method=method,
-                                      recorder=recorder,
-                                      stream=True,
-                                      allow_redirects=False,
-                                      headers=req_headers,
-                                      data=data,
-                                      timeout=params.get('_timeout'))
+        try:
+            upstream_res = remote_request(url=load_url,
+                                          method=method,
+                                          recorder=recorder,
+                                          stream=True,
+                                          allow_redirects=False,
+                                          headers=req_headers,
+                                          data=data,
+                                          timeout=params.get('_timeout'))
+        except Exception:
+            raise LiveResourceException(load_url)

        resp_headers = recorder.get_header()

@ -175,7 +179,7 @@ class LiveWebHandler(object):
        return dt.strftime('%Y-%m-%dT%H:%M:%SZ')

    @staticmethod
-    def _make_warc_id(id_=None):
+    def _make_warc_id(id_=None):  #pragma: no cover
        if not id_:
            id_ = uuid.uuid1()
        return '<urn:uuid:{0}>'.format(id_)
--- a/rezag/utils.py
+++ b/rezag/utils.py
@ -77,6 +77,7 @@ class MementoUtils(object):
            from_date = timestamp_to_http_date(first_cdx['timestamp'])
        except StopIteration:
            first_cdx = None
+            return

        # first memento link
        yield MementoUtils.make_timemap_memento_link(first_cdx, datetime=from_date)
@ -91,4 +92,4 @@ class MementoUtils(object):

        # last memento link, if any
        if prev_cdx:
-            yield MementoUtils.make_timemap_memento_link(prev_cdx, end='')
+            yield MementoUtils.make_timemap_memento_link(prev_cdx, end='\n')
--- a/setup.py
+++ b/setup.py
@ -32,8 +32,11 @@ setup(
        'rezag',
        ],
    install_requires=[
-        'pywb',
+        'pywb==1.0b',
        ],
+    dependency_links=[
+        'git+https://github.com/ikreymer/pywb.git@py3#egg=pywb-1.0b-py3',
+    ],
    zip_safe=True,
    entry_points="""
        [console_scripts]
--- a/test/test_dir_agg.py
+++ b/test/test_dir_agg.py
@ -33,6 +33,9 @@ def setup_module():
    shutil.copy(to_path('testdata/iana.cdxj'), coll_B)
    shutil.copy(to_path('testdata/dupes.cdxj'), coll_C)

+    with open(to_path(root_dir) + 'somefile', 'w') as fh:
+        fh.write('foo')
+
    global dir_loader
    dir_loader = DirectoryIndexAggregator(dir_prefix, dir_path)

@ -121,7 +124,7 @@ def test_agg_dir_and_memento():
               'local': dir_loader}
    agg_source = SimpleAggregator(sources)

-    res = agg_source({'url': 'example.com/', 'param.coll': '*', 'closest': '20100512', 'limit': 6})
+    res = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})

    exp = [
        {'source': 'ia', 'timestamp': '20100513052358', 'load_url': 'http://web.archive.org/web/20100513052358id_/http://example.com/'},
@ -144,7 +147,7 @@ def test_agg_no_dir_1():


 def test_agg_no_dir_2():
-    loader = DirectoryIndexAggregator(root_dir, 'no_such')
+    loader = DirectoryIndexAggregator(root_dir, '')
    res = loader({'url': 'example.com/', 'param.coll': 'X'})

    exp = []
@ -152,4 +155,31 @@ def test_agg_no_dir_2():
    assert(to_json_list(res) == exp)


+def test_agg_dir_sources_1():
+    res = dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
+    exp = {'sources': {'colls/A/indexes': 'file',
+                       'colls/B/indexes': 'file',
+                       'colls/C/indexes': 'file'}
+          }
+
+    assert(res == exp)
+
+
+def test_agg_dir_sources_2():
+    res = dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'})
+    exp = {'sources': {'colls/A/indexes': 'file',
+                       'colls/C/indexes': 'file'}
+          }
+
+    assert(res == exp)
+
+
+def test_agg_dir_sources_single_dir():
+    loader = DirectoryIndexAggregator('testdata/', '')
+    res = loader.get_source_list({'url': 'example.com/'})
+
+    exp = {'sources': {}}
+
+    assert(res == exp)
+

--- a/test/test_handlers.py
+++ b/test/test_handlers.py
@ -42,13 +42,17 @@ def setup_module(self):
    source3 = SimpleAggregator({'example': FileIndexSource(to_path('testdata/example.cdxj'))})
    handler3 = DefaultResourceHandler(source3, to_path('testdata/'))

-
    add_route('/fallback', HandlerSeq([handler3,
                                       handler2,
                                       live_handler]))

+    add_route('/seq', HandlerSeq([handler3,
+                                  handler2]))

-    bottle.debug = True
+    add_route('/empty', HandlerSeq([]))
+    add_route('/invalid', HandlerSeq(['foo']))
+
+    application.debug = True
    global testapp
    testapp = webtest.TestApp(application)

@ -61,8 +65,23 @@ class TestResAgg(object):
    def setup(self):
        self.testapp = testapp

+    def test_list_handlers(self):
+        resp = self.testapp.get('/many?mode=list_modes')
+        assert resp.json == {'modes': ['list_modes', 'list_sources', 'index', 'resource']}
+
+        resp = self.testapp.get('/many?mode=other')
+        assert resp.json == {'modes': ['list_modes', 'list_sources', 'index', 'resource']}
+
+        # defaults to resource, must specify url
+        resp = self.testapp.get('/many', status=400)
+        assert resp.json == {'message': 'The "url" param is required'}
+
+    def test_list_sources(self):
+        resp = self.testapp.get('/many?mode=list_sources')
+        assert resp.json == {'sources': {'local': 'file_dir', 'ia': 'memento', 'rhiz': 'memento', 'live': 'live'}}
+
    def test_live_index(self):
-        resp = self.testapp.get('/live?url=http://httpbin.org/get&mode=index&output=json')
+        resp = self.testapp.get('/live?mode=index&url=http://httpbin.org/get&output=json')
        resp.charset = 'utf-8'

        res = to_json_list(resp.text)
@ -71,7 +90,8 @@ class TestResAgg(object):
                        'load_url': 'http://httpbin.org/get', 'source': 'live', 'timestamp': '2016'}])

    def test_live_resource(self):
-        resp = self.testapp.get('/live?url=http://httpbin.org/get?foo=bar&mode=resource')
+        headers = {'foo': 'bar'}
+        resp = self.testapp.get('/live?url=http://httpbin.org/get?foo=bar', headers=headers)

        assert resp.headers['WARC-Coll'] == 'live'
        assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar'
@ -82,7 +102,7 @@ class TestResAgg(object):


    def test_live_post_resource(self):
-        resp = self.testapp.post('/live?url=http://httpbin.org/post&mode=resource',
+        resp = self.testapp.post('/live?url=http://httpbin.org/post',
                                 OrderedDict([('foo', 'bar')]))

        assert resp.headers['WARC-Coll'] == 'live'
@ -204,6 +224,11 @@ foo=bar&test=abc"""
        assert resp.headers['WARC-Target-URI'] == 'http://example.com/'
        assert b'HTTP/1.1 200 OK' in resp.body

+    def test_error_fallback_live_not_found(self):
+        resp = self.testapp.get('/fallback?url=http://invalid.url-not-found', status=400)
+
+        assert resp.json == {'message': 'http://invalid.url-not-found'}
+
    def test_agg_local_revisit(self):
        resp = self.testapp.get('/many?url=http://www.example.com/&closest=20140127171251&sources=local')

@ -214,3 +239,24 @@ foo=bar&test=abc"""
        assert resp.headers['WARC-Refers-To-Date'] == '2014-01-27T17:12:00Z'
        assert b'HTTP/1.1 200 OK' in resp.body
        assert b'<!doctype html>' in resp.body
+
+    def test_error_invalid_index_output(self):
+        resp = self.testapp.get('/live?mode=index&url=http://httpbin.org/get&output=foobar', status=400)
+
+        assert resp.json == {'message': 'output=foobar not supported'}
+
+    def test_error_local_not_found(self):
+        resp = self.testapp.get('/many?url=http://not-found.error/&sources=local', status=404)
+
+        assert resp.json == {'message': 'No Resource Found'}
+
+    def test_error_empty(self):
+        resp = self.testapp.get('/empty?url=http://example.com/', status=404)
+
+        assert resp.json == {'message': 'No Resource Found'}
+
+    def test_error_invalid(self):
+        resp = self.testapp.get('/invalid?url=http://example.com/', status=500)
+
+        assert resp.json['message'].startswith('Internal Error')
+
--- a/test/test_indexsource.py
+++ b/test/test_indexsource.py
@ -32,16 +32,20 @@ local_sources = [


 remote_sources = [
-    RemoteIndexSource('http://webenact.rhizome.org/all-cdx',
+    RemoteIndexSource('http://webenact.rhizome.org/all-cdx?url={url}',
                      'http://webenact.rhizome.org/all/{timestamp}id_/{url}'),

-    MementoIndexSource('http://webenact.rhizome.org/all/',
-                       'http://webenact.rhizome.org/all/timemap/*/',
+    MementoIndexSource('http://webenact.rhizome.org/all/{url}',
+                       'http://webenact.rhizome.org/all/timemap/*/{url}',
                       'http://webenact.rhizome.org/all/{timestamp}id_/{url}')
 ]

+ait_source = RemoteIndexSource('http://wayback.archive-it.org/cdx?url={url}',
+                               'http://wayback.archive-it.org/all/{timestamp}id_/{url}')
+

 def query_single_source(source, params):
+    string = str(source)
    return SimpleAggregator({'source': source})(params)


@ -182,4 +186,22 @@ def test_file_not_found():



+def test_ait_filters():
+    ait_source = RemoteIndexSource('http://wayback.archive-it.org/cdx/search/cdx?url={url}&filter=filename:ARCHIVEIT-({colls})-.*',
+                                   'http://wayback.archive-it.org/all/{timestamp}id_/{url}')
+
+    cdxlist = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '5610|933'})
+    filenames = [cdx['filename'] for cdx in cdxlist]
+
+    prefix = ('ARCHIVEIT-5610-', 'ARCHIVEIT-933-')
+
+    assert(all([x.startswith(prefix) for x in filenames]))
+
+
+    cdxlist = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '1883|366|905'})
+    filenames = [cdx['filename'] for cdx in cdxlist]
+
+    prefix = ('ARCHIVEIT-1883-', 'ARCHIVEIT-366-', 'ARCHIVEIT-905-')
+
+    assert(all([x.startswith(prefix) for x in filenames]))

--- a/test/test_memento_agg.py
+++ b/test/test_memento_agg.py
@ -27,10 +27,11 @@ aggs = {'simple': SimpleAggregator(sources),
        'processes': ThreadedTimeoutAggregator(sources, timeout=5.0, use_processes=True),
       }

-#@pytest.mark.parametrize("agg", aggs, ids=["simple", "gevent_timeout"])
-def pytest_generate_tests(metafunc):
-    metafunc.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
+#def pytest_generate_tests(metafunc):
+#    metafunc.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))

+
+@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
 def test_mem_agg_index_1(agg):
    url = 'http://iana.org/'
    res = agg(dict(url=url, closest='20140126000000', limit=5))
@ -46,6 +47,7 @@ def test_mem_agg_index_1(agg):
    assert(json_list(res) == exp)


+@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
 def test_mem_agg_index_2(agg):
    url = 'http://example.com/'
    res = agg(dict(url=url, closest='20100512', limit=6))
@ -60,6 +62,7 @@ def test_mem_agg_index_2(agg):
    assert(json_list(res) == exp)


+@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
 def test_mem_agg_index_3(agg):
    url = 'http://vvork.com/'
    res = agg(dict(url=url, closest='20141001', limit=5))
@ -73,6 +76,7 @@ def test_mem_agg_index_3(agg):
    assert(json_list(res) == exp)


+@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
 def test_mem_agg_index_4(agg):
    url = 'http://vvork.com/'
    res = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
@ -83,10 +87,11 @@ def test_mem_agg_index_4(agg):
    assert(json_list(res) == exp)


-def test_handler_output_cdxj(agg):
-    loader = IndexHandler(agg)
+def test_handler_output_cdxj():
+    agg = GeventTimeoutAggregator(sources, timeout=5.0)
+    handler = IndexHandler(agg)
    url = 'http://vvork.com/'
-    res = loader(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
+    res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))

    exp = """\
 com,vvork)/ 20141006184357 {"url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
@ -96,10 +101,11 @@ com,vvork)/ 20131004231540 {"url": "http://vvork.com/", "mem_rel": "last memento
    assert(''.join(res) == exp)


-def test_handler_output_json(agg):
-    loader = IndexHandler(agg)
+def test_handler_output_json():
+    agg = GeventTimeoutAggregator(sources, timeout=5.0)
+    handler = IndexHandler(agg)
    url = 'http://vvork.com/'
-    res = loader(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='json'))
+    res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='json'))

    exp = """\
 {"urlkey": "com,vvork)/", "timestamp": "20141006184357", "url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
@ -109,22 +115,50 @@ def test_handler_output_json(agg):
    assert(''.join(res) == exp)


-def test_handler_output_link(agg):
-    loader = IndexHandler(agg)
+def test_handler_output_link():
+    agg = GeventTimeoutAggregator(sources, timeout=5.0)
+    handler = IndexHandler(agg)
    url = 'http://vvork.com/'
-    res = loader(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='link'))
+    res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='link'))

    exp = """\
 <http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/>; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT"; src="rhiz",
-<http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/>; rel="memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT"; src="ait"\
+<http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/>; rel="memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT"; src="ait"
 """
    assert(''.join(res) == exp)


-def test_handler_output_text(agg):
-    loader = IndexHandler(agg)
+def test_handler_output_link_2():
+    agg = GeventTimeoutAggregator(sources, timeout=5.0)
+    handler = IndexHandler(agg)
+    url = 'http://iana.org/'
+    res = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
+
+    exp = """\
+<http://web.archive.org/web/20140126093743id_/http://iana.org/>; rel="memento"; datetime="Sun, 26 Jan 2014 09:37:43 GMT"; src="ia",
+<filename://iana.warc.gz>; rel="memento"; datetime="Sun, 26 Jan 2014 20:06:24 GMT"; src="local",
+<http://web.archive.org/web/20140123034755id_/http://iana.org/>; rel="memento"; datetime="Thu, 23 Jan 2014 03:47:55 GMT"; src="ia",
+<http://web.archive.org/web/20140129175203id_/http://iana.org/>; rel="memento"; datetime="Wed, 29 Jan 2014 17:52:03 GMT"; src="ia",
+<http://wayback.archive-it.org/all/20140107040552id_/http://iana.org/>; rel="memento"; datetime="Tue, 07 Jan 2014 04:05:52 GMT"; src="ait"
+"""
+    assert(''.join(res) == exp)
+
+
+def test_handler_output_link_3():
+    agg = GeventTimeoutAggregator(sources, timeout=5.0)
+    handler = IndexHandler(agg)
+    url = 'http://foo.bar.non-existent'
+    res = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
+
+    exp = ''
+
+    assert(''.join(res) == exp)
+
+def test_handler_output_text():
+    agg = GeventTimeoutAggregator(sources, timeout=5.0)
+    handler = IndexHandler(agg)
    url = 'http://vvork.com/'
-    res = loader(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='text'))
+    res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='text'))

    exp = """\
 com,vvork)/ 20141006184357 http://www.vvork.com/ memento http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/ http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/ rhiz
@ -133,9 +167,10 @@ com,vvork)/ 20131004231540 http://vvork.com/ last memento http://wayback.archive
    assert(''.join(res) == exp)


-def test_handler_list_sources(agg):
-    loader = IndexHandler(agg)
-    res = loader(dict(mode='sources'))
+def test_handler_list_sources():
+    agg = GeventTimeoutAggregator(sources, timeout=5.0)
+    handler = IndexHandler(agg)
+    res = handler(dict(mode='list_sources'))

    assert(res == {'sources': {'bl': 'memento',
                               'ait': 'memento',
@ -143,4 +178,3 @@ def test_handler_list_sources(agg):
                               'rhiz': 'memento',
                               'local': 'file'}})

-