1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

seperate iter_sources from list_sources api

all errors returned as json block with error msg
tests for not found, invalid errors
This commit is contained in:
Ilya Kreymer 2016-02-29 12:34:06 -08:00
parent 68090d00c1
commit 008e5284b1
12 changed files with 304 additions and 145 deletions

View File

@ -63,7 +63,6 @@ class BaseAggregator(object):
try:
_src_params = all_params['_all_src_params'].get(name)
all_params['_src_params'] = _src_params
cdx_iter = source.load_index(all_params)
except NotFoundException as nf:
print('Not found in ' + name)
@ -89,15 +88,21 @@ class BaseAggregator(object):
return cdx_iter
def _on_source_error(self, name):
def _on_source_error(self, name): #pragma: no cover
pass
def _load_all(self, params): #pragma: no cover
raise NotImplemented()
def get_sources(self, params): #pragma: no cover
def _iter_sources(self, params): #pragma: no cover
raise NotImplemented()
def get_source_list(self, params):
srcs = self._iter_sources(params)
result = [(name, str(value)) for name, value in srcs]
result = {'sources': dict(result)}
return result
#=============================================================================
class BaseSourceListAggregator(BaseAggregator):
@ -107,7 +112,7 @@ class BaseSourceListAggregator(BaseAggregator):
def get_all_sources(self, params):
return self.sources
def get_sources(self, params):
def _iter_sources(self, params):
sources = self.get_all_sources(params)
srcs_list = params.get('sources')
if not srcs_list:
@ -125,7 +130,7 @@ class SeqAggMixin(object):
def _load_all(self, params):
sources = list(self.get_sources(params))
sources = list(self._iter_sources(params))
return list([self.load_child_source(name, source, params)
for name, source in sources])
@ -160,8 +165,8 @@ class TimeoutMixin(object):
return False
def get_sources(self, params):
sources = super(TimeoutMixin, self).get_sources(params)
def _iter_sources(self, params):
sources = super(TimeoutMixin, self)._iter_sources(params)
for name, source in sources:
if not self.is_timed_out(name):
yield name, source
@ -185,7 +190,7 @@ class GeventMixin(object):
def _load_all(self, params):
params['_timeout'] = self.timeout
sources = list(self.get_sources(params))
sources = list(self._iter_sources(params))
def do_spawn(name, source):
return self.pool.spawn(self.load_child_source, name, source, params)
@ -223,7 +228,7 @@ class ConcurrentMixin(object):
def _load_all(self, params):
params['_timeout'] = self.timeout
sources = list(self.get_sources(params))
sources = list(self._iter_sources(params))
with self.pool_class(max_workers=self.size) as executor:
def do_spawn(name, source):
@ -257,7 +262,8 @@ class BaseDirectoryIndexAggregator(BaseAggregator):
self.base_prefix = base_prefix
self.base_dir = base_dir
def get_sources(self, params):
def _iter_sources(self, params):
self._set_src_params(params)
# see if specific params (when part of another agg)
src_params = params.get('_src_params')
if not src_params:
@ -270,7 +276,6 @@ class BaseDirectoryIndexAggregator(BaseAggregator):
the_dir = self.base_dir
the_dir = os.path.join(self.base_prefix, the_dir)
try:
sources = list(self._load_files(the_dir))
except Exception:
@ -290,6 +295,10 @@ class BaseDirectoryIndexAggregator(BaseAggregator):
rel_path = ''
yield rel_path, FileIndexSource(filename)
def __str__(self):
return 'file_dir'
class DirectoryIndexAggregator(SeqAggMixin, BaseDirectoryIndexAggregator):
pass

View File

@ -1,31 +1,50 @@
from rezag.inputrequest import WSGIInputRequest, POSTInputRequest
from bottle import route, request, response, default_app
from rezag.inputrequest import DirectWSGIInputRequest, POSTInputRequest
from bottle import route, request, response, default_app, abort
from pywb.utils.wbexception import WbException
import traceback
import json
def err_handler(exc):
response.status = exc.status_code
response.content_type = 'application/json'
return json.dumps({'message': exc.body})
def wrap_error(func):
def do_d(*args, **kwargs):
try:
return func(*args, **kwargs)
except WbException as exc:
if application.debug:
traceback.print_exc()
abort(exc.status(), exc.msg)
except Exception as e:
if application.debug:
traceback.print_exc()
abort(500, 'Internal Error: ' + str(e))
return do_d
def add_route(path, handler):
def debug(func):
def do_d():
try:
return func()
except Exception:
import traceback
traceback.print_exc()
return do_d
def direct_input_request():
@wrap_error
def direct_input_request(mode=''):
params = dict(request.query)
params['_input_req'] = WSGIInputRequest(request.environ)
params['_input_req'] = DirectWSGIInputRequest(request.environ)
return handler(params)
def post_fullrequest():
@wrap_error
def post_fullrequest(mode=''):
params = dict(request.query)
params['_input_req'] = POSTInputRequest(request.environ)
return handler(params)
route(path + '/postreq', method=['POST'], callback=debug(post_fullrequest))
route(path, method=['ANY'], callback=debug(direct_input_request))
route(path + '/postreq', method=['POST'], callback=post_fullrequest)
route(path, method=['ANY'], callback=direct_input_request)
application = default_app()
application.default_error_handler = err_handler

View File

@ -1,12 +1,13 @@
from rezag.responseloader import WARCPathHandler, LiveWebHandler
from rezag.responseloader import WARCPathLoader, LiveWebLoader
from rezag.utils import MementoUtils
from pywb.warc.recordloader import ArchiveLoadFailed
from pywb.utils.wbexception import BadRequestException, WbException
from pywb.utils.wbexception import NotFoundException
from bottle import response
#=============================================================================
def to_cdxj(cdx_iter, fields):
response.headers['Content-Type'] = 'text/x-cdxj'
response.headers['Content-Type'] = 'application/x-cdxj'
return [cdx.to_cdxj(fields) for cdx in cdx_iter]
def to_json(cdx_iter, fields):
@ -37,26 +38,36 @@ class IndexHandler(object):
self.index_source = index_source
self.opts = opts or {}
def __call__(self, params):
if params.get('mode') == 'sources':
srcs = self.index_source.get_sources(params)
result = [(name, str(value)) for name, value in srcs]
result = {'sources': dict(result)}
return result
def get_supported_modes(self):
return dict(modes=['list_modes', 'list_sources', 'index'])
def _load_index_source(self, params):
url = params.get('url')
if not url:
raise BadRequestException('The "url" param is required')
input_req = params.get('_input_req')
if input_req:
params['alt_url'] = input_req.include_post_query(params.get('url'))
params['alt_url'] = input_req.include_post_query(url)
cdx_iter = self.index_source(params)
return self.index_source(params)
def __call__(self, params):
mode = params.get('mode', 'index')
if mode == 'list_sources':
return self.index_source.get_source_list(params)
if mode == 'list_modes' or mode != 'index':
return self.get_supported_modes()
output = params.get('output', self.DEF_OUTPUT)
fields = params.get('fields')
handler = self.OUTPUTS.get(output)
if not handler:
handler = self.OUTPUTS[self.DEF_OUTPUT]
raise BadRequestException('output={0} not supported'.format(output))
cdx_iter = self._load_index_source(params)
res = handler(cdx_iter, fields)
return res
@ -67,57 +78,59 @@ class ResourceHandler(IndexHandler):
super(ResourceHandler, self).__init__(index_source)
self.resource_loaders = resource_loaders
def get_supported_modes(self):
res = super(ResourceHandler, self).get_supported_modes()
res['modes'].append('resource')
return res
def __call__(self, params):
if params.get('mode', 'resource') != 'resource':
return super(ResourceHandler, self).__call__(params)
input_req = params.get('_input_req')
if input_req:
params['alt_url'] = input_req.include_post_query(params.get('url'))
cdx_iter = self.index_source(params)
any_found = False
cdx_iter = self._load_index_source(params)
last_exc = None
for cdx in cdx_iter:
any_found = True
for loader in self.resource_loaders:
try:
resp = loader(cdx, params)
if resp:
if resp is not None:
return resp
except ArchiveLoadFailed as e:
print(e)
pass
except WbException as e:
last_exc = e
if any_found:
raise ArchiveLoadFailed('Resource Found, could not be Loaded')
if last_exc:
raise last_exc
#raise ArchiveLoadFailed('Resource Found, could not be Loaded')
else:
raise ArchiveLoadFailed('No Resource Found')
raise NotFoundException('No Resource Found')
#=============================================================================
class DefaultResourceHandler(ResourceHandler):
def __init__(self, index_source, warc_paths=''):
loaders = [WARCPathHandler(warc_paths, index_source),
LiveWebHandler()
loaders = [WARCPathLoader(warc_paths, index_source),
LiveWebLoader()
]
super(DefaultResourceHandler, self).__init__(index_source, loaders)
#=============================================================================
class HandlerSeq(object):
def __init__(self, loaders):
self.loaders = loaders
def __init__(self, handlers):
self.handlers = handlers
def __call__(self, params):
for loader in self.loaders:
last_exc = None
for handler in self.handlers:
try:
res = loader(params)
if res:
res = handler(params)
if res is not None:
return res
except ArchiveLoadFailed:
pass
except WbException as e:
last_exc = e
raise ArchiveLoadFailed('No Resource Found')
if last_exc:
raise last_exc
else:
raise NotFoundException('No Resource Found')

View File

@ -14,6 +14,9 @@ from rezag.liverec import patched_requests as requests
from rezag.utils import MementoUtils
WAYBACK_ORIG_SUFFIX = '{timestamp}id_/{url}'
#=============================================================================
class BaseIndexSource(object):
def load_index(self, params): #pragma: no cover
@ -22,10 +25,10 @@ class BaseIndexSource(object):
@staticmethod
def res_template(template, params):
src_params = params.get('_src_params')
if src_params:
res = template.format(**src_params)
if not src_params:
res = template.format(url=params['url'])
else:
res = template
res = template.format(url=params['url'], **src_params)
return res
@ -59,7 +62,7 @@ class RemoteIndexSource(BaseIndexSource):
def load_index(self, params):
api_url = self.res_template(self.api_url_template, params)
api_url += '?url=' + params['url']
print('API URL', api_url)
r = requests.get(api_url, timeout=params.get('_timeout'))
if r.status_code >= 400:
raise NotFoundException(api_url)
@ -169,7 +172,6 @@ class MementoIndexSource(BaseIndexSource):
def get_timegate_links(self, params, closest):
url = self.res_template(self.timegate_url, params)
url += params['url']
accept_dt = timestamp_to_http_date(closest)
res = requests.head(url, headers={'Accept-Datetime': accept_dt})
if res.status_code >= 400:
@ -179,7 +181,6 @@ class MementoIndexSource(BaseIndexSource):
def get_timemap_links(self, params):
url = self.res_template(self.timemap_url, params)
url += params['url']
res = requests.get(url, timeout=params.get('_timeout'))
if res.status_code >= 400:
raise NotFoundException(url)
@ -200,9 +201,9 @@ class MementoIndexSource(BaseIndexSource):
@staticmethod
def from_timegate_url(timegate_url, path='link'):
return MementoIndexSource(timegate_url,
timegate_url + 'timemap/' + path + '/',
timegate_url + '{timestamp}id_/{url}')
return MementoIndexSource(timegate_url + '{url}',
timegate_url + 'timemap/' + path + '/{url}',
timegate_url + WAYBACK_ORIG_SUFFIX)
def __str__(self):
return 'memento'

View File

@ -1,4 +1,3 @@
from pywb.utils.loaders import extract_client_cookie
from pywb.utils.loaders import extract_post_query, append_post_query
from pywb.utils.loaders import LimitReader
from pywb.utils.statusandheaders import StatusAndHeadersParser
@ -9,7 +8,7 @@ from io import BytesIO
#=============================================================================
class WSGIInputRequest(object):
class DirectWSGIInputRequest(object):
def __init__(self, env):
self.env = env
@ -20,26 +19,10 @@ class WSGIInputRequest(object):
headers = {}
for name, value in iteritems(self.env):
# will be set by requests to match actual host
if name == 'HTTP_HOST':
#name = 'Host'
#value = splits.netloc
# will be set automatically
continue
#elif name == 'HTTP_ORIGIN':
# name = 'Origin'
# value = (splits.scheme + '://' + splits.netloc)
elif name == 'HTTP_X_CSRFTOKEN':
name = 'X-CSRFToken'
cookie_val = extract_client_cookie(env, 'csrftoken')
if cookie_val:
value = cookie_val
#elif name == 'HTTP_X_FORWARDED_PROTO':
# name = 'X-Forwarded-Proto'
# value = splits.scheme
elif name.startswith('HTTP_'):
name = name[5:].title().replace('_', '-')
@ -55,10 +38,7 @@ class WSGIInputRequest(object):
return headers
def get_req_body(self):
input_ = self.env.get('wsgi.input')
if not input_:
return None
input_ = self.env['wsgi.input']
len_ = self._get_content_length()
enc = self._get_header('Transfer-Encoding')
@ -70,9 +50,6 @@ class WSGIInputRequest(object):
data = None
return data
#buf = data.read().decode('utf-8')
#print(buf)
#return StringIO(buf)
def _get_content_type(self):
return self.env.get('CONTENT_TYPE')
@ -105,7 +82,7 @@ class WSGIInputRequest(object):
#=============================================================================
class POSTInputRequest(WSGIInputRequest):
class POSTInputRequest(DirectWSGIInputRequest):
def __init__(self, env):
self.env = env

View File

@ -2,6 +2,7 @@ from rezag.liverec import BaseRecorder
from rezag.liverec import request as remote_request
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_http_date
from pywb.utils.wbexception import LiveResourceException
from pywb.warc.resolvingloader import ResolvingLoader
from io import BytesIO
@ -29,7 +30,7 @@ def incr_reader(stream, header=None, size=8192):
#=============================================================================
class WARCPathHandler(object):
class WARCPathLoader(object):
def __init__(self, paths, cdx_source):
self.paths = paths
if isinstance(paths, str):
@ -108,7 +109,7 @@ class HeaderRecorder(BaseRecorder):
#=============================================================================
class LiveWebHandler(object):
class LiveWebLoader(object):
SKIP_HEADERS = (b'link',
b'memento-datetime',
b'content-location',
@ -140,14 +141,17 @@ class LiveWebHandler(object):
method = input_req.get_req_method()
data = input_req.get_req_body()
upstream_res = remote_request(url=load_url,
method=method,
recorder=recorder,
stream=True,
allow_redirects=False,
headers=req_headers,
data=data,
timeout=params.get('_timeout'))
try:
upstream_res = remote_request(url=load_url,
method=method,
recorder=recorder,
stream=True,
allow_redirects=False,
headers=req_headers,
data=data,
timeout=params.get('_timeout'))
except Exception:
raise LiveResourceException(load_url)
resp_headers = recorder.get_header()
@ -175,7 +179,7 @@ class LiveWebHandler(object):
return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
@staticmethod
def _make_warc_id(id_=None):
def _make_warc_id(id_=None): #pragma: no cover
if not id_:
id_ = uuid.uuid1()
return '<urn:uuid:{0}>'.format(id_)

View File

@ -77,6 +77,7 @@ class MementoUtils(object):
from_date = timestamp_to_http_date(first_cdx['timestamp'])
except StopIteration:
first_cdx = None
return
# first memento link
yield MementoUtils.make_timemap_memento_link(first_cdx, datetime=from_date)
@ -91,4 +92,4 @@ class MementoUtils(object):
# last memento link, if any
if prev_cdx:
yield MementoUtils.make_timemap_memento_link(prev_cdx, end='')
yield MementoUtils.make_timemap_memento_link(prev_cdx, end='\n')

View File

@ -32,8 +32,11 @@ setup(
'rezag',
],
install_requires=[
'pywb',
'pywb==1.0b',
],
dependency_links=[
'git+https://github.com/ikreymer/pywb.git@py3#egg=pywb-1.0b-py3',
],
zip_safe=True,
entry_points="""
[console_scripts]

View File

@ -33,6 +33,9 @@ def setup_module():
shutil.copy(to_path('testdata/iana.cdxj'), coll_B)
shutil.copy(to_path('testdata/dupes.cdxj'), coll_C)
with open(to_path(root_dir) + 'somefile', 'w') as fh:
fh.write('foo')
global dir_loader
dir_loader = DirectoryIndexAggregator(dir_prefix, dir_path)
@ -121,7 +124,7 @@ def test_agg_dir_and_memento():
'local': dir_loader}
agg_source = SimpleAggregator(sources)
res = agg_source({'url': 'example.com/', 'param.coll': '*', 'closest': '20100512', 'limit': 6})
res = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})
exp = [
{'source': 'ia', 'timestamp': '20100513052358', 'load_url': 'http://web.archive.org/web/20100513052358id_/http://example.com/'},
@ -144,7 +147,7 @@ def test_agg_no_dir_1():
def test_agg_no_dir_2():
loader = DirectoryIndexAggregator(root_dir, 'no_such')
loader = DirectoryIndexAggregator(root_dir, '')
res = loader({'url': 'example.com/', 'param.coll': 'X'})
exp = []
@ -152,4 +155,31 @@ def test_agg_no_dir_2():
assert(to_json_list(res) == exp)
def test_agg_dir_sources_1():
res = dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
exp = {'sources': {'colls/A/indexes': 'file',
'colls/B/indexes': 'file',
'colls/C/indexes': 'file'}
}
assert(res == exp)
def test_agg_dir_sources_2():
res = dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'})
exp = {'sources': {'colls/A/indexes': 'file',
'colls/C/indexes': 'file'}
}
assert(res == exp)
def test_agg_dir_sources_single_dir():
loader = DirectoryIndexAggregator('testdata/', '')
res = loader.get_source_list({'url': 'example.com/'})
exp = {'sources': {}}
assert(res == exp)

View File

@ -42,13 +42,17 @@ def setup_module(self):
source3 = SimpleAggregator({'example': FileIndexSource(to_path('testdata/example.cdxj'))})
handler3 = DefaultResourceHandler(source3, to_path('testdata/'))
add_route('/fallback', HandlerSeq([handler3,
handler2,
live_handler]))
add_route('/seq', HandlerSeq([handler3,
handler2]))
bottle.debug = True
add_route('/empty', HandlerSeq([]))
add_route('/invalid', HandlerSeq(['foo']))
application.debug = True
global testapp
testapp = webtest.TestApp(application)
@ -61,8 +65,23 @@ class TestResAgg(object):
def setup(self):
self.testapp = testapp
def test_list_handlers(self):
resp = self.testapp.get('/many?mode=list_modes')
assert resp.json == {'modes': ['list_modes', 'list_sources', 'index', 'resource']}
resp = self.testapp.get('/many?mode=other')
assert resp.json == {'modes': ['list_modes', 'list_sources', 'index', 'resource']}
# defaults to resource, must specify url
resp = self.testapp.get('/many', status=400)
assert resp.json == {'message': 'The "url" param is required'}
def test_list_sources(self):
resp = self.testapp.get('/many?mode=list_sources')
assert resp.json == {'sources': {'local': 'file_dir', 'ia': 'memento', 'rhiz': 'memento', 'live': 'live'}}
def test_live_index(self):
resp = self.testapp.get('/live?url=http://httpbin.org/get&mode=index&output=json')
resp = self.testapp.get('/live?mode=index&url=http://httpbin.org/get&output=json')
resp.charset = 'utf-8'
res = to_json_list(resp.text)
@ -71,7 +90,8 @@ class TestResAgg(object):
'load_url': 'http://httpbin.org/get', 'source': 'live', 'timestamp': '2016'}])
def test_live_resource(self):
resp = self.testapp.get('/live?url=http://httpbin.org/get?foo=bar&mode=resource')
headers = {'foo': 'bar'}
resp = self.testapp.get('/live?url=http://httpbin.org/get?foo=bar', headers=headers)
assert resp.headers['WARC-Coll'] == 'live'
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar'
@ -82,7 +102,7 @@ class TestResAgg(object):
def test_live_post_resource(self):
resp = self.testapp.post('/live?url=http://httpbin.org/post&mode=resource',
resp = self.testapp.post('/live?url=http://httpbin.org/post',
OrderedDict([('foo', 'bar')]))
assert resp.headers['WARC-Coll'] == 'live'
@ -204,6 +224,11 @@ foo=bar&test=abc"""
assert resp.headers['WARC-Target-URI'] == 'http://example.com/'
assert b'HTTP/1.1 200 OK' in resp.body
def test_error_fallback_live_not_found(self):
resp = self.testapp.get('/fallback?url=http://invalid.url-not-found', status=400)
assert resp.json == {'message': 'http://invalid.url-not-found'}
def test_agg_local_revisit(self):
resp = self.testapp.get('/many?url=http://www.example.com/&closest=20140127171251&sources=local')
@ -214,3 +239,24 @@ foo=bar&test=abc"""
assert resp.headers['WARC-Refers-To-Date'] == '2014-01-27T17:12:00Z'
assert b'HTTP/1.1 200 OK' in resp.body
assert b'<!doctype html>' in resp.body
def test_error_invalid_index_output(self):
resp = self.testapp.get('/live?mode=index&url=http://httpbin.org/get&output=foobar', status=400)
assert resp.json == {'message': 'output=foobar not supported'}
def test_error_local_not_found(self):
resp = self.testapp.get('/many?url=http://not-found.error/&sources=local', status=404)
assert resp.json == {'message': 'No Resource Found'}
def test_error_empty(self):
resp = self.testapp.get('/empty?url=http://example.com/', status=404)
assert resp.json == {'message': 'No Resource Found'}
def test_error_invalid(self):
resp = self.testapp.get('/invalid?url=http://example.com/', status=500)
assert resp.json['message'].startswith('Internal Error')

View File

@ -32,16 +32,20 @@ local_sources = [
remote_sources = [
RemoteIndexSource('http://webenact.rhizome.org/all-cdx',
RemoteIndexSource('http://webenact.rhizome.org/all-cdx?url={url}',
'http://webenact.rhizome.org/all/{timestamp}id_/{url}'),
MementoIndexSource('http://webenact.rhizome.org/all/',
'http://webenact.rhizome.org/all/timemap/*/',
MementoIndexSource('http://webenact.rhizome.org/all/{url}',
'http://webenact.rhizome.org/all/timemap/*/{url}',
'http://webenact.rhizome.org/all/{timestamp}id_/{url}')
]
ait_source = RemoteIndexSource('http://wayback.archive-it.org/cdx?url={url}',
'http://wayback.archive-it.org/all/{timestamp}id_/{url}')
def query_single_source(source, params):
string = str(source)
return SimpleAggregator({'source': source})(params)
@ -182,4 +186,22 @@ def test_file_not_found():
def test_ait_filters():
ait_source = RemoteIndexSource('http://wayback.archive-it.org/cdx/search/cdx?url={url}&filter=filename:ARCHIVEIT-({colls})-.*',
'http://wayback.archive-it.org/all/{timestamp}id_/{url}')
cdxlist = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '5610|933'})
filenames = [cdx['filename'] for cdx in cdxlist]
prefix = ('ARCHIVEIT-5610-', 'ARCHIVEIT-933-')
assert(all([x.startswith(prefix) for x in filenames]))
cdxlist = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '1883|366|905'})
filenames = [cdx['filename'] for cdx in cdxlist]
prefix = ('ARCHIVEIT-1883-', 'ARCHIVEIT-366-', 'ARCHIVEIT-905-')
assert(all([x.startswith(prefix) for x in filenames]))

View File

@ -27,10 +27,11 @@ aggs = {'simple': SimpleAggregator(sources),
'processes': ThreadedTimeoutAggregator(sources, timeout=5.0, use_processes=True),
}
#@pytest.mark.parametrize("agg", aggs, ids=["simple", "gevent_timeout"])
def pytest_generate_tests(metafunc):
metafunc.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
#def pytest_generate_tests(metafunc):
# metafunc.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
def test_mem_agg_index_1(agg):
url = 'http://iana.org/'
res = agg(dict(url=url, closest='20140126000000', limit=5))
@ -46,6 +47,7 @@ def test_mem_agg_index_1(agg):
assert(json_list(res) == exp)
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
def test_mem_agg_index_2(agg):
url = 'http://example.com/'
res = agg(dict(url=url, closest='20100512', limit=6))
@ -60,6 +62,7 @@ def test_mem_agg_index_2(agg):
assert(json_list(res) == exp)
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
def test_mem_agg_index_3(agg):
url = 'http://vvork.com/'
res = agg(dict(url=url, closest='20141001', limit=5))
@ -73,6 +76,7 @@ def test_mem_agg_index_3(agg):
assert(json_list(res) == exp)
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
def test_mem_agg_index_4(agg):
url = 'http://vvork.com/'
res = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
@ -83,10 +87,11 @@ def test_mem_agg_index_4(agg):
assert(json_list(res) == exp)
def test_handler_output_cdxj(agg):
loader = IndexHandler(agg)
def test_handler_output_cdxj():
agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
url = 'http://vvork.com/'
res = loader(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
exp = """\
com,vvork)/ 20141006184357 {"url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
@ -96,10 +101,11 @@ com,vvork)/ 20131004231540 {"url": "http://vvork.com/", "mem_rel": "last memento
assert(''.join(res) == exp)
def test_handler_output_json(agg):
loader = IndexHandler(agg)
def test_handler_output_json():
agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
url = 'http://vvork.com/'
res = loader(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='json'))
res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='json'))
exp = """\
{"urlkey": "com,vvork)/", "timestamp": "20141006184357", "url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
@ -109,22 +115,50 @@ def test_handler_output_json(agg):
assert(''.join(res) == exp)
def test_handler_output_link(agg):
loader = IndexHandler(agg)
def test_handler_output_link():
agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
url = 'http://vvork.com/'
res = loader(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='link'))
res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='link'))
exp = """\
<http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/>; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT"; src="rhiz",
<http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/>; rel="memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT"; src="ait"\
<http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/>; rel="memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT"; src="ait"
"""
assert(''.join(res) == exp)
def test_handler_output_text(agg):
loader = IndexHandler(agg)
def test_handler_output_link_2():
agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
url = 'http://iana.org/'
res = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
exp = """\
<http://web.archive.org/web/20140126093743id_/http://iana.org/>; rel="memento"; datetime="Sun, 26 Jan 2014 09:37:43 GMT"; src="ia",
<filename://iana.warc.gz>; rel="memento"; datetime="Sun, 26 Jan 2014 20:06:24 GMT"; src="local",
<http://web.archive.org/web/20140123034755id_/http://iana.org/>; rel="memento"; datetime="Thu, 23 Jan 2014 03:47:55 GMT"; src="ia",
<http://web.archive.org/web/20140129175203id_/http://iana.org/>; rel="memento"; datetime="Wed, 29 Jan 2014 17:52:03 GMT"; src="ia",
<http://wayback.archive-it.org/all/20140107040552id_/http://iana.org/>; rel="memento"; datetime="Tue, 07 Jan 2014 04:05:52 GMT"; src="ait"
"""
assert(''.join(res) == exp)
def test_handler_output_link_3():
agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
url = 'http://foo.bar.non-existent'
res = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
exp = ''
assert(''.join(res) == exp)
def test_handler_output_text():
agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
url = 'http://vvork.com/'
res = loader(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='text'))
res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='text'))
exp = """\
com,vvork)/ 20141006184357 http://www.vvork.com/ memento http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/ http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/ rhiz
@ -133,9 +167,10 @@ com,vvork)/ 20131004231540 http://vvork.com/ last memento http://wayback.archive
assert(''.join(res) == exp)
def test_handler_list_sources(agg):
loader = IndexHandler(agg)
res = loader(dict(mode='sources'))
def test_handler_list_sources():
agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg)
res = handler(dict(mode='list_sources'))
assert(res == {'sources': {'bl': 'memento',
'ait': 'memento',
@ -143,4 +178,3 @@ def test_handler_list_sources(agg):
'rhiz': 'memento',
'local': 'file'}})