mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 08:04:49 +01:00
seperate iter_sources from list_sources api
all errors returned as json block with error msg tests for not found, invalid errors
This commit is contained in:
parent
68090d00c1
commit
008e5284b1
@ -63,7 +63,6 @@ class BaseAggregator(object):
|
||||
try:
|
||||
_src_params = all_params['_all_src_params'].get(name)
|
||||
all_params['_src_params'] = _src_params
|
||||
|
||||
cdx_iter = source.load_index(all_params)
|
||||
except NotFoundException as nf:
|
||||
print('Not found in ' + name)
|
||||
@ -89,15 +88,21 @@ class BaseAggregator(object):
|
||||
|
||||
return cdx_iter
|
||||
|
||||
def _on_source_error(self, name):
|
||||
def _on_source_error(self, name): #pragma: no cover
|
||||
pass
|
||||
|
||||
def _load_all(self, params): #pragma: no cover
|
||||
raise NotImplemented()
|
||||
|
||||
def get_sources(self, params): #pragma: no cover
|
||||
def _iter_sources(self, params): #pragma: no cover
|
||||
raise NotImplemented()
|
||||
|
||||
def get_source_list(self, params):
|
||||
srcs = self._iter_sources(params)
|
||||
result = [(name, str(value)) for name, value in srcs]
|
||||
result = {'sources': dict(result)}
|
||||
return result
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class BaseSourceListAggregator(BaseAggregator):
|
||||
@ -107,7 +112,7 @@ class BaseSourceListAggregator(BaseAggregator):
|
||||
def get_all_sources(self, params):
|
||||
return self.sources
|
||||
|
||||
def get_sources(self, params):
|
||||
def _iter_sources(self, params):
|
||||
sources = self.get_all_sources(params)
|
||||
srcs_list = params.get('sources')
|
||||
if not srcs_list:
|
||||
@ -125,7 +130,7 @@ class SeqAggMixin(object):
|
||||
|
||||
|
||||
def _load_all(self, params):
|
||||
sources = list(self.get_sources(params))
|
||||
sources = list(self._iter_sources(params))
|
||||
return list([self.load_child_source(name, source, params)
|
||||
for name, source in sources])
|
||||
|
||||
@ -160,8 +165,8 @@ class TimeoutMixin(object):
|
||||
|
||||
return False
|
||||
|
||||
def get_sources(self, params):
|
||||
sources = super(TimeoutMixin, self).get_sources(params)
|
||||
def _iter_sources(self, params):
|
||||
sources = super(TimeoutMixin, self)._iter_sources(params)
|
||||
for name, source in sources:
|
||||
if not self.is_timed_out(name):
|
||||
yield name, source
|
||||
@ -185,7 +190,7 @@ class GeventMixin(object):
|
||||
def _load_all(self, params):
|
||||
params['_timeout'] = self.timeout
|
||||
|
||||
sources = list(self.get_sources(params))
|
||||
sources = list(self._iter_sources(params))
|
||||
|
||||
def do_spawn(name, source):
|
||||
return self.pool.spawn(self.load_child_source, name, source, params)
|
||||
@ -223,7 +228,7 @@ class ConcurrentMixin(object):
|
||||
def _load_all(self, params):
|
||||
params['_timeout'] = self.timeout
|
||||
|
||||
sources = list(self.get_sources(params))
|
||||
sources = list(self._iter_sources(params))
|
||||
|
||||
with self.pool_class(max_workers=self.size) as executor:
|
||||
def do_spawn(name, source):
|
||||
@ -257,7 +262,8 @@ class BaseDirectoryIndexAggregator(BaseAggregator):
|
||||
self.base_prefix = base_prefix
|
||||
self.base_dir = base_dir
|
||||
|
||||
def get_sources(self, params):
|
||||
def _iter_sources(self, params):
|
||||
self._set_src_params(params)
|
||||
# see if specific params (when part of another agg)
|
||||
src_params = params.get('_src_params')
|
||||
if not src_params:
|
||||
@ -270,7 +276,6 @@ class BaseDirectoryIndexAggregator(BaseAggregator):
|
||||
the_dir = self.base_dir
|
||||
|
||||
the_dir = os.path.join(self.base_prefix, the_dir)
|
||||
|
||||
try:
|
||||
sources = list(self._load_files(the_dir))
|
||||
except Exception:
|
||||
@ -290,6 +295,10 @@ class BaseDirectoryIndexAggregator(BaseAggregator):
|
||||
rel_path = ''
|
||||
yield rel_path, FileIndexSource(filename)
|
||||
|
||||
def __str__(self):
|
||||
return 'file_dir'
|
||||
|
||||
|
||||
class DirectoryIndexAggregator(SeqAggMixin, BaseDirectoryIndexAggregator):
|
||||
pass
|
||||
|
||||
|
53
rezag/app.py
53
rezag/app.py
@ -1,31 +1,50 @@
|
||||
from rezag.inputrequest import WSGIInputRequest, POSTInputRequest
|
||||
from bottle import route, request, response, default_app
|
||||
from rezag.inputrequest import DirectWSGIInputRequest, POSTInputRequest
|
||||
from bottle import route, request, response, default_app, abort
|
||||
|
||||
from pywb.utils.wbexception import WbException
|
||||
|
||||
import traceback
|
||||
import json
|
||||
|
||||
def err_handler(exc):
|
||||
response.status = exc.status_code
|
||||
response.content_type = 'application/json'
|
||||
return json.dumps({'message': exc.body})
|
||||
|
||||
def wrap_error(func):
|
||||
def do_d(*args, **kwargs):
|
||||
try:
|
||||
return func(*args, **kwargs)
|
||||
except WbException as exc:
|
||||
if application.debug:
|
||||
traceback.print_exc()
|
||||
abort(exc.status(), exc.msg)
|
||||
except Exception as e:
|
||||
if application.debug:
|
||||
traceback.print_exc()
|
||||
abort(500, 'Internal Error: ' + str(e))
|
||||
|
||||
return do_d
|
||||
|
||||
|
||||
def add_route(path, handler):
|
||||
def debug(func):
|
||||
def do_d():
|
||||
try:
|
||||
return func()
|
||||
except Exception:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
return do_d
|
||||
|
||||
def direct_input_request():
|
||||
@wrap_error
|
||||
def direct_input_request(mode=''):
|
||||
params = dict(request.query)
|
||||
params['_input_req'] = WSGIInputRequest(request.environ)
|
||||
params['_input_req'] = DirectWSGIInputRequest(request.environ)
|
||||
return handler(params)
|
||||
|
||||
def post_fullrequest():
|
||||
@wrap_error
|
||||
def post_fullrequest(mode=''):
|
||||
params = dict(request.query)
|
||||
params['_input_req'] = POSTInputRequest(request.environ)
|
||||
return handler(params)
|
||||
|
||||
route(path + '/postreq', method=['POST'], callback=debug(post_fullrequest))
|
||||
route(path, method=['ANY'], callback=debug(direct_input_request))
|
||||
route(path + '/postreq', method=['POST'], callback=post_fullrequest)
|
||||
route(path, method=['ANY'], callback=direct_input_request)
|
||||
|
||||
|
||||
application = default_app()
|
||||
application.default_error_handler = err_handler
|
||||
|
||||
|
||||
|
@ -1,12 +1,13 @@
|
||||
from rezag.responseloader import WARCPathHandler, LiveWebHandler
|
||||
from rezag.responseloader import WARCPathLoader, LiveWebLoader
|
||||
from rezag.utils import MementoUtils
|
||||
from pywb.warc.recordloader import ArchiveLoadFailed
|
||||
from pywb.utils.wbexception import BadRequestException, WbException
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
from bottle import response
|
||||
|
||||
|
||||
#=============================================================================
|
||||
def to_cdxj(cdx_iter, fields):
|
||||
response.headers['Content-Type'] = 'text/x-cdxj'
|
||||
response.headers['Content-Type'] = 'application/x-cdxj'
|
||||
return [cdx.to_cdxj(fields) for cdx in cdx_iter]
|
||||
|
||||
def to_json(cdx_iter, fields):
|
||||
@ -37,26 +38,36 @@ class IndexHandler(object):
|
||||
self.index_source = index_source
|
||||
self.opts = opts or {}
|
||||
|
||||
def __call__(self, params):
|
||||
if params.get('mode') == 'sources':
|
||||
srcs = self.index_source.get_sources(params)
|
||||
result = [(name, str(value)) for name, value in srcs]
|
||||
result = {'sources': dict(result)}
|
||||
return result
|
||||
def get_supported_modes(self):
|
||||
return dict(modes=['list_modes', 'list_sources', 'index'])
|
||||
|
||||
def _load_index_source(self, params):
|
||||
url = params.get('url')
|
||||
if not url:
|
||||
raise BadRequestException('The "url" param is required')
|
||||
|
||||
input_req = params.get('_input_req')
|
||||
if input_req:
|
||||
params['alt_url'] = input_req.include_post_query(params.get('url'))
|
||||
params['alt_url'] = input_req.include_post_query(url)
|
||||
|
||||
cdx_iter = self.index_source(params)
|
||||
return self.index_source(params)
|
||||
|
||||
def __call__(self, params):
|
||||
mode = params.get('mode', 'index')
|
||||
if mode == 'list_sources':
|
||||
return self.index_source.get_source_list(params)
|
||||
|
||||
if mode == 'list_modes' or mode != 'index':
|
||||
return self.get_supported_modes()
|
||||
|
||||
output = params.get('output', self.DEF_OUTPUT)
|
||||
fields = params.get('fields')
|
||||
|
||||
handler = self.OUTPUTS.get(output)
|
||||
if not handler:
|
||||
handler = self.OUTPUTS[self.DEF_OUTPUT]
|
||||
raise BadRequestException('output={0} not supported'.format(output))
|
||||
|
||||
cdx_iter = self._load_index_source(params)
|
||||
res = handler(cdx_iter, fields)
|
||||
return res
|
||||
|
||||
@ -67,57 +78,59 @@ class ResourceHandler(IndexHandler):
|
||||
super(ResourceHandler, self).__init__(index_source)
|
||||
self.resource_loaders = resource_loaders
|
||||
|
||||
def get_supported_modes(self):
|
||||
res = super(ResourceHandler, self).get_supported_modes()
|
||||
res['modes'].append('resource')
|
||||
return res
|
||||
|
||||
def __call__(self, params):
|
||||
if params.get('mode', 'resource') != 'resource':
|
||||
return super(ResourceHandler, self).__call__(params)
|
||||
|
||||
input_req = params.get('_input_req')
|
||||
if input_req:
|
||||
params['alt_url'] = input_req.include_post_query(params.get('url'))
|
||||
|
||||
cdx_iter = self.index_source(params)
|
||||
|
||||
any_found = False
|
||||
cdx_iter = self._load_index_source(params)
|
||||
last_exc = None
|
||||
|
||||
for cdx in cdx_iter:
|
||||
any_found = True
|
||||
|
||||
for loader in self.resource_loaders:
|
||||
try:
|
||||
resp = loader(cdx, params)
|
||||
if resp:
|
||||
if resp is not None:
|
||||
return resp
|
||||
except ArchiveLoadFailed as e:
|
||||
print(e)
|
||||
pass
|
||||
except WbException as e:
|
||||
last_exc = e
|
||||
|
||||
if any_found:
|
||||
raise ArchiveLoadFailed('Resource Found, could not be Loaded')
|
||||
if last_exc:
|
||||
raise last_exc
|
||||
#raise ArchiveLoadFailed('Resource Found, could not be Loaded')
|
||||
else:
|
||||
raise ArchiveLoadFailed('No Resource Found')
|
||||
raise NotFoundException('No Resource Found')
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class DefaultResourceHandler(ResourceHandler):
|
||||
def __init__(self, index_source, warc_paths=''):
|
||||
loaders = [WARCPathHandler(warc_paths, index_source),
|
||||
LiveWebHandler()
|
||||
loaders = [WARCPathLoader(warc_paths, index_source),
|
||||
LiveWebLoader()
|
||||
]
|
||||
super(DefaultResourceHandler, self).__init__(index_source, loaders)
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class HandlerSeq(object):
|
||||
def __init__(self, loaders):
|
||||
self.loaders = loaders
|
||||
def __init__(self, handlers):
|
||||
self.handlers = handlers
|
||||
|
||||
def __call__(self, params):
|
||||
for loader in self.loaders:
|
||||
last_exc = None
|
||||
for handler in self.handlers:
|
||||
try:
|
||||
res = loader(params)
|
||||
if res:
|
||||
res = handler(params)
|
||||
if res is not None:
|
||||
return res
|
||||
except ArchiveLoadFailed:
|
||||
pass
|
||||
except WbException as e:
|
||||
last_exc = e
|
||||
|
||||
raise ArchiveLoadFailed('No Resource Found')
|
||||
if last_exc:
|
||||
raise last_exc
|
||||
else:
|
||||
raise NotFoundException('No Resource Found')
|
||||
|
@ -14,6 +14,9 @@ from rezag.liverec import patched_requests as requests
|
||||
from rezag.utils import MementoUtils
|
||||
|
||||
|
||||
WAYBACK_ORIG_SUFFIX = '{timestamp}id_/{url}'
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class BaseIndexSource(object):
|
||||
def load_index(self, params): #pragma: no cover
|
||||
@ -22,10 +25,10 @@ class BaseIndexSource(object):
|
||||
@staticmethod
|
||||
def res_template(template, params):
|
||||
src_params = params.get('_src_params')
|
||||
if src_params:
|
||||
res = template.format(**src_params)
|
||||
if not src_params:
|
||||
res = template.format(url=params['url'])
|
||||
else:
|
||||
res = template
|
||||
res = template.format(url=params['url'], **src_params)
|
||||
return res
|
||||
|
||||
|
||||
@ -59,7 +62,7 @@ class RemoteIndexSource(BaseIndexSource):
|
||||
|
||||
def load_index(self, params):
|
||||
api_url = self.res_template(self.api_url_template, params)
|
||||
api_url += '?url=' + params['url']
|
||||
print('API URL', api_url)
|
||||
r = requests.get(api_url, timeout=params.get('_timeout'))
|
||||
if r.status_code >= 400:
|
||||
raise NotFoundException(api_url)
|
||||
@ -169,7 +172,6 @@ class MementoIndexSource(BaseIndexSource):
|
||||
|
||||
def get_timegate_links(self, params, closest):
|
||||
url = self.res_template(self.timegate_url, params)
|
||||
url += params['url']
|
||||
accept_dt = timestamp_to_http_date(closest)
|
||||
res = requests.head(url, headers={'Accept-Datetime': accept_dt})
|
||||
if res.status_code >= 400:
|
||||
@ -179,7 +181,6 @@ class MementoIndexSource(BaseIndexSource):
|
||||
|
||||
def get_timemap_links(self, params):
|
||||
url = self.res_template(self.timemap_url, params)
|
||||
url += params['url']
|
||||
res = requests.get(url, timeout=params.get('_timeout'))
|
||||
if res.status_code >= 400:
|
||||
raise NotFoundException(url)
|
||||
@ -200,9 +201,9 @@ class MementoIndexSource(BaseIndexSource):
|
||||
|
||||
@staticmethod
|
||||
def from_timegate_url(timegate_url, path='link'):
|
||||
return MementoIndexSource(timegate_url,
|
||||
timegate_url + 'timemap/' + path + '/',
|
||||
timegate_url + '{timestamp}id_/{url}')
|
||||
return MementoIndexSource(timegate_url + '{url}',
|
||||
timegate_url + 'timemap/' + path + '/{url}',
|
||||
timegate_url + WAYBACK_ORIG_SUFFIX)
|
||||
|
||||
def __str__(self):
|
||||
return 'memento'
|
||||
|
@ -1,4 +1,3 @@
|
||||
from pywb.utils.loaders import extract_client_cookie
|
||||
from pywb.utils.loaders import extract_post_query, append_post_query
|
||||
from pywb.utils.loaders import LimitReader
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||
@ -9,7 +8,7 @@ from io import BytesIO
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class WSGIInputRequest(object):
|
||||
class DirectWSGIInputRequest(object):
|
||||
def __init__(self, env):
|
||||
self.env = env
|
||||
|
||||
@ -20,26 +19,10 @@ class WSGIInputRequest(object):
|
||||
headers = {}
|
||||
|
||||
for name, value in iteritems(self.env):
|
||||
# will be set by requests to match actual host
|
||||
if name == 'HTTP_HOST':
|
||||
#name = 'Host'
|
||||
#value = splits.netloc
|
||||
# will be set automatically
|
||||
continue
|
||||
|
||||
#elif name == 'HTTP_ORIGIN':
|
||||
# name = 'Origin'
|
||||
# value = (splits.scheme + '://' + splits.netloc)
|
||||
|
||||
elif name == 'HTTP_X_CSRFTOKEN':
|
||||
name = 'X-CSRFToken'
|
||||
cookie_val = extract_client_cookie(env, 'csrftoken')
|
||||
if cookie_val:
|
||||
value = cookie_val
|
||||
|
||||
#elif name == 'HTTP_X_FORWARDED_PROTO':
|
||||
# name = 'X-Forwarded-Proto'
|
||||
# value = splits.scheme
|
||||
|
||||
elif name.startswith('HTTP_'):
|
||||
name = name[5:].title().replace('_', '-')
|
||||
|
||||
@ -55,10 +38,7 @@ class WSGIInputRequest(object):
|
||||
return headers
|
||||
|
||||
def get_req_body(self):
|
||||
input_ = self.env.get('wsgi.input')
|
||||
if not input_:
|
||||
return None
|
||||
|
||||
input_ = self.env['wsgi.input']
|
||||
len_ = self._get_content_length()
|
||||
enc = self._get_header('Transfer-Encoding')
|
||||
|
||||
@ -70,9 +50,6 @@ class WSGIInputRequest(object):
|
||||
data = None
|
||||
|
||||
return data
|
||||
#buf = data.read().decode('utf-8')
|
||||
#print(buf)
|
||||
#return StringIO(buf)
|
||||
|
||||
def _get_content_type(self):
|
||||
return self.env.get('CONTENT_TYPE')
|
||||
@ -105,7 +82,7 @@ class WSGIInputRequest(object):
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class POSTInputRequest(WSGIInputRequest):
|
||||
class POSTInputRequest(DirectWSGIInputRequest):
|
||||
def __init__(self, env):
|
||||
self.env = env
|
||||
|
||||
|
@ -2,6 +2,7 @@ from rezag.liverec import BaseRecorder
|
||||
from rezag.liverec import request as remote_request
|
||||
|
||||
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_http_date
|
||||
from pywb.utils.wbexception import LiveResourceException
|
||||
from pywb.warc.resolvingloader import ResolvingLoader
|
||||
|
||||
from io import BytesIO
|
||||
@ -29,7 +30,7 @@ def incr_reader(stream, header=None, size=8192):
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class WARCPathHandler(object):
|
||||
class WARCPathLoader(object):
|
||||
def __init__(self, paths, cdx_source):
|
||||
self.paths = paths
|
||||
if isinstance(paths, str):
|
||||
@ -108,7 +109,7 @@ class HeaderRecorder(BaseRecorder):
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class LiveWebHandler(object):
|
||||
class LiveWebLoader(object):
|
||||
SKIP_HEADERS = (b'link',
|
||||
b'memento-datetime',
|
||||
b'content-location',
|
||||
@ -140,14 +141,17 @@ class LiveWebHandler(object):
|
||||
method = input_req.get_req_method()
|
||||
data = input_req.get_req_body()
|
||||
|
||||
upstream_res = remote_request(url=load_url,
|
||||
method=method,
|
||||
recorder=recorder,
|
||||
stream=True,
|
||||
allow_redirects=False,
|
||||
headers=req_headers,
|
||||
data=data,
|
||||
timeout=params.get('_timeout'))
|
||||
try:
|
||||
upstream_res = remote_request(url=load_url,
|
||||
method=method,
|
||||
recorder=recorder,
|
||||
stream=True,
|
||||
allow_redirects=False,
|
||||
headers=req_headers,
|
||||
data=data,
|
||||
timeout=params.get('_timeout'))
|
||||
except Exception:
|
||||
raise LiveResourceException(load_url)
|
||||
|
||||
resp_headers = recorder.get_header()
|
||||
|
||||
@ -175,7 +179,7 @@ class LiveWebHandler(object):
|
||||
return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
|
||||
|
||||
@staticmethod
|
||||
def _make_warc_id(id_=None):
|
||||
def _make_warc_id(id_=None): #pragma: no cover
|
||||
if not id_:
|
||||
id_ = uuid.uuid1()
|
||||
return '<urn:uuid:{0}>'.format(id_)
|
||||
|
@ -77,6 +77,7 @@ class MementoUtils(object):
|
||||
from_date = timestamp_to_http_date(first_cdx['timestamp'])
|
||||
except StopIteration:
|
||||
first_cdx = None
|
||||
return
|
||||
|
||||
# first memento link
|
||||
yield MementoUtils.make_timemap_memento_link(first_cdx, datetime=from_date)
|
||||
@ -91,4 +92,4 @@ class MementoUtils(object):
|
||||
|
||||
# last memento link, if any
|
||||
if prev_cdx:
|
||||
yield MementoUtils.make_timemap_memento_link(prev_cdx, end='')
|
||||
yield MementoUtils.make_timemap_memento_link(prev_cdx, end='\n')
|
||||
|
5
setup.py
5
setup.py
@ -32,8 +32,11 @@ setup(
|
||||
'rezag',
|
||||
],
|
||||
install_requires=[
|
||||
'pywb',
|
||||
'pywb==1.0b',
|
||||
],
|
||||
dependency_links=[
|
||||
'git+https://github.com/ikreymer/pywb.git@py3#egg=pywb-1.0b-py3',
|
||||
],
|
||||
zip_safe=True,
|
||||
entry_points="""
|
||||
[console_scripts]
|
||||
|
@ -33,6 +33,9 @@ def setup_module():
|
||||
shutil.copy(to_path('testdata/iana.cdxj'), coll_B)
|
||||
shutil.copy(to_path('testdata/dupes.cdxj'), coll_C)
|
||||
|
||||
with open(to_path(root_dir) + 'somefile', 'w') as fh:
|
||||
fh.write('foo')
|
||||
|
||||
global dir_loader
|
||||
dir_loader = DirectoryIndexAggregator(dir_prefix, dir_path)
|
||||
|
||||
@ -121,7 +124,7 @@ def test_agg_dir_and_memento():
|
||||
'local': dir_loader}
|
||||
agg_source = SimpleAggregator(sources)
|
||||
|
||||
res = agg_source({'url': 'example.com/', 'param.coll': '*', 'closest': '20100512', 'limit': 6})
|
||||
res = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})
|
||||
|
||||
exp = [
|
||||
{'source': 'ia', 'timestamp': '20100513052358', 'load_url': 'http://web.archive.org/web/20100513052358id_/http://example.com/'},
|
||||
@ -144,7 +147,7 @@ def test_agg_no_dir_1():
|
||||
|
||||
|
||||
def test_agg_no_dir_2():
|
||||
loader = DirectoryIndexAggregator(root_dir, 'no_such')
|
||||
loader = DirectoryIndexAggregator(root_dir, '')
|
||||
res = loader({'url': 'example.com/', 'param.coll': 'X'})
|
||||
|
||||
exp = []
|
||||
@ -152,4 +155,31 @@ def test_agg_no_dir_2():
|
||||
assert(to_json_list(res) == exp)
|
||||
|
||||
|
||||
def test_agg_dir_sources_1():
|
||||
res = dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
|
||||
exp = {'sources': {'colls/A/indexes': 'file',
|
||||
'colls/B/indexes': 'file',
|
||||
'colls/C/indexes': 'file'}
|
||||
}
|
||||
|
||||
assert(res == exp)
|
||||
|
||||
|
||||
def test_agg_dir_sources_2():
|
||||
res = dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'})
|
||||
exp = {'sources': {'colls/A/indexes': 'file',
|
||||
'colls/C/indexes': 'file'}
|
||||
}
|
||||
|
||||
assert(res == exp)
|
||||
|
||||
|
||||
def test_agg_dir_sources_single_dir():
|
||||
loader = DirectoryIndexAggregator('testdata/', '')
|
||||
res = loader.get_source_list({'url': 'example.com/'})
|
||||
|
||||
exp = {'sources': {}}
|
||||
|
||||
assert(res == exp)
|
||||
|
||||
|
||||
|
@ -42,13 +42,17 @@ def setup_module(self):
|
||||
source3 = SimpleAggregator({'example': FileIndexSource(to_path('testdata/example.cdxj'))})
|
||||
handler3 = DefaultResourceHandler(source3, to_path('testdata/'))
|
||||
|
||||
|
||||
add_route('/fallback', HandlerSeq([handler3,
|
||||
handler2,
|
||||
live_handler]))
|
||||
|
||||
add_route('/seq', HandlerSeq([handler3,
|
||||
handler2]))
|
||||
|
||||
bottle.debug = True
|
||||
add_route('/empty', HandlerSeq([]))
|
||||
add_route('/invalid', HandlerSeq(['foo']))
|
||||
|
||||
application.debug = True
|
||||
global testapp
|
||||
testapp = webtest.TestApp(application)
|
||||
|
||||
@ -61,8 +65,23 @@ class TestResAgg(object):
|
||||
def setup(self):
|
||||
self.testapp = testapp
|
||||
|
||||
def test_list_handlers(self):
|
||||
resp = self.testapp.get('/many?mode=list_modes')
|
||||
assert resp.json == {'modes': ['list_modes', 'list_sources', 'index', 'resource']}
|
||||
|
||||
resp = self.testapp.get('/many?mode=other')
|
||||
assert resp.json == {'modes': ['list_modes', 'list_sources', 'index', 'resource']}
|
||||
|
||||
# defaults to resource, must specify url
|
||||
resp = self.testapp.get('/many', status=400)
|
||||
assert resp.json == {'message': 'The "url" param is required'}
|
||||
|
||||
def test_list_sources(self):
|
||||
resp = self.testapp.get('/many?mode=list_sources')
|
||||
assert resp.json == {'sources': {'local': 'file_dir', 'ia': 'memento', 'rhiz': 'memento', 'live': 'live'}}
|
||||
|
||||
def test_live_index(self):
|
||||
resp = self.testapp.get('/live?url=http://httpbin.org/get&mode=index&output=json')
|
||||
resp = self.testapp.get('/live?mode=index&url=http://httpbin.org/get&output=json')
|
||||
resp.charset = 'utf-8'
|
||||
|
||||
res = to_json_list(resp.text)
|
||||
@ -71,7 +90,8 @@ class TestResAgg(object):
|
||||
'load_url': 'http://httpbin.org/get', 'source': 'live', 'timestamp': '2016'}])
|
||||
|
||||
def test_live_resource(self):
|
||||
resp = self.testapp.get('/live?url=http://httpbin.org/get?foo=bar&mode=resource')
|
||||
headers = {'foo': 'bar'}
|
||||
resp = self.testapp.get('/live?url=http://httpbin.org/get?foo=bar', headers=headers)
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'live'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar'
|
||||
@ -82,7 +102,7 @@ class TestResAgg(object):
|
||||
|
||||
|
||||
def test_live_post_resource(self):
|
||||
resp = self.testapp.post('/live?url=http://httpbin.org/post&mode=resource',
|
||||
resp = self.testapp.post('/live?url=http://httpbin.org/post',
|
||||
OrderedDict([('foo', 'bar')]))
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'live'
|
||||
@ -204,6 +224,11 @@ foo=bar&test=abc"""
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://example.com/'
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
|
||||
def test_error_fallback_live_not_found(self):
|
||||
resp = self.testapp.get('/fallback?url=http://invalid.url-not-found', status=400)
|
||||
|
||||
assert resp.json == {'message': 'http://invalid.url-not-found'}
|
||||
|
||||
def test_agg_local_revisit(self):
|
||||
resp = self.testapp.get('/many?url=http://www.example.com/&closest=20140127171251&sources=local')
|
||||
|
||||
@ -214,3 +239,24 @@ foo=bar&test=abc"""
|
||||
assert resp.headers['WARC-Refers-To-Date'] == '2014-01-27T17:12:00Z'
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'<!doctype html>' in resp.body
|
||||
|
||||
def test_error_invalid_index_output(self):
|
||||
resp = self.testapp.get('/live?mode=index&url=http://httpbin.org/get&output=foobar', status=400)
|
||||
|
||||
assert resp.json == {'message': 'output=foobar not supported'}
|
||||
|
||||
def test_error_local_not_found(self):
|
||||
resp = self.testapp.get('/many?url=http://not-found.error/&sources=local', status=404)
|
||||
|
||||
assert resp.json == {'message': 'No Resource Found'}
|
||||
|
||||
def test_error_empty(self):
|
||||
resp = self.testapp.get('/empty?url=http://example.com/', status=404)
|
||||
|
||||
assert resp.json == {'message': 'No Resource Found'}
|
||||
|
||||
def test_error_invalid(self):
|
||||
resp = self.testapp.get('/invalid?url=http://example.com/', status=500)
|
||||
|
||||
assert resp.json['message'].startswith('Internal Error')
|
||||
|
||||
|
@ -32,16 +32,20 @@ local_sources = [
|
||||
|
||||
|
||||
remote_sources = [
|
||||
RemoteIndexSource('http://webenact.rhizome.org/all-cdx',
|
||||
RemoteIndexSource('http://webenact.rhizome.org/all-cdx?url={url}',
|
||||
'http://webenact.rhizome.org/all/{timestamp}id_/{url}'),
|
||||
|
||||
MementoIndexSource('http://webenact.rhizome.org/all/',
|
||||
'http://webenact.rhizome.org/all/timemap/*/',
|
||||
MementoIndexSource('http://webenact.rhizome.org/all/{url}',
|
||||
'http://webenact.rhizome.org/all/timemap/*/{url}',
|
||||
'http://webenact.rhizome.org/all/{timestamp}id_/{url}')
|
||||
]
|
||||
|
||||
ait_source = RemoteIndexSource('http://wayback.archive-it.org/cdx?url={url}',
|
||||
'http://wayback.archive-it.org/all/{timestamp}id_/{url}')
|
||||
|
||||
|
||||
def query_single_source(source, params):
|
||||
string = str(source)
|
||||
return SimpleAggregator({'source': source})(params)
|
||||
|
||||
|
||||
@ -182,4 +186,22 @@ def test_file_not_found():
|
||||
|
||||
|
||||
|
||||
def test_ait_filters():
|
||||
ait_source = RemoteIndexSource('http://wayback.archive-it.org/cdx/search/cdx?url={url}&filter=filename:ARCHIVEIT-({colls})-.*',
|
||||
'http://wayback.archive-it.org/all/{timestamp}id_/{url}')
|
||||
|
||||
cdxlist = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '5610|933'})
|
||||
filenames = [cdx['filename'] for cdx in cdxlist]
|
||||
|
||||
prefix = ('ARCHIVEIT-5610-', 'ARCHIVEIT-933-')
|
||||
|
||||
assert(all([x.startswith(prefix) for x in filenames]))
|
||||
|
||||
|
||||
cdxlist = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '1883|366|905'})
|
||||
filenames = [cdx['filename'] for cdx in cdxlist]
|
||||
|
||||
prefix = ('ARCHIVEIT-1883-', 'ARCHIVEIT-366-', 'ARCHIVEIT-905-')
|
||||
|
||||
assert(all([x.startswith(prefix) for x in filenames]))
|
||||
|
||||
|
@ -27,10 +27,11 @@ aggs = {'simple': SimpleAggregator(sources),
|
||||
'processes': ThreadedTimeoutAggregator(sources, timeout=5.0, use_processes=True),
|
||||
}
|
||||
|
||||
#@pytest.mark.parametrize("agg", aggs, ids=["simple", "gevent_timeout"])
|
||||
def pytest_generate_tests(metafunc):
|
||||
metafunc.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
||||
#def pytest_generate_tests(metafunc):
|
||||
# metafunc.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
||||
def test_mem_agg_index_1(agg):
|
||||
url = 'http://iana.org/'
|
||||
res = agg(dict(url=url, closest='20140126000000', limit=5))
|
||||
@ -46,6 +47,7 @@ def test_mem_agg_index_1(agg):
|
||||
assert(json_list(res) == exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
||||
def test_mem_agg_index_2(agg):
|
||||
url = 'http://example.com/'
|
||||
res = agg(dict(url=url, closest='20100512', limit=6))
|
||||
@ -60,6 +62,7 @@ def test_mem_agg_index_2(agg):
|
||||
assert(json_list(res) == exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
||||
def test_mem_agg_index_3(agg):
|
||||
url = 'http://vvork.com/'
|
||||
res = agg(dict(url=url, closest='20141001', limit=5))
|
||||
@ -73,6 +76,7 @@ def test_mem_agg_index_3(agg):
|
||||
assert(json_list(res) == exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
||||
def test_mem_agg_index_4(agg):
|
||||
url = 'http://vvork.com/'
|
||||
res = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
|
||||
@ -83,10 +87,11 @@ def test_mem_agg_index_4(agg):
|
||||
assert(json_list(res) == exp)
|
||||
|
||||
|
||||
def test_handler_output_cdxj(agg):
|
||||
loader = IndexHandler(agg)
|
||||
def test_handler_output_cdxj():
|
||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||
handler = IndexHandler(agg)
|
||||
url = 'http://vvork.com/'
|
||||
res = loader(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
|
||||
res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
|
||||
|
||||
exp = """\
|
||||
com,vvork)/ 20141006184357 {"url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
|
||||
@ -96,10 +101,11 @@ com,vvork)/ 20131004231540 {"url": "http://vvork.com/", "mem_rel": "last memento
|
||||
assert(''.join(res) == exp)
|
||||
|
||||
|
||||
def test_handler_output_json(agg):
|
||||
loader = IndexHandler(agg)
|
||||
def test_handler_output_json():
|
||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||
handler = IndexHandler(agg)
|
||||
url = 'http://vvork.com/'
|
||||
res = loader(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='json'))
|
||||
res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='json'))
|
||||
|
||||
exp = """\
|
||||
{"urlkey": "com,vvork)/", "timestamp": "20141006184357", "url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
|
||||
@ -109,22 +115,50 @@ def test_handler_output_json(agg):
|
||||
assert(''.join(res) == exp)
|
||||
|
||||
|
||||
def test_handler_output_link(agg):
|
||||
loader = IndexHandler(agg)
|
||||
def test_handler_output_link():
|
||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||
handler = IndexHandler(agg)
|
||||
url = 'http://vvork.com/'
|
||||
res = loader(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='link'))
|
||||
res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='link'))
|
||||
|
||||
exp = """\
|
||||
<http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/>; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT"; src="rhiz",
|
||||
<http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/>; rel="memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT"; src="ait"\
|
||||
<http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/>; rel="memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT"; src="ait"
|
||||
"""
|
||||
assert(''.join(res) == exp)
|
||||
|
||||
|
||||
def test_handler_output_text(agg):
|
||||
loader = IndexHandler(agg)
|
||||
def test_handler_output_link_2():
|
||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||
handler = IndexHandler(agg)
|
||||
url = 'http://iana.org/'
|
||||
res = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
|
||||
|
||||
exp = """\
|
||||
<http://web.archive.org/web/20140126093743id_/http://iana.org/>; rel="memento"; datetime="Sun, 26 Jan 2014 09:37:43 GMT"; src="ia",
|
||||
<filename://iana.warc.gz>; rel="memento"; datetime="Sun, 26 Jan 2014 20:06:24 GMT"; src="local",
|
||||
<http://web.archive.org/web/20140123034755id_/http://iana.org/>; rel="memento"; datetime="Thu, 23 Jan 2014 03:47:55 GMT"; src="ia",
|
||||
<http://web.archive.org/web/20140129175203id_/http://iana.org/>; rel="memento"; datetime="Wed, 29 Jan 2014 17:52:03 GMT"; src="ia",
|
||||
<http://wayback.archive-it.org/all/20140107040552id_/http://iana.org/>; rel="memento"; datetime="Tue, 07 Jan 2014 04:05:52 GMT"; src="ait"
|
||||
"""
|
||||
assert(''.join(res) == exp)
|
||||
|
||||
|
||||
def test_handler_output_link_3():
|
||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||
handler = IndexHandler(agg)
|
||||
url = 'http://foo.bar.non-existent'
|
||||
res = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
|
||||
|
||||
exp = ''
|
||||
|
||||
assert(''.join(res) == exp)
|
||||
|
||||
def test_handler_output_text():
|
||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||
handler = IndexHandler(agg)
|
||||
url = 'http://vvork.com/'
|
||||
res = loader(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='text'))
|
||||
res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='text'))
|
||||
|
||||
exp = """\
|
||||
com,vvork)/ 20141006184357 http://www.vvork.com/ memento http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/ http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/ rhiz
|
||||
@ -133,9 +167,10 @@ com,vvork)/ 20131004231540 http://vvork.com/ last memento http://wayback.archive
|
||||
assert(''.join(res) == exp)
|
||||
|
||||
|
||||
def test_handler_list_sources(agg):
|
||||
loader = IndexHandler(agg)
|
||||
res = loader(dict(mode='sources'))
|
||||
def test_handler_list_sources():
|
||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||
handler = IndexHandler(agg)
|
||||
res = handler(dict(mode='list_sources'))
|
||||
|
||||
assert(res == {'sources': {'bl': 'memento',
|
||||
'ait': 'memento',
|
||||
@ -143,4 +178,3 @@ def test_handler_list_sources(agg):
|
||||
'rhiz': 'memento',
|
||||
'local': 'file'}})
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user