mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
seperate iter_sources from list_sources api
all errors returned as json block with error msg tests for not found, invalid errors
This commit is contained in:
parent
68090d00c1
commit
008e5284b1
@ -63,7 +63,6 @@ class BaseAggregator(object):
|
|||||||
try:
|
try:
|
||||||
_src_params = all_params['_all_src_params'].get(name)
|
_src_params = all_params['_all_src_params'].get(name)
|
||||||
all_params['_src_params'] = _src_params
|
all_params['_src_params'] = _src_params
|
||||||
|
|
||||||
cdx_iter = source.load_index(all_params)
|
cdx_iter = source.load_index(all_params)
|
||||||
except NotFoundException as nf:
|
except NotFoundException as nf:
|
||||||
print('Not found in ' + name)
|
print('Not found in ' + name)
|
||||||
@ -89,15 +88,21 @@ class BaseAggregator(object):
|
|||||||
|
|
||||||
return cdx_iter
|
return cdx_iter
|
||||||
|
|
||||||
def _on_source_error(self, name):
|
def _on_source_error(self, name): #pragma: no cover
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def _load_all(self, params): #pragma: no cover
|
def _load_all(self, params): #pragma: no cover
|
||||||
raise NotImplemented()
|
raise NotImplemented()
|
||||||
|
|
||||||
def get_sources(self, params): #pragma: no cover
|
def _iter_sources(self, params): #pragma: no cover
|
||||||
raise NotImplemented()
|
raise NotImplemented()
|
||||||
|
|
||||||
|
def get_source_list(self, params):
|
||||||
|
srcs = self._iter_sources(params)
|
||||||
|
result = [(name, str(value)) for name, value in srcs]
|
||||||
|
result = {'sources': dict(result)}
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class BaseSourceListAggregator(BaseAggregator):
|
class BaseSourceListAggregator(BaseAggregator):
|
||||||
@ -107,7 +112,7 @@ class BaseSourceListAggregator(BaseAggregator):
|
|||||||
def get_all_sources(self, params):
|
def get_all_sources(self, params):
|
||||||
return self.sources
|
return self.sources
|
||||||
|
|
||||||
def get_sources(self, params):
|
def _iter_sources(self, params):
|
||||||
sources = self.get_all_sources(params)
|
sources = self.get_all_sources(params)
|
||||||
srcs_list = params.get('sources')
|
srcs_list = params.get('sources')
|
||||||
if not srcs_list:
|
if not srcs_list:
|
||||||
@ -125,7 +130,7 @@ class SeqAggMixin(object):
|
|||||||
|
|
||||||
|
|
||||||
def _load_all(self, params):
|
def _load_all(self, params):
|
||||||
sources = list(self.get_sources(params))
|
sources = list(self._iter_sources(params))
|
||||||
return list([self.load_child_source(name, source, params)
|
return list([self.load_child_source(name, source, params)
|
||||||
for name, source in sources])
|
for name, source in sources])
|
||||||
|
|
||||||
@ -160,8 +165,8 @@ class TimeoutMixin(object):
|
|||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def get_sources(self, params):
|
def _iter_sources(self, params):
|
||||||
sources = super(TimeoutMixin, self).get_sources(params)
|
sources = super(TimeoutMixin, self)._iter_sources(params)
|
||||||
for name, source in sources:
|
for name, source in sources:
|
||||||
if not self.is_timed_out(name):
|
if not self.is_timed_out(name):
|
||||||
yield name, source
|
yield name, source
|
||||||
@ -185,7 +190,7 @@ class GeventMixin(object):
|
|||||||
def _load_all(self, params):
|
def _load_all(self, params):
|
||||||
params['_timeout'] = self.timeout
|
params['_timeout'] = self.timeout
|
||||||
|
|
||||||
sources = list(self.get_sources(params))
|
sources = list(self._iter_sources(params))
|
||||||
|
|
||||||
def do_spawn(name, source):
|
def do_spawn(name, source):
|
||||||
return self.pool.spawn(self.load_child_source, name, source, params)
|
return self.pool.spawn(self.load_child_source, name, source, params)
|
||||||
@ -223,7 +228,7 @@ class ConcurrentMixin(object):
|
|||||||
def _load_all(self, params):
|
def _load_all(self, params):
|
||||||
params['_timeout'] = self.timeout
|
params['_timeout'] = self.timeout
|
||||||
|
|
||||||
sources = list(self.get_sources(params))
|
sources = list(self._iter_sources(params))
|
||||||
|
|
||||||
with self.pool_class(max_workers=self.size) as executor:
|
with self.pool_class(max_workers=self.size) as executor:
|
||||||
def do_spawn(name, source):
|
def do_spawn(name, source):
|
||||||
@ -257,7 +262,8 @@ class BaseDirectoryIndexAggregator(BaseAggregator):
|
|||||||
self.base_prefix = base_prefix
|
self.base_prefix = base_prefix
|
||||||
self.base_dir = base_dir
|
self.base_dir = base_dir
|
||||||
|
|
||||||
def get_sources(self, params):
|
def _iter_sources(self, params):
|
||||||
|
self._set_src_params(params)
|
||||||
# see if specific params (when part of another agg)
|
# see if specific params (when part of another agg)
|
||||||
src_params = params.get('_src_params')
|
src_params = params.get('_src_params')
|
||||||
if not src_params:
|
if not src_params:
|
||||||
@ -270,7 +276,6 @@ class BaseDirectoryIndexAggregator(BaseAggregator):
|
|||||||
the_dir = self.base_dir
|
the_dir = self.base_dir
|
||||||
|
|
||||||
the_dir = os.path.join(self.base_prefix, the_dir)
|
the_dir = os.path.join(self.base_prefix, the_dir)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
sources = list(self._load_files(the_dir))
|
sources = list(self._load_files(the_dir))
|
||||||
except Exception:
|
except Exception:
|
||||||
@ -290,6 +295,10 @@ class BaseDirectoryIndexAggregator(BaseAggregator):
|
|||||||
rel_path = ''
|
rel_path = ''
|
||||||
yield rel_path, FileIndexSource(filename)
|
yield rel_path, FileIndexSource(filename)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return 'file_dir'
|
||||||
|
|
||||||
|
|
||||||
class DirectoryIndexAggregator(SeqAggMixin, BaseDirectoryIndexAggregator):
|
class DirectoryIndexAggregator(SeqAggMixin, BaseDirectoryIndexAggregator):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
53
rezag/app.py
53
rezag/app.py
@ -1,31 +1,50 @@
|
|||||||
from rezag.inputrequest import WSGIInputRequest, POSTInputRequest
|
from rezag.inputrequest import DirectWSGIInputRequest, POSTInputRequest
|
||||||
from bottle import route, request, response, default_app
|
from bottle import route, request, response, default_app, abort
|
||||||
|
|
||||||
|
from pywb.utils.wbexception import WbException
|
||||||
|
|
||||||
|
import traceback
|
||||||
|
import json
|
||||||
|
|
||||||
|
def err_handler(exc):
|
||||||
|
response.status = exc.status_code
|
||||||
|
response.content_type = 'application/json'
|
||||||
|
return json.dumps({'message': exc.body})
|
||||||
|
|
||||||
|
def wrap_error(func):
|
||||||
|
def do_d(*args, **kwargs):
|
||||||
|
try:
|
||||||
|
return func(*args, **kwargs)
|
||||||
|
except WbException as exc:
|
||||||
|
if application.debug:
|
||||||
|
traceback.print_exc()
|
||||||
|
abort(exc.status(), exc.msg)
|
||||||
|
except Exception as e:
|
||||||
|
if application.debug:
|
||||||
|
traceback.print_exc()
|
||||||
|
abort(500, 'Internal Error: ' + str(e))
|
||||||
|
|
||||||
|
return do_d
|
||||||
|
|
||||||
|
|
||||||
def add_route(path, handler):
|
def add_route(path, handler):
|
||||||
def debug(func):
|
@wrap_error
|
||||||
def do_d():
|
def direct_input_request(mode=''):
|
||||||
try:
|
|
||||||
return func()
|
|
||||||
except Exception:
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
return do_d
|
|
||||||
|
|
||||||
def direct_input_request():
|
|
||||||
params = dict(request.query)
|
params = dict(request.query)
|
||||||
params['_input_req'] = WSGIInputRequest(request.environ)
|
params['_input_req'] = DirectWSGIInputRequest(request.environ)
|
||||||
return handler(params)
|
return handler(params)
|
||||||
|
|
||||||
def post_fullrequest():
|
@wrap_error
|
||||||
|
def post_fullrequest(mode=''):
|
||||||
params = dict(request.query)
|
params = dict(request.query)
|
||||||
params['_input_req'] = POSTInputRequest(request.environ)
|
params['_input_req'] = POSTInputRequest(request.environ)
|
||||||
return handler(params)
|
return handler(params)
|
||||||
|
|
||||||
route(path + '/postreq', method=['POST'], callback=debug(post_fullrequest))
|
route(path + '/postreq', method=['POST'], callback=post_fullrequest)
|
||||||
route(path, method=['ANY'], callback=debug(direct_input_request))
|
route(path, method=['ANY'], callback=direct_input_request)
|
||||||
|
|
||||||
|
|
||||||
application = default_app()
|
application = default_app()
|
||||||
|
application.default_error_handler = err_handler
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,12 +1,13 @@
|
|||||||
from rezag.responseloader import WARCPathHandler, LiveWebHandler
|
from rezag.responseloader import WARCPathLoader, LiveWebLoader
|
||||||
from rezag.utils import MementoUtils
|
from rezag.utils import MementoUtils
|
||||||
from pywb.warc.recordloader import ArchiveLoadFailed
|
from pywb.utils.wbexception import BadRequestException, WbException
|
||||||
|
from pywb.utils.wbexception import NotFoundException
|
||||||
from bottle import response
|
from bottle import response
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
def to_cdxj(cdx_iter, fields):
|
def to_cdxj(cdx_iter, fields):
|
||||||
response.headers['Content-Type'] = 'text/x-cdxj'
|
response.headers['Content-Type'] = 'application/x-cdxj'
|
||||||
return [cdx.to_cdxj(fields) for cdx in cdx_iter]
|
return [cdx.to_cdxj(fields) for cdx in cdx_iter]
|
||||||
|
|
||||||
def to_json(cdx_iter, fields):
|
def to_json(cdx_iter, fields):
|
||||||
@ -37,26 +38,36 @@ class IndexHandler(object):
|
|||||||
self.index_source = index_source
|
self.index_source = index_source
|
||||||
self.opts = opts or {}
|
self.opts = opts or {}
|
||||||
|
|
||||||
def __call__(self, params):
|
def get_supported_modes(self):
|
||||||
if params.get('mode') == 'sources':
|
return dict(modes=['list_modes', 'list_sources', 'index'])
|
||||||
srcs = self.index_source.get_sources(params)
|
|
||||||
result = [(name, str(value)) for name, value in srcs]
|
def _load_index_source(self, params):
|
||||||
result = {'sources': dict(result)}
|
url = params.get('url')
|
||||||
return result
|
if not url:
|
||||||
|
raise BadRequestException('The "url" param is required')
|
||||||
|
|
||||||
input_req = params.get('_input_req')
|
input_req = params.get('_input_req')
|
||||||
if input_req:
|
if input_req:
|
||||||
params['alt_url'] = input_req.include_post_query(params.get('url'))
|
params['alt_url'] = input_req.include_post_query(url)
|
||||||
|
|
||||||
cdx_iter = self.index_source(params)
|
return self.index_source(params)
|
||||||
|
|
||||||
|
def __call__(self, params):
|
||||||
|
mode = params.get('mode', 'index')
|
||||||
|
if mode == 'list_sources':
|
||||||
|
return self.index_source.get_source_list(params)
|
||||||
|
|
||||||
|
if mode == 'list_modes' or mode != 'index':
|
||||||
|
return self.get_supported_modes()
|
||||||
|
|
||||||
output = params.get('output', self.DEF_OUTPUT)
|
output = params.get('output', self.DEF_OUTPUT)
|
||||||
fields = params.get('fields')
|
fields = params.get('fields')
|
||||||
|
|
||||||
handler = self.OUTPUTS.get(output)
|
handler = self.OUTPUTS.get(output)
|
||||||
if not handler:
|
if not handler:
|
||||||
handler = self.OUTPUTS[self.DEF_OUTPUT]
|
raise BadRequestException('output={0} not supported'.format(output))
|
||||||
|
|
||||||
|
cdx_iter = self._load_index_source(params)
|
||||||
res = handler(cdx_iter, fields)
|
res = handler(cdx_iter, fields)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
@ -67,57 +78,59 @@ class ResourceHandler(IndexHandler):
|
|||||||
super(ResourceHandler, self).__init__(index_source)
|
super(ResourceHandler, self).__init__(index_source)
|
||||||
self.resource_loaders = resource_loaders
|
self.resource_loaders = resource_loaders
|
||||||
|
|
||||||
|
def get_supported_modes(self):
|
||||||
|
res = super(ResourceHandler, self).get_supported_modes()
|
||||||
|
res['modes'].append('resource')
|
||||||
|
return res
|
||||||
|
|
||||||
def __call__(self, params):
|
def __call__(self, params):
|
||||||
if params.get('mode', 'resource') != 'resource':
|
if params.get('mode', 'resource') != 'resource':
|
||||||
return super(ResourceHandler, self).__call__(params)
|
return super(ResourceHandler, self).__call__(params)
|
||||||
|
|
||||||
input_req = params.get('_input_req')
|
cdx_iter = self._load_index_source(params)
|
||||||
if input_req:
|
last_exc = None
|
||||||
params['alt_url'] = input_req.include_post_query(params.get('url'))
|
|
||||||
|
|
||||||
cdx_iter = self.index_source(params)
|
|
||||||
|
|
||||||
any_found = False
|
|
||||||
|
|
||||||
for cdx in cdx_iter:
|
for cdx in cdx_iter:
|
||||||
any_found = True
|
|
||||||
|
|
||||||
for loader in self.resource_loaders:
|
for loader in self.resource_loaders:
|
||||||
try:
|
try:
|
||||||
resp = loader(cdx, params)
|
resp = loader(cdx, params)
|
||||||
if resp:
|
if resp is not None:
|
||||||
return resp
|
return resp
|
||||||
except ArchiveLoadFailed as e:
|
except WbException as e:
|
||||||
print(e)
|
last_exc = e
|
||||||
pass
|
|
||||||
|
|
||||||
if any_found:
|
if last_exc:
|
||||||
raise ArchiveLoadFailed('Resource Found, could not be Loaded')
|
raise last_exc
|
||||||
|
#raise ArchiveLoadFailed('Resource Found, could not be Loaded')
|
||||||
else:
|
else:
|
||||||
raise ArchiveLoadFailed('No Resource Found')
|
raise NotFoundException('No Resource Found')
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class DefaultResourceHandler(ResourceHandler):
|
class DefaultResourceHandler(ResourceHandler):
|
||||||
def __init__(self, index_source, warc_paths=''):
|
def __init__(self, index_source, warc_paths=''):
|
||||||
loaders = [WARCPathHandler(warc_paths, index_source),
|
loaders = [WARCPathLoader(warc_paths, index_source),
|
||||||
LiveWebHandler()
|
LiveWebLoader()
|
||||||
]
|
]
|
||||||
super(DefaultResourceHandler, self).__init__(index_source, loaders)
|
super(DefaultResourceHandler, self).__init__(index_source, loaders)
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class HandlerSeq(object):
|
class HandlerSeq(object):
|
||||||
def __init__(self, loaders):
|
def __init__(self, handlers):
|
||||||
self.loaders = loaders
|
self.handlers = handlers
|
||||||
|
|
||||||
def __call__(self, params):
|
def __call__(self, params):
|
||||||
for loader in self.loaders:
|
last_exc = None
|
||||||
|
for handler in self.handlers:
|
||||||
try:
|
try:
|
||||||
res = loader(params)
|
res = handler(params)
|
||||||
if res:
|
if res is not None:
|
||||||
return res
|
return res
|
||||||
except ArchiveLoadFailed:
|
except WbException as e:
|
||||||
pass
|
last_exc = e
|
||||||
|
|
||||||
raise ArchiveLoadFailed('No Resource Found')
|
if last_exc:
|
||||||
|
raise last_exc
|
||||||
|
else:
|
||||||
|
raise NotFoundException('No Resource Found')
|
||||||
|
@ -14,6 +14,9 @@ from rezag.liverec import patched_requests as requests
|
|||||||
from rezag.utils import MementoUtils
|
from rezag.utils import MementoUtils
|
||||||
|
|
||||||
|
|
||||||
|
WAYBACK_ORIG_SUFFIX = '{timestamp}id_/{url}'
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class BaseIndexSource(object):
|
class BaseIndexSource(object):
|
||||||
def load_index(self, params): #pragma: no cover
|
def load_index(self, params): #pragma: no cover
|
||||||
@ -22,10 +25,10 @@ class BaseIndexSource(object):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def res_template(template, params):
|
def res_template(template, params):
|
||||||
src_params = params.get('_src_params')
|
src_params = params.get('_src_params')
|
||||||
if src_params:
|
if not src_params:
|
||||||
res = template.format(**src_params)
|
res = template.format(url=params['url'])
|
||||||
else:
|
else:
|
||||||
res = template
|
res = template.format(url=params['url'], **src_params)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
@ -59,7 +62,7 @@ class RemoteIndexSource(BaseIndexSource):
|
|||||||
|
|
||||||
def load_index(self, params):
|
def load_index(self, params):
|
||||||
api_url = self.res_template(self.api_url_template, params)
|
api_url = self.res_template(self.api_url_template, params)
|
||||||
api_url += '?url=' + params['url']
|
print('API URL', api_url)
|
||||||
r = requests.get(api_url, timeout=params.get('_timeout'))
|
r = requests.get(api_url, timeout=params.get('_timeout'))
|
||||||
if r.status_code >= 400:
|
if r.status_code >= 400:
|
||||||
raise NotFoundException(api_url)
|
raise NotFoundException(api_url)
|
||||||
@ -169,7 +172,6 @@ class MementoIndexSource(BaseIndexSource):
|
|||||||
|
|
||||||
def get_timegate_links(self, params, closest):
|
def get_timegate_links(self, params, closest):
|
||||||
url = self.res_template(self.timegate_url, params)
|
url = self.res_template(self.timegate_url, params)
|
||||||
url += params['url']
|
|
||||||
accept_dt = timestamp_to_http_date(closest)
|
accept_dt = timestamp_to_http_date(closest)
|
||||||
res = requests.head(url, headers={'Accept-Datetime': accept_dt})
|
res = requests.head(url, headers={'Accept-Datetime': accept_dt})
|
||||||
if res.status_code >= 400:
|
if res.status_code >= 400:
|
||||||
@ -179,7 +181,6 @@ class MementoIndexSource(BaseIndexSource):
|
|||||||
|
|
||||||
def get_timemap_links(self, params):
|
def get_timemap_links(self, params):
|
||||||
url = self.res_template(self.timemap_url, params)
|
url = self.res_template(self.timemap_url, params)
|
||||||
url += params['url']
|
|
||||||
res = requests.get(url, timeout=params.get('_timeout'))
|
res = requests.get(url, timeout=params.get('_timeout'))
|
||||||
if res.status_code >= 400:
|
if res.status_code >= 400:
|
||||||
raise NotFoundException(url)
|
raise NotFoundException(url)
|
||||||
@ -200,9 +201,9 @@ class MementoIndexSource(BaseIndexSource):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_timegate_url(timegate_url, path='link'):
|
def from_timegate_url(timegate_url, path='link'):
|
||||||
return MementoIndexSource(timegate_url,
|
return MementoIndexSource(timegate_url + '{url}',
|
||||||
timegate_url + 'timemap/' + path + '/',
|
timegate_url + 'timemap/' + path + '/{url}',
|
||||||
timegate_url + '{timestamp}id_/{url}')
|
timegate_url + WAYBACK_ORIG_SUFFIX)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'memento'
|
return 'memento'
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
from pywb.utils.loaders import extract_client_cookie
|
|
||||||
from pywb.utils.loaders import extract_post_query, append_post_query
|
from pywb.utils.loaders import extract_post_query, append_post_query
|
||||||
from pywb.utils.loaders import LimitReader
|
from pywb.utils.loaders import LimitReader
|
||||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||||
@ -9,7 +8,7 @@ from io import BytesIO
|
|||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class WSGIInputRequest(object):
|
class DirectWSGIInputRequest(object):
|
||||||
def __init__(self, env):
|
def __init__(self, env):
|
||||||
self.env = env
|
self.env = env
|
||||||
|
|
||||||
@ -20,26 +19,10 @@ class WSGIInputRequest(object):
|
|||||||
headers = {}
|
headers = {}
|
||||||
|
|
||||||
for name, value in iteritems(self.env):
|
for name, value in iteritems(self.env):
|
||||||
|
# will be set by requests to match actual host
|
||||||
if name == 'HTTP_HOST':
|
if name == 'HTTP_HOST':
|
||||||
#name = 'Host'
|
|
||||||
#value = splits.netloc
|
|
||||||
# will be set automatically
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
#elif name == 'HTTP_ORIGIN':
|
|
||||||
# name = 'Origin'
|
|
||||||
# value = (splits.scheme + '://' + splits.netloc)
|
|
||||||
|
|
||||||
elif name == 'HTTP_X_CSRFTOKEN':
|
|
||||||
name = 'X-CSRFToken'
|
|
||||||
cookie_val = extract_client_cookie(env, 'csrftoken')
|
|
||||||
if cookie_val:
|
|
||||||
value = cookie_val
|
|
||||||
|
|
||||||
#elif name == 'HTTP_X_FORWARDED_PROTO':
|
|
||||||
# name = 'X-Forwarded-Proto'
|
|
||||||
# value = splits.scheme
|
|
||||||
|
|
||||||
elif name.startswith('HTTP_'):
|
elif name.startswith('HTTP_'):
|
||||||
name = name[5:].title().replace('_', '-')
|
name = name[5:].title().replace('_', '-')
|
||||||
|
|
||||||
@ -55,10 +38,7 @@ class WSGIInputRequest(object):
|
|||||||
return headers
|
return headers
|
||||||
|
|
||||||
def get_req_body(self):
|
def get_req_body(self):
|
||||||
input_ = self.env.get('wsgi.input')
|
input_ = self.env['wsgi.input']
|
||||||
if not input_:
|
|
||||||
return None
|
|
||||||
|
|
||||||
len_ = self._get_content_length()
|
len_ = self._get_content_length()
|
||||||
enc = self._get_header('Transfer-Encoding')
|
enc = self._get_header('Transfer-Encoding')
|
||||||
|
|
||||||
@ -70,9 +50,6 @@ class WSGIInputRequest(object):
|
|||||||
data = None
|
data = None
|
||||||
|
|
||||||
return data
|
return data
|
||||||
#buf = data.read().decode('utf-8')
|
|
||||||
#print(buf)
|
|
||||||
#return StringIO(buf)
|
|
||||||
|
|
||||||
def _get_content_type(self):
|
def _get_content_type(self):
|
||||||
return self.env.get('CONTENT_TYPE')
|
return self.env.get('CONTENT_TYPE')
|
||||||
@ -105,7 +82,7 @@ class WSGIInputRequest(object):
|
|||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class POSTInputRequest(WSGIInputRequest):
|
class POSTInputRequest(DirectWSGIInputRequest):
|
||||||
def __init__(self, env):
|
def __init__(self, env):
|
||||||
self.env = env
|
self.env = env
|
||||||
|
|
||||||
|
@ -2,6 +2,7 @@ from rezag.liverec import BaseRecorder
|
|||||||
from rezag.liverec import request as remote_request
|
from rezag.liverec import request as remote_request
|
||||||
|
|
||||||
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_http_date
|
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_http_date
|
||||||
|
from pywb.utils.wbexception import LiveResourceException
|
||||||
from pywb.warc.resolvingloader import ResolvingLoader
|
from pywb.warc.resolvingloader import ResolvingLoader
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
@ -29,7 +30,7 @@ def incr_reader(stream, header=None, size=8192):
|
|||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class WARCPathHandler(object):
|
class WARCPathLoader(object):
|
||||||
def __init__(self, paths, cdx_source):
|
def __init__(self, paths, cdx_source):
|
||||||
self.paths = paths
|
self.paths = paths
|
||||||
if isinstance(paths, str):
|
if isinstance(paths, str):
|
||||||
@ -108,7 +109,7 @@ class HeaderRecorder(BaseRecorder):
|
|||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class LiveWebHandler(object):
|
class LiveWebLoader(object):
|
||||||
SKIP_HEADERS = (b'link',
|
SKIP_HEADERS = (b'link',
|
||||||
b'memento-datetime',
|
b'memento-datetime',
|
||||||
b'content-location',
|
b'content-location',
|
||||||
@ -140,14 +141,17 @@ class LiveWebHandler(object):
|
|||||||
method = input_req.get_req_method()
|
method = input_req.get_req_method()
|
||||||
data = input_req.get_req_body()
|
data = input_req.get_req_body()
|
||||||
|
|
||||||
upstream_res = remote_request(url=load_url,
|
try:
|
||||||
method=method,
|
upstream_res = remote_request(url=load_url,
|
||||||
recorder=recorder,
|
method=method,
|
||||||
stream=True,
|
recorder=recorder,
|
||||||
allow_redirects=False,
|
stream=True,
|
||||||
headers=req_headers,
|
allow_redirects=False,
|
||||||
data=data,
|
headers=req_headers,
|
||||||
timeout=params.get('_timeout'))
|
data=data,
|
||||||
|
timeout=params.get('_timeout'))
|
||||||
|
except Exception:
|
||||||
|
raise LiveResourceException(load_url)
|
||||||
|
|
||||||
resp_headers = recorder.get_header()
|
resp_headers = recorder.get_header()
|
||||||
|
|
||||||
@ -175,7 +179,7 @@ class LiveWebHandler(object):
|
|||||||
return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
|
return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _make_warc_id(id_=None):
|
def _make_warc_id(id_=None): #pragma: no cover
|
||||||
if not id_:
|
if not id_:
|
||||||
id_ = uuid.uuid1()
|
id_ = uuid.uuid1()
|
||||||
return '<urn:uuid:{0}>'.format(id_)
|
return '<urn:uuid:{0}>'.format(id_)
|
||||||
|
@ -77,6 +77,7 @@ class MementoUtils(object):
|
|||||||
from_date = timestamp_to_http_date(first_cdx['timestamp'])
|
from_date = timestamp_to_http_date(first_cdx['timestamp'])
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
first_cdx = None
|
first_cdx = None
|
||||||
|
return
|
||||||
|
|
||||||
# first memento link
|
# first memento link
|
||||||
yield MementoUtils.make_timemap_memento_link(first_cdx, datetime=from_date)
|
yield MementoUtils.make_timemap_memento_link(first_cdx, datetime=from_date)
|
||||||
@ -91,4 +92,4 @@ class MementoUtils(object):
|
|||||||
|
|
||||||
# last memento link, if any
|
# last memento link, if any
|
||||||
if prev_cdx:
|
if prev_cdx:
|
||||||
yield MementoUtils.make_timemap_memento_link(prev_cdx, end='')
|
yield MementoUtils.make_timemap_memento_link(prev_cdx, end='\n')
|
||||||
|
5
setup.py
5
setup.py
@ -32,8 +32,11 @@ setup(
|
|||||||
'rezag',
|
'rezag',
|
||||||
],
|
],
|
||||||
install_requires=[
|
install_requires=[
|
||||||
'pywb',
|
'pywb==1.0b',
|
||||||
],
|
],
|
||||||
|
dependency_links=[
|
||||||
|
'git+https://github.com/ikreymer/pywb.git@py3#egg=pywb-1.0b-py3',
|
||||||
|
],
|
||||||
zip_safe=True,
|
zip_safe=True,
|
||||||
entry_points="""
|
entry_points="""
|
||||||
[console_scripts]
|
[console_scripts]
|
||||||
|
@ -33,6 +33,9 @@ def setup_module():
|
|||||||
shutil.copy(to_path('testdata/iana.cdxj'), coll_B)
|
shutil.copy(to_path('testdata/iana.cdxj'), coll_B)
|
||||||
shutil.copy(to_path('testdata/dupes.cdxj'), coll_C)
|
shutil.copy(to_path('testdata/dupes.cdxj'), coll_C)
|
||||||
|
|
||||||
|
with open(to_path(root_dir) + 'somefile', 'w') as fh:
|
||||||
|
fh.write('foo')
|
||||||
|
|
||||||
global dir_loader
|
global dir_loader
|
||||||
dir_loader = DirectoryIndexAggregator(dir_prefix, dir_path)
|
dir_loader = DirectoryIndexAggregator(dir_prefix, dir_path)
|
||||||
|
|
||||||
@ -121,7 +124,7 @@ def test_agg_dir_and_memento():
|
|||||||
'local': dir_loader}
|
'local': dir_loader}
|
||||||
agg_source = SimpleAggregator(sources)
|
agg_source = SimpleAggregator(sources)
|
||||||
|
|
||||||
res = agg_source({'url': 'example.com/', 'param.coll': '*', 'closest': '20100512', 'limit': 6})
|
res = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})
|
||||||
|
|
||||||
exp = [
|
exp = [
|
||||||
{'source': 'ia', 'timestamp': '20100513052358', 'load_url': 'http://web.archive.org/web/20100513052358id_/http://example.com/'},
|
{'source': 'ia', 'timestamp': '20100513052358', 'load_url': 'http://web.archive.org/web/20100513052358id_/http://example.com/'},
|
||||||
@ -144,7 +147,7 @@ def test_agg_no_dir_1():
|
|||||||
|
|
||||||
|
|
||||||
def test_agg_no_dir_2():
|
def test_agg_no_dir_2():
|
||||||
loader = DirectoryIndexAggregator(root_dir, 'no_such')
|
loader = DirectoryIndexAggregator(root_dir, '')
|
||||||
res = loader({'url': 'example.com/', 'param.coll': 'X'})
|
res = loader({'url': 'example.com/', 'param.coll': 'X'})
|
||||||
|
|
||||||
exp = []
|
exp = []
|
||||||
@ -152,4 +155,31 @@ def test_agg_no_dir_2():
|
|||||||
assert(to_json_list(res) == exp)
|
assert(to_json_list(res) == exp)
|
||||||
|
|
||||||
|
|
||||||
|
def test_agg_dir_sources_1():
|
||||||
|
res = dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
|
||||||
|
exp = {'sources': {'colls/A/indexes': 'file',
|
||||||
|
'colls/B/indexes': 'file',
|
||||||
|
'colls/C/indexes': 'file'}
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(res == exp)
|
||||||
|
|
||||||
|
|
||||||
|
def test_agg_dir_sources_2():
|
||||||
|
res = dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'})
|
||||||
|
exp = {'sources': {'colls/A/indexes': 'file',
|
||||||
|
'colls/C/indexes': 'file'}
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(res == exp)
|
||||||
|
|
||||||
|
|
||||||
|
def test_agg_dir_sources_single_dir():
|
||||||
|
loader = DirectoryIndexAggregator('testdata/', '')
|
||||||
|
res = loader.get_source_list({'url': 'example.com/'})
|
||||||
|
|
||||||
|
exp = {'sources': {}}
|
||||||
|
|
||||||
|
assert(res == exp)
|
||||||
|
|
||||||
|
|
||||||
|
@ -42,13 +42,17 @@ def setup_module(self):
|
|||||||
source3 = SimpleAggregator({'example': FileIndexSource(to_path('testdata/example.cdxj'))})
|
source3 = SimpleAggregator({'example': FileIndexSource(to_path('testdata/example.cdxj'))})
|
||||||
handler3 = DefaultResourceHandler(source3, to_path('testdata/'))
|
handler3 = DefaultResourceHandler(source3, to_path('testdata/'))
|
||||||
|
|
||||||
|
|
||||||
add_route('/fallback', HandlerSeq([handler3,
|
add_route('/fallback', HandlerSeq([handler3,
|
||||||
handler2,
|
handler2,
|
||||||
live_handler]))
|
live_handler]))
|
||||||
|
|
||||||
|
add_route('/seq', HandlerSeq([handler3,
|
||||||
|
handler2]))
|
||||||
|
|
||||||
bottle.debug = True
|
add_route('/empty', HandlerSeq([]))
|
||||||
|
add_route('/invalid', HandlerSeq(['foo']))
|
||||||
|
|
||||||
|
application.debug = True
|
||||||
global testapp
|
global testapp
|
||||||
testapp = webtest.TestApp(application)
|
testapp = webtest.TestApp(application)
|
||||||
|
|
||||||
@ -61,8 +65,23 @@ class TestResAgg(object):
|
|||||||
def setup(self):
|
def setup(self):
|
||||||
self.testapp = testapp
|
self.testapp = testapp
|
||||||
|
|
||||||
|
def test_list_handlers(self):
|
||||||
|
resp = self.testapp.get('/many?mode=list_modes')
|
||||||
|
assert resp.json == {'modes': ['list_modes', 'list_sources', 'index', 'resource']}
|
||||||
|
|
||||||
|
resp = self.testapp.get('/many?mode=other')
|
||||||
|
assert resp.json == {'modes': ['list_modes', 'list_sources', 'index', 'resource']}
|
||||||
|
|
||||||
|
# defaults to resource, must specify url
|
||||||
|
resp = self.testapp.get('/many', status=400)
|
||||||
|
assert resp.json == {'message': 'The "url" param is required'}
|
||||||
|
|
||||||
|
def test_list_sources(self):
|
||||||
|
resp = self.testapp.get('/many?mode=list_sources')
|
||||||
|
assert resp.json == {'sources': {'local': 'file_dir', 'ia': 'memento', 'rhiz': 'memento', 'live': 'live'}}
|
||||||
|
|
||||||
def test_live_index(self):
|
def test_live_index(self):
|
||||||
resp = self.testapp.get('/live?url=http://httpbin.org/get&mode=index&output=json')
|
resp = self.testapp.get('/live?mode=index&url=http://httpbin.org/get&output=json')
|
||||||
resp.charset = 'utf-8'
|
resp.charset = 'utf-8'
|
||||||
|
|
||||||
res = to_json_list(resp.text)
|
res = to_json_list(resp.text)
|
||||||
@ -71,7 +90,8 @@ class TestResAgg(object):
|
|||||||
'load_url': 'http://httpbin.org/get', 'source': 'live', 'timestamp': '2016'}])
|
'load_url': 'http://httpbin.org/get', 'source': 'live', 'timestamp': '2016'}])
|
||||||
|
|
||||||
def test_live_resource(self):
|
def test_live_resource(self):
|
||||||
resp = self.testapp.get('/live?url=http://httpbin.org/get?foo=bar&mode=resource')
|
headers = {'foo': 'bar'}
|
||||||
|
resp = self.testapp.get('/live?url=http://httpbin.org/get?foo=bar', headers=headers)
|
||||||
|
|
||||||
assert resp.headers['WARC-Coll'] == 'live'
|
assert resp.headers['WARC-Coll'] == 'live'
|
||||||
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar'
|
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar'
|
||||||
@ -82,7 +102,7 @@ class TestResAgg(object):
|
|||||||
|
|
||||||
|
|
||||||
def test_live_post_resource(self):
|
def test_live_post_resource(self):
|
||||||
resp = self.testapp.post('/live?url=http://httpbin.org/post&mode=resource',
|
resp = self.testapp.post('/live?url=http://httpbin.org/post',
|
||||||
OrderedDict([('foo', 'bar')]))
|
OrderedDict([('foo', 'bar')]))
|
||||||
|
|
||||||
assert resp.headers['WARC-Coll'] == 'live'
|
assert resp.headers['WARC-Coll'] == 'live'
|
||||||
@ -204,6 +224,11 @@ foo=bar&test=abc"""
|
|||||||
assert resp.headers['WARC-Target-URI'] == 'http://example.com/'
|
assert resp.headers['WARC-Target-URI'] == 'http://example.com/'
|
||||||
assert b'HTTP/1.1 200 OK' in resp.body
|
assert b'HTTP/1.1 200 OK' in resp.body
|
||||||
|
|
||||||
|
def test_error_fallback_live_not_found(self):
|
||||||
|
resp = self.testapp.get('/fallback?url=http://invalid.url-not-found', status=400)
|
||||||
|
|
||||||
|
assert resp.json == {'message': 'http://invalid.url-not-found'}
|
||||||
|
|
||||||
def test_agg_local_revisit(self):
|
def test_agg_local_revisit(self):
|
||||||
resp = self.testapp.get('/many?url=http://www.example.com/&closest=20140127171251&sources=local')
|
resp = self.testapp.get('/many?url=http://www.example.com/&closest=20140127171251&sources=local')
|
||||||
|
|
||||||
@ -214,3 +239,24 @@ foo=bar&test=abc"""
|
|||||||
assert resp.headers['WARC-Refers-To-Date'] == '2014-01-27T17:12:00Z'
|
assert resp.headers['WARC-Refers-To-Date'] == '2014-01-27T17:12:00Z'
|
||||||
assert b'HTTP/1.1 200 OK' in resp.body
|
assert b'HTTP/1.1 200 OK' in resp.body
|
||||||
assert b'<!doctype html>' in resp.body
|
assert b'<!doctype html>' in resp.body
|
||||||
|
|
||||||
|
def test_error_invalid_index_output(self):
|
||||||
|
resp = self.testapp.get('/live?mode=index&url=http://httpbin.org/get&output=foobar', status=400)
|
||||||
|
|
||||||
|
assert resp.json == {'message': 'output=foobar not supported'}
|
||||||
|
|
||||||
|
def test_error_local_not_found(self):
|
||||||
|
resp = self.testapp.get('/many?url=http://not-found.error/&sources=local', status=404)
|
||||||
|
|
||||||
|
assert resp.json == {'message': 'No Resource Found'}
|
||||||
|
|
||||||
|
def test_error_empty(self):
|
||||||
|
resp = self.testapp.get('/empty?url=http://example.com/', status=404)
|
||||||
|
|
||||||
|
assert resp.json == {'message': 'No Resource Found'}
|
||||||
|
|
||||||
|
def test_error_invalid(self):
|
||||||
|
resp = self.testapp.get('/invalid?url=http://example.com/', status=500)
|
||||||
|
|
||||||
|
assert resp.json['message'].startswith('Internal Error')
|
||||||
|
|
||||||
|
@ -32,16 +32,20 @@ local_sources = [
|
|||||||
|
|
||||||
|
|
||||||
remote_sources = [
|
remote_sources = [
|
||||||
RemoteIndexSource('http://webenact.rhizome.org/all-cdx',
|
RemoteIndexSource('http://webenact.rhizome.org/all-cdx?url={url}',
|
||||||
'http://webenact.rhizome.org/all/{timestamp}id_/{url}'),
|
'http://webenact.rhizome.org/all/{timestamp}id_/{url}'),
|
||||||
|
|
||||||
MementoIndexSource('http://webenact.rhizome.org/all/',
|
MementoIndexSource('http://webenact.rhizome.org/all/{url}',
|
||||||
'http://webenact.rhizome.org/all/timemap/*/',
|
'http://webenact.rhizome.org/all/timemap/*/{url}',
|
||||||
'http://webenact.rhizome.org/all/{timestamp}id_/{url}')
|
'http://webenact.rhizome.org/all/{timestamp}id_/{url}')
|
||||||
]
|
]
|
||||||
|
|
||||||
|
ait_source = RemoteIndexSource('http://wayback.archive-it.org/cdx?url={url}',
|
||||||
|
'http://wayback.archive-it.org/all/{timestamp}id_/{url}')
|
||||||
|
|
||||||
|
|
||||||
def query_single_source(source, params):
|
def query_single_source(source, params):
|
||||||
|
string = str(source)
|
||||||
return SimpleAggregator({'source': source})(params)
|
return SimpleAggregator({'source': source})(params)
|
||||||
|
|
||||||
|
|
||||||
@ -182,4 +186,22 @@ def test_file_not_found():
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_ait_filters():
|
||||||
|
ait_source = RemoteIndexSource('http://wayback.archive-it.org/cdx/search/cdx?url={url}&filter=filename:ARCHIVEIT-({colls})-.*',
|
||||||
|
'http://wayback.archive-it.org/all/{timestamp}id_/{url}')
|
||||||
|
|
||||||
|
cdxlist = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '5610|933'})
|
||||||
|
filenames = [cdx['filename'] for cdx in cdxlist]
|
||||||
|
|
||||||
|
prefix = ('ARCHIVEIT-5610-', 'ARCHIVEIT-933-')
|
||||||
|
|
||||||
|
assert(all([x.startswith(prefix) for x in filenames]))
|
||||||
|
|
||||||
|
|
||||||
|
cdxlist = query_single_source(ait_source, {'url': 'http://iana.org/', 'param.source.colls': '1883|366|905'})
|
||||||
|
filenames = [cdx['filename'] for cdx in cdxlist]
|
||||||
|
|
||||||
|
prefix = ('ARCHIVEIT-1883-', 'ARCHIVEIT-366-', 'ARCHIVEIT-905-')
|
||||||
|
|
||||||
|
assert(all([x.startswith(prefix) for x in filenames]))
|
||||||
|
|
||||||
|
@ -27,10 +27,11 @@ aggs = {'simple': SimpleAggregator(sources),
|
|||||||
'processes': ThreadedTimeoutAggregator(sources, timeout=5.0, use_processes=True),
|
'processes': ThreadedTimeoutAggregator(sources, timeout=5.0, use_processes=True),
|
||||||
}
|
}
|
||||||
|
|
||||||
#@pytest.mark.parametrize("agg", aggs, ids=["simple", "gevent_timeout"])
|
#def pytest_generate_tests(metafunc):
|
||||||
def pytest_generate_tests(metafunc):
|
# metafunc.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
||||||
metafunc.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
||||||
def test_mem_agg_index_1(agg):
|
def test_mem_agg_index_1(agg):
|
||||||
url = 'http://iana.org/'
|
url = 'http://iana.org/'
|
||||||
res = agg(dict(url=url, closest='20140126000000', limit=5))
|
res = agg(dict(url=url, closest='20140126000000', limit=5))
|
||||||
@ -46,6 +47,7 @@ def test_mem_agg_index_1(agg):
|
|||||||
assert(json_list(res) == exp)
|
assert(json_list(res) == exp)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
||||||
def test_mem_agg_index_2(agg):
|
def test_mem_agg_index_2(agg):
|
||||||
url = 'http://example.com/'
|
url = 'http://example.com/'
|
||||||
res = agg(dict(url=url, closest='20100512', limit=6))
|
res = agg(dict(url=url, closest='20100512', limit=6))
|
||||||
@ -60,6 +62,7 @@ def test_mem_agg_index_2(agg):
|
|||||||
assert(json_list(res) == exp)
|
assert(json_list(res) == exp)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
||||||
def test_mem_agg_index_3(agg):
|
def test_mem_agg_index_3(agg):
|
||||||
url = 'http://vvork.com/'
|
url = 'http://vvork.com/'
|
||||||
res = agg(dict(url=url, closest='20141001', limit=5))
|
res = agg(dict(url=url, closest='20141001', limit=5))
|
||||||
@ -73,6 +76,7 @@ def test_mem_agg_index_3(agg):
|
|||||||
assert(json_list(res) == exp)
|
assert(json_list(res) == exp)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
|
||||||
def test_mem_agg_index_4(agg):
|
def test_mem_agg_index_4(agg):
|
||||||
url = 'http://vvork.com/'
|
url = 'http://vvork.com/'
|
||||||
res = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
|
res = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
|
||||||
@ -83,10 +87,11 @@ def test_mem_agg_index_4(agg):
|
|||||||
assert(json_list(res) == exp)
|
assert(json_list(res) == exp)
|
||||||
|
|
||||||
|
|
||||||
def test_handler_output_cdxj(agg):
|
def test_handler_output_cdxj():
|
||||||
loader = IndexHandler(agg)
|
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||||
|
handler = IndexHandler(agg)
|
||||||
url = 'http://vvork.com/'
|
url = 'http://vvork.com/'
|
||||||
res = loader(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
|
res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
|
||||||
|
|
||||||
exp = """\
|
exp = """\
|
||||||
com,vvork)/ 20141006184357 {"url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
|
com,vvork)/ 20141006184357 {"url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
|
||||||
@ -96,10 +101,11 @@ com,vvork)/ 20131004231540 {"url": "http://vvork.com/", "mem_rel": "last memento
|
|||||||
assert(''.join(res) == exp)
|
assert(''.join(res) == exp)
|
||||||
|
|
||||||
|
|
||||||
def test_handler_output_json(agg):
|
def test_handler_output_json():
|
||||||
loader = IndexHandler(agg)
|
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||||
|
handler = IndexHandler(agg)
|
||||||
url = 'http://vvork.com/'
|
url = 'http://vvork.com/'
|
||||||
res = loader(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='json'))
|
res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='json'))
|
||||||
|
|
||||||
exp = """\
|
exp = """\
|
||||||
{"urlkey": "com,vvork)/", "timestamp": "20141006184357", "url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
|
{"urlkey": "com,vvork)/", "timestamp": "20141006184357", "url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
|
||||||
@ -109,22 +115,50 @@ def test_handler_output_json(agg):
|
|||||||
assert(''.join(res) == exp)
|
assert(''.join(res) == exp)
|
||||||
|
|
||||||
|
|
||||||
def test_handler_output_link(agg):
|
def test_handler_output_link():
|
||||||
loader = IndexHandler(agg)
|
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||||
|
handler = IndexHandler(agg)
|
||||||
url = 'http://vvork.com/'
|
url = 'http://vvork.com/'
|
||||||
res = loader(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='link'))
|
res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='link'))
|
||||||
|
|
||||||
exp = """\
|
exp = """\
|
||||||
<http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/>; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT"; src="rhiz",
|
<http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/>; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT"; src="rhiz",
|
||||||
<http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/>; rel="memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT"; src="ait"\
|
<http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/>; rel="memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT"; src="ait"
|
||||||
"""
|
"""
|
||||||
assert(''.join(res) == exp)
|
assert(''.join(res) == exp)
|
||||||
|
|
||||||
|
|
||||||
def test_handler_output_text(agg):
|
def test_handler_output_link_2():
|
||||||
loader = IndexHandler(agg)
|
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||||
|
handler = IndexHandler(agg)
|
||||||
|
url = 'http://iana.org/'
|
||||||
|
res = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
|
||||||
|
|
||||||
|
exp = """\
|
||||||
|
<http://web.archive.org/web/20140126093743id_/http://iana.org/>; rel="memento"; datetime="Sun, 26 Jan 2014 09:37:43 GMT"; src="ia",
|
||||||
|
<filename://iana.warc.gz>; rel="memento"; datetime="Sun, 26 Jan 2014 20:06:24 GMT"; src="local",
|
||||||
|
<http://web.archive.org/web/20140123034755id_/http://iana.org/>; rel="memento"; datetime="Thu, 23 Jan 2014 03:47:55 GMT"; src="ia",
|
||||||
|
<http://web.archive.org/web/20140129175203id_/http://iana.org/>; rel="memento"; datetime="Wed, 29 Jan 2014 17:52:03 GMT"; src="ia",
|
||||||
|
<http://wayback.archive-it.org/all/20140107040552id_/http://iana.org/>; rel="memento"; datetime="Tue, 07 Jan 2014 04:05:52 GMT"; src="ait"
|
||||||
|
"""
|
||||||
|
assert(''.join(res) == exp)
|
||||||
|
|
||||||
|
|
||||||
|
def test_handler_output_link_3():
|
||||||
|
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||||
|
handler = IndexHandler(agg)
|
||||||
|
url = 'http://foo.bar.non-existent'
|
||||||
|
res = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
|
||||||
|
|
||||||
|
exp = ''
|
||||||
|
|
||||||
|
assert(''.join(res) == exp)
|
||||||
|
|
||||||
|
def test_handler_output_text():
|
||||||
|
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||||
|
handler = IndexHandler(agg)
|
||||||
url = 'http://vvork.com/'
|
url = 'http://vvork.com/'
|
||||||
res = loader(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='text'))
|
res = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='text'))
|
||||||
|
|
||||||
exp = """\
|
exp = """\
|
||||||
com,vvork)/ 20141006184357 http://www.vvork.com/ memento http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/ http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/ rhiz
|
com,vvork)/ 20141006184357 http://www.vvork.com/ memento http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/ http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/ rhiz
|
||||||
@ -133,9 +167,10 @@ com,vvork)/ 20131004231540 http://vvork.com/ last memento http://wayback.archive
|
|||||||
assert(''.join(res) == exp)
|
assert(''.join(res) == exp)
|
||||||
|
|
||||||
|
|
||||||
def test_handler_list_sources(agg):
|
def test_handler_list_sources():
|
||||||
loader = IndexHandler(agg)
|
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||||
res = loader(dict(mode='sources'))
|
handler = IndexHandler(agg)
|
||||||
|
res = handler(dict(mode='list_sources'))
|
||||||
|
|
||||||
assert(res == {'sources': {'bl': 'memento',
|
assert(res == {'sources': {'bl': 'memento',
|
||||||
'ait': 'memento',
|
'ait': 'memento',
|
||||||
@ -143,4 +178,3 @@ def test_handler_list_sources(agg):
|
|||||||
'rhiz': 'memento',
|
'rhiz': 'memento',
|
||||||
'local': 'file'}})
|
'local': 'file'}})
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user