1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

misc fixes: add route listing, more not found tests, timemap use file:// with ranges

This commit is contained in:
Ilya Kreymer 2016-03-01 14:46:05 -08:00
parent 008e5284b1
commit 1f3763d02c
10 changed files with 108 additions and 52 deletions

View File

@ -59,11 +59,14 @@ class BaseAggregator(object):
params['_all_src_params'] = src_params params['_all_src_params'] = src_params
def load_child_source(self, name, source, all_params): def load_child_source_list(self, name, source, params):
return list(self.load_child_source(name, source, params))
def load_child_source(self, name, source, params):
try: try:
_src_params = all_params['_all_src_params'].get(name) _src_params = params['_all_src_params'].get(name)
all_params['_src_params'] = _src_params params['_src_params'] = _src_params
cdx_iter = source.load_index(all_params) cdx_iter = source.load_index(params)
except NotFoundException as nf: except NotFoundException as nf:
print('Not found in ' + name) print('Not found in ' + name)
cdx_iter = iter([]) cdx_iter = iter([])
@ -75,10 +78,10 @@ class BaseAggregator(object):
cdx['source'] = name cdx['source'] = name
return cdx return cdx
return [add_name(cdx) for cdx in cdx_iter] return (add_name(cdx) for cdx in cdx_iter)
def load_index(self, params): def load_index(self, params):
iter_list = list(self._load_all(params)) iter_list = self._load_all(params)
#optimization: if only a single entry (or empty) just load directly #optimization: if only a single entry (or empty) just load directly
if len(iter_list) <= 1: if len(iter_list) <= 1:
@ -130,9 +133,9 @@ class SeqAggMixin(object):
def _load_all(self, params): def _load_all(self, params):
sources = list(self._iter_sources(params)) sources = self._iter_sources(params)
return list([self.load_child_source(name, source, params) return [self.load_child_source(name, source, params)
for name, source in sources]) for name, source in sources]
#============================================================================= #=============================================================================
@ -232,7 +235,7 @@ class ConcurrentMixin(object):
with self.pool_class(max_workers=self.size) as executor: with self.pool_class(max_workers=self.size) as executor:
def do_spawn(name, source): def do_spawn(name, source):
return executor.submit(self.load_child_source, return executor.submit(self.load_child_source_list,
name, source, params), name name, source, params), name
jobs = dict([do_spawn(name, source) for name, source in sources]) jobs = dict([do_spawn(name, source) for name, source in sources])
@ -255,10 +258,10 @@ class ThreadedTimeoutAggregator(TimeoutMixin, ConcurrentMixin, BaseSourceListAgg
#============================================================================= #=============================================================================
class BaseDirectoryIndexAggregator(BaseAggregator): class BaseDirectoryIndexSource(BaseAggregator):
CDX_EXT = ('.cdx', '.cdxj') CDX_EXT = ('.cdx', '.cdxj')
def __init__(self, base_prefix, base_dir): def __init__(self, base_prefix, base_dir=''):
self.base_prefix = base_prefix self.base_prefix = base_prefix
self.base_dir = base_dir self.base_dir = base_dir
@ -299,7 +302,7 @@ class BaseDirectoryIndexAggregator(BaseAggregator):
return 'file_dir' return 'file_dir'
class DirectoryIndexAggregator(SeqAggMixin, BaseDirectoryIndexAggregator): class DirectoryIndexSource(SeqAggMixin, BaseDirectoryIndexSource):
pass pass

View File

@ -1,5 +1,6 @@
from rezag.inputrequest import DirectWSGIInputRequest, POSTInputRequest from rezag.inputrequest import DirectWSGIInputRequest, POSTInputRequest
from bottle import route, request, response, default_app, abort from bottle import route, request, response, default_app, abort
import bottle
from pywb.utils.wbexception import WbException from pywb.utils.wbexception import WbException
@ -11,37 +12,53 @@ def err_handler(exc):
response.content_type = 'application/json' response.content_type = 'application/json'
return json.dumps({'message': exc.body}) return json.dumps({'message': exc.body})
def wrap_error(func): def wrap_error(func):
def do_d(*args, **kwargs): def wrap_func(*args, **kwargs):
try: try:
return func(*args, **kwargs) return func(*args, **kwargs)
except WbException as exc: except WbException as exc:
if application.debug: if bottle.debug:
traceback.print_exc() traceback.print_exc()
abort(exc.status(), exc.msg) abort(exc.status(), exc.msg)
except Exception as e: except Exception as e:
if application.debug: if bottle.debug:
traceback.print_exc() traceback.print_exc()
abort(500, 'Internal Error: ' + str(e)) abort(500, 'Internal Error: ' + str(e))
return do_d return wrap_func
route_dict = {}
def add_route(path, handler): def add_route(path, handler):
@route(path, 'ANY')
@wrap_error @wrap_error
def direct_input_request(mode=''): def direct_input_request():
params = dict(request.query) params = dict(request.query)
params['_input_req'] = DirectWSGIInputRequest(request.environ) params['_input_req'] = DirectWSGIInputRequest(request.environ)
return handler(params) return handler(params)
@route(path + '/postreq', 'POST')
@wrap_error @wrap_error
def post_fullrequest(mode=''): def post_fullrequest():
params = dict(request.query) params = dict(request.query)
params['_input_req'] = POSTInputRequest(request.environ) params['_input_req'] = POSTInputRequest(request.environ)
return handler(params) return handler(params)
route(path + '/postreq', method=['POST'], callback=post_fullrequest) global route_dict
route(path, method=['ANY'], callback=direct_input_request) handler_dict = {'handler': handler.get_supported_modes()}
route_dict[path] = handler_dict
route_dict[path + '/postreq'] = handler_dict
@route('/')
def list_routes():
return route_dict
application = default_app() application = default_app()

View File

@ -7,7 +7,7 @@ from bottle import response
#============================================================================= #=============================================================================
def to_cdxj(cdx_iter, fields): def to_cdxj(cdx_iter, fields):
response.headers['Content-Type'] = 'application/x-cdxj' response.headers['Content-Type'] = 'text/x-cdxj'
return [cdx.to_cdxj(fields) for cdx in cdx_iter] return [cdx.to_cdxj(fields) for cdx in cdx_iter]
def to_json(cdx_iter, fields): def to_json(cdx_iter, fields):
@ -120,6 +120,10 @@ class HandlerSeq(object):
def __init__(self, handlers): def __init__(self, handlers):
self.handlers = handlers self.handlers = handlers
def get_supported_modes(self):
return []
# return zip([self.handlers.get_supported_modes()]
def __call__(self, params): def __call__(self, params):
last_exc = None last_exc = None
for handler in self.handlers: for handler in self.handlers:

View File

@ -45,10 +45,13 @@ class FileIndexSource(BaseIndexSource):
except IOError: except IOError:
raise NotFoundException(filename) raise NotFoundException(filename)
with fh: def do_load(fh):
gen = iter_range(fh, params['key'], params['end_key']) with fh:
for line in gen: gen = iter_range(fh, params['key'], params['end_key'])
yield CDXObject(line) for line in gen:
yield CDXObject(line)
return do_load(fh)
def __str__(self): def __str__(self):
return 'file' return 'file'
@ -62,7 +65,6 @@ class RemoteIndexSource(BaseIndexSource):
def load_index(self, params): def load_index(self, params):
api_url = self.res_template(self.api_url_template, params) api_url = self.res_template(self.api_url_template, params)
print('API URL', api_url)
r = requests.get(api_url, timeout=params.get('_timeout')) r = requests.get(api_url, timeout=params.get('_timeout'))
if r.status_code >= 400: if r.status_code >= 400:
raise NotFoundException(api_url) raise NotFoundException(api_url)

View File

@ -12,21 +12,37 @@ import uuid
#============================================================================= #=============================================================================
def incr_reader(stream, header=None, size=8192): class StreamIter(object):
if header: def __init__(self, stream, header=None, size=8192):
yield header self.stream = stream
self.header = header
self.size = size
while True: def __iter__(self):
data = stream.read(size) return self
def __next__(self):
if self.header:
header = self.header
self.header = None
return header
data = self.stream.read(self.size)
if data: if data:
yield data return data
else:
break
try: self.close()
stream.close() raise StopIteration
except:
pass def close(self):
if not self.stream:
return
try:
self.stream.close()
self.stream = None
except Exception:
pass
#============================================================================= #=============================================================================
@ -83,7 +99,8 @@ class WARCPathLoader(object):
response.headers['WARC-Refers-To-Date'] = payload.rec_headers.get_header('WARC-Date') response.headers['WARC-Refers-To-Date'] = payload.rec_headers.get_header('WARC-Date')
headers.stream.close() headers.stream.close()
return incr_reader(record.stream) res = StreamIter(record.stream)
return res
#============================================================================= #=============================================================================
@ -172,7 +189,7 @@ class LiveWebLoader(object):
except: except:
raise raise
return incr_reader(upstream_res.raw, header=resp_headers) return StreamIter(upstream_res.raw, header=resp_headers)
@staticmethod @staticmethod
def _make_date(dt): def _make_date(dt):

View File

@ -59,7 +59,7 @@ class MementoUtils(object):
def make_timemap_memento_link(cdx, datetime=None, rel='memento', end=',\n'): def make_timemap_memento_link(cdx, datetime=None, rel='memento', end=',\n'):
url = cdx.get('load_url') url = cdx.get('load_url')
if not url: if not url:
url = 'filename://' + cdx.get('filename') url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length'))
memento = '<{0}>; rel="{1}"; datetime="{2}"; src="{3}"' + end memento = '<{0}>; rel="{1}"; datetime="{2}"; src="{3}"' + end

View File

@ -5,7 +5,7 @@ import json
from .testutils import to_path from .testutils import to_path
from rezag.aggindexsource import DirectoryIndexAggregator, SimpleAggregator from rezag.aggindexsource import DirectoryIndexSource, SimpleAggregator
from rezag.indexsource import MementoIndexSource from rezag.indexsource import MementoIndexSource
@ -37,7 +37,7 @@ def setup_module():
fh.write('foo') fh.write('foo')
global dir_loader global dir_loader
dir_loader = DirectoryIndexAggregator(dir_prefix, dir_path) dir_loader = DirectoryIndexSource(dir_prefix, dir_path)
global orig_cwd global orig_cwd
orig_cwd = os.getcwd() orig_cwd = os.getcwd()
@ -147,7 +147,7 @@ def test_agg_no_dir_1():
def test_agg_no_dir_2(): def test_agg_no_dir_2():
loader = DirectoryIndexAggregator(root_dir, '') loader = DirectoryIndexSource(root_dir, '')
res = loader({'url': 'example.com/', 'param.coll': 'X'}) res = loader({'url': 'example.com/', 'param.coll': 'X'})
exp = [] exp = []
@ -175,7 +175,7 @@ def test_agg_dir_sources_2():
def test_agg_dir_sources_single_dir(): def test_agg_dir_sources_single_dir():
loader = DirectoryIndexAggregator('testdata/', '') loader = DirectoryIndexSource('testdata/', '')
res = loader.get_source_list({'url': 'example.com/'}) res = loader.get_source_list({'url': 'example.com/'})
exp = {'sources': {}} exp = {'sources': {}}

View File

@ -6,7 +6,7 @@ from rezag.handlers import DefaultResourceHandler, HandlerSeq
from rezag.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource from rezag.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource
from rezag.aggindexsource import GeventTimeoutAggregator, SimpleAggregator from rezag.aggindexsource import GeventTimeoutAggregator, SimpleAggregator
from rezag.aggindexsource import DirectoryIndexAggregator from rezag.aggindexsource import DirectoryIndexSource
from rezag.app import add_route, application from rezag.app import add_route, application
@ -18,7 +18,7 @@ from .testutils import to_path
import json import json
sources = { sources = {
'local': DirectoryIndexAggregator(to_path('testdata/'), ''), 'local': DirectoryIndexSource(to_path('testdata/'), ''),
'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'), 'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*'), 'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*'),
'live': LiveIndexSource(), 'live': LiveIndexSource(),

View File

@ -162,7 +162,6 @@ def test_all_not_found(source):
assert(key_ts_res(res) == expected) assert(key_ts_res(res) == expected)
# ============================================================================ # ============================================================================
def test_another_remote_not_found(): def test_another_remote_not_found():
source = MementoIndexSource.from_timegate_url('http://www.webarchive.org.uk/wayback/archive/') source = MementoIndexSource.from_timegate_url('http://www.webarchive.org.uk/wayback/archive/')
@ -180,12 +179,11 @@ def test_file_not_found():
url = 'http://x-not-found-x.notfound/' url = 'http://x-not-found-x.notfound/'
res = query_single_source(source, dict(url=url, limit=3)) res = query_single_source(source, dict(url=url, limit=3))
expected = '' expected = ''
assert(key_ts_res(res) == expected) assert(key_ts_res(res) == expected)
# ============================================================================
def test_ait_filters(): def test_ait_filters():
ait_source = RemoteIndexSource('http://wayback.archive-it.org/cdx/search/cdx?url={url}&filter=filename:ARCHIVEIT-({colls})-.*', ait_source = RemoteIndexSource('http://wayback.archive-it.org/cdx/search/cdx?url={url}&filter=filename:ARCHIVEIT-({colls})-.*',
'http://wayback.archive-it.org/all/{timestamp}id_/{url}') 'http://wayback.archive-it.org/all/{timestamp}id_/{url}')

View File

@ -27,6 +27,13 @@ aggs = {'simple': SimpleAggregator(sources),
'processes': ThreadedTimeoutAggregator(sources, timeout=5.0, use_processes=True), 'processes': ThreadedTimeoutAggregator(sources, timeout=5.0, use_processes=True),
} }
nf = {'notfound': FileIndexSource(to_path('testdata/not-found-x'))}
agg_nf = {'simple': SimpleAggregator(nf),
'gevent': GeventTimeoutAggregator(nf, timeout=5.0),
'threaded': ThreadedTimeoutAggregator(nf, timeout=5.0),
'processes': ThreadedTimeoutAggregator(nf, timeout=5.0, use_processes=True),
}
#def pytest_generate_tests(metafunc): #def pytest_generate_tests(metafunc):
# metafunc.parametrize("agg", list(aggs.values()), ids=list(aggs.keys())) # metafunc.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
@ -87,6 +94,14 @@ def test_mem_agg_index_4(agg):
assert(json_list(res) == exp) assert(json_list(res) == exp)
@pytest.mark.parametrize("agg", list(agg_nf.values()), ids=list(agg_nf.keys()))
def test_mem_agg_not_found(agg):
url = 'http://vvork.com/'
res = agg(dict(url=url, closest='20141001', limit=2))
assert(json_list(res) == [])
def test_handler_output_cdxj(): def test_handler_output_cdxj():
agg = GeventTimeoutAggregator(sources, timeout=5.0) agg = GeventTimeoutAggregator(sources, timeout=5.0)
handler = IndexHandler(agg) handler = IndexHandler(agg)
@ -136,7 +151,7 @@ def test_handler_output_link_2():
exp = """\ exp = """\
<http://web.archive.org/web/20140126093743id_/http://iana.org/>; rel="memento"; datetime="Sun, 26 Jan 2014 09:37:43 GMT"; src="ia", <http://web.archive.org/web/20140126093743id_/http://iana.org/>; rel="memento"; datetime="Sun, 26 Jan 2014 09:37:43 GMT"; src="ia",
<filename://iana.warc.gz>; rel="memento"; datetime="Sun, 26 Jan 2014 20:06:24 GMT"; src="local", <file://iana.warc.gz:334:2258>; rel="memento"; datetime="Sun, 26 Jan 2014 20:06:24 GMT"; src="local",
<http://web.archive.org/web/20140123034755id_/http://iana.org/>; rel="memento"; datetime="Thu, 23 Jan 2014 03:47:55 GMT"; src="ia", <http://web.archive.org/web/20140123034755id_/http://iana.org/>; rel="memento"; datetime="Thu, 23 Jan 2014 03:47:55 GMT"; src="ia",
<http://web.archive.org/web/20140129175203id_/http://iana.org/>; rel="memento"; datetime="Wed, 29 Jan 2014 17:52:03 GMT"; src="ia", <http://web.archive.org/web/20140129175203id_/http://iana.org/>; rel="memento"; datetime="Wed, 29 Jan 2014 17:52:03 GMT"; src="ia",
<http://wayback.archive-it.org/all/20140107040552id_/http://iana.org/>; rel="memento"; datetime="Tue, 07 Jan 2014 04:05:52 GMT"; src="ait" <http://wayback.archive-it.org/all/20140107040552id_/http://iana.org/>; rel="memento"; datetime="Tue, 07 Jan 2014 04:05:52 GMT"; src="ait"