2016-03-03 11:55:43 -08:00
|
|
|
from webagg.responseloader import WARCPathLoader, LiveWebLoader
|
|
|
|
from webagg.utils import MementoUtils
|
2016-02-29 12:34:06 -08:00
|
|
|
from pywb.utils.wbexception import BadRequestException, WbException
|
|
|
|
from pywb.utils.wbexception import NotFoundException
|
2016-02-26 18:25:10 -08:00
|
|
|
from bottle import response
|
|
|
|
|
|
|
|
|
|
|
|
#=============================================================================
|
|
|
|
def to_cdxj(cdx_iter, fields):
|
2016-03-01 14:46:05 -08:00
|
|
|
response.headers['Content-Type'] = 'text/x-cdxj'
|
2016-02-26 18:25:10 -08:00
|
|
|
return [cdx.to_cdxj(fields) for cdx in cdx_iter]
|
|
|
|
|
|
|
|
def to_json(cdx_iter, fields):
|
|
|
|
response.headers['Content-Type'] = 'application/x-ndjson'
|
|
|
|
return [cdx.to_json(fields) for cdx in cdx_iter]
|
|
|
|
|
|
|
|
def to_text(cdx_iter, fields):
|
|
|
|
response.headers['Content-Type'] = 'text/plain'
|
|
|
|
return [cdx.to_text(fields) for cdx in cdx_iter]
|
|
|
|
|
|
|
|
def to_link(cdx_iter, fields):
|
|
|
|
response.headers['Content-Type'] = 'application/link'
|
|
|
|
return MementoUtils.make_timemap(cdx_iter)
|
|
|
|
|
|
|
|
|
|
|
|
#=============================================================================
|
|
|
|
class IndexHandler(object):
|
|
|
|
OUTPUTS = {
|
|
|
|
'cdxj': to_cdxj,
|
|
|
|
'json': to_json,
|
|
|
|
'text': to_text,
|
|
|
|
'link': to_link,
|
|
|
|
}
|
|
|
|
|
|
|
|
DEF_OUTPUT = 'cdxj'
|
|
|
|
|
|
|
|
def __init__(self, index_source, opts=None):
|
|
|
|
self.index_source = index_source
|
|
|
|
self.opts = opts or {}
|
|
|
|
|
2016-02-29 12:34:06 -08:00
|
|
|
def get_supported_modes(self):
|
2016-03-02 18:13:13 -08:00
|
|
|
return dict(modes=['list_sources', 'index'])
|
2016-02-29 12:34:06 -08:00
|
|
|
|
|
|
|
def _load_index_source(self, params):
|
|
|
|
url = params.get('url')
|
|
|
|
if not url:
|
2016-03-02 18:13:13 -08:00
|
|
|
errs = dict(last_exc=BadRequestException('The "url" param is required'))
|
|
|
|
return None, errs
|
2016-02-26 18:25:10 -08:00
|
|
|
|
|
|
|
input_req = params.get('_input_req')
|
|
|
|
if input_req:
|
2016-02-29 12:34:06 -08:00
|
|
|
params['alt_url'] = input_req.include_post_query(url)
|
2016-02-26 18:25:10 -08:00
|
|
|
|
2016-02-29 12:34:06 -08:00
|
|
|
return self.index_source(params)
|
|
|
|
|
|
|
|
def __call__(self, params):
|
|
|
|
mode = params.get('mode', 'index')
|
|
|
|
if mode == 'list_sources':
|
2016-03-02 18:13:13 -08:00
|
|
|
return self.index_source.get_source_list(params), {}
|
2016-02-29 12:34:06 -08:00
|
|
|
|
2016-03-02 18:13:13 -08:00
|
|
|
if mode != 'index':
|
|
|
|
return self.get_supported_modes(), {}
|
2016-02-26 18:25:10 -08:00
|
|
|
|
|
|
|
output = params.get('output', self.DEF_OUTPUT)
|
|
|
|
fields = params.get('fields')
|
|
|
|
|
|
|
|
handler = self.OUTPUTS.get(output)
|
|
|
|
if not handler:
|
2016-03-02 18:13:13 -08:00
|
|
|
errs = dict(last_exc=BadRequestException('output={0} not supported'.format(output)))
|
|
|
|
return None, errs
|
|
|
|
|
|
|
|
cdx_iter, errs = self._load_index_source(params)
|
|
|
|
if not cdx_iter:
|
|
|
|
return None, errs
|
2016-02-26 18:25:10 -08:00
|
|
|
|
|
|
|
res = handler(cdx_iter, fields)
|
2016-03-02 18:13:13 -08:00
|
|
|
return res, errs
|
2016-02-26 18:25:10 -08:00
|
|
|
|
|
|
|
|
|
|
|
#=============================================================================
|
|
|
|
class ResourceHandler(IndexHandler):
|
|
|
|
def __init__(self, index_source, resource_loaders):
|
|
|
|
super(ResourceHandler, self).__init__(index_source)
|
|
|
|
self.resource_loaders = resource_loaders
|
|
|
|
|
2016-02-29 12:34:06 -08:00
|
|
|
def get_supported_modes(self):
|
|
|
|
res = super(ResourceHandler, self).get_supported_modes()
|
|
|
|
res['modes'].append('resource')
|
|
|
|
return res
|
|
|
|
|
2016-02-26 18:25:10 -08:00
|
|
|
def __call__(self, params):
|
|
|
|
if params.get('mode', 'resource') != 'resource':
|
|
|
|
return super(ResourceHandler, self).__call__(params)
|
|
|
|
|
2016-03-02 18:13:13 -08:00
|
|
|
cdx_iter, errs = self._load_index_source(params)
|
|
|
|
if not cdx_iter:
|
|
|
|
return None, errs
|
|
|
|
|
2016-02-29 12:34:06 -08:00
|
|
|
last_exc = None
|
2016-02-26 18:25:10 -08:00
|
|
|
|
|
|
|
for cdx in cdx_iter:
|
|
|
|
for loader in self.resource_loaders:
|
|
|
|
try:
|
|
|
|
resp = loader(cdx, params)
|
2016-02-29 12:34:06 -08:00
|
|
|
if resp is not None:
|
2016-03-02 18:13:13 -08:00
|
|
|
return resp, errs
|
2016-02-29 12:34:06 -08:00
|
|
|
except WbException as e:
|
|
|
|
last_exc = e
|
2016-03-02 18:13:13 -08:00
|
|
|
errs[str(loader)] = repr(e)
|
2016-02-26 18:25:10 -08:00
|
|
|
|
2016-02-29 12:34:06 -08:00
|
|
|
if last_exc:
|
2016-03-02 18:13:13 -08:00
|
|
|
errs['last_exc'] = last_exc
|
|
|
|
|
|
|
|
return None, errs
|
2016-02-26 18:25:10 -08:00
|
|
|
|
|
|
|
|
|
|
|
#=============================================================================
|
|
|
|
class DefaultResourceHandler(ResourceHandler):
|
|
|
|
def __init__(self, index_source, warc_paths=''):
|
2016-02-29 12:34:06 -08:00
|
|
|
loaders = [WARCPathLoader(warc_paths, index_source),
|
|
|
|
LiveWebLoader()
|
2016-02-26 18:25:10 -08:00
|
|
|
]
|
|
|
|
super(DefaultResourceHandler, self).__init__(index_source, loaders)
|
|
|
|
|
|
|
|
|
|
|
|
#=============================================================================
|
|
|
|
class HandlerSeq(object):
|
2016-02-29 12:34:06 -08:00
|
|
|
def __init__(self, handlers):
|
|
|
|
self.handlers = handlers
|
2016-02-26 18:25:10 -08:00
|
|
|
|
2016-03-01 14:46:05 -08:00
|
|
|
def get_supported_modes(self):
|
2016-03-02 18:13:13 -08:00
|
|
|
if self.handlers:
|
|
|
|
return self.handlers[0].get_supported_modes()
|
|
|
|
else:
|
|
|
|
return {}
|
2016-03-01 14:46:05 -08:00
|
|
|
|
2016-02-26 18:25:10 -08:00
|
|
|
def __call__(self, params):
|
2016-03-02 18:13:13 -08:00
|
|
|
all_errs = {}
|
2016-02-29 12:34:06 -08:00
|
|
|
for handler in self.handlers:
|
2016-03-02 18:13:13 -08:00
|
|
|
res, errs = handler(params)
|
|
|
|
all_errs.update(errs)
|
|
|
|
if res is not None:
|
|
|
|
return res, all_errs
|
|
|
|
|
|
|
|
return None, all_errs
|
|
|
|
|
2016-02-26 18:25:10 -08:00
|
|
|
|