mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-21 19:12:10 +01:00
121 lines
3.7 KiB
Python
121 lines
3.7 KiB
Python
|
from rezag.utils import MementoUtils
|
||
|
from pywb.warc.recordloader import ArchiveLoadFailed
|
||
|
from rezag.responseloader import WARCPathHandler, LiveWebHandler
|
||
|
from bottle import response
|
||
|
|
||
|
|
||
|
#=============================================================================
|
||
|
def to_cdxj(cdx_iter, fields):
|
||
|
response.headers['Content-Type'] = 'text/x-cdxj'
|
||
|
return [cdx.to_cdxj(fields) for cdx in cdx_iter]
|
||
|
|
||
|
def to_json(cdx_iter, fields):
|
||
|
response.headers['Content-Type'] = 'application/x-ndjson'
|
||
|
return [cdx.to_json(fields) for cdx in cdx_iter]
|
||
|
|
||
|
def to_text(cdx_iter, fields):
|
||
|
response.headers['Content-Type'] = 'text/plain'
|
||
|
return [cdx.to_text(fields) for cdx in cdx_iter]
|
||
|
|
||
|
def to_link(cdx_iter, fields):
|
||
|
response.headers['Content-Type'] = 'application/link'
|
||
|
return MementoUtils.make_timemap(cdx_iter)
|
||
|
|
||
|
|
||
|
#=============================================================================
|
||
|
class IndexHandler(object):
|
||
|
OUTPUTS = {
|
||
|
'cdxj': to_cdxj,
|
||
|
'json': to_json,
|
||
|
'text': to_text,
|
||
|
'link': to_link,
|
||
|
}
|
||
|
|
||
|
DEF_OUTPUT = 'cdxj'
|
||
|
|
||
|
def __init__(self, index_source, opts=None):
|
||
|
self.index_source = index_source
|
||
|
self.opts = opts or {}
|
||
|
|
||
|
def __call__(self, params):
|
||
|
if params.get('mode') == 'sources':
|
||
|
srcs = self.index_source.get_sources(params)
|
||
|
result = [(name, str(value)) for name, value in srcs]
|
||
|
result = {'sources': dict(result)}
|
||
|
return result
|
||
|
|
||
|
input_req = params.get('_input_req')
|
||
|
if input_req:
|
||
|
params['url'] = input_req.include_post_query()
|
||
|
|
||
|
cdx_iter = self.index_source(params)
|
||
|
|
||
|
output = params.get('output', self.DEF_OUTPUT)
|
||
|
fields = params.get('fields')
|
||
|
|
||
|
handler = self.OUTPUTS.get(output)
|
||
|
if not handler:
|
||
|
handler = self.OUTPUTS[self.DEF_OUTPUT]
|
||
|
|
||
|
res = handler(cdx_iter, fields)
|
||
|
return res
|
||
|
|
||
|
|
||
|
#=============================================================================
|
||
|
class ResourceHandler(IndexHandler):
|
||
|
def __init__(self, index_source, resource_loaders):
|
||
|
super(ResourceHandler, self).__init__(index_source)
|
||
|
self.resource_loaders = resource_loaders
|
||
|
|
||
|
def __call__(self, params):
|
||
|
if params.get('mode', 'resource') != 'resource':
|
||
|
return super(ResourceHandler, self).__call__(params)
|
||
|
|
||
|
cdx_iter = self.index_source(params)
|
||
|
|
||
|
any_found = False
|
||
|
|
||
|
for cdx in cdx_iter:
|
||
|
any_found = True
|
||
|
cdx['coll'] = params.get('coll', '')
|
||
|
|
||
|
for loader in self.resource_loaders:
|
||
|
try:
|
||
|
resp = loader(cdx, params)
|
||
|
if resp:
|
||
|
return resp
|
||
|
except ArchiveLoadFailed as e:
|
||
|
print(e)
|
||
|
pass
|
||
|
|
||
|
if any_found:
|
||
|
raise ArchiveLoadFailed('Resource Found, could not be Loaded')
|
||
|
else:
|
||
|
raise ArchiveLoadFailed('No Resource Found')
|
||
|
|
||
|
|
||
|
#=============================================================================
|
||
|
class DefaultResourceHandler(ResourceHandler):
|
||
|
def __init__(self, index_source, warc_paths=''):
|
||
|
loaders = [WARCPathHandler(warc_paths, index_source),
|
||
|
LiveWebHandler()
|
||
|
]
|
||
|
super(DefaultResourceHandler, self).__init__(index_source, loaders)
|
||
|
|
||
|
|
||
|
#=============================================================================
|
||
|
class HandlerSeq(object):
|
||
|
def __init__(self, loaders):
|
||
|
self.loaders = loaders
|
||
|
|
||
|
def __call__(self, params):
|
||
|
for loader in self.loaders:
|
||
|
try:
|
||
|
res = loader(params)
|
||
|
if res:
|
||
|
return res
|
||
|
except ArchiveLoadFailed:
|
||
|
pass
|
||
|
|
||
|
raise ArchiveLoadFailed('No Resource Found')
|