mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
record_loader can optionally parse 'request' records archiveindexer has -a flag to write all records ('request' included), -p flag to append post query post-test.warc.gz and cdx POST redirects using 307
153 lines
4.7 KiB
Python
153 lines
4.7 KiB
Python
import urllib
|
|
import urllib2
|
|
|
|
from pywb.utils.dsrules import DEFAULT_RULES_FILE
|
|
|
|
from pywb.perms.perms_filter import make_perms_cdx_filter
|
|
from pywb.framework.wbrequestresponse import WbResponse
|
|
from pywb.cdx.cdxserver import create_cdx_server
|
|
from views import MementoTimemapView
|
|
|
|
|
|
#=================================================================
|
|
class QueryHandler(object):
|
|
"""
|
|
Main interface for querying the index (currently only CDX) from a
|
|
source server (currently a cdx server)
|
|
|
|
Creates an appropriate query based on wbrequest type info and outputs
|
|
a returns a view for the cdx, either a raw cdx iter, an html view,
|
|
etc...
|
|
"""
|
|
|
|
def __init__(self, cdx_server, html_query_view=None, perms_policy=None):
|
|
self.cdx_server = cdx_server
|
|
self.perms_policy = perms_policy
|
|
|
|
self.views = {}
|
|
if html_query_view:
|
|
self.views['html'] = html_query_view
|
|
|
|
self.views['timemap'] = MementoTimemapView()
|
|
|
|
@staticmethod
|
|
def init_from_config(config,
|
|
ds_rules_file=DEFAULT_RULES_FILE,
|
|
html_view=None,
|
|
server_cls=None):
|
|
|
|
perms_policy = None
|
|
|
|
if hasattr(config, 'get'):
|
|
perms_policy = config.get('perms_policy')
|
|
server_cls = config.get('server_cls', server_cls)
|
|
|
|
cdx_server = create_cdx_server(config, ds_rules_file, server_cls)
|
|
|
|
return QueryHandler(cdx_server, html_view, perms_policy)
|
|
|
|
def load_for_request(self, wbrequest):
|
|
wbrequest.normalize_post_query()
|
|
|
|
wb_url = wbrequest.wb_url
|
|
|
|
# cdx server only supports text and cdxobject for now
|
|
if wb_url.mod == 'cdx_':
|
|
output = 'text'
|
|
elif wb_url.mod == 'timemap':
|
|
output = 'timemap'
|
|
elif wb_url.is_query():
|
|
output = 'html'
|
|
else:
|
|
output = 'cdxobject'
|
|
|
|
# init standard params
|
|
params = self.get_query_params(wb_url)
|
|
|
|
params['allowFuzzy'] = True
|
|
params['url'] = wb_url.url
|
|
params['output'] = output
|
|
|
|
cdx_iter = self.load_cdx(wbrequest, params)
|
|
|
|
if output != 'text' and wb_url.is_replay():
|
|
return (cdx_iter, self.cdx_load_callback(wbrequest))
|
|
|
|
return self.make_cdx_response(wbrequest, cdx_iter, params['output'])
|
|
|
|
def load_cdx(self, wbrequest, params):
|
|
if wbrequest:
|
|
# add any custom filter from the request
|
|
if wbrequest.query_filter:
|
|
params['filter'].extend(wbrequest.query_filter)
|
|
|
|
if wbrequest.custom_params:
|
|
params.update(wbrequest.custom_params)
|
|
|
|
if self.perms_policy:
|
|
perms_op = make_perms_cdx_filter(self.perms_policy, wbrequest)
|
|
if perms_op:
|
|
params['custom_ops'] = [perms_op]
|
|
|
|
cdx_iter = self.cdx_server.load_cdx(**params)
|
|
return cdx_iter
|
|
|
|
def make_cdx_response(self, wbrequest, cdx_iter, output):
|
|
# if not text, the iterator is assumed to be CDXObjects
|
|
if output and output != 'text':
|
|
view = self.views.get(output)
|
|
if view:
|
|
return view.render_response(wbrequest, cdx_iter)
|
|
|
|
return WbResponse.text_stream(cdx_iter)
|
|
|
|
def cdx_load_callback(self, wbrequest):
|
|
def load_cdx(params):
|
|
params['output'] = 'cdxobject'
|
|
return self.load_cdx(wbrequest, params)
|
|
|
|
return load_cdx
|
|
|
|
def get_query_params(self,
|
|
wburl, limit=150000,
|
|
collapse_time=None,
|
|
replay_closest=100):
|
|
|
|
#if wburl.type == wburl.URL_QUERY:
|
|
# raise NotImplementedError('Url Query Not Yet Supported')
|
|
|
|
return {
|
|
wburl.QUERY:
|
|
{'collapseTime': collapse_time,
|
|
'filter': ['!statuscode:(500|502|504)'],
|
|
'limit': limit,
|
|
},
|
|
|
|
wburl.URL_QUERY:
|
|
{'collapse': 'urlkey',
|
|
'matchType': 'prefix',
|
|
'showGroupCount': True,
|
|
'showUniqCount': True,
|
|
'lastSkipTimestamp': True,
|
|
'limit': limit,
|
|
'fl': ('urlkey,original,timestamp,' +
|
|
'endtimestamp,groupcount,uniqcount'),
|
|
},
|
|
|
|
wburl.REPLAY:
|
|
{'sort': 'closest',
|
|
'filter': ['!statuscode:(500|502|504)'],
|
|
'limit': replay_closest,
|
|
'closest': wburl.timestamp,
|
|
'resolveRevisits': True,
|
|
},
|
|
|
|
wburl.LATEST_REPLAY:
|
|
{'sort': 'reverse',
|
|
'filter': ['statuscode:[23]..'],
|
|
'limit': '1',
|
|
'resolveRevisits': True,
|
|
}
|
|
|
|
}[wburl.type]
|