2013-12-18 18:52:52 -08:00
|
|
|
import urllib
|
|
|
|
import urllib2
|
2014-02-17 10:23:37 -08:00
|
|
|
|
2014-04-04 10:09:26 -07:00
|
|
|
from pywb.utils.dsrules import DEFAULT_RULES_FILE
|
|
|
|
|
2014-03-06 18:06:05 -08:00
|
|
|
from pywb.perms.perms_filter import make_perms_cdx_filter
|
2014-03-23 12:44:28 -07:00
|
|
|
from pywb.framework.wbrequestresponse import WbResponse
|
|
|
|
from pywb.cdx.cdxserver import create_cdx_server
|
2014-03-24 14:00:06 -07:00
|
|
|
from views import MementoTimemapView
|
2014-03-23 12:44:28 -07:00
|
|
|
|
2014-01-27 21:46:38 -08:00
|
|
|
|
|
|
|
#=================================================================
|
2014-03-23 12:44:28 -07:00
|
|
|
class QueryHandler(object):
|
2014-02-17 14:52:13 -08:00
|
|
|
"""
|
2014-03-23 12:44:28 -07:00
|
|
|
Main interface for querying the index (currently only CDX) from a
|
2014-03-03 18:27:04 -08:00
|
|
|
source server (currently a cdx server)
|
2014-02-17 14:52:13 -08:00
|
|
|
|
2014-03-23 12:44:28 -07:00
|
|
|
Creates an appropriate query based on wbrequest type info and outputs
|
|
|
|
a returns a view for the cdx, either a raw cdx iter, an html view,
|
|
|
|
etc...
|
2014-02-17 14:52:13 -08:00
|
|
|
"""
|
|
|
|
|
2014-03-23 12:44:28 -07:00
|
|
|
def __init__(self, cdx_server, html_query_view=None, perms_policy=None):
|
2014-03-03 13:40:48 -08:00
|
|
|
self.cdx_server = cdx_server
|
2014-03-03 18:27:04 -08:00
|
|
|
self.perms_policy = perms_policy
|
2014-02-17 02:34:39 -08:00
|
|
|
|
2014-03-23 12:44:28 -07:00
|
|
|
self.views = {}
|
|
|
|
if html_query_view:
|
|
|
|
self.views['html'] = html_query_view
|
|
|
|
|
2014-03-24 14:00:06 -07:00
|
|
|
self.views['timemap'] = MementoTimemapView()
|
|
|
|
|
2014-03-23 12:44:28 -07:00
|
|
|
@staticmethod
|
|
|
|
def init_from_config(config,
|
2014-04-04 10:09:26 -07:00
|
|
|
ds_rules_file=DEFAULT_RULES_FILE,
|
2014-04-19 12:04:43 -07:00
|
|
|
html_view=None,
|
|
|
|
server_cls=None):
|
2014-03-23 12:44:28 -07:00
|
|
|
|
2014-04-04 12:20:54 -07:00
|
|
|
perms_policy = None
|
2014-03-23 12:44:28 -07:00
|
|
|
|
2014-04-04 12:20:54 -07:00
|
|
|
if hasattr(config, 'get'):
|
|
|
|
perms_policy = config.get('perms_policy')
|
2014-04-19 12:04:43 -07:00
|
|
|
server_cls = config.get('server_cls', server_cls)
|
2014-03-28 10:58:14 -07:00
|
|
|
|
|
|
|
cdx_server = create_cdx_server(config, ds_rules_file, server_cls)
|
2014-03-23 12:44:28 -07:00
|
|
|
|
|
|
|
return QueryHandler(cdx_server, html_view, perms_policy)
|
|
|
|
|
2014-02-17 02:34:39 -08:00
|
|
|
def load_for_request(self, wbrequest):
|
2014-03-23 12:44:28 -07:00
|
|
|
wb_url = wbrequest.wb_url
|
|
|
|
|
|
|
|
# cdx server only supports text and cdxobject for now
|
|
|
|
if wb_url.mod == 'cdx_':
|
|
|
|
output = 'text'
|
2014-03-24 14:00:06 -07:00
|
|
|
elif wb_url.mod == 'timemap':
|
2014-03-23 12:44:28 -07:00
|
|
|
output = 'timemap'
|
|
|
|
elif wb_url.is_query():
|
|
|
|
output = 'html'
|
|
|
|
else:
|
|
|
|
output = 'cdxobject'
|
2014-01-28 16:41:19 -08:00
|
|
|
|
|
|
|
# init standard params
|
2014-03-23 12:44:28 -07:00
|
|
|
params = self.get_query_params(wb_url)
|
2014-01-28 16:41:19 -08:00
|
|
|
|
2014-02-19 20:20:31 -08:00
|
|
|
params['allowFuzzy'] = True
|
2014-03-23 12:44:28 -07:00
|
|
|
params['url'] = wb_url.url
|
|
|
|
params['output'] = output
|
|
|
|
|
|
|
|
cdx_iter = self.load_cdx(wbrequest, params)
|
2014-01-28 16:41:19 -08:00
|
|
|
|
2014-03-27 14:46:59 -07:00
|
|
|
if output != 'text' and wb_url.is_replay():
|
2014-03-23 12:44:28 -07:00
|
|
|
return (cdx_iter, self.cdx_load_callback(wbrequest))
|
2014-01-28 16:41:19 -08:00
|
|
|
|
2014-03-23 12:44:28 -07:00
|
|
|
return self.make_cdx_response(wbrequest, params, cdx_iter)
|
2014-01-28 16:41:19 -08:00
|
|
|
|
2014-03-03 18:27:04 -08:00
|
|
|
def load_cdx(self, wbrequest, params):
|
2014-04-19 12:04:43 -07:00
|
|
|
if wbrequest:
|
|
|
|
# add any custom filter from the request
|
|
|
|
if wbrequest.query_filter:
|
|
|
|
params['filter'].extend(wbrequest.query_filter)
|
|
|
|
|
|
|
|
if wbrequest.custom_params:
|
|
|
|
params.update(wbrequest.custom_params)
|
|
|
|
|
2014-03-03 18:27:04 -08:00
|
|
|
if self.perms_policy:
|
2014-03-06 18:06:05 -08:00
|
|
|
perms_op = make_perms_cdx_filter(self.perms_policy, wbrequest)
|
2014-03-03 18:27:04 -08:00
|
|
|
if perms_op:
|
|
|
|
params['custom_ops'] = [perms_op]
|
|
|
|
|
2014-03-23 12:44:28 -07:00
|
|
|
cdx_iter = self.cdx_server.load_cdx(**params)
|
|
|
|
return cdx_iter
|
|
|
|
|
|
|
|
def make_cdx_response(self, wbrequest, params, cdx_iter):
|
|
|
|
output = params['output']
|
|
|
|
|
|
|
|
# if not text, the iterator is assumed to be CDXObjects
|
|
|
|
if output and output != 'text':
|
|
|
|
view = self.views.get(output)
|
|
|
|
if view:
|
|
|
|
return view.render_response(wbrequest, cdx_iter)
|
|
|
|
|
|
|
|
return WbResponse.text_stream(cdx_iter)
|
2014-01-28 16:41:19 -08:00
|
|
|
|
2014-03-04 20:12:09 +00:00
|
|
|
def cdx_load_callback(self, wbrequest):
|
|
|
|
def load_cdx(params):
|
2014-03-05 05:12:25 +00:00
|
|
|
params['output'] = 'cdxobject'
|
2014-03-04 20:12:09 +00:00
|
|
|
return self.load_cdx(wbrequest, params)
|
2014-03-05 05:12:25 +00:00
|
|
|
|
2014-03-04 20:12:09 +00:00
|
|
|
return load_cdx
|
|
|
|
|
2014-03-23 12:44:28 -07:00
|
|
|
def get_query_params(self,
|
|
|
|
wburl, limit=150000,
|
|
|
|
collapse_time=None,
|
|
|
|
replay_closest=100):
|
|
|
|
|
2014-03-28 10:58:14 -07:00
|
|
|
#if wburl.type == wburl.URL_QUERY:
|
|
|
|
# raise NotImplementedError('Url Query Not Yet Supported')
|
2014-01-28 16:41:19 -08:00
|
|
|
|
2014-01-27 21:46:38 -08:00
|
|
|
return {
|
2014-01-20 14:12:59 -05:00
|
|
|
wburl.QUERY:
|
2014-03-23 12:44:28 -07:00
|
|
|
{'collapseTime': collapse_time,
|
|
|
|
'filter': ['!statuscode:(500|502|504)'],
|
|
|
|
'limit': limit,
|
|
|
|
},
|
2013-12-20 14:54:41 -08:00
|
|
|
|
2014-01-20 14:12:59 -05:00
|
|
|
wburl.URL_QUERY:
|
2014-03-23 12:44:28 -07:00
|
|
|
{'collapse': 'urlkey',
|
|
|
|
'matchType': 'prefix',
|
|
|
|
'showGroupCount': True,
|
|
|
|
'showUniqCount': True,
|
|
|
|
'lastSkipTimestamp': True,
|
|
|
|
'limit': limit,
|
|
|
|
'fl': ('urlkey,original,timestamp,' +
|
|
|
|
'endtimestamp,groupcount,uniqcount'),
|
2013-12-20 14:54:41 -08:00
|
|
|
},
|
|
|
|
|
2014-01-20 14:12:59 -05:00
|
|
|
wburl.REPLAY:
|
2014-03-23 12:44:28 -07:00
|
|
|
{'sort': 'closest',
|
|
|
|
'filter': ['!statuscode:(500|502|504)'],
|
|
|
|
'limit': replay_closest,
|
|
|
|
'closest': wburl.timestamp,
|
|
|
|
'resolveRevisits': True,
|
|
|
|
},
|
2013-12-20 14:54:41 -08:00
|
|
|
|
2014-01-20 14:12:59 -05:00
|
|
|
wburl.LATEST_REPLAY:
|
2014-03-23 12:44:28 -07:00
|
|
|
{'sort': 'reverse',
|
|
|
|
'filter': ['statuscode:[23]..'],
|
|
|
|
'limit': '1',
|
|
|
|
'resolveRevisits': True,
|
|
|
|
}
|
2013-12-20 14:54:41 -08:00
|
|
|
|
|
|
|
}[wburl.type]
|