mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-28 00:25:21 +01:00
244 lines
7.6 KiB
Python
244 lines
7.6 KiB
Python
from canonicalize import UrlCanonicalizer, calc_search_range
|
|
|
|
from cdxops import cdx_load
|
|
from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
|
|
from zipnum import ZipNumCluster
|
|
from cdxobject import CDXObject, CaptureNotFoundException, CDXException
|
|
from cdxdomainspecific import load_domain_specific_cdx_rules
|
|
|
|
from pywb.utils.loaders import is_http
|
|
|
|
from itertools import chain
|
|
import logging
|
|
import os
|
|
import urlparse
|
|
|
|
|
|
#=================================================================
|
|
class BaseCDXServer(object):
|
|
def __init__(self, **kwargs):
|
|
ds_rules = kwargs.get('ds_rules')
|
|
surt_ordered = kwargs.get('surt_ordered', True)
|
|
|
|
# load from domain-specific rules
|
|
if ds_rules:
|
|
self.url_canon, self.fuzzy_query = (
|
|
load_domain_specific_cdx_rules(ds_rules, surt_ordered))
|
|
# or custom passed in canonicalizer
|
|
else:
|
|
self.url_canon = kwargs.get('url_canon')
|
|
self.fuzzy_query = kwargs.get('fuzzy_query')
|
|
|
|
# set default canonicalizer if none set thus far
|
|
if not self.url_canon:
|
|
self.url_canon = UrlCanonicalizer(surt_ordered)
|
|
|
|
# set perms checker, if any
|
|
self.perms_checker = kwargs.get('perms_checker')
|
|
|
|
def _check_cdx_iter(self, cdx_iter, params):
|
|
""" Check cdx iter semantics
|
|
If iter is empty (no matches), check if fuzzy matching
|
|
is allowed, and try it -- otherwise,
|
|
throw CaptureNotFoundException
|
|
"""
|
|
|
|
cdx_iter = self.peek_iter(cdx_iter)
|
|
|
|
if cdx_iter:
|
|
return cdx_iter
|
|
|
|
url = params['url']
|
|
|
|
if self.fuzzy_query and params.get('allowFuzzy'):
|
|
if not 'key' in params:
|
|
params['key'] = self.url_canon(url)
|
|
|
|
params = self.fuzzy_query(params)
|
|
if params:
|
|
params['allowFuzzy'] = False
|
|
return self.load_cdx(**params)
|
|
|
|
msg = 'No Captures found for: ' + url
|
|
raise CaptureNotFoundException(msg)
|
|
|
|
def load_cdx(self, **params):
|
|
raise NotImplementedError('Implement in subclass')
|
|
|
|
@staticmethod
|
|
def peek_iter(iterable):
|
|
try:
|
|
first = next(iterable)
|
|
except StopIteration:
|
|
return None
|
|
|
|
return chain([first], iterable)
|
|
|
|
|
|
#=================================================================
|
|
class CDXServer(BaseCDXServer):
|
|
"""
|
|
Top-level cdx server object which maintains a list of cdx sources,
|
|
responds to queries and dispatches to the cdx ops for processing
|
|
"""
|
|
|
|
def __init__(self, paths, **kwargs):
|
|
super(CDXServer, self).__init__(**kwargs)
|
|
self.sources = create_cdx_sources(paths, kwargs.get('config'))
|
|
|
|
def load_cdx(self, **params):
|
|
# if key not set, assume 'url' is set and needs canonicalization
|
|
if not params.get('key'):
|
|
try:
|
|
url = params['url']
|
|
except KeyError:
|
|
msg = 'A url= param must be specified to query the cdx server'
|
|
raise CDXException(msg)
|
|
|
|
#params['key'] = self.url_canon(url)
|
|
match_type = params.get('matchType', 'exact')
|
|
|
|
key, end_key = calc_search_range(url=url,
|
|
match_type=match_type,
|
|
url_canon=self.url_canon)
|
|
params['key'] = key
|
|
params['end_key'] = end_key
|
|
|
|
cdx_iter = cdx_load(self.sources, params, self.perms_checker)
|
|
|
|
return self._check_cdx_iter(cdx_iter, params)
|
|
|
|
def __str__(self):
|
|
return 'CDX server serving from ' + str(self.sources)
|
|
|
|
|
|
#=================================================================
|
|
class RemoteCDXServer(BaseCDXServer):
|
|
"""
|
|
A special cdx server that uses a single RemoteCDXSource
|
|
It simply proxies the query params to the remote source
|
|
and performs no local processing/filtering
|
|
"""
|
|
def __init__(self, source, **kwargs):
|
|
super(RemoteCDXServer, self).__init__(**kwargs)
|
|
|
|
if isinstance(source, RemoteCDXSource):
|
|
self.source = source
|
|
elif (isinstance(source, str) and
|
|
any(source.startswith(x) for x in ['http://', 'https://'])):
|
|
self.source = RemoteCDXSource(source)
|
|
else:
|
|
raise Exception('Invalid remote cdx source: ' + str(source))
|
|
|
|
def load_cdx(self, **params):
|
|
remote_iter = self.source.load_cdx(params)
|
|
|
|
# if need raw, convert to raw format here
|
|
if params.get('output') == 'raw':
|
|
remote_iter = (CDXObject(cdx) for cdx in remote_iter)
|
|
|
|
return self._check_cdx_iter(remote_iter, params)
|
|
|
|
def __str__(self):
|
|
return 'Remote CDX server serving from ' + str(self.sources[0])
|
|
|
|
|
|
#=================================================================
|
|
def create_cdx_server(config, ds_rules_file=None):
|
|
if hasattr(config, 'get'):
|
|
paths = config.get('index_paths')
|
|
surt_ordered = config.get('surt_ordered', True)
|
|
perms_checker = config.get('perms_checker')
|
|
pass_config = config
|
|
else:
|
|
paths = config
|
|
surt_ordered = True
|
|
perms_checker = None
|
|
pass_config = None
|
|
|
|
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
|
|
|
|
if isinstance(paths, str) and is_http(paths):
|
|
server_cls = RemoteCDXServer
|
|
else:
|
|
server_cls = CDXServer
|
|
|
|
return server_cls(paths,
|
|
config=pass_config,
|
|
surt_ordered=surt_ordered,
|
|
ds_rules=ds_rules_file,
|
|
perms_checker=perms_checker)
|
|
|
|
|
|
#=================================================================
|
|
def create_cdx_sources(paths, config=None):
|
|
sources = []
|
|
|
|
if not isinstance(paths, list):
|
|
paths = [paths]
|
|
|
|
for path in paths:
|
|
if isinstance(path, CDXSource):
|
|
add_cdx_source(sources, path, config)
|
|
elif isinstance(path, str):
|
|
if os.path.isdir(path):
|
|
for file in os.listdir(path):
|
|
add_cdx_source(sources, path + file, config)
|
|
else:
|
|
add_cdx_source(sources, path, config)
|
|
|
|
if len(sources) == 0:
|
|
logging.exception('No CDX Sources Found from: ' + str(sources))
|
|
|
|
return sources
|
|
|
|
|
|
#=================================================================
|
|
def add_cdx_source(sources, source, config):
|
|
if not isinstance(source, CDXSource):
|
|
source = create_cdx_source(source, config)
|
|
if not source:
|
|
return
|
|
|
|
logging.debug('Adding CDX Source: ' + str(source))
|
|
sources.append(source)
|
|
|
|
|
|
#=================================================================
|
|
def create_cdx_source(filename, config):
|
|
if is_http(filename):
|
|
return RemoteCDXSource(filename)
|
|
|
|
if filename.startswith('redis://'):
|
|
return RedisCDXSource(filename, config)
|
|
|
|
if filename.endswith('.cdx'):
|
|
return CDXFile(filename)
|
|
|
|
if filename.endswith('.summary'):
|
|
return ZipNumCluster(filename, config)
|
|
|
|
return None
|
|
|
|
|
|
#=================================================================
|
|
def extract_params_from_wsgi_env(env):
|
|
""" utility function to extract params from the query
|
|
string of a WSGI environment dictionary
|
|
"""
|
|
# use url= param to get actual url
|
|
params = urlparse.parse_qs(env['QUERY_STRING'])
|
|
|
|
if not 'output' in params:
|
|
params['output'] = 'text'
|
|
|
|
# parse_qs produces arrays for single values
|
|
# cdx processing expects singleton params for all params,
|
|
# except filters, so convert here
|
|
# use first value of the list
|
|
for name, val in params.iteritems():
|
|
if name != 'filter':
|
|
params[name] = val[0]
|
|
|
|
return params
|