mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
cleanup cdx server config, refactored such that
a cdx server need implement a single interface: load_cdx(self, **params) CDXServer and RemoteCDXServer distinct classes in cdxserver.py utility function cdxserver.create_cdx_server() to create appropriate server based on input
This commit is contained in:
parent
94f1dc3be5
commit
abea504b04
@ -7,6 +7,7 @@ import os
|
||||
import urlparse
|
||||
|
||||
from cdxsource import CDXSource, CDXFile, RemoteCDXSource
|
||||
from cdxobject import CDXObject
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -22,70 +23,22 @@ class AccessException(CDXException):
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDXServer:
|
||||
class CDXServer(object):
|
||||
"""
|
||||
Top-level cdx server object which maintains a list of cdx sources,
|
||||
responds to queries and dispatches to the cdx ops for processing
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def create_from_config(config):
|
||||
paths = config.get('index_paths')
|
||||
surt_ordered = config.get('surt_ordered', True)
|
||||
return CDXServer(paths, surt_ordered)
|
||||
|
||||
def __init__(self, sources, surt_ordered=True):
|
||||
self.sources = []
|
||||
def __init__(self, paths, surt_ordered=True):
|
||||
self.sources = create_cdx_sources(paths)
|
||||
self.surt_ordered = surt_ordered
|
||||
|
||||
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
|
||||
|
||||
if not isinstance(sources, list):
|
||||
sources = [sources]
|
||||
|
||||
for src in sources:
|
||||
if isinstance(src, CDXSource):
|
||||
self.add_cdx_source(src)
|
||||
elif isinstance(src, str):
|
||||
if os.path.isdir(src):
|
||||
for file in os.listdir(src):
|
||||
self.add_cdx_source(src + file)
|
||||
else:
|
||||
self.add_cdx_source(src)
|
||||
|
||||
if len(self.sources) == 0:
|
||||
logging.exception('No CDX Sources Found from: ' + str(sources))
|
||||
|
||||
def add_cdx_source(self, source):
|
||||
if not isinstance(source, CDXSource):
|
||||
source = self.create_cdx_source(source)
|
||||
if not source:
|
||||
return
|
||||
|
||||
logging.debug('Adding CDX Source: ' + str(source))
|
||||
self.sources.append(source)
|
||||
|
||||
@staticmethod
|
||||
def create_cdx_source(filename):
|
||||
if filename.startswith('http://') or filename.startswith('https://'):
|
||||
return RemoteCDXSource(filename)
|
||||
|
||||
if filename.endswith('.cdx'):
|
||||
return CDXFile(filename)
|
||||
|
||||
return None
|
||||
#TODO: support zipnum
|
||||
#elif filename.endswith('.summary')
|
||||
# return ZipNumCDXSource(filename)
|
||||
#elif filename.startswith('redis://')
|
||||
# return RedisCDXSource(filename)
|
||||
|
||||
def load_cdx(self, **params):
|
||||
# if key not set, assume 'url' is set and needs canonicalization
|
||||
if not params.get('key'):
|
||||
params['key'] = self._canonicalize(params)
|
||||
|
||||
self._convert_old_style(params)
|
||||
convert_old_style_params(params)
|
||||
|
||||
return cdx_load(self.sources, params)
|
||||
|
||||
@ -112,43 +65,145 @@ class CDXServer:
|
||||
|
||||
return key
|
||||
|
||||
def _convert_old_style(self, params):
|
||||
"""
|
||||
Convert old-style CDX Server param semantics
|
||||
"""
|
||||
collapse_time = params.get('collapseTime')
|
||||
if collapse_time:
|
||||
params['collapse_time'] = collapse_time
|
||||
def __str__(self):
|
||||
return 'CDX server serving from ' + str(self.sources)
|
||||
|
||||
resolve_revisits = params.get('resolveRevisits')
|
||||
if resolve_revisits:
|
||||
params['resolve_revisits'] = resolve_revisits
|
||||
|
||||
if params.get('sort') == 'reverse':
|
||||
params['reverse'] = True
|
||||
#=================================================================
|
||||
class RemoteCDXServer(object):
|
||||
"""
|
||||
A special cdx server that uses a single RemoteCDXSource
|
||||
It simply proxies the query params to the remote source
|
||||
and performs no local processing/filtering
|
||||
"""
|
||||
def __init__(self, source):
|
||||
if isinstance(source, RemoteCDXSource):
|
||||
self.source = source
|
||||
elif (isinstance(source, str) and
|
||||
any(source.startswith(x) for x in ['http://', 'https://'])):
|
||||
self.source = RemoteCDXSource(source)
|
||||
else:
|
||||
raise Exception('Invalid remote cdx source: ' + str(source))
|
||||
|
||||
def load_cdx_from_request(self, env):
|
||||
#url = wbrequest.wb_url.url
|
||||
|
||||
# use url= param to get actual url
|
||||
params = urlparse.parse_qs(env['QUERY_STRING'])
|
||||
|
||||
if not 'output' in params:
|
||||
params['output'] = 'text'
|
||||
|
||||
# parse_qs produces arrays for single values
|
||||
# cdx processing expects singleton params for all params,
|
||||
# except filters, so convert here
|
||||
# use first value of the list
|
||||
for name, val in params.iteritems():
|
||||
if name != 'filter':
|
||||
params[name] = val[0]
|
||||
|
||||
cdx_lines = self.load_cdx(**params)
|
||||
return cdx_lines
|
||||
def load_cdx(self, **params):
|
||||
remote_iter = remote.load_cdx(**params)
|
||||
# if need raw, convert to raw format here
|
||||
if params.get('output') == 'raw':
|
||||
return (CDXObject(cdx) for cdx in remote_iter)
|
||||
else:
|
||||
return remote_iter
|
||||
|
||||
def __str__(self):
|
||||
return 'load cdx indexes from ' + str(self.sources)
|
||||
return 'Remote CDX server serving from ' + str(self.sources[0])
|
||||
|
||||
|
||||
#=================================================================
|
||||
def create_cdx_server(config):
|
||||
if hasattr(config, 'get'):
|
||||
paths = config.get('index_paths')
|
||||
surt_ordered = config.get('surt_ordered', True)
|
||||
else:
|
||||
paths = config
|
||||
surt_ordered = True
|
||||
|
||||
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
|
||||
|
||||
if (isinstance(paths, str) and
|
||||
any(paths.startswith(x) for x in ['http://', 'https://'])):
|
||||
return RemoteCDXServer(paths)
|
||||
else:
|
||||
return CDXServer(paths)
|
||||
|
||||
|
||||
#=================================================================
|
||||
def create_cdx_sources(paths):
|
||||
sources = []
|
||||
|
||||
if not isinstance(paths, list):
|
||||
paths = [paths]
|
||||
|
||||
for path in paths:
|
||||
if isinstance(path, CDXSource):
|
||||
add_cdx_source(sources, path)
|
||||
elif isinstance(path, str):
|
||||
if os.path.isdir(path):
|
||||
for file in os.listdir(path):
|
||||
add_cdx_source(sources, path + file)
|
||||
else:
|
||||
add_cdx_source(sources, path)
|
||||
|
||||
if len(sources) == 0:
|
||||
logging.exception('No CDX Sources Found from: ' + str(sources))
|
||||
|
||||
return sources
|
||||
|
||||
|
||||
#=================================================================
|
||||
def add_cdx_source(sources, source):
|
||||
if not isinstance(source, CDXSource):
|
||||
source = create_cdx_source(source)
|
||||
if not source:
|
||||
return
|
||||
|
||||
logging.debug('Adding CDX Source: ' + str(source))
|
||||
sources.append(source)
|
||||
|
||||
|
||||
#=================================================================
|
||||
def create_cdx_source(filename):
|
||||
if filename.startswith('http://') or filename.startswith('https://'):
|
||||
return RemoteCDXSource(filename)
|
||||
|
||||
if filename.endswith('.cdx'):
|
||||
return CDXFile(filename)
|
||||
|
||||
return None
|
||||
#TODO: support zipnum
|
||||
#elif filename.endswith('.summary')
|
||||
# return ZipNumCDXSource(filename)
|
||||
#elif filename.startswith('redis://')
|
||||
# return RedisCDXSource(filename)
|
||||
|
||||
|
||||
#=================================================================
|
||||
def convert_old_style_params(params):
|
||||
"""
|
||||
Convert old-style CDX Server param semantics
|
||||
"""
|
||||
collapse_time = params.get('collapseTime')
|
||||
if collapse_time:
|
||||
params['collapse_time'] = collapse_time
|
||||
|
||||
resolve_revisits = params.get('resolveRevisits')
|
||||
if resolve_revisits:
|
||||
params['resolve_revisits'] = resolve_revisits
|
||||
|
||||
if params.get('sort') == 'reverse':
|
||||
params['reverse'] = True
|
||||
|
||||
return params
|
||||
|
||||
|
||||
#=================================================================
|
||||
def extract_params_from_wsgi_env(env):
|
||||
""" utility function to extract params from the query
|
||||
string of a WSGI environment dictionary
|
||||
"""
|
||||
# use url= param to get actual url
|
||||
params = urlparse.parse_qs(env['QUERY_STRING'])
|
||||
|
||||
if not 'output' in params:
|
||||
params['output'] = 'text'
|
||||
|
||||
# parse_qs produces arrays for single values
|
||||
# cdx processing expects singleton params for all params,
|
||||
# except filters, so convert here
|
||||
# use first value of the list
|
||||
for name, val in params.iteritems():
|
||||
if name != 'filter':
|
||||
params[name] = val[0]
|
||||
|
||||
return params
|
||||
|
||||
|
||||
#=================================================================
|
||||
|
@ -1,3 +1,3 @@
|
||||
#CDX Server WSGI App Config
|
||||
index_paths: ./sample_data/
|
||||
index_paths: ./sample_archive/cdx/
|
||||
port: 8090
|
||||
|
@ -1,38 +1,42 @@
|
||||
from cdxserver import CDXServer
|
||||
from cdxserver import create_cdx_server, extract_params_from_wsgi_env
|
||||
from pywb import get_test_dir
|
||||
|
||||
import logging
|
||||
import os
|
||||
import yaml
|
||||
import pkgutil
|
||||
|
||||
#=================================================================
|
||||
TEST_CDX_DIR = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/'
|
||||
|
||||
CONFIG_FILE = 'config.yaml'
|
||||
|
||||
DEFAULT_PORT = 8080
|
||||
|
||||
config = None
|
||||
if __package__:
|
||||
config = pkgutil.get_data(__package__, CONFIG_FILE)
|
||||
config = yaml.load(config)
|
||||
else:
|
||||
config = None
|
||||
try:
|
||||
config = pkgutil.get_data(__package__, CONFIG_FILE)
|
||||
config = yaml.load(config)
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
#=================================================================
|
||||
def main():
|
||||
def main(paths=None):
|
||||
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
|
||||
level=logging.DEBUG)
|
||||
|
||||
cdx_config = config.get('index_paths') if config else None
|
||||
if not paths:
|
||||
if config:
|
||||
paths = config
|
||||
else:
|
||||
paths = get_test_dir() + 'cdx/'
|
||||
|
||||
if not cdx_config:
|
||||
cdx_config = [TEST_CDX_DIR]
|
||||
|
||||
cdxserver = CDXServer(cdx_config)
|
||||
cdxserver = create_cdx_server(paths)
|
||||
|
||||
def application(env, start_response):
|
||||
try:
|
||||
response = cdxserver.load_cdx_from_request(env)
|
||||
params = extract_params_from_wsgi_env(env)
|
||||
response = cdxserver.load_cdx(**params)
|
||||
start_response('200 OK', [('Content-Type', 'text/plain')])
|
||||
|
||||
response = list(response)
|
||||
|
@ -4,6 +4,7 @@ import mimetypes
|
||||
import time
|
||||
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.cdx.cdxserver import extract_params_from_wsgi_env
|
||||
from wbrequestresponse import WbResponse
|
||||
from wbexceptions import WbException, NotFoundException
|
||||
from views import TextCapturesView
|
||||
@ -69,7 +70,8 @@ class CDXHandler(BaseHandler):
|
||||
self.view = view if view else TextCapturesView()
|
||||
|
||||
def __call__(self, wbrequest):
|
||||
cdx_lines = self.index_reader.cdx_server.load_cdx_from_request(wbrequest.env)
|
||||
params = extract_params_from_wsgi_env(wbrequest.env)
|
||||
cdx_lines = self.index_reader.load_cdx(**params)
|
||||
|
||||
return self.view.render_response(wbrequest, cdx_lines)
|
||||
|
||||
|
@ -6,16 +6,13 @@ from wbexceptions import NotFoundException
|
||||
from itertools import chain
|
||||
from pprint import pprint
|
||||
|
||||
from pywb.cdx.cdxserver import CDXServer, CDXException
|
||||
from pywb.cdx.cdxserver import create_cdx_server, CDXException
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
|
||||
#=================================================================
|
||||
class IndexReader(object):
|
||||
def __init__(self, config):
|
||||
if isinstance(config, str):
|
||||
self.cdx_server = CDXServer(config)
|
||||
else:
|
||||
self.cdx_server = CDXServer.create_from_config(config)
|
||||
self.cdx_server = create_cdx_server(config)
|
||||
|
||||
def load_for_request(self, wbrequest):
|
||||
wburl = wbrequest.wb_url
|
||||
@ -76,12 +73,3 @@ class IndexReader(object):
|
||||
return None
|
||||
|
||||
return chain([first], iterable)
|
||||
|
||||
#=================================================================
|
||||
class RemoteCDXServer(IndexReader):
|
||||
def __init__(self, remote_url, cookie=None):
|
||||
self.remote = RemoteCDXSource(remote_url=remote_url, cookie=cookie, proxy_all=True)
|
||||
self.cdx_server = CDXServer(self.remote)
|
||||
|
||||
#def load_cdx(self, **params):
|
||||
#return remote.load_cdx(**params)
|
||||
|
2
setup.py
2
setup.py
@ -13,7 +13,7 @@ setuptools.setup(name='pywb',
|
||||
license='GPL',
|
||||
packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
|
||||
provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
|
||||
package_data={'pywb': ['ui/*', 'static/*']},
|
||||
package_data={'pywb': ['ui/*', 'static/*'], 'pywb.cdx': ['*.yaml']},
|
||||
data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
|
||||
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))],
|
||||
install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'],
|
||||
|
Loading…
x
Reference in New Issue
Block a user