1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

cleanup cdx server config, refactored such that

a cdx server need implement a single interface:
load_cdx(self, **params)

CDXServer and RemoteCDXServer distinct classes in cdxserver.py
utility function cdxserver.create_cdx_server() to create
appropriate server based on input
This commit is contained in:
Ilya Kreymer 2014-02-17 13:58:02 -08:00
parent 94f1dc3be5
commit abea504b04
6 changed files with 164 additions and 115 deletions

View File

@ -7,6 +7,7 @@ import os
import urlparse import urlparse
from cdxsource import CDXSource, CDXFile, RemoteCDXSource from cdxsource import CDXSource, CDXFile, RemoteCDXSource
from cdxobject import CDXObject
#================================================================= #=================================================================
@ -22,70 +23,22 @@ class AccessException(CDXException):
#================================================================= #=================================================================
class CDXServer: class CDXServer(object):
""" """
Top-level cdx server object which maintains a list of cdx sources, Top-level cdx server object which maintains a list of cdx sources,
responds to queries and dispatches to the cdx ops for processing responds to queries and dispatches to the cdx ops for processing
""" """
@staticmethod def __init__(self, paths, surt_ordered=True):
def create_from_config(config): self.sources = create_cdx_sources(paths)
paths = config.get('index_paths')
surt_ordered = config.get('surt_ordered', True)
return CDXServer(paths, surt_ordered)
def __init__(self, sources, surt_ordered=True):
self.sources = []
self.surt_ordered = surt_ordered self.surt_ordered = surt_ordered
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
if not isinstance(sources, list):
sources = [sources]
for src in sources:
if isinstance(src, CDXSource):
self.add_cdx_source(src)
elif isinstance(src, str):
if os.path.isdir(src):
for file in os.listdir(src):
self.add_cdx_source(src + file)
else:
self.add_cdx_source(src)
if len(self.sources) == 0:
logging.exception('No CDX Sources Found from: ' + str(sources))
def add_cdx_source(self, source):
if not isinstance(source, CDXSource):
source = self.create_cdx_source(source)
if not source:
return
logging.debug('Adding CDX Source: ' + str(source))
self.sources.append(source)
@staticmethod
def create_cdx_source(filename):
if filename.startswith('http://') or filename.startswith('https://'):
return RemoteCDXSource(filename)
if filename.endswith('.cdx'):
return CDXFile(filename)
return None
#TODO: support zipnum
#elif filename.endswith('.summary')
# return ZipNumCDXSource(filename)
#elif filename.startswith('redis://')
# return RedisCDXSource(filename)
def load_cdx(self, **params): def load_cdx(self, **params):
# if key not set, assume 'url' is set and needs canonicalization # if key not set, assume 'url' is set and needs canonicalization
if not params.get('key'): if not params.get('key'):
params['key'] = self._canonicalize(params) params['key'] = self._canonicalize(params)
self._convert_old_style(params) convert_old_style_params(params)
return cdx_load(self.sources, params) return cdx_load(self.sources, params)
@ -112,43 +65,145 @@ class CDXServer:
return key return key
def _convert_old_style(self, params): def __str__(self):
""" return 'CDX server serving from ' + str(self.sources)
Convert old-style CDX Server param semantics
"""
collapse_time = params.get('collapseTime')
if collapse_time:
params['collapse_time'] = collapse_time
resolve_revisits = params.get('resolveRevisits')
if resolve_revisits:
params['resolve_revisits'] = resolve_revisits
if params.get('sort') == 'reverse': #=================================================================
params['reverse'] = True class RemoteCDXServer(object):
"""
A special cdx server that uses a single RemoteCDXSource
It simply proxies the query params to the remote source
and performs no local processing/filtering
"""
def __init__(self, source):
if isinstance(source, RemoteCDXSource):
self.source = source
elif (isinstance(source, str) and
any(source.startswith(x) for x in ['http://', 'https://'])):
self.source = RemoteCDXSource(source)
else:
raise Exception('Invalid remote cdx source: ' + str(source))
def load_cdx_from_request(self, env): def load_cdx(self, **params):
#url = wbrequest.wb_url.url remote_iter = remote.load_cdx(**params)
# if need raw, convert to raw format here
# use url= param to get actual url if params.get('output') == 'raw':
params = urlparse.parse_qs(env['QUERY_STRING']) return (CDXObject(cdx) for cdx in remote_iter)
else:
if not 'output' in params: return remote_iter
params['output'] = 'text'
# parse_qs produces arrays for single values
# cdx processing expects singleton params for all params,
# except filters, so convert here
# use first value of the list
for name, val in params.iteritems():
if name != 'filter':
params[name] = val[0]
cdx_lines = self.load_cdx(**params)
return cdx_lines
def __str__(self): def __str__(self):
return 'load cdx indexes from ' + str(self.sources) return 'Remote CDX server serving from ' + str(self.sources[0])
#=================================================================
def create_cdx_server(config):
if hasattr(config, 'get'):
paths = config.get('index_paths')
surt_ordered = config.get('surt_ordered', True)
else:
paths = config
surt_ordered = True
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
if (isinstance(paths, str) and
any(paths.startswith(x) for x in ['http://', 'https://'])):
return RemoteCDXServer(paths)
else:
return CDXServer(paths)
#=================================================================
def create_cdx_sources(paths):
sources = []
if not isinstance(paths, list):
paths = [paths]
for path in paths:
if isinstance(path, CDXSource):
add_cdx_source(sources, path)
elif isinstance(path, str):
if os.path.isdir(path):
for file in os.listdir(path):
add_cdx_source(sources, path + file)
else:
add_cdx_source(sources, path)
if len(sources) == 0:
logging.exception('No CDX Sources Found from: ' + str(sources))
return sources
#=================================================================
def add_cdx_source(sources, source):
if not isinstance(source, CDXSource):
source = create_cdx_source(source)
if not source:
return
logging.debug('Adding CDX Source: ' + str(source))
sources.append(source)
#=================================================================
def create_cdx_source(filename):
if filename.startswith('http://') or filename.startswith('https://'):
return RemoteCDXSource(filename)
if filename.endswith('.cdx'):
return CDXFile(filename)
return None
#TODO: support zipnum
#elif filename.endswith('.summary')
# return ZipNumCDXSource(filename)
#elif filename.startswith('redis://')
# return RedisCDXSource(filename)
#=================================================================
def convert_old_style_params(params):
"""
Convert old-style CDX Server param semantics
"""
collapse_time = params.get('collapseTime')
if collapse_time:
params['collapse_time'] = collapse_time
resolve_revisits = params.get('resolveRevisits')
if resolve_revisits:
params['resolve_revisits'] = resolve_revisits
if params.get('sort') == 'reverse':
params['reverse'] = True
return params
#=================================================================
def extract_params_from_wsgi_env(env):
""" utility function to extract params from the query
string of a WSGI environment dictionary
"""
# use url= param to get actual url
params = urlparse.parse_qs(env['QUERY_STRING'])
if not 'output' in params:
params['output'] = 'text'
# parse_qs produces arrays for single values
# cdx processing expects singleton params for all params,
# except filters, so convert here
# use first value of the list
for name, val in params.iteritems():
if name != 'filter':
params[name] = val[0]
return params
#================================================================= #=================================================================

View File

@ -1,3 +1,3 @@
#CDX Server WSGI App Config #CDX Server WSGI App Config
index_paths: ./sample_data/ index_paths: ./sample_archive/cdx/
port: 8090 port: 8090

View File

@ -1,38 +1,42 @@
from cdxserver import CDXServer from cdxserver import create_cdx_server, extract_params_from_wsgi_env
from pywb import get_test_dir
import logging import logging
import os import os
import yaml import yaml
import pkgutil import pkgutil
#================================================================= #=================================================================
TEST_CDX_DIR = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/'
CONFIG_FILE = 'config.yaml' CONFIG_FILE = 'config.yaml'
DEFAULT_PORT = 8080 DEFAULT_PORT = 8080
config = None
if __package__: if __package__:
config = pkgutil.get_data(__package__, CONFIG_FILE) try:
config = yaml.load(config) config = pkgutil.get_data(__package__, CONFIG_FILE)
else: config = yaml.load(config)
config = None except:
pass
#================================================================= #=================================================================
def main(): def main(paths=None):
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s', logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
level=logging.DEBUG) level=logging.DEBUG)
cdx_config = config.get('index_paths') if config else None if not paths:
if config:
paths = config
else:
paths = get_test_dir() + 'cdx/'
if not cdx_config: cdxserver = create_cdx_server(paths)
cdx_config = [TEST_CDX_DIR]
cdxserver = CDXServer(cdx_config)
def application(env, start_response): def application(env, start_response):
try: try:
response = cdxserver.load_cdx_from_request(env) params = extract_params_from_wsgi_env(env)
response = cdxserver.load_cdx(**params)
start_response('200 OK', [('Content-Type', 'text/plain')]) start_response('200 OK', [('Content-Type', 'text/plain')])
response = list(response) response = list(response)

View File

@ -4,6 +4,7 @@ import mimetypes
import time import time
from pywb.rewrite.wburl import WbUrl from pywb.rewrite.wburl import WbUrl
from pywb.cdx.cdxserver import extract_params_from_wsgi_env
from wbrequestresponse import WbResponse from wbrequestresponse import WbResponse
from wbexceptions import WbException, NotFoundException from wbexceptions import WbException, NotFoundException
from views import TextCapturesView from views import TextCapturesView
@ -69,7 +70,8 @@ class CDXHandler(BaseHandler):
self.view = view if view else TextCapturesView() self.view = view if view else TextCapturesView()
def __call__(self, wbrequest): def __call__(self, wbrequest):
cdx_lines = self.index_reader.cdx_server.load_cdx_from_request(wbrequest.env) params = extract_params_from_wsgi_env(wbrequest.env)
cdx_lines = self.index_reader.load_cdx(**params)
return self.view.render_response(wbrequest, cdx_lines) return self.view.render_response(wbrequest, cdx_lines)

View File

@ -6,16 +6,13 @@ from wbexceptions import NotFoundException
from itertools import chain from itertools import chain
from pprint import pprint from pprint import pprint
from pywb.cdx.cdxserver import CDXServer, CDXException from pywb.cdx.cdxserver import create_cdx_server, CDXException
from pywb.cdx.cdxobject import CDXObject from pywb.cdx.cdxobject import CDXObject
#================================================================= #=================================================================
class IndexReader(object): class IndexReader(object):
def __init__(self, config): def __init__(self, config):
if isinstance(config, str): self.cdx_server = create_cdx_server(config)
self.cdx_server = CDXServer(config)
else:
self.cdx_server = CDXServer.create_from_config(config)
def load_for_request(self, wbrequest): def load_for_request(self, wbrequest):
wburl = wbrequest.wb_url wburl = wbrequest.wb_url
@ -76,12 +73,3 @@ class IndexReader(object):
return None return None
return chain([first], iterable) return chain([first], iterable)
#=================================================================
class RemoteCDXServer(IndexReader):
def __init__(self, remote_url, cookie=None):
self.remote = RemoteCDXSource(remote_url=remote_url, cookie=cookie, proxy_all=True)
self.cdx_server = CDXServer(self.remote)
#def load_cdx(self, **params):
#return remote.load_cdx(**params)

View File

@ -13,7 +13,7 @@ setuptools.setup(name='pywb',
license='GPL', license='GPL',
packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
package_data={'pywb': ['ui/*', 'static/*']}, package_data={'pywb': ['ui/*', 'static/*'], 'pywb.cdx': ['*.yaml']},
data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')), data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))], ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))],
install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'], install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'],