mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
cleanup cdx server config, refactored such that
a cdx server need implement a single interface: load_cdx(self, **params) CDXServer and RemoteCDXServer distinct classes in cdxserver.py utility function cdxserver.create_cdx_server() to create appropriate server based on input
This commit is contained in:
parent
94f1dc3be5
commit
abea504b04
@ -7,6 +7,7 @@ import os
|
|||||||
import urlparse
|
import urlparse
|
||||||
|
|
||||||
from cdxsource import CDXSource, CDXFile, RemoteCDXSource
|
from cdxsource import CDXSource, CDXFile, RemoteCDXSource
|
||||||
|
from cdxobject import CDXObject
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -22,70 +23,22 @@ class AccessException(CDXException):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class CDXServer:
|
class CDXServer(object):
|
||||||
"""
|
"""
|
||||||
Top-level cdx server object which maintains a list of cdx sources,
|
Top-level cdx server object which maintains a list of cdx sources,
|
||||||
responds to queries and dispatches to the cdx ops for processing
|
responds to queries and dispatches to the cdx ops for processing
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
def __init__(self, paths, surt_ordered=True):
|
||||||
def create_from_config(config):
|
self.sources = create_cdx_sources(paths)
|
||||||
paths = config.get('index_paths')
|
|
||||||
surt_ordered = config.get('surt_ordered', True)
|
|
||||||
return CDXServer(paths, surt_ordered)
|
|
||||||
|
|
||||||
def __init__(self, sources, surt_ordered=True):
|
|
||||||
self.sources = []
|
|
||||||
self.surt_ordered = surt_ordered
|
self.surt_ordered = surt_ordered
|
||||||
|
|
||||||
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
|
|
||||||
|
|
||||||
if not isinstance(sources, list):
|
|
||||||
sources = [sources]
|
|
||||||
|
|
||||||
for src in sources:
|
|
||||||
if isinstance(src, CDXSource):
|
|
||||||
self.add_cdx_source(src)
|
|
||||||
elif isinstance(src, str):
|
|
||||||
if os.path.isdir(src):
|
|
||||||
for file in os.listdir(src):
|
|
||||||
self.add_cdx_source(src + file)
|
|
||||||
else:
|
|
||||||
self.add_cdx_source(src)
|
|
||||||
|
|
||||||
if len(self.sources) == 0:
|
|
||||||
logging.exception('No CDX Sources Found from: ' + str(sources))
|
|
||||||
|
|
||||||
def add_cdx_source(self, source):
|
|
||||||
if not isinstance(source, CDXSource):
|
|
||||||
source = self.create_cdx_source(source)
|
|
||||||
if not source:
|
|
||||||
return
|
|
||||||
|
|
||||||
logging.debug('Adding CDX Source: ' + str(source))
|
|
||||||
self.sources.append(source)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def create_cdx_source(filename):
|
|
||||||
if filename.startswith('http://') or filename.startswith('https://'):
|
|
||||||
return RemoteCDXSource(filename)
|
|
||||||
|
|
||||||
if filename.endswith('.cdx'):
|
|
||||||
return CDXFile(filename)
|
|
||||||
|
|
||||||
return None
|
|
||||||
#TODO: support zipnum
|
|
||||||
#elif filename.endswith('.summary')
|
|
||||||
# return ZipNumCDXSource(filename)
|
|
||||||
#elif filename.startswith('redis://')
|
|
||||||
# return RedisCDXSource(filename)
|
|
||||||
|
|
||||||
def load_cdx(self, **params):
|
def load_cdx(self, **params):
|
||||||
# if key not set, assume 'url' is set and needs canonicalization
|
# if key not set, assume 'url' is set and needs canonicalization
|
||||||
if not params.get('key'):
|
if not params.get('key'):
|
||||||
params['key'] = self._canonicalize(params)
|
params['key'] = self._canonicalize(params)
|
||||||
|
|
||||||
self._convert_old_style(params)
|
convert_old_style_params(params)
|
||||||
|
|
||||||
return cdx_load(self.sources, params)
|
return cdx_load(self.sources, params)
|
||||||
|
|
||||||
@ -112,43 +65,145 @@ class CDXServer:
|
|||||||
|
|
||||||
return key
|
return key
|
||||||
|
|
||||||
def _convert_old_style(self, params):
|
def __str__(self):
|
||||||
"""
|
return 'CDX server serving from ' + str(self.sources)
|
||||||
Convert old-style CDX Server param semantics
|
|
||||||
"""
|
|
||||||
collapse_time = params.get('collapseTime')
|
|
||||||
if collapse_time:
|
|
||||||
params['collapse_time'] = collapse_time
|
|
||||||
|
|
||||||
resolve_revisits = params.get('resolveRevisits')
|
|
||||||
if resolve_revisits:
|
|
||||||
params['resolve_revisits'] = resolve_revisits
|
|
||||||
|
|
||||||
if params.get('sort') == 'reverse':
|
#=================================================================
|
||||||
params['reverse'] = True
|
class RemoteCDXServer(object):
|
||||||
|
"""
|
||||||
|
A special cdx server that uses a single RemoteCDXSource
|
||||||
|
It simply proxies the query params to the remote source
|
||||||
|
and performs no local processing/filtering
|
||||||
|
"""
|
||||||
|
def __init__(self, source):
|
||||||
|
if isinstance(source, RemoteCDXSource):
|
||||||
|
self.source = source
|
||||||
|
elif (isinstance(source, str) and
|
||||||
|
any(source.startswith(x) for x in ['http://', 'https://'])):
|
||||||
|
self.source = RemoteCDXSource(source)
|
||||||
|
else:
|
||||||
|
raise Exception('Invalid remote cdx source: ' + str(source))
|
||||||
|
|
||||||
def load_cdx_from_request(self, env):
|
def load_cdx(self, **params):
|
||||||
#url = wbrequest.wb_url.url
|
remote_iter = remote.load_cdx(**params)
|
||||||
|
# if need raw, convert to raw format here
|
||||||
# use url= param to get actual url
|
if params.get('output') == 'raw':
|
||||||
params = urlparse.parse_qs(env['QUERY_STRING'])
|
return (CDXObject(cdx) for cdx in remote_iter)
|
||||||
|
else:
|
||||||
if not 'output' in params:
|
return remote_iter
|
||||||
params['output'] = 'text'
|
|
||||||
|
|
||||||
# parse_qs produces arrays for single values
|
|
||||||
# cdx processing expects singleton params for all params,
|
|
||||||
# except filters, so convert here
|
|
||||||
# use first value of the list
|
|
||||||
for name, val in params.iteritems():
|
|
||||||
if name != 'filter':
|
|
||||||
params[name] = val[0]
|
|
||||||
|
|
||||||
cdx_lines = self.load_cdx(**params)
|
|
||||||
return cdx_lines
|
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'load cdx indexes from ' + str(self.sources)
|
return 'Remote CDX server serving from ' + str(self.sources[0])
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def create_cdx_server(config):
|
||||||
|
if hasattr(config, 'get'):
|
||||||
|
paths = config.get('index_paths')
|
||||||
|
surt_ordered = config.get('surt_ordered', True)
|
||||||
|
else:
|
||||||
|
paths = config
|
||||||
|
surt_ordered = True
|
||||||
|
|
||||||
|
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
|
||||||
|
|
||||||
|
if (isinstance(paths, str) and
|
||||||
|
any(paths.startswith(x) for x in ['http://', 'https://'])):
|
||||||
|
return RemoteCDXServer(paths)
|
||||||
|
else:
|
||||||
|
return CDXServer(paths)
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def create_cdx_sources(paths):
|
||||||
|
sources = []
|
||||||
|
|
||||||
|
if not isinstance(paths, list):
|
||||||
|
paths = [paths]
|
||||||
|
|
||||||
|
for path in paths:
|
||||||
|
if isinstance(path, CDXSource):
|
||||||
|
add_cdx_source(sources, path)
|
||||||
|
elif isinstance(path, str):
|
||||||
|
if os.path.isdir(path):
|
||||||
|
for file in os.listdir(path):
|
||||||
|
add_cdx_source(sources, path + file)
|
||||||
|
else:
|
||||||
|
add_cdx_source(sources, path)
|
||||||
|
|
||||||
|
if len(sources) == 0:
|
||||||
|
logging.exception('No CDX Sources Found from: ' + str(sources))
|
||||||
|
|
||||||
|
return sources
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def add_cdx_source(sources, source):
|
||||||
|
if not isinstance(source, CDXSource):
|
||||||
|
source = create_cdx_source(source)
|
||||||
|
if not source:
|
||||||
|
return
|
||||||
|
|
||||||
|
logging.debug('Adding CDX Source: ' + str(source))
|
||||||
|
sources.append(source)
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def create_cdx_source(filename):
|
||||||
|
if filename.startswith('http://') or filename.startswith('https://'):
|
||||||
|
return RemoteCDXSource(filename)
|
||||||
|
|
||||||
|
if filename.endswith('.cdx'):
|
||||||
|
return CDXFile(filename)
|
||||||
|
|
||||||
|
return None
|
||||||
|
#TODO: support zipnum
|
||||||
|
#elif filename.endswith('.summary')
|
||||||
|
# return ZipNumCDXSource(filename)
|
||||||
|
#elif filename.startswith('redis://')
|
||||||
|
# return RedisCDXSource(filename)
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def convert_old_style_params(params):
|
||||||
|
"""
|
||||||
|
Convert old-style CDX Server param semantics
|
||||||
|
"""
|
||||||
|
collapse_time = params.get('collapseTime')
|
||||||
|
if collapse_time:
|
||||||
|
params['collapse_time'] = collapse_time
|
||||||
|
|
||||||
|
resolve_revisits = params.get('resolveRevisits')
|
||||||
|
if resolve_revisits:
|
||||||
|
params['resolve_revisits'] = resolve_revisits
|
||||||
|
|
||||||
|
if params.get('sort') == 'reverse':
|
||||||
|
params['reverse'] = True
|
||||||
|
|
||||||
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def extract_params_from_wsgi_env(env):
|
||||||
|
""" utility function to extract params from the query
|
||||||
|
string of a WSGI environment dictionary
|
||||||
|
"""
|
||||||
|
# use url= param to get actual url
|
||||||
|
params = urlparse.parse_qs(env['QUERY_STRING'])
|
||||||
|
|
||||||
|
if not 'output' in params:
|
||||||
|
params['output'] = 'text'
|
||||||
|
|
||||||
|
# parse_qs produces arrays for single values
|
||||||
|
# cdx processing expects singleton params for all params,
|
||||||
|
# except filters, so convert here
|
||||||
|
# use first value of the list
|
||||||
|
for name, val in params.iteritems():
|
||||||
|
if name != 'filter':
|
||||||
|
params[name] = val[0]
|
||||||
|
|
||||||
|
return params
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
@ -1,3 +1,3 @@
|
|||||||
#CDX Server WSGI App Config
|
#CDX Server WSGI App Config
|
||||||
index_paths: ./sample_data/
|
index_paths: ./sample_archive/cdx/
|
||||||
port: 8090
|
port: 8090
|
||||||
|
@ -1,38 +1,42 @@
|
|||||||
from cdxserver import CDXServer
|
from cdxserver import create_cdx_server, extract_params_from_wsgi_env
|
||||||
|
from pywb import get_test_dir
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import yaml
|
import yaml
|
||||||
import pkgutil
|
import pkgutil
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
TEST_CDX_DIR = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/'
|
|
||||||
|
|
||||||
CONFIG_FILE = 'config.yaml'
|
CONFIG_FILE = 'config.yaml'
|
||||||
|
|
||||||
DEFAULT_PORT = 8080
|
DEFAULT_PORT = 8080
|
||||||
|
|
||||||
|
config = None
|
||||||
if __package__:
|
if __package__:
|
||||||
config = pkgutil.get_data(__package__, CONFIG_FILE)
|
try:
|
||||||
config = yaml.load(config)
|
config = pkgutil.get_data(__package__, CONFIG_FILE)
|
||||||
else:
|
config = yaml.load(config)
|
||||||
config = None
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def main():
|
def main(paths=None):
|
||||||
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
|
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
|
||||||
level=logging.DEBUG)
|
level=logging.DEBUG)
|
||||||
|
|
||||||
cdx_config = config.get('index_paths') if config else None
|
if not paths:
|
||||||
|
if config:
|
||||||
|
paths = config
|
||||||
|
else:
|
||||||
|
paths = get_test_dir() + 'cdx/'
|
||||||
|
|
||||||
if not cdx_config:
|
cdxserver = create_cdx_server(paths)
|
||||||
cdx_config = [TEST_CDX_DIR]
|
|
||||||
|
|
||||||
cdxserver = CDXServer(cdx_config)
|
|
||||||
|
|
||||||
def application(env, start_response):
|
def application(env, start_response):
|
||||||
try:
|
try:
|
||||||
response = cdxserver.load_cdx_from_request(env)
|
params = extract_params_from_wsgi_env(env)
|
||||||
|
response = cdxserver.load_cdx(**params)
|
||||||
start_response('200 OK', [('Content-Type', 'text/plain')])
|
start_response('200 OK', [('Content-Type', 'text/plain')])
|
||||||
|
|
||||||
response = list(response)
|
response = list(response)
|
||||||
|
@ -4,6 +4,7 @@ import mimetypes
|
|||||||
import time
|
import time
|
||||||
|
|
||||||
from pywb.rewrite.wburl import WbUrl
|
from pywb.rewrite.wburl import WbUrl
|
||||||
|
from pywb.cdx.cdxserver import extract_params_from_wsgi_env
|
||||||
from wbrequestresponse import WbResponse
|
from wbrequestresponse import WbResponse
|
||||||
from wbexceptions import WbException, NotFoundException
|
from wbexceptions import WbException, NotFoundException
|
||||||
from views import TextCapturesView
|
from views import TextCapturesView
|
||||||
@ -69,7 +70,8 @@ class CDXHandler(BaseHandler):
|
|||||||
self.view = view if view else TextCapturesView()
|
self.view = view if view else TextCapturesView()
|
||||||
|
|
||||||
def __call__(self, wbrequest):
|
def __call__(self, wbrequest):
|
||||||
cdx_lines = self.index_reader.cdx_server.load_cdx_from_request(wbrequest.env)
|
params = extract_params_from_wsgi_env(wbrequest.env)
|
||||||
|
cdx_lines = self.index_reader.load_cdx(**params)
|
||||||
|
|
||||||
return self.view.render_response(wbrequest, cdx_lines)
|
return self.view.render_response(wbrequest, cdx_lines)
|
||||||
|
|
||||||
|
@ -6,16 +6,13 @@ from wbexceptions import NotFoundException
|
|||||||
from itertools import chain
|
from itertools import chain
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
|
|
||||||
from pywb.cdx.cdxserver import CDXServer, CDXException
|
from pywb.cdx.cdxserver import create_cdx_server, CDXException
|
||||||
from pywb.cdx.cdxobject import CDXObject
|
from pywb.cdx.cdxobject import CDXObject
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class IndexReader(object):
|
class IndexReader(object):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
if isinstance(config, str):
|
self.cdx_server = create_cdx_server(config)
|
||||||
self.cdx_server = CDXServer(config)
|
|
||||||
else:
|
|
||||||
self.cdx_server = CDXServer.create_from_config(config)
|
|
||||||
|
|
||||||
def load_for_request(self, wbrequest):
|
def load_for_request(self, wbrequest):
|
||||||
wburl = wbrequest.wb_url
|
wburl = wbrequest.wb_url
|
||||||
@ -76,12 +73,3 @@ class IndexReader(object):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
return chain([first], iterable)
|
return chain([first], iterable)
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class RemoteCDXServer(IndexReader):
|
|
||||||
def __init__(self, remote_url, cookie=None):
|
|
||||||
self.remote = RemoteCDXSource(remote_url=remote_url, cookie=cookie, proxy_all=True)
|
|
||||||
self.cdx_server = CDXServer(self.remote)
|
|
||||||
|
|
||||||
#def load_cdx(self, **params):
|
|
||||||
#return remote.load_cdx(**params)
|
|
||||||
|
2
setup.py
2
setup.py
@ -13,7 +13,7 @@ setuptools.setup(name='pywb',
|
|||||||
license='GPL',
|
license='GPL',
|
||||||
packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
|
packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
|
||||||
provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
|
provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
|
||||||
package_data={'pywb': ['ui/*', 'static/*']},
|
package_data={'pywb': ['ui/*', 'static/*'], 'pywb.cdx': ['*.yaml']},
|
||||||
data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
|
data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
|
||||||
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))],
|
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))],
|
||||||
install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'],
|
install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'],
|
||||||
|
Loading…
x
Reference in New Issue
Block a user