diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 22a14ee0..4fad5ff0 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -7,6 +7,7 @@ import os import urlparse from cdxsource import CDXSource, CDXFile, RemoteCDXSource +from cdxobject import CDXObject #================================================================= @@ -22,70 +23,22 @@ class AccessException(CDXException): #================================================================= -class CDXServer: +class CDXServer(object): """ Top-level cdx server object which maintains a list of cdx sources, responds to queries and dispatches to the cdx ops for processing """ - @staticmethod - def create_from_config(config): - paths = config.get('index_paths') - surt_ordered = config.get('surt_ordered', True) - return CDXServer(paths, surt_ordered) - - def __init__(self, sources, surt_ordered=True): - self.sources = [] + def __init__(self, paths, surt_ordered=True): + self.sources = create_cdx_sources(paths) self.surt_ordered = surt_ordered - logging.debug('CDX Surt-Ordered? ' + str(surt_ordered)) - - if not isinstance(sources, list): - sources = [sources] - - for src in sources: - if isinstance(src, CDXSource): - self.add_cdx_source(src) - elif isinstance(src, str): - if os.path.isdir(src): - for file in os.listdir(src): - self.add_cdx_source(src + file) - else: - self.add_cdx_source(src) - - if len(self.sources) == 0: - logging.exception('No CDX Sources Found from: ' + str(sources)) - - def add_cdx_source(self, source): - if not isinstance(source, CDXSource): - source = self.create_cdx_source(source) - if not source: - return - - logging.debug('Adding CDX Source: ' + str(source)) - self.sources.append(source) - - @staticmethod - def create_cdx_source(filename): - if filename.startswith('http://') or filename.startswith('https://'): - return RemoteCDXSource(filename) - - if filename.endswith('.cdx'): - return CDXFile(filename) - - return None - #TODO: support zipnum - #elif filename.endswith('.summary') - # return ZipNumCDXSource(filename) - #elif filename.startswith('redis://') - # return RedisCDXSource(filename) - def load_cdx(self, **params): # if key not set, assume 'url' is set and needs canonicalization if not params.get('key'): params['key'] = self._canonicalize(params) - self._convert_old_style(params) + convert_old_style_params(params) return cdx_load(self.sources, params) @@ -112,43 +65,145 @@ class CDXServer: return key - def _convert_old_style(self, params): - """ - Convert old-style CDX Server param semantics - """ - collapse_time = params.get('collapseTime') - if collapse_time: - params['collapse_time'] = collapse_time + def __str__(self): + return 'CDX server serving from ' + str(self.sources) - resolve_revisits = params.get('resolveRevisits') - if resolve_revisits: - params['resolve_revisits'] = resolve_revisits - if params.get('sort') == 'reverse': - params['reverse'] = True +#================================================================= +class RemoteCDXServer(object): + """ + A special cdx server that uses a single RemoteCDXSource + It simply proxies the query params to the remote source + and performs no local processing/filtering + """ + def __init__(self, source): + if isinstance(source, RemoteCDXSource): + self.source = source + elif (isinstance(source, str) and + any(source.startswith(x) for x in ['http://', 'https://'])): + self.source = RemoteCDXSource(source) + else: + raise Exception('Invalid remote cdx source: ' + str(source)) - def load_cdx_from_request(self, env): - #url = wbrequest.wb_url.url - - # use url= param to get actual url - params = urlparse.parse_qs(env['QUERY_STRING']) - - if not 'output' in params: - params['output'] = 'text' - - # parse_qs produces arrays for single values - # cdx processing expects singleton params for all params, - # except filters, so convert here - # use first value of the list - for name, val in params.iteritems(): - if name != 'filter': - params[name] = val[0] - - cdx_lines = self.load_cdx(**params) - return cdx_lines + def load_cdx(self, **params): + remote_iter = remote.load_cdx(**params) + # if need raw, convert to raw format here + if params.get('output') == 'raw': + return (CDXObject(cdx) for cdx in remote_iter) + else: + return remote_iter def __str__(self): - return 'load cdx indexes from ' + str(self.sources) + return 'Remote CDX server serving from ' + str(self.sources[0]) + + +#================================================================= +def create_cdx_server(config): + if hasattr(config, 'get'): + paths = config.get('index_paths') + surt_ordered = config.get('surt_ordered', True) + else: + paths = config + surt_ordered = True + + logging.debug('CDX Surt-Ordered? ' + str(surt_ordered)) + + if (isinstance(paths, str) and + any(paths.startswith(x) for x in ['http://', 'https://'])): + return RemoteCDXServer(paths) + else: + return CDXServer(paths) + + +#================================================================= +def create_cdx_sources(paths): + sources = [] + + if not isinstance(paths, list): + paths = [paths] + + for path in paths: + if isinstance(path, CDXSource): + add_cdx_source(sources, path) + elif isinstance(path, str): + if os.path.isdir(path): + for file in os.listdir(path): + add_cdx_source(sources, path + file) + else: + add_cdx_source(sources, path) + + if len(sources) == 0: + logging.exception('No CDX Sources Found from: ' + str(sources)) + + return sources + + +#================================================================= +def add_cdx_source(sources, source): + if not isinstance(source, CDXSource): + source = create_cdx_source(source) + if not source: + return + + logging.debug('Adding CDX Source: ' + str(source)) + sources.append(source) + + +#================================================================= +def create_cdx_source(filename): + if filename.startswith('http://') or filename.startswith('https://'): + return RemoteCDXSource(filename) + + if filename.endswith('.cdx'): + return CDXFile(filename) + + return None + #TODO: support zipnum + #elif filename.endswith('.summary') + # return ZipNumCDXSource(filename) + #elif filename.startswith('redis://') + # return RedisCDXSource(filename) + + +#================================================================= +def convert_old_style_params(params): + """ + Convert old-style CDX Server param semantics + """ + collapse_time = params.get('collapseTime') + if collapse_time: + params['collapse_time'] = collapse_time + + resolve_revisits = params.get('resolveRevisits') + if resolve_revisits: + params['resolve_revisits'] = resolve_revisits + + if params.get('sort') == 'reverse': + params['reverse'] = True + + return params + + +#================================================================= +def extract_params_from_wsgi_env(env): + """ utility function to extract params from the query + string of a WSGI environment dictionary + """ + # use url= param to get actual url + params = urlparse.parse_qs(env['QUERY_STRING']) + + if not 'output' in params: + params['output'] = 'text' + + # parse_qs produces arrays for single values + # cdx processing expects singleton params for all params, + # except filters, so convert here + # use first value of the list + for name, val in params.iteritems(): + if name != 'filter': + params[name] = val[0] + + return params #================================================================= diff --git a/pywb/cdx/config.yaml b/pywb/cdx/config.yaml index 2aa4838f..293e71a1 100644 --- a/pywb/cdx/config.yaml +++ b/pywb/cdx/config.yaml @@ -1,3 +1,3 @@ #CDX Server WSGI App Config -index_paths: ./sample_data/ +index_paths: ./sample_archive/cdx/ port: 8090 diff --git a/pywb/cdx/wsgi_cdxserver.py b/pywb/cdx/wsgi_cdxserver.py index ecf64b8b..e7840cc9 100644 --- a/pywb/cdx/wsgi_cdxserver.py +++ b/pywb/cdx/wsgi_cdxserver.py @@ -1,38 +1,42 @@ -from cdxserver import CDXServer +from cdxserver import create_cdx_server, extract_params_from_wsgi_env +from pywb import get_test_dir + import logging import os import yaml import pkgutil #================================================================= -TEST_CDX_DIR = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/' - CONFIG_FILE = 'config.yaml' DEFAULT_PORT = 8080 +config = None if __package__: - config = pkgutil.get_data(__package__, CONFIG_FILE) - config = yaml.load(config) -else: - config = None + try: + config = pkgutil.get_data(__package__, CONFIG_FILE) + config = yaml.load(config) + except: + pass #================================================================= -def main(): +def main(paths=None): logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s', level=logging.DEBUG) - cdx_config = config.get('index_paths') if config else None + if not paths: + if config: + paths = config + else: + paths = get_test_dir() + 'cdx/' - if not cdx_config: - cdx_config = [TEST_CDX_DIR] - - cdxserver = CDXServer(cdx_config) + cdxserver = create_cdx_server(paths) def application(env, start_response): try: - response = cdxserver.load_cdx_from_request(env) + params = extract_params_from_wsgi_env(env) + response = cdxserver.load_cdx(**params) start_response('200 OK', [('Content-Type', 'text/plain')]) response = list(response) diff --git a/pywb/handlers.py b/pywb/handlers.py index 0061264d..4be855e3 100644 --- a/pywb/handlers.py +++ b/pywb/handlers.py @@ -4,6 +4,7 @@ import mimetypes import time from pywb.rewrite.wburl import WbUrl +from pywb.cdx.cdxserver import extract_params_from_wsgi_env from wbrequestresponse import WbResponse from wbexceptions import WbException, NotFoundException from views import TextCapturesView @@ -69,7 +70,8 @@ class CDXHandler(BaseHandler): self.view = view if view else TextCapturesView() def __call__(self, wbrequest): - cdx_lines = self.index_reader.cdx_server.load_cdx_from_request(wbrequest.env) + params = extract_params_from_wsgi_env(wbrequest.env) + cdx_lines = self.index_reader.load_cdx(**params) return self.view.render_response(wbrequest, cdx_lines) diff --git a/pywb/indexreader.py b/pywb/indexreader.py index 236b83f0..7472e762 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -6,16 +6,13 @@ from wbexceptions import NotFoundException from itertools import chain from pprint import pprint -from pywb.cdx.cdxserver import CDXServer, CDXException +from pywb.cdx.cdxserver import create_cdx_server, CDXException from pywb.cdx.cdxobject import CDXObject #================================================================= class IndexReader(object): def __init__(self, config): - if isinstance(config, str): - self.cdx_server = CDXServer(config) - else: - self.cdx_server = CDXServer.create_from_config(config) + self.cdx_server = create_cdx_server(config) def load_for_request(self, wbrequest): wburl = wbrequest.wb_url @@ -76,12 +73,3 @@ class IndexReader(object): return None return chain([first], iterable) - -#================================================================= -class RemoteCDXServer(IndexReader): - def __init__(self, remote_url, cookie=None): - self.remote = RemoteCDXSource(remote_url=remote_url, cookie=cookie, proxy_all=True) - self.cdx_server = CDXServer(self.remote) - - #def load_cdx(self, **params): - #return remote.load_cdx(**params) diff --git a/setup.py b/setup.py index c3bb977d..20ac8518 100755 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ setuptools.setup(name='pywb', license='GPL', packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], - package_data={'pywb': ['ui/*', 'static/*']}, + package_data={'pywb': ['ui/*', 'static/*'], 'pywb.cdx': ['*.yaml']}, data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')), ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))], install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'],