From d0229b6b2d33e932aef73807a7511d09dcffc6e7 Mon Sep 17 00:00:00 2001 From: Kenji Nagahashi Date: Wed, 19 Feb 2014 23:37:44 +0000 Subject: [PATCH 1/8] cleanup setup.py indent for ease of add/remove things. also use find_package(). --- setup.py | 51 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 16 deletions(-) diff --git a/setup.py b/setup.py index 20ac8518..e8d92e16 100755 --- a/setup.py +++ b/setup.py @@ -1,22 +1,41 @@ #!/usr/bin/env python # vim: set sw=4 et: -import setuptools +from setuptools import setup, find_packages import glob -setuptools.setup(name='pywb', - version='0.2', - url='https://github.com/ikreymer/pywb', - author='Ilya Kreymer', - author_email='ilya@archive.org', - long_description=open('README.md').read(), - license='GPL', - packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], - provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], - package_data={'pywb': ['ui/*', 'static/*'], 'pywb.cdx': ['*.yaml']}, - data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')), - ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))], - install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'], -# tests_require=['WebTest', 'pytest'], - zip_safe=False) +setup( + name='pywb', + version='0.2', + url='https://github.com/ikreymer/pywb', + author='Ilya Kreymer', + author_email='ilya@archive.org', + long_description=open('README.md').read(), + license='GPL', + packages=find_packages(), + provides=[ + 'pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite' + ], + package_data={ + 'pywb': ['ui/*', 'static/*'], + 'pywb.cdx': ['*.yaml'] + }, + data_files = [ + ('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')), + ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')) + ], + install_requires=[ + 'uwsgi', + 'rfc3987', + 'chardet', + 'redis', + 'jinja2', + 'surt', + 'pyyaml', + 'WebTest', + 'pytest', + ], + # tests_require=['WebTest', 'pytest'], + zip_safe=False + ) From 79eb3be44f17bc5bfccee171c6aac843e3b0d736 Mon Sep 17 00:00:00 2001 From: Kenji Nagahashi Date: Thu, 20 Feb 2014 09:58:08 +0000 Subject: [PATCH 2/8] rewrite wsgi_cdxserver with werkzeug use pkg_resources instead of pkgutil because pkgutil breaks with auto-reload. add --port command line option. --- pywb/cdx/cdxdomainspecific.py | 4 +- pywb/cdx/wsgi_cdxserver.py | 116 ++++++++++++++++++++-------------- setup.py | 2 + 3 files changed, 72 insertions(+), 50 deletions(-) diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index 2c733c8d..4d8d9b87 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -1,14 +1,14 @@ import yaml import re import logging -import pkgutil +import pkg_resources from canonicalize import unsurt, UrlCanonicalizer #================================================================= def load_domain_specific_cdx_rules(filename, surt_ordered): - fh = pkgutil.get_data(__package__, filename) + fh = pkg_resources.resource_string(__name__, filename) config = yaml.load(fh) # Load Canonicalizer Rules diff --git a/pywb/cdx/wsgi_cdxserver.py b/pywb/cdx/wsgi_cdxserver.py index b6ccc61c..609928a0 100644 --- a/pywb/cdx/wsgi_cdxserver.py +++ b/pywb/cdx/wsgi_cdxserver.py @@ -1,10 +1,11 @@ -from cdxserver import create_cdx_server, extract_params_from_wsgi_env +from werkzeug.wrappers import BaseRequest, BaseResponse +from cdxserver import create_cdx_server from pywb import get_test_dir import logging import os import yaml -import pkgutil +import pkg_resources #================================================================= CONFIG_FILE = 'config.yaml' @@ -13,66 +14,85 @@ RULES_FILE = 'rules.yaml' DEFAULT_PORT = 8080 -config = None -if __package__: - try: - config = pkgutil.get_data(__package__, CONFIG_FILE) - config = yaml.load(config) - except: - pass - - #================================================================= -def main(paths=None): +class CDXQueryRequest(BaseRequest): + def __init__(self, environ): + super(CDXQueryRequest, self).__init__(environ) + + @property + def output(self): + return self.args.get('output', 'text') + @property + def filter(self): + return self.args.getlist('filter', []) + @property + def params(self): + return dict(t if t[0] == 'filter' else (t[0], t[1][0]) + for t in self.args.iterlists()) + +class WSGICDXServer(object): + def __init__(self, paths, rules_file): + self.cdxserver = create_cdx_server(paths, rules_file) + + def __call__(self, environ, start_response): + request = CDXQueryRequest(environ) + try: + logging.debug('request.args=%s', request.params) + result = self.cdxserver.load_cdx(**request.params) + + # TODO: select response type by "output" parameter + response = PlainTextResponse(result) + return response(environ, start_response) + except Exception as exc: + logging.error('load_cdx failed', exc_info=1) + # TODO: error response should be different for each response + # type + start_response('400 Error', [('Content-Type', 'text/plain')]) + return [str(exc)] + +class PlainTextResponse(BaseResponse): + def __init__(self, cdxitr, status=200, content_type='text/plain'): + super(PlainTextResponse, self).__init__( + response=cdxitr, + status=status, content_type=content_type) + +# class JsonResponse(Response): +# pass +# class MementoResponse(Response): +# pass + +def create_app(paths=None): logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s', level=logging.DEBUG) if not paths: - if config: - paths = config - else: - paths = get_test_dir() + 'cdx/' - - cdxserver = create_cdx_server(paths, RULES_FILE) - - def application(env, start_response): - try: - params = extract_params_from_wsgi_env(env) - response = cdxserver.load_cdx(**params) - start_response('200 OK', [('Content-Type', 'text/plain')]) - - response = list(response) - - except Exception as exc: - import traceback - err_details = traceback.format_exc(exc) - start_response('400 Error', [('Content-Type', 'text/plain')]) - response = [str(exc)] - print err_details - - return response - - return application + paths = config or get_test_dir() + 'cdx/' + return WSGICDXServer(paths, RULES_FILE) if __name__ == "__main__": - from wsgiref.simple_server import make_server + from optparse import OptionParser + from werkzeug.serving import run_simple - app = main() + opt = OptionParser('%prog [OPTIONS]') + opt.add_option('-p', '--port', type='int', default=None) - port = DEFAULT_PORT - if config: - port = config.get('port', DEFAULT_PORT) + options, args = opt.parse_args() - httpd = make_server('', port, app) + configdata = pkg_resources.resource_string(__name__, CONFIG_FILE) + config = yaml.load(configdata) - logging.debug('Starting CDX Server on port ' + str(port)) + port = options.port + if port is None: + port = (config and config.get('port')) or DEFAULT_PORT + app = create_app() + + logging.debug('Starting CDX Server on port %s', port) try: - httpd.serve_forever() - except KeyboardInterrupt: + run_simple('0.0.0.0', port, app, use_reloader=True, use_debugger=True) + except KeyboardInterrupt as ex: pass - logging.debug('Stopping CDX Server') else: - application = main() + application = create_app() diff --git a/setup.py b/setup.py index e8d92e16..70dba6d6 100755 --- a/setup.py +++ b/setup.py @@ -34,6 +34,8 @@ setup( 'pyyaml', 'WebTest', 'pytest', + 'werkzeug>=0.9.4', + 'setuptools==0.9.7', ], # tests_require=['WebTest', 'pytest'], zip_safe=False From 2c40c9b11286f54443848ad1e6fa62e0ab55292a Mon Sep 17 00:00:00 2001 From: Kenji Nagahashi Date: Thu, 27 Feb 2014 01:58:07 +0000 Subject: [PATCH 3/8] refactor cdxserver, add tests focused on wsgi_cdxserver, add docstrings. align cdxops function interfaces - all cdx_iter. move module functions / common ops to class methods support both 0/1 and true/false for boolean parameters move CDXObject to text conversion to wsgi_cdxserver (may have broken embedded cdxserver mode). pass config object as function arg rather than as global var. --- pywb/cdx/cdxobject.py | 7 +- pywb/cdx/cdxops.py | 63 +++++------ pywb/cdx/cdxserver.py | 119 +++++++++++---------- pywb/cdx/wsgi_cdxserver.py | 56 ++++++++-- tests/test_wsgi_cdxserver.py | 197 +++++++++++++++++++++++++++++++++++ 5 files changed, 334 insertions(+), 108 deletions(-) create mode 100644 tests/test_wsgi_cdxserver.py diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py index 4eba8025..1b38722a 100644 --- a/pywb/cdx/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -71,12 +71,15 @@ class CDXObject(OrderedDict): # force regen on next __str__ call self.cdxline = None + def is_revisit(self): + return (self['mimetype'] == 'warc/revisit' or + self['filename'] == '-') + def __str__(self): if self.cdxline: return self.cdxline - li = itertools.imap(lambda (n, val): val, self.items()) - return ' '.join(li) + return ' '.join(val for n, val in self.iteritems()) #================================================================= diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index 247f3d18..b24df6bf 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -10,32 +10,38 @@ from collections import deque #================================================================= -def cdx_load(sources, params, perms_checker=None): +def cdx_load(sources, params, filter=True, perms_checker=None): + """ + merge text CDX lines from sources, return an iterator for + filtered and access-checked sequence of CDX objects. + :param sources: iterable for text CDX sources. + :param perms_checker: access check filter object implementing + allow_url_lookup(key, url), allow_capture(cdxobj) and + filter_fields(cdxobj) methods. + """ + cdx_iter = load_cdx_streams(sources, params) + cdx_iter = make_obj_iter(cdx_iter, params) + cdx_iter = filter_cdx(cdx_iter, params) if perms_checker: - cdx_iter = cdx_load_with_perms(sources, params, perms_checker) - else: - cdx_iter = cdx_load_and_filter(sources, params) - - # output raw cdx objects - if params.get('output') == 'raw': - return cdx_iter - - def write_cdx(fields): - for cdx in cdx_iter: - yield cdx_text_out(cdx, fields) + '\n' - - return write_cdx(params.get('fields')) - + cdx_iter = restrict_cdx(cdx_iter, params, perms_checker) + return cdx_iter #================================================================= -def cdx_load_with_perms(sources, params, perms_checker): +def restrict_cdx(cdx_iter, params, perms_checker): + """ + filter out those cdx records that user doesn't have access to, + by consulting :param perms_checker:. + :param cdx_iter: cdx record source iterable + :param params: request parameters (dict) + :param perms_checker: object implementing permission checker + """ if not perms_checker.allow_url_lookup(params['key'], params['url']): if params.get('matchType', 'exact') == 'exact': raise AccessException('Excluded') - cdx_iter = cdx_load_and_filter(sources, params) - for cdx in cdx_iter: + # TODO: we could let filter_fields handle this case by accepting + # None as a return value. if not perms_checker.allow_capture(cdx): continue @@ -43,21 +49,8 @@ def cdx_load_with_perms(sources, params, perms_checker): yield cdx - #================================================================= -def cdx_text_out(cdx, fields): - if not fields: - return str(cdx) - else: - return ' '.join(map(lambda x: cdx[x], fields.split(','))) - - -#================================================================= -def cdx_load_and_filter(sources, params): - cdx_iter = load_cdx_streams(sources, params) - - cdx_iter = make_obj_iter(cdx_iter, params) - +def filter_cdx(cdx_iter, params): if params.get('proxyAll'): return cdx_iter @@ -110,7 +103,7 @@ def make_obj_iter(text_iter, params): else: cls = CDXObject - return itertools.imap(lambda line: cls(line), text_iter) + return (cls(line) for line in text_iter) #================================================================= @@ -242,8 +235,8 @@ def cdx_resolve_revisits(cdx_iter): originals = {} for cdx in cdx_iter: - is_revisit = ((cdx['mimetype'] == 'warc/revisit') or - (cdx['filename'] == '-')) + + is_revisit = cdx.is_revisit() digest = cdx['digest'] diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 1a68f7e4..b102aff1 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -84,7 +84,10 @@ class CDXServer(BaseCDXServer): def __init__(self, paths, **kwargs): super(CDXServer, self).__init__(**kwargs) - self.sources = create_cdx_sources(paths, kwargs.get('config')) + # TODO: we could save config in member, so that other + # methods can use it. it's bad for add_cdx_source to take + # config argument. + self._create_cdx_sources(paths, kwargs.get('config')) def load_cdx(self, **params): # if key not set, assume 'url' is set and needs canonicalization @@ -105,9 +108,62 @@ class CDXServer(BaseCDXServer): params['end_key'] = end_key cdx_iter = cdx_load(self.sources, params, self.perms_checker) - return self._check_cdx_iter(cdx_iter, params) + def _create_cdx_sources(self, paths, config): + """ + build CDXSource instances for each of path in :param paths:. + :param paths: list of sources or single source. + each source may be either string or CDXSource instance. value + of any other types will be silently ignored. + :param config: config object passed to :method:`add_cdx_source`. + """ + self.sources = [] + + if paths is not None: + if not isinstance(paths, (list, tuple)): + paths = [paths] + + for path in paths: + self.add_cdx_source(path, config) + + if len(self.sources) == 0: + logging.warn('No CDX Sources configured from paths=%s', paths) + + def _add_cdx_source(self, source): + if source is None: return + logging.debug('Adding CDX Source: %s', source) + self.sources.append(source) + + def add_cdx_source(self, source, config): + if source is None: return + if isinstance(source, CDXSource): + self._add_cdx_source(source) + elif isinstance(source, str): + if os.path.isdir(source): + for fn in os.listdir(source): + self._add_cdx_source(self._create_cdx_source( + os.path.join(source, fn), config)) + else: + self._add_cdx_source(self._create_cdx_source( + source, config)) + + def _create_cdx_source(self, filename, config): + if is_http(filename): + return RemoteCDXSource(filename) + + if filename.startswith('redis://'): + return RedisCDXSource(filename, config) + + if filename.endswith('.cdx'): + return CDXFile(filename) + + if filename.endswith('.summary'): + return ZipNumCluster(filename, config) + + logging.warn('skipping unrecognized URI:%s', filename) + return None + def __str__(self): return 'CDX server serving from ' + str(self.sources) @@ -131,12 +187,7 @@ class RemoteCDXServer(BaseCDXServer): raise Exception('Invalid remote cdx source: ' + str(source)) def load_cdx(self, **params): - remote_iter = self.source.load_cdx(params) - - # if need raw, convert to raw format here - if params.get('output') == 'raw': - remote_iter = (CDXObject(cdx) for cdx in remote_iter) - + remote_iter = cdx_load((self.sources,), params, filter=False) return self._check_cdx_iter(remote_iter, params) def __str__(self): @@ -169,58 +220,6 @@ def create_cdx_server(config, ds_rules_file=None): ds_rules=ds_rules_file, perms_checker=perms_checker) - -#================================================================= -def create_cdx_sources(paths, config=None): - sources = [] - - if not isinstance(paths, list): - paths = [paths] - - for path in paths: - if isinstance(path, CDXSource): - add_cdx_source(sources, path, config) - elif isinstance(path, str): - if os.path.isdir(path): - for file in os.listdir(path): - add_cdx_source(sources, path + file, config) - else: - add_cdx_source(sources, path, config) - - if len(sources) == 0: - logging.exception('No CDX Sources Found from: ' + str(sources)) - - return sources - - -#================================================================= -def add_cdx_source(sources, source, config): - if not isinstance(source, CDXSource): - source = create_cdx_source(source, config) - if not source: - return - - logging.debug('Adding CDX Source: ' + str(source)) - sources.append(source) - - -#================================================================= -def create_cdx_source(filename, config): - if is_http(filename): - return RemoteCDXSource(filename) - - if filename.startswith('redis://'): - return RedisCDXSource(filename, config) - - if filename.endswith('.cdx'): - return CDXFile(filename) - - if filename.endswith('.summary'): - return ZipNumCluster(filename, config) - - return None - - #================================================================= def extract_params_from_wsgi_env(env): """ utility function to extract params from the query diff --git a/pywb/cdx/wsgi_cdxserver.py b/pywb/cdx/wsgi_cdxserver.py index 609928a0..57945904 100644 --- a/pywb/cdx/wsgi_cdxserver.py +++ b/pywb/cdx/wsgi_cdxserver.py @@ -7,6 +7,8 @@ import os import yaml import pkg_resources +import cdxops + #================================================================= CONFIG_FILE = 'config.yaml' @@ -15,10 +17,19 @@ RULES_FILE = 'rules.yaml' DEFAULT_PORT = 8080 #================================================================= + class CDXQueryRequest(BaseRequest): def __init__(self, environ): super(CDXQueryRequest, self).__init__(environ) + def _get_bool(self, name): + v = self.args.get(name) + if v: + try: + v = int(s) + except ValueError as ex: + v = (s.lower() == 'true') + return bool(v) @property def output(self): return self.args.get('output', 'text') @@ -26,13 +37,22 @@ class CDXQueryRequest(BaseRequest): def filter(self): return self.args.getlist('filter', []) @property + def fields(self): + v = self.args.get('fields') + return v.split(',') if v else None + @property + def reverse(self): + # sort=reverse overrides reverse=0 + return (self._get_bool('reverse') or + self.args.get('sort') == 'reverse') + @property def params(self): return dict(t if t[0] == 'filter' else (t[0], t[1][0]) for t in self.args.iterlists()) class WSGICDXServer(object): - def __init__(self, paths, rules_file): - self.cdxserver = create_cdx_server(paths, rules_file) + def __init__(self, config, rules_file): + self.cdxserver = create_cdx_server(config, rules_file) def __call__(self, environ, start_response): request = CDXQueryRequest(environ) @@ -41,7 +61,7 @@ class WSGICDXServer(object): result = self.cdxserver.load_cdx(**request.params) # TODO: select response type by "output" parameter - response = PlainTextResponse(result) + response = PlainTextResponse(result, request.fields) return response(environ, start_response) except Exception as exc: logging.error('load_cdx failed', exc_info=1) @@ -50,25 +70,38 @@ class WSGICDXServer(object): start_response('400 Error', [('Content-Type', 'text/plain')]) return [str(exc)] +def cdx_text_out(cdx, fields): + if not fields: + return str(cdx) + '\n' + else: + logging.info('cdx fields=%s', cdx.keys()) + # TODO: this will results in an exception if fields contain + # non-existent field name. + return ' '.join(cdx[x] for x in fields) + '\n' + class PlainTextResponse(BaseResponse): - def __init__(self, cdxitr, status=200, content_type='text/plain'): + def __init__(self, cdxitr, fields, status=200, content_type='text/plain'): super(PlainTextResponse, self).__init__( - response=cdxitr, + response=( + cdx_text_out(cdx, fields) + for cdx in cdxitr + ), status=status, content_type=content_type) - + # class JsonResponse(Response): # pass # class MementoResponse(Response): # pass -def create_app(paths=None): +def create_app(config=None): logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s', level=logging.DEBUG) - if not paths: - paths = config or get_test_dir() + 'cdx/' + if not config: + index_paths = get_test_dir() + 'cdx/' + config = dict(index_paths=index_paths) - return WSGICDXServer(paths, RULES_FILE) + return WSGICDXServer(config, RULES_FILE) if __name__ == "__main__": from optparse import OptionParser @@ -86,7 +119,7 @@ if __name__ == "__main__": if port is None: port = (config and config.get('port')) or DEFAULT_PORT - app = create_app() + app = create_app(config) logging.debug('Starting CDX Server on port %s', port) try: @@ -95,4 +128,5 @@ if __name__ == "__main__": pass logging.debug('Stopping CDX Server') else: + # XXX pass production config application = create_app() diff --git a/tests/test_wsgi_cdxserver.py b/tests/test_wsgi_cdxserver.py new file mode 100644 index 00000000..333b8a8b --- /dev/null +++ b/tests/test_wsgi_cdxserver.py @@ -0,0 +1,197 @@ +import os +import re + +import pytest +from urllib import urlencode + +from werkzeug.test import Client +from werkzeug.wrappers import BaseResponse, Response + +import yaml + +from pywb.cdx.cdxobject import CDXObject +from pywb.cdx.wsgi_cdxserver import create_app + +@pytest.fixture +def testconfig(): + config = yaml.load(open('test_config.yaml')) + assert config + if 'index_paths' not in config: + config['index_paths'] = os.path.join( + os.path.dirname(os.path.realpath(__file__)), + '../sample_archive/cdx') + return config + +@pytest.fixture +def client(testconfig): + app = create_app(testconfig) + return Client(app, Response) + +# ================================================================ + +def query(client, url, **params): + params['url'] = url + return client.get('/cdx?' + urlencode(params, doseq=1)) + +# ================================================================ + +def test_exact_url(client): + """ + basic exact match, no filters, etc. + """ + resp = query(client, 'http://www.iana.org/') + + assert resp.status_code == 200 + print resp.data + +def test_prefix_match(client): + """ + prefix match test + """ + resp = query(client, 'http://www.iana.org/', matchType='prefix') + + print resp.data.splitlines() + assert resp.status_code == 200 + + suburls = 0 + for l in resp.data.splitlines(): + fields = l.split(' ') + if len(fields[0]) > len('org,iana)/'): + suburls += 1 + assert suburls > 0 + +def test_filters(client): + """ + filter cdxes by mimetype and filename field, exact match. + """ + resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css', + filter=('mimetype:warc/revisit', 'filename:dupes.warc.gz')) + + assert resp.status_code == 200 + assert resp.mimetype == 'text/plain' + + for l in resp.data.splitlines(): + fields = l.split(' ') + assert fields[0] == 'org,iana)/_css/2013.1/screen.css' + assert fields[3] == 'warc/revisit' + assert fields[10] == 'dupes.warc.gz' + +def test_limit(client): + resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css', + limit='1') + + assert resp.status_code == 200 + assert resp.mimetype == 'text/plain' + + cdxes = resp.data.splitlines() + assert len(cdxes) == 1 + fields = cdxes[0].split(' ') + assert fields[0] == 'org,iana)/_css/2013.1/screen.css' + assert fields[1] == '20140126200625' + assert fields[3] == 'text/css' + + resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css', + limit='1', reverse='1') + + assert resp.status_code == 200 + assert resp.mimetype == 'text/plain' + + cdxes = resp.data.splitlines() + assert len(cdxes) == 1 + fields = cdxes[0].split(' ') + assert fields[0] == 'org,iana)/_css/2013.1/screen.css' + assert fields[1] == '20140127171239' + assert fields[3] == 'warc/revisit' + +def test_fields(client): + """ + retrieve subset of fields with ``fields`` parameter. + """ + resp = query(client, 'http://www.iana.org/_css/2013.1/print.css', + fields='urlkey,timestamp,statuscode') + + assert resp.status_code == 200 + + cdxes = resp.data.splitlines() + + for cdx in cdxes: + fields = cdx.split(' ') + assert len(fields) == 3 + assert fields[0] == 'org,iana)/_css/2013.1/print.css' + assert re.match(r'\d{14}$', fields[1]) + assert re.match(r'\d{3}|-', fields[2]) + +def test_fields_undefined(client): + """ + server shall respond with Bad Request (TODO: with proper explanation), + when ``fields`` parameter contains undefined name(s). + """ + resp = query(client, 'http://www.iana.org/_css/2013.1/print.css', + fields='urlkey,nosuchfield') + + resp.status_code == 400 + +def test_resolveRevisits(client): + """ + with ``resolveRevisits=true``, server adds three fields pointing to + the *original* capture. + """ + resp = query(client, 'http://www.iana.org/_css/2013.1/print.css', + resolveRevisits='true' + ) + assert resp.status_code == 200 + assert resp.mimetype == 'text/plain' + + cdxes = resp.data.splitlines() + originals = {} + for cdx in cdxes: + fields = cdx.split(' ') + assert len(fields) == 14 + (key, ts, url, mt, st, sha, _, _, size, offset, fn, + orig_size, orig_offset, orig_fn) = fields + # orig_* fields are either all '-' or (int, int, filename) + # check if orig_* fields are equals to corresponding fields + # for the original capture. + if orig_size == '-': + assert orig_offset == '-' and orig_fn == '-' + originals[sha] = (int(size), int(offset), fn) + else: + orig = originals.get(sha) + assert orig == (int(orig_size), int(orig_offset), orig_fn) + +def test_resolveRevisits_orig_fields(client): + """ + when resolveRevisits=true, extra three fields are named + ``orig.length``, ``orig.offset`` and ``orig.filename``, respectively. + it is possible to filter fields by these names. + """ + resp = query(client, 'http://www.iana.org/_css/2013.1/print.css', + resolveRevisits='1', + fields='urlkey,orig.length,orig.offset,orig.filename' + ) + assert resp.status_code == 200 + assert resp.mimetype == 'text/plain' + + cdxes = resp.data.splitlines() + for cdx in cdxes: + fields = cdx.split(' ') + assert len(fields) == 4 + key, orig_len, orig_offset, orig_fn = fields + assert (orig_len == '-' and orig_offset == '-' and orig_fn == '-' or + (int(orig_len), int(orig_offset), orig_fn)) + +def test_collapseTime_resolveRevisits_reverse(client): + resp = query(client, 'http://www.iana.org/_css/2013.1/print.css', + collapseTime='11', + resolveRevisits='true', + reverse='true' + ) + + cdxes = [CDXObject(l) for l in resp.data.splitlines()] + + assert len(cdxes) == 3 + + # timestamp is in descending order + for i in range(len(cdxes) - 1): + assert cdxes[i]['timestamp'] >= cdxes[i + 1]['timestamp'] + From 9eda5ad97e5ea29567828a3ca890048201978a6c Mon Sep 17 00:00:00 2001 From: Kenji Nagahashi Date: Fri, 28 Feb 2014 01:39:04 +0000 Subject: [PATCH 4/8] address test cases broken by previous commit. move py.test fixture and fixture classes (TestExclusionPerms, PrintReporter) to tests.fixture module. update test_config.yaml accordingly. --- pywb/cdx/cdxobject.py | 17 +++++++++ pywb/cdx/cdxops.py | 3 +- pywb/cdx/cdxserver.py | 3 +- pywb/cdx/test/cdxserver_test.py | 28 ++++++++++++-- pywb/cdx/test/wsgi_cdxserver_test.py | 4 +- pywb/cdx/wsgi_cdxserver.py | 3 +- pywb/indexreader.py | 2 +- test_config.yaml | 4 +- tests/fixture.py | 56 ++++++++++++++++++++++++++++ tests/test_integration.py | 44 ++-------------------- tests/test_wsgi_cdxserver.py | 10 +---- 11 files changed, 111 insertions(+), 63 deletions(-) create mode 100644 tests/fixture.py diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py index 1b38722a..8fed07af 100644 --- a/pywb/cdx/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -75,6 +75,16 @@ class CDXObject(OrderedDict): return (self['mimetype'] == 'warc/revisit' or self['filename'] == '-') + def to_text(self, fields=None): + """ + return plaintext CDX record (includes newline). + :param fields: list of field names to output. + """ + if fields is None: + return str(self) + '\n' + else: + return ' '.join(self[x] for x in fields) + '\n' + def __str__(self): if self.cdxline: return self.cdxline @@ -109,5 +119,12 @@ class IDXObject(OrderedDict): self.idxline = idxline + def to_text(self, fields=None): + """ + return plaintext IDX record (including newline). + :param fields: list of field names to output (currently ignored) + """ + return str(self) + '\n' + def __str__(self): return self.idxline diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index b24df6bf..2c2c30af 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -10,10 +10,11 @@ from collections import deque #================================================================= -def cdx_load(sources, params, filter=True, perms_checker=None): +def cdx_load(sources, params, perms_checker=None, filter=True): """ merge text CDX lines from sources, return an iterator for filtered and access-checked sequence of CDX objects. + :param sources: iterable for text CDX sources. :param perms_checker: access check filter object implementing allow_url_lookup(key, url), allow_capture(cdxobj) and diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index b102aff1..83627009 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -107,7 +107,8 @@ class CDXServer(BaseCDXServer): params['key'] = key params['end_key'] = end_key - cdx_iter = cdx_load(self.sources, params, self.perms_checker) + cdx_iter = cdx_load(self.sources, params, + perms_checker=self.perms_checker) return self._check_cdx_iter(cdx_iter, params) def _create_cdx_sources(self, paths, config): diff --git a/pywb/cdx/test/cdxserver_test.py b/pywb/cdx/test/cdxserver_test.py index 0e799ce9..ad9286bf 100644 --- a/pywb/cdx/test/cdxserver_test.py +++ b/pywb/cdx/test/cdxserver_test.py @@ -153,21 +153,41 @@ import pprint from pywb import get_test_dir #test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/' test_cdx_dir = get_test_dir() + 'cdx/' +from pywb.cdx.cdxobject import AccessException + +from tests.fixture import testconfig, TestExclusionPerms + +import pytest def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams): kwparams['url'] = url - kwparams['output'] = 'text' + fields = kwparams.get('fields') + if fields: + fields = fields.split(',') server = CDXServer(sources) results = server.load_cdx(**kwparams) for x in results: - sys.stdout.write(x) + sys.stdout.write(x.to_text(fields)) +#================================================================ + +def test_excluded(testconfig): + testconfig['perms_checker'] = TestExclusionPerms() + sources = testconfig.get('index_paths') + print sources + server = CDXServer(sources, perms_checker=testconfig['perms_checker']) + assert isinstance(server, CDXServer) + assert server.perms_checker + + url = 'http://www.iana.org/_img/bookmark_icon.ico' + key = 'org,iana)/_img/bookmark_icon.ico' + with pytest.raises(AccessException): + cdxobjs = list(server.load_cdx(url=url)) + print cdxobjs if __name__ == "__main__": import doctest doctest.testmod() - - diff --git a/pywb/cdx/test/wsgi_cdxserver_test.py b/pywb/cdx/test/wsgi_cdxserver_test.py index 70c4fe71..a7d1ecdb 100644 --- a/pywb/cdx/test/wsgi_cdxserver_test.py +++ b/pywb/cdx/test/wsgi_cdxserver_test.py @@ -1,10 +1,10 @@ import webtest -from pywb.cdx.wsgi_cdxserver import main +from pywb.cdx.wsgi_cdxserver import create_app from pywb import get_test_dir class TestCdx: def setup(self): - self.app = main(get_test_dir() + 'cdx/') + self.app = create_app(get_test_dir() + 'cdx/') self.testapp = webtest.TestApp(self.app) def test_cdx(self): diff --git a/pywb/cdx/wsgi_cdxserver.py b/pywb/cdx/wsgi_cdxserver.py index 57945904..c138e5c6 100644 --- a/pywb/cdx/wsgi_cdxserver.py +++ b/pywb/cdx/wsgi_cdxserver.py @@ -83,8 +83,7 @@ class PlainTextResponse(BaseResponse): def __init__(self, cdxitr, fields, status=200, content_type='text/plain'): super(PlainTextResponse, self).__init__( response=( - cdx_text_out(cdx, fields) - for cdx in cdxitr + cdx.to_text(fields) for cdx in cdxitr ), status=status, content_type=content_type) diff --git a/pywb/indexreader.py b/pywb/indexreader.py index cea27a8f..a422d0b4 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -30,7 +30,7 @@ class IndexReader(object): params['allowFuzzy'] = True - cdxlines = self.load_cdx(url=wburl.url, output='raw', **params) + cdxlines = self.load_cdx(url=wburl.url, **params) return cdxlines diff --git a/test_config.yaml b/test_config.yaml index 8421aead..20e52933 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -92,10 +92,10 @@ enable_cdx_api: true # optional reporter callback func # if set, called with request and cdx object -reporter: !!python/object/new:tests.test_integration.PrintReporter [] +reporter: !!python/object/new:tests.fixture.PrintReporter [] # custom rules for domain specific matching #domain_specific_rules: rules.yaml #perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms [] -perms_checker: !!python/object/new:tests.test_integration.TestExclusionPerms [] +perms_checker: !!python/object/new:tests.fixture.TestExclusionPerms [] diff --git a/tests/fixture.py b/tests/fixture.py new file mode 100644 index 00000000..ea495bb7 --- /dev/null +++ b/tests/fixture.py @@ -0,0 +1,56 @@ +import os +import pytest + +import yaml + +@pytest.fixture +def testconfig(): + config = yaml.load(open('test_config.yaml')) + assert config + if 'index_paths' not in config: + # !!! assumes this module is in a sub-directory of project root. + config['index_paths'] = os.path.join( + os.path.dirname(os.path.realpath(__file__)), + '../sample_archive/cdx') + return config + +#================================================================ +# Reporter callback for replay view +class PrintReporter: + """Reporter callback for replay view. + """ + def __call__(self, wbrequest, cdx, response): + print wbrequest + print cdx + pass + +#================================================================ +class TestExclusionPerms: + """ + Perm Checker fixture which can block one URL. + """ + # sample_archive has captures for this URLKEY + URLKEY_EXCLUDED = 'org,iana)/_img/bookmark_icon.ico' + + def allow_url_lookup(self, urlkey, url): + """ + Return true/false if url or urlkey (canonicalized url) + should be allowed + """ + print "allow_url_lookup:urlkey={}".format(urlkey) + if urlkey == self.URLKEY_EXCLUDED: + return False + + return True + + def allow_capture(self, cdx): + """ + Return True if specified capture (cdx) is allowed. + """ + return True + + def filter_fields(self, cdx): + """ + Filter out any forbidden cdx fields from cdx object + """ + return cdx diff --git a/tests/test_integration.py b/tests/test_integration.py index 1a7a943c..805759da 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -8,7 +8,9 @@ class TestWb: def setup(self): #self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config()) - self.app = create_wb_app(pywb_config(self.TEST_CONFIG)) + # save it in self - useful for debugging + self.router = pywb_config(self.TEST_CONFIG) + self.app = create_wb_app(self.router) self.testapp = webtest.TestApp(self.app) def _assert_basic_html(self, resp): @@ -193,43 +195,3 @@ class TestWb: resp = self.testapp.get('/pywb/?abc', status = 400) assert resp.status_int == 400 assert 'Invalid Url: http://?abc' in resp.body - -#================================================================= -# Reporter callback for replay view -class PrintReporter: - def __call__(self, wbrequest, cdx, response): - print wbrequest - print cdx - pass - -#================================================================= -class TestExclusionPerms: - """ - Sample Perm Checker which allows all - """ - def allow_url_lookup(self, urlkey, url): - """ - Return true/false if url or urlkey (canonicalized url) - should be allowed - """ - print urlkey - if urlkey == 'org,iana)/_img/bookmark_icon.ico': - return False - - return True - - def allow_capture(self, cdx): - """ - Return true/false is specified capture (cdx) should be - allowed - """ - return True - - def filter_fields(self, cdx): - """ - Filter out any forbidden cdx fields from cdx dictionary - """ - return cdx - - - diff --git a/tests/test_wsgi_cdxserver.py b/tests/test_wsgi_cdxserver.py index 333b8a8b..8eee2484 100644 --- a/tests/test_wsgi_cdxserver.py +++ b/tests/test_wsgi_cdxserver.py @@ -12,15 +12,7 @@ import yaml from pywb.cdx.cdxobject import CDXObject from pywb.cdx.wsgi_cdxserver import create_app -@pytest.fixture -def testconfig(): - config = yaml.load(open('test_config.yaml')) - assert config - if 'index_paths' not in config: - config['index_paths'] = os.path.join( - os.path.dirname(os.path.realpath(__file__)), - '../sample_archive/cdx') - return config +from tests.fixture import testconfig @pytest.fixture def client(testconfig): From 355fa326008240d56b5be85b9804891c7739ae04 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 1 Mar 2014 08:41:24 -0800 Subject: [PATCH 5/8] cdx: refactor to create seperate CDXQuery object for wrapping params passed to load_cdx() --- pywb/cdx/cdxdomainspecific.py | 13 ++-- pywb/cdx/cdxobject.py | 123 ++++++++++++++++++++++++++++++++++ pywb/cdx/cdxops.py | 58 ++++++++-------- pywb/cdx/cdxserver.py | 75 +++++++-------------- pywb/cdx/cdxsource.py | 26 +++---- pywb/cdx/wsgi_cdxserver.py | 42 +++--------- pywb/cdx/zipnum.py | 28 ++++---- pywb/handlers.py | 6 +- pywb/indexreader.py | 3 + setup.py | 2 - 10 files changed, 223 insertions(+), 153 deletions(-) diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index 7295bf0c..9890605e 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -7,6 +7,7 @@ from pywb.utils.dsrules import BaseRule, RuleSet from pywb.utils.canonicalize import unsurt, UrlCanonicalizer +from cdxobject import CDXQuery #================================================================= def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered): @@ -70,13 +71,13 @@ class FuzzyQuery: def __init__(self, rules): self.rules = rules - def __call__(self, params): + def __call__(self, query): matched_rule = None - urlkey = params['key'] - url = params['url'] - filter_ = params.get('filter', []) - output = params.get('output') + urlkey = query.key + url = query.url + filter_ = query.filters + output = query.output for rule in self.rules.iter_matching(urlkey): m = rule.regex.search(urlkey) @@ -102,7 +103,7 @@ class FuzzyQuery: 'filter': filter_, 'output': output} - return params + return CDXQuery(**params) #================================================================= diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py index 8fed07af..682e8c74 100644 --- a/pywb/cdx/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -1,6 +1,9 @@ from collections import OrderedDict import itertools +from urllib import urlencode +from urlparse import parse_qs + #================================================================= class CDXException(Exception): @@ -20,6 +23,126 @@ class AccessException(CDXException): return '403 Access Denied' +#================================================================= +class CDXQuery(object): + def __init__(self, **kwargs): + self.params = kwargs + + @property + def key(self): + return self.params['key'] + + @property + def end_key(self): + return self.params['end_key'] + + def set_key(self, key, end_key): + self.params['key'] = key + self.params['end_key'] = end_key + + @property + def url(self): + try: + return self.params['url'] + except KeyError: + msg = 'A url= param must be specified to query the cdx server' + raise CDXException(msg) + + @property + def match_type(self): + return self.params.get('matchType', 'exact') + + @property + def is_exact(self): + return self.match_type == 'exact' + + @property + def allow_fuzzy(self): + return self._get_bool('allowFuzzy') + + @property + def output(self): + return self.params.get('output', 'text') + + @property + def limit(self): + return int(self.params.get('limit', 100000)) + + @property + def collapse_time(self): + return self.params.get('collapseTime') + + @property + def resolve_revisits(self): + return self._get_bool('resolveRevisits') + + @property + def filters(self): + return self.params.get('filter', []) + + @property + def fields(self): + v = self.params.get('fields') + return v.split(',') if v else None + + @property + def closest(self): + # sort=closest is not required + return self.params.get('closest') + + @property + def reverse(self): + # sort=reverse overrides reverse=0 + return (self._get_bool('reverse') or + self.params.get('sort') == 'reverse') + + @property + def secondary_index_only(self): + return self._get_bool('showPagedIndex') + + @property + def process(self): + return self._get_bool('processOps', True) + + def set_process(self, process): + self.params['processOps'] = process + + def _get_bool(self, name, def_val=False): + v = self.params.get(name) + if v: + try: + v = int(v) + except ValueError as ex: + v = (v.lower() == 'true') + else: + v = def_val + + return bool(v) + + def urlencode(self): + return urlencode(self.params, True) + + @staticmethod + def from_wsgi_env(env): + """ utility function to extract params and create a CDXQuery + from a WSGI environment dictionary + """ + params = parse_qs(env['QUERY_STRING']) + + if not 'output' in params: + params['output'] = 'text' + + # parse_qs produces arrays for single values + # cdx processing expects singleton params for all params, + # except filters, so convert here + # use first value of the list + for name, val in params.iteritems(): + if name != 'filter': + params[name] = val[0] + + return CDXQuery(**params) + + #================================================================= class CDXObject(OrderedDict): CDX_FORMATS = [ diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index 4bdb0a55..72d69417 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -1,4 +1,4 @@ -from cdxobject import CDXObject, IDXObject, AccessException +from cdxobject import CDXObject, IDXObject, AccessException, CDXQuery from pywb.utils.timeutils import timestamp_to_sec import bisect @@ -10,7 +10,7 @@ from collections import deque #================================================================= -def cdx_load(sources, params, perms_checker=None, filter=True): +def cdx_load(sources, query, perms_checker=None, process=True): """ merge text CDX lines from sources, return an iterator for filtered and access-checked sequence of CDX objects. @@ -19,25 +19,30 @@ def cdx_load(sources, params, perms_checker=None, filter=True): :param perms_checker: access check filter object implementing allow_url_lookup(key, url), allow_capture(cdxobj) and filter_fields(cdxobj) methods. + :param process: bool, perform processing sorting/filtering/grouping ops """ - cdx_iter = load_cdx_streams(sources, params) - cdx_iter = make_obj_iter(cdx_iter, params) - cdx_iter = filter_cdx(cdx_iter, params) + cdx_iter = load_cdx_streams(sources, query) + cdx_iter = make_obj_iter(cdx_iter, query) + + if process and query.process: + cdx_iter = process_cdx(cdx_iter, query) + if perms_checker: - cdx_iter = restrict_cdx(cdx_iter, params, perms_checker) + cdx_iter = restrict_cdx(cdx_iter, query, perms_checker) + return cdx_iter #================================================================= -def restrict_cdx(cdx_iter, params, perms_checker): +def restrict_cdx(cdx_iter, query, perms_checker): """ filter out those cdx records that user doesn't have access to, by consulting :param perms_checker:. :param cdx_iter: cdx record source iterable - :param params: request parameters (dict) + :param query: request parameters (CDXQuery) :param perms_checker: object implementing permission checker """ - if not perms_checker.allow_url_lookup(params['key'], params['url']): - if params.get('matchType', 'exact') == 'exact': + if not perms_checker.allow_url_lookup(query.key, query.url): + if query.is_exact: raise AccessException('Excluded') for cdx in cdx_iter: @@ -51,31 +56,26 @@ def restrict_cdx(cdx_iter, params, perms_checker): yield cdx #================================================================= -def filter_cdx(cdx_iter, params): - if params.get('proxyAll'): - return cdx_iter - - resolve_revisits = params.get('resolveRevisits', False) - if resolve_revisits: +def process_cdx(cdx_iter, query): + if query.resolve_revisits: cdx_iter = cdx_resolve_revisits(cdx_iter) - filters = params.get('filter', None) + filters = query.filters if filters: cdx_iter = cdx_filter(cdx_iter, filters) - collapse_time = params.get('collapseTime', None) + collapse_time = query.collapse_time if collapse_time: cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time) - limit = int(params.get('limit', 1000000)) + limit = query.limit - reverse = params.get('reverse', False) or params.get('sort') == 'reverse' - if reverse: + if query.reverse: cdx_iter = cdx_reverse(cdx_iter, limit) - closest_to = params.get('closest', None) - if closest_to: - cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit) + closest = query.closest + if closest: + cdx_iter = cdx_sort_closest(closest, cdx_iter, limit) if limit: cdx_iter = cdx_limit(cdx_iter, limit) @@ -85,21 +85,21 @@ def filter_cdx(cdx_iter, params): #================================================================= # load and source merge cdx streams -def load_cdx_streams(sources, params): +def load_cdx_streams(sources, query): # Optimize: no need to merge if just one input if len(sources) == 1: - return sources[0].load_cdx(params) + return sources[0].load_cdx(query) - source_iters = map(lambda src: src.load_cdx(params), sources) + source_iters = map(lambda src: src.load_cdx(query), sources) merged_stream = merge(*(source_iters)) return merged_stream #================================================================= # convert text cdx stream to CDXObject/IDXObject -def make_obj_iter(text_iter, params): +def make_obj_iter(text_iter, query): # already converted - if params.get('showPagedIndex'): + if query.secondary_index_only: cls = IDXObject else: cls = CDXObject diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 0de183ae..c3874a93 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -3,7 +3,7 @@ from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range from cdxops import cdx_load from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource from zipnum import ZipNumCluster -from cdxobject import CDXObject, CaptureNotFoundException, CDXException +from cdxobject import CDXObject, CaptureNotFoundException, CDXException, CDXQuery from cdxdomainspecific import load_domain_specific_cdx_rules from pywb.utils.loaders import is_http @@ -36,7 +36,7 @@ class BaseCDXServer(object): # set perms checker, if any self.perms_checker = kwargs.get('perms_checker') - def _check_cdx_iter(self, cdx_iter, params): + def _check_cdx_iter(self, cdx_iter, query): """ Check cdx iter semantics If iter is empty (no matches), check if fuzzy matching is allowed, and try it -- otherwise, @@ -48,21 +48,23 @@ class BaseCDXServer(object): if cdx_iter: return cdx_iter - url = params['url'] - # check if fuzzy is allowed and ensure that its an # exact match - if (self.fuzzy_query and params.get('allowFuzzy') and - params.get('matchType', 'exact') == 'exact'): + if (self.fuzzy_query and + query.allow_fuzzy and + query.is_exact): - fuzzy_params = self.fuzzy_query(params) - if fuzzy_params: - return self.load_cdx(**fuzzy_params) + fuzzy_query_params = self.fuzzy_query(query) + if fuzzy_query_params: + return self.load_cdx_query(fuzzy_query_params) - msg = 'No Captures found for: ' + url + msg = 'No Captures found for: ' + query.url raise CaptureNotFoundException(msg) def load_cdx(self, **params): + return self.load_cdx_query(CDXQuery(**params)) + + def load_cdx_query(self, query): raise NotImplementedError('Implement in subclass') @staticmethod @@ -89,26 +91,18 @@ class CDXServer(BaseCDXServer): # config argument. self._create_cdx_sources(paths, kwargs.get('config')) - def load_cdx(self, **params): - # if key not set, assume 'url' is set and needs canonicalization - if not params.get('key'): - try: - url = params['url'] - except KeyError: - msg = 'A url= param must be specified to query the cdx server' - raise CDXException(msg) + def load_cdx_query(self, query): + url = query.url + key, end_key = calc_search_range(url=url, + match_type=query.match_type, + url_canon=self.url_canon) + query.set_key(key, end_key) - match_type = params.get('matchType', 'exact') - - key, end_key = calc_search_range(url=url, - match_type=match_type, - url_canon=self.url_canon) - params['key'] = key - params['end_key'] = end_key - - cdx_iter = cdx_load(self.sources, params, + cdx_iter = cdx_load(self.sources, + query, perms_checker=self.perms_checker) - return self._check_cdx_iter(cdx_iter, params) + + return self._check_cdx_iter(cdx_iter, query) def _create_cdx_sources(self, paths, config): """ @@ -186,9 +180,9 @@ class RemoteCDXServer(BaseCDXServer): else: raise Exception('Invalid remote cdx source: ' + str(source)) - def load_cdx(self, **params): - remote_iter = cdx_load((self.sources,), params, filter=False) - return self._check_cdx_iter(remote_iter, params) + def load_cdx_query(self, query): + remote_iter = cdx_load(self.sources, query, process=False) + return self._check_cdx_iter(remote_iter, query) def __str__(self): return 'Remote CDX server serving from ' + str(self.sources[0]) @@ -220,23 +214,4 @@ def create_cdx_server(config, ds_rules_file=None): ds_rules_file=ds_rules_file, perms_checker=perms_checker) -#================================================================= -def extract_params_from_wsgi_env(env): - """ utility function to extract params from the query - string of a WSGI environment dictionary - """ - # use url= param to get actual url - params = urlparse.parse_qs(env['QUERY_STRING']) - if not 'output' in params: - params['output'] = 'text' - - # parse_qs produces arrays for single values - # cdx processing expects singleton params for all params, - # except filters, so convert here - # use first value of the list - for name, val in params.iteritems(): - if name != 'filter': - params[name] = val[0] - - return params diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index ba5f8b3b..c17312c0 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -12,7 +12,7 @@ class CDXSource(object): """ Represents any cdx index source """ - def load_cdx(self, params): + def load_cdx(self, query): raise NotImplementedError('Implement in subclass') @@ -24,9 +24,9 @@ class CDXFile(CDXSource): def __init__(self, filename): self.filename = filename - def load_cdx(self, params): + def load_cdx(self, query): source = SeekableTextFileReader(self.filename) - return iter_range(source, params.get('key'), params.get('end_key')) + return iter_range(source, query.key, query.end_key) def __str__(self): return 'CDX File - ' + self.filename @@ -45,20 +45,16 @@ class RemoteCDXSource(CDXSource): self.cookie = cookie self.proxy_all = proxy_all - def load_cdx(self, proxy_params): + def load_cdx(self, query): if self.proxy_all: - params = proxy_params - params['proxyAll'] = True + query.set_process(False) + remote_query = query else: # Only send url and matchType params to remote - params = {} - params['url'] = proxy_params['url'] - match_type = proxy_params.get('matchType') + remote_query = CDXQuery(url=query.url, + match_type=query.matchType) - if match_type: - proxy_params['matchType'] = match_type - - urlparams = urllib.urlencode(params, True) + urlparams = remote_query.urlencode() try: request = urllib2.Request(self.remote_url, urlparams) @@ -97,14 +93,14 @@ class RedisCDXSource(CDXSource): self.key_prefix = config.get('redis_key_prefix', self.key_prefix) - def load_cdx(self, params): + def load_cdx(self, query): """ Load cdx from redis cache, from an ordered list Currently, there is no support for range queries Only 'exact' matchType is supported """ - key = params['key'] + key = query.key # ensure only url/surt is part of key key = key.split(' ')[0] diff --git a/pywb/cdx/wsgi_cdxserver.py b/pywb/cdx/wsgi_cdxserver.py index c138e5c6..e6ab6067 100644 --- a/pywb/cdx/wsgi_cdxserver.py +++ b/pywb/cdx/wsgi_cdxserver.py @@ -1,6 +1,7 @@ -from werkzeug.wrappers import BaseRequest, BaseResponse +from werkzeug.wrappers import BaseResponse from cdxserver import create_cdx_server from pywb import get_test_dir +from cdxobject import CDXQuery import logging import os @@ -18,37 +19,10 @@ DEFAULT_PORT = 8080 #================================================================= -class CDXQueryRequest(BaseRequest): +class CDXQueryRequest(object): def __init__(self, environ): - super(CDXQueryRequest, self).__init__(environ) + self.query = CDXQuery.from_wsgi_env(environ) - def _get_bool(self, name): - v = self.args.get(name) - if v: - try: - v = int(s) - except ValueError as ex: - v = (s.lower() == 'true') - return bool(v) - @property - def output(self): - return self.args.get('output', 'text') - @property - def filter(self): - return self.args.getlist('filter', []) - @property - def fields(self): - v = self.args.get('fields') - return v.split(',') if v else None - @property - def reverse(self): - # sort=reverse overrides reverse=0 - return (self._get_bool('reverse') or - self.args.get('sort') == 'reverse') - @property - def params(self): - return dict(t if t[0] == 'filter' else (t[0], t[1][0]) - for t in self.args.iterlists()) class WSGICDXServer(object): def __init__(self, config, rules_file): @@ -57,11 +31,11 @@ class WSGICDXServer(object): def __call__(self, environ, start_response): request = CDXQueryRequest(environ) try: - logging.debug('request.args=%s', request.params) - result = self.cdxserver.load_cdx(**request.params) + logging.debug('request.args=%s', request.query) + result = self.cdxserver.load_cdx_query(request.query) # TODO: select response type by "output" parameter - response = PlainTextResponse(result, request.fields) + response = PlainTextResponse(result, request.query.fields) return response(environ, start_response) except Exception as exc: logging.error('load_cdx failed', exc_info=1) @@ -74,7 +48,7 @@ def cdx_text_out(cdx, fields): if not fields: return str(cdx) + '\n' else: - logging.info('cdx fields=%s', cdx.keys()) + logging.info('cdx fields=%s', cdx.keys) # TODO: this will results in an exception if fields contain # non-existent field name. return ' '.join(cdx[x] for x in fields) + '\n' diff --git a/pywb/cdx/zipnum.py b/pywb/cdx/zipnum.py index 847c660f..fbb1503f 100644 --- a/pywb/cdx/zipnum.py +++ b/pywb/cdx/zipnum.py @@ -110,21 +110,21 @@ class ZipNumCluster(CDXSource): def lookup_loc(self, part): return self.loc_map[part] - def load_cdx(self, params): + def load_cdx(self, query): self.reload_loc() reader = SeekableTextFileReader(self.summary) idx_iter = iter_range(reader, - params['key'], - params['end_key'], + query.key, + query.end_key, prev_size=1) - if params.get('showPagedIndex'): - params['proxyAll'] = True + if query.secondary_index_only: + query.set_process(False) return idx_iter else: - blocks = self.idx_to_cdx(idx_iter, params) + blocks = self.idx_to_cdx(idx_iter, query) def gen_cdx(): for blk in blocks: @@ -133,7 +133,7 @@ class ZipNumCluster(CDXSource): return gen_cdx() - def idx_to_cdx(self, idx_iter, params): + def idx_to_cdx(self, idx_iter, query): blocks = None ranges = [] @@ -150,7 +150,7 @@ class ZipNumCluster(CDXSource): else: if blocks: - yield self.block_to_cdx_iter(blocks, ranges, params) + yield self.block_to_cdx_iter(blocks, ranges, query) blocks = ZipBlocks(idx['part'], idx['offset'], @@ -160,15 +160,15 @@ class ZipNumCluster(CDXSource): ranges = [blocks.length] if blocks: - yield self.block_to_cdx_iter(blocks, ranges, params) + yield self.block_to_cdx_iter(blocks, ranges, query) - def block_to_cdx_iter(self, blocks, ranges, params): + def block_to_cdx_iter(self, blocks, ranges, query): last_exc = None last_traceback = None for location in self.lookup_loc(blocks.part): try: - return self.load_blocks(location, blocks, ranges, params) + return self.load_blocks(location, blocks, ranges, query) except Exception as exc: last_exc = exc import sys @@ -179,7 +179,7 @@ class ZipNumCluster(CDXSource): else: raise Exception('No Locations Found for: ' + block.part) - def load_blocks(self, location, blocks, ranges, params): + def load_blocks(self, location, blocks, ranges, query): if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG): msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}' @@ -195,9 +195,9 @@ class ZipNumCluster(CDXSource): iter_ = itertools.chain(*itertools.imap(decompress_block, ranges)) # start bound - iter_ = linearsearch(iter_, params['key']) + iter_ = linearsearch(iter_, query.key) # end bound - end = params['end_key'] + end = query.end_key iter_ = itertools.takewhile(lambda line: line < end, iter_) return iter_ diff --git a/pywb/handlers.py b/pywb/handlers.py index c82db7fe..10456380 100644 --- a/pywb/handlers.py +++ b/pywb/handlers.py @@ -4,7 +4,7 @@ import mimetypes import time from pywb.rewrite.wburl import WbUrl -from pywb.cdx.cdxserver import extract_params_from_wsgi_env +from pywb.cdx.cdxobject import CDXQuery from wbrequestresponse import WbResponse from wbexceptions import WbException, NotFoundException from views import TextCapturesView @@ -79,8 +79,8 @@ class CDXHandler(BaseHandler): self.view = view if view else TextCapturesView() def __call__(self, wbrequest): - params = extract_params_from_wsgi_env(wbrequest.env) - cdx_lines = self.index_reader.load_cdx(**params) + query = CDXQuery.from_wsgi_env(wbrequest.env) + cdx_lines = self.index_reader.load_cdx_query(query) return self.view.render_response(wbrequest, cdx_lines) diff --git a/pywb/indexreader.py b/pywb/indexreader.py index a422d0b4..ff17dfde 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -34,6 +34,9 @@ class IndexReader(object): return cdxlines + def load_cdx_query(self, query): + return self.cdx_server.load_cdx_query(query) + def load_cdx(self, **params): return self.cdx_server.load_cdx(**params) diff --git a/setup.py b/setup.py index 7f843161..c9ff86bd 100755 --- a/setup.py +++ b/setup.py @@ -26,7 +26,6 @@ setup( ('sample_archive/text_content/', glob.glob('sample_archive/text_content/*')), ], install_requires=[ - 'uwsgi', 'rfc3987', 'chardet', 'redis', @@ -36,7 +35,6 @@ setup( 'WebTest', 'pytest', 'werkzeug>=0.9.4', - 'setuptools', ], # tests_require=['WebTest', 'pytest'], zip_safe=False From 739d0a6f93f592a61e7cded77e514027dedaa4ee Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 1 Mar 2014 08:57:15 -0800 Subject: [PATCH 6/8] move CDXQuery to seperate file --- pywb/cdx/cdxdomainspecific.py | 2 +- pywb/cdx/cdxobject.py | 120 --------------------------------- pywb/cdx/cdxops.py | 3 +- pywb/cdx/cdxserver.py | 3 +- pywb/cdx/cdxsource.py | 1 + pywb/cdx/query.py | 122 ++++++++++++++++++++++++++++++++++ pywb/cdx/wsgi_cdxserver.py | 4 +- pywb/handlers.py | 2 +- 8 files changed, 130 insertions(+), 127 deletions(-) create mode 100644 pywb/cdx/query.py diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index 9890605e..2e8a3855 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -7,7 +7,7 @@ from pywb.utils.dsrules import BaseRule, RuleSet from pywb.utils.canonicalize import unsurt, UrlCanonicalizer -from cdxobject import CDXQuery +from query import CDXQuery #================================================================= def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered): diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py index 682e8c74..3915f169 100644 --- a/pywb/cdx/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -23,126 +23,6 @@ class AccessException(CDXException): return '403 Access Denied' -#================================================================= -class CDXQuery(object): - def __init__(self, **kwargs): - self.params = kwargs - - @property - def key(self): - return self.params['key'] - - @property - def end_key(self): - return self.params['end_key'] - - def set_key(self, key, end_key): - self.params['key'] = key - self.params['end_key'] = end_key - - @property - def url(self): - try: - return self.params['url'] - except KeyError: - msg = 'A url= param must be specified to query the cdx server' - raise CDXException(msg) - - @property - def match_type(self): - return self.params.get('matchType', 'exact') - - @property - def is_exact(self): - return self.match_type == 'exact' - - @property - def allow_fuzzy(self): - return self._get_bool('allowFuzzy') - - @property - def output(self): - return self.params.get('output', 'text') - - @property - def limit(self): - return int(self.params.get('limit', 100000)) - - @property - def collapse_time(self): - return self.params.get('collapseTime') - - @property - def resolve_revisits(self): - return self._get_bool('resolveRevisits') - - @property - def filters(self): - return self.params.get('filter', []) - - @property - def fields(self): - v = self.params.get('fields') - return v.split(',') if v else None - - @property - def closest(self): - # sort=closest is not required - return self.params.get('closest') - - @property - def reverse(self): - # sort=reverse overrides reverse=0 - return (self._get_bool('reverse') or - self.params.get('sort') == 'reverse') - - @property - def secondary_index_only(self): - return self._get_bool('showPagedIndex') - - @property - def process(self): - return self._get_bool('processOps', True) - - def set_process(self, process): - self.params['processOps'] = process - - def _get_bool(self, name, def_val=False): - v = self.params.get(name) - if v: - try: - v = int(v) - except ValueError as ex: - v = (v.lower() == 'true') - else: - v = def_val - - return bool(v) - - def urlencode(self): - return urlencode(self.params, True) - - @staticmethod - def from_wsgi_env(env): - """ utility function to extract params and create a CDXQuery - from a WSGI environment dictionary - """ - params = parse_qs(env['QUERY_STRING']) - - if not 'output' in params: - params['output'] = 'text' - - # parse_qs produces arrays for single values - # cdx processing expects singleton params for all params, - # except filters, so convert here - # use first value of the list - for name, val in params.iteritems(): - if name != 'filter': - params[name] = val[0] - - return CDXQuery(**params) - - #================================================================= class CDXObject(OrderedDict): CDX_FORMATS = [ diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index 72d69417..92809b07 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -1,4 +1,5 @@ -from cdxobject import CDXObject, IDXObject, AccessException, CDXQuery +from cdxobject import CDXObject, IDXObject, AccessException +from query import CDXQuery from pywb.utils.timeutils import timestamp_to_sec import bisect diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index c3874a93..8753545d 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -3,7 +3,8 @@ from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range from cdxops import cdx_load from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource from zipnum import ZipNumCluster -from cdxobject import CDXObject, CaptureNotFoundException, CDXException, CDXQuery +from cdxobject import CDXObject, CaptureNotFoundException, CDXException +from query import CDXQuery from cdxdomainspecific import load_domain_specific_cdx_rules from pywb.utils.loaders import is_http diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index c17312c0..119f2006 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -2,6 +2,7 @@ from pywb.utils.binsearch import iter_range from pywb.utils.loaders import SeekableTextFileReader from cdxobject import AccessException +from query import CDXQuery import urllib import urllib2 diff --git a/pywb/cdx/query.py b/pywb/cdx/query.py new file mode 100644 index 00000000..3ce2fc3d --- /dev/null +++ b/pywb/cdx/query.py @@ -0,0 +1,122 @@ +from urllib import urlencode +from urlparse import parse_qs + + +#================================================================= +class CDXQuery(object): + def __init__(self, **kwargs): + self.params = kwargs + + @property + def key(self): + return self.params['key'] + + @property + def end_key(self): + return self.params['end_key'] + + def set_key(self, key, end_key): + self.params['key'] = key + self.params['end_key'] = end_key + + @property + def url(self): + try: + return self.params['url'] + except KeyError: + msg = 'A url= param must be specified to query the cdx server' + raise CDXException(msg) + + @property + def match_type(self): + return self.params.get('matchType', 'exact') + + @property + def is_exact(self): + return self.match_type == 'exact' + + @property + def allow_fuzzy(self): + return self._get_bool('allowFuzzy') + + @property + def output(self): + return self.params.get('output', 'text') + + @property + def limit(self): + return int(self.params.get('limit', 100000)) + + @property + def collapse_time(self): + return self.params.get('collapseTime') + + @property + def resolve_revisits(self): + return self._get_bool('resolveRevisits') + + @property + def filters(self): + return self.params.get('filter', []) + + @property + def fields(self): + v = self.params.get('fields') + return v.split(',') if v else None + + @property + def closest(self): + # sort=closest is not required + return self.params.get('closest') + + @property + def reverse(self): + # sort=reverse overrides reverse=0 + return (self._get_bool('reverse') or + self.params.get('sort') == 'reverse') + + @property + def secondary_index_only(self): + return self._get_bool('showPagedIndex') + + @property + def process(self): + return self._get_bool('processOps', True) + + def set_process(self, process): + self.params['processOps'] = process + + def _get_bool(self, name, def_val=False): + v = self.params.get(name) + if v: + try: + v = int(v) + except ValueError as ex: + v = (v.lower() == 'true') + else: + v = def_val + + return bool(v) + + def urlencode(self): + return urlencode(self.params, True) + + @staticmethod + def from_wsgi_env(env): + """ utility function to extract params and create a CDXQuery + from a WSGI environment dictionary + """ + params = parse_qs(env['QUERY_STRING']) + + if not 'output' in params: + params['output'] = 'text' + + # parse_qs produces arrays for single values + # cdx processing expects singleton params for all params, + # except filters, so convert here + # use first value of the list + for name, val in params.iteritems(): + if name != 'filter': + params[name] = val[0] + + return CDXQuery(**params) diff --git a/pywb/cdx/wsgi_cdxserver.py b/pywb/cdx/wsgi_cdxserver.py index e6ab6067..c9fe11d7 100644 --- a/pywb/cdx/wsgi_cdxserver.py +++ b/pywb/cdx/wsgi_cdxserver.py @@ -1,15 +1,13 @@ from werkzeug.wrappers import BaseResponse from cdxserver import create_cdx_server from pywb import get_test_dir -from cdxobject import CDXQuery +from query import CDXQuery import logging import os import yaml import pkg_resources -import cdxops - #================================================================= CONFIG_FILE = 'config.yaml' diff --git a/pywb/handlers.py b/pywb/handlers.py index 10456380..43cb94f9 100644 --- a/pywb/handlers.py +++ b/pywb/handlers.py @@ -4,7 +4,7 @@ import mimetypes import time from pywb.rewrite.wburl import WbUrl -from pywb.cdx.cdxobject import CDXQuery +from pywb.cdx.query import CDXQuery from wbrequestresponse import WbResponse from wbexceptions import WbException, NotFoundException from views import TextCapturesView From 15d2cdd1b3c8e2e8e8ac7b83c94f02ee99f31b07 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 1 Mar 2014 16:35:27 -0800 Subject: [PATCH 7/8] cdx: cleanup regarding and more consistency for RemoteCDXServer RemoteCDXServer delegates filter/processing and simply proxies response from remote RemoteCDXSource (and default usage with CDXServer) only fetches the unfiltered/unprocessed stream and performs cdx ops locally --- pywb/cdx/cdxops.py | 2 +- pywb/cdx/cdxserver.py | 7 +++---- pywb/cdx/cdxsource.py | 9 ++++----- pywb/cdx/query.py | 13 +++++-------- pywb/cdx/test/cdxserver_test.py | 16 ++++++++++++++-- pywb/cdx/zipnum.py | 1 - pywb/handlers.py | 4 ++-- pywb/indexreader.py | 3 --- 8 files changed, 29 insertions(+), 26 deletions(-) diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index 92809b07..10be9bb1 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -25,7 +25,7 @@ def cdx_load(sources, query, perms_checker=None, process=True): cdx_iter = load_cdx_streams(sources, query) cdx_iter = make_obj_iter(cdx_iter, query) - if process and query.process: + if process and not query.secondary_index_only: cdx_iter = process_cdx(cdx_iter, query) if perms_checker: diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 8753545d..54d46f4b 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -175,14 +175,13 @@ class RemoteCDXServer(BaseCDXServer): if isinstance(source, RemoteCDXSource): self.source = source - elif (isinstance(source, str) and - any(source.startswith(x) for x in ['http://', 'https://'])): - self.source = RemoteCDXSource(source) + elif (isinstance(source, str) and is_http(source)): + self.source = RemoteCDXSource(source, remote_processing=True) else: raise Exception('Invalid remote cdx source: ' + str(source)) def load_cdx_query(self, query): - remote_iter = cdx_load(self.sources, query, process=False) + remote_iter = cdx_load([self.source], query, process=False) return self._check_cdx_iter(remote_iter, query) def __str__(self): diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index 119f2006..0923fba9 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -41,19 +41,18 @@ class RemoteCDXSource(CDXSource): Only url and match type params are proxied at this time, the stream is passed through all other filters locally. """ - def __init__(self, filename, cookie=None, proxy_all=True): + def __init__(self, filename, cookie=None, remote_processing=False): self.remote_url = filename self.cookie = cookie - self.proxy_all = proxy_all + self.remote_processing = remote_processing def load_cdx(self, query): - if self.proxy_all: - query.set_process(False) + if self.remote_processing: remote_query = query else: # Only send url and matchType params to remote remote_query = CDXQuery(url=query.url, - match_type=query.matchType) + match_type=query.match_type) urlparams = remote_query.urlencode() diff --git a/pywb/cdx/query.py b/pywb/cdx/query.py index 3ce2fc3d..dc480836 100644 --- a/pywb/cdx/query.py +++ b/pywb/cdx/query.py @@ -79,13 +79,6 @@ class CDXQuery(object): def secondary_index_only(self): return self._get_bool('showPagedIndex') - @property - def process(self): - return self._get_bool('processOps', True) - - def set_process(self, process): - self.params['processOps'] = process - def _get_bool(self, name, def_val=False): v = self.params.get(name) if v: @@ -103,6 +96,10 @@ class CDXQuery(object): @staticmethod def from_wsgi_env(env): + return CDXQuery(**CDXQuery.extract_params_from_wsgi_env(env)) + + @staticmethod + def extract_params_from_wsgi_env(env): """ utility function to extract params and create a CDXQuery from a WSGI environment dictionary """ @@ -119,4 +116,4 @@ class CDXQuery(object): if name != 'filter': params[name] = val[0] - return CDXQuery(**params) + return params diff --git a/pywb/cdx/test/cdxserver_test.py b/pywb/cdx/test/cdxserver_test.py index f09af0fc..e261ead4 100644 --- a/pywb/cdx/test/cdxserver_test.py +++ b/pywb/cdx/test/cdxserver_test.py @@ -142,6 +142,8 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex ('filename', 'dupes.warc.gz')] # NOTE: external dependency -- need self-contained test TODO + +# Load remote query but filter locally >>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') >>> pprint.pprint(x.next().items()) [('urlkey', 'com,example)/'), @@ -152,14 +154,24 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'), ('length', '1792')] +# No local filtering/processing of cdx, simply return result from remote server +>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') +>>> pprint.pprint(x.next().items()) +[('urlkey', 'com,example)/'), + ('timestamp', '20020120142510'), + ('original', 'http://example.com:80/'), + ('mimetype', 'text/html'), + ('statuscode', '200'), + ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'), + ('length', '1792')] ->>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2') +>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2') Traceback (most recent call last): AccessException: Blocked By Robots """ #================================================================= -from pywb.cdx.cdxserver import CDXServer +from pywb.cdx.cdxserver import CDXServer, RemoteCDXServer import os import sys import pprint diff --git a/pywb/cdx/zipnum.py b/pywb/cdx/zipnum.py index fbb1503f..1d0cb24f 100644 --- a/pywb/cdx/zipnum.py +++ b/pywb/cdx/zipnum.py @@ -121,7 +121,6 @@ class ZipNumCluster(CDXSource): prev_size=1) if query.secondary_index_only: - query.set_process(False) return idx_iter else: blocks = self.idx_to_cdx(idx_iter, query) diff --git a/pywb/handlers.py b/pywb/handlers.py index 43cb94f9..0d9500f4 100644 --- a/pywb/handlers.py +++ b/pywb/handlers.py @@ -79,8 +79,8 @@ class CDXHandler(BaseHandler): self.view = view if view else TextCapturesView() def __call__(self, wbrequest): - query = CDXQuery.from_wsgi_env(wbrequest.env) - cdx_lines = self.index_reader.load_cdx_query(query) + params = CDXQuery.extract_params_from_wsgi_env(wbrequest.env) + cdx_lines = self.index_reader.load_cdx(**params) return self.view.render_response(wbrequest, cdx_lines) diff --git a/pywb/indexreader.py b/pywb/indexreader.py index ff17dfde..a422d0b4 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -34,9 +34,6 @@ class IndexReader(object): return cdxlines - def load_cdx_query(self, query): - return self.cdx_server.load_cdx_query(query) - def load_cdx(self, **params): return self.cdx_server.load_cdx(**params) From 06a22c845bad2b4ebdfc5542d901c9544ae28f37 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 1 Mar 2014 18:40:16 -0800 Subject: [PATCH 8/8] ensure cdx loading happens lazily add perms test to ensure 'short-circuiting' in case of permission exception --- pywb/cdx/cdxops.py | 11 +++++++---- pywb/cdx/test/test_perms.py | 28 ++++++++++++++++++++++++++++ tests/test_integration.py | 4 +++- 3 files changed, 38 insertions(+), 5 deletions(-) create mode 100644 pywb/cdx/test/test_perms.py diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index 10be9bb1..c4f865c2 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -89,11 +89,13 @@ def process_cdx(cdx_iter, query): def load_cdx_streams(sources, query): # Optimize: no need to merge if just one input if len(sources) == 1: - return sources[0].load_cdx(query) + cdx_iter = sources[0].load_cdx(query) + else: + source_iters = map(lambda src: src.load_cdx(query), sources) + cdx_iter = merge(*(source_iters)) - source_iters = map(lambda src: src.load_cdx(query), sources) - merged_stream = merge(*(source_iters)) - return merged_stream + for cdx in cdx_iter: + yield cdx #================================================================= @@ -156,6 +158,7 @@ def cdx_filter(cdx_iter, filter_strings): if string.startswith('='): string = string[1:] self.compare_func = self.exact + # contains match elif string.startswith('~'): string = string[1:] self.compare_func = self.contains diff --git a/pywb/cdx/test/test_perms.py b/pywb/cdx/test/test_perms.py new file mode 100644 index 00000000..eb5a30ac --- /dev/null +++ b/pywb/cdx/test/test_perms.py @@ -0,0 +1,28 @@ +from pywb.cdx.cdxops import cdx_load +from pywb.cdx.perms import AllowAllPerms +from pywb.cdx.query import CDXQuery +from pywb.cdx.cdxobject import AccessException + +from pytest import raises + +class BlockAllPerms(AllowAllPerms): + def allow_url_lookup(self, urlkey, url): + return False + + +def test_exclusion_short_circuit(): + """ + # Verify that exclusion check 'short-circuits' further evaluation.. eg, a bad cdx source is not even loaded + # if exclusion check does not pass + """ + cdx_iter = cdx_load(['bogus ignored'], CDXQuery(url='example.com', key='com,example)/'), + perms_checker=BlockAllPerms(), process=True) + + # exception happens on first access attempt + with raises(AccessException): + cdx_iter.next() + + + + + diff --git a/tests/test_integration.py b/tests/test_integration.py index 4c815677..6e24ec6a 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -2,7 +2,8 @@ import webtest from pywb.pywb_init import pywb_config from pywb.wbapp import create_wb_app from pywb.cdx.cdxobject import CDXObject -from pywb.cdx.perms import AllowAllPerms + +from fixture import TestExclusionPerms class TestWb: TEST_CONFIG = 'test_config.yaml' @@ -208,3 +209,4 @@ class TestWb: resp = self.testapp.get('/pywb/?abc', status = 400) assert resp.status_int == 400 assert 'Invalid Url: http://?abc' in resp.body +