update pkg-reorg with changes from master, including

CDXQuery configuration
2025-03-24 06:59:52 +01:00 · 2014-03-02 00:26:29 -08:00 · 2014-03-02 00:26:29 -08:00 · 19f86305bf
commit 19f86305bf
parent c084b45298 06a22c845b
18 changed files with 746 additions and 309 deletions
--- a/pywb/cdx/cdxdomainspecific.py
+++ b/pywb/cdx/cdxdomainspecific.py
@ -1,12 +1,13 @@
 import yaml
 import re
 import logging
-import pkgutil
+import pkg_resources
 from pywb.utils.dsrules import BaseRule, RuleSet
 from pywb.utils.canonicalize import unsurt, UrlCanonicalizer
 from query import CDXQuery
 #=================================================================
 def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
@ -70,13 +71,13 @@ class FuzzyQuery:
    def __init__(self, rules):
        self.rules = rules
-    def __call__(self, params):
+    def __call__(self, query):
        matched_rule = None
-        urlkey = params['key']
+        urlkey = query.key
-        url = params['url']
+        url = query.url
-        filter_ = params.get('filter', [])
+        filter_ = query.filters
-        output = params.get('output')
+        output = query.output
        for rule in self.rules.iter_matching(urlkey):
            m = rule.regex.search(urlkey)
@ -102,7 +103,7 @@ class FuzzyQuery:
                  'filter': filter_,
                  'output': output}
-        return params
+        return CDXQuery(**params)
 #=================================================================
--- a/pywb/cdx/cdxobject.py
+++ b/pywb/cdx/cdxobject.py
@ -1,6 +1,9 @@
 from collections import OrderedDict
 import itertools
 from urllib import urlencode
 from urlparse import parse_qs
 #=================================================================
 class CDXException(Exception):
@ -71,12 +74,25 @@ class CDXObject(OrderedDict):
        # force regen on next __str__ call
        self.cdxline = None
    def is_revisit(self):
        return (self['mimetype'] == 'warc/revisit' or
                self['filename'] == '-')
    def to_text(self, fields=None):
        """
        return plaintext CDX record (includes newline).
        :param fields: list of field names to output.
        """
        if fields is None:
            return str(self) + '\n'
        else:
            return ' '.join(self[x] for x in fields) + '\n'
    def __str__(self):
        if self.cdxline:
            return self.cdxline
-        li = itertools.imap(lambda (n, val): val, self.items())
+        return ' '.join(val for n, val in self.iteritems())
        return ' '.join(li)
 #=================================================================
@ -106,5 +122,12 @@ class IDXObject(OrderedDict):
        self.idxline = idxline
    def to_text(self, fields=None):
        """
        return plaintext IDX record (including newline).
        :param fields: list of field names to output (currently ignored)
        """
        return str(self) + '\n'
    def __str__(self):
        return self.idxline
--- a/pywb/cdx/cdxops.py
+++ b/pywb/cdx/cdxops.py
@ -1,4 +1,5 @@
 from cdxobject import CDXObject, IDXObject, AccessException
 from query import CDXQuery
 from pywb.utils.timeutils import timestamp_to_sec
 import bisect
@ -10,32 +11,44 @@ from collections import deque
 #=================================================================
-def cdx_load(sources, params, perms_checker=None):
+def cdx_load(sources, query, perms_checker=None, process=True):
    """
    merge text CDX lines from sources, return an iterator for
    filtered and access-checked sequence of CDX objects.
    :param sources: iterable for text CDX sources.
    :param perms_checker: access check filter object implementing
      allow_url_lookup(key, url), allow_capture(cdxobj) and
      filter_fields(cdxobj) methods.
    :param process: bool, perform processing sorting/filtering/grouping ops
    """
    cdx_iter = load_cdx_streams(sources, query)
    cdx_iter = make_obj_iter(cdx_iter, query)
    if process and not query.secondary_index_only:
        cdx_iter = process_cdx(cdx_iter, query)
    if perms_checker:
-        cdx_iter = cdx_load_with_perms(sources, params, perms_checker)
+        cdx_iter = restrict_cdx(cdx_iter, query, perms_checker)
    else:
        cdx_iter = cdx_load_and_filter(sources, params)
    # output raw cdx objects
    if params.get('output') == 'raw':
        return cdx_iter
    def write_cdx(fields):
        for cdx in cdx_iter:
            yield cdx_text_out(cdx, fields) + '\n'
    return write_cdx(params.get('fields'))
    return cdx_iter
 #=================================================================
-def cdx_load_with_perms(sources, params, perms_checker):
+def restrict_cdx(cdx_iter, query, perms_checker):
-    if not perms_checker.allow_url_lookup(params['key'], params['url']):
+    """
-        if params.get('matchType', 'exact') == 'exact':
+    filter out those cdx records that user doesn't have access to,
    by consulting :param perms_checker:.
    :param cdx_iter: cdx record source iterable
    :param query: request parameters (CDXQuery)
    :param perms_checker: object implementing permission checker
    """
    if not perms_checker.allow_url_lookup(query.key, query.url):
        if query.is_exact:
            raise AccessException('Excluded')
    cdx_iter = cdx_load_and_filter(sources, params)
    for cdx in cdx_iter:
        # TODO: we could let filter_fields handle this case by accepting
        # None as a return value.
        if not perms_checker.allow_capture(cdx):
            continue
@ -43,45 +56,27 @@ def cdx_load_with_perms(sources, params, perms_checker):
        yield cdx
 #=================================================================
-def cdx_text_out(cdx, fields):
+def process_cdx(cdx_iter, query):
-    if not fields:
+    if query.resolve_revisits:
        return str(cdx)
    else:
        return ' '.join(map(lambda x: cdx[x], fields.split(',')))
 #=================================================================
 def cdx_load_and_filter(sources, params):
    cdx_iter = load_cdx_streams(sources, params)
    cdx_iter = make_obj_iter(cdx_iter, params)
    if params.get('proxyAll'):
        return cdx_iter
    resolve_revisits = params.get('resolveRevisits', False)
    if resolve_revisits:
        cdx_iter = cdx_resolve_revisits(cdx_iter)
-    filters = params.get('filter', None)
+    filters = query.filters
    if filters:
        cdx_iter = cdx_filter(cdx_iter, filters)
-    collapse_time = params.get('collapseTime', None)
+    collapse_time = query.collapse_time
    if collapse_time:
        cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time)
-    limit = int(params.get('limit', 1000000))
+    limit = query.limit
-    reverse = params.get('reverse', False) or params.get('sort') == 'reverse'
+    if query.reverse:
    if reverse:
        cdx_iter = cdx_reverse(cdx_iter, limit)
-    closest_to = params.get('closest', None)
+    closest = query.closest
-    if closest_to:
+    if closest:
-        cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit)
+        cdx_iter = cdx_sort_closest(closest, cdx_iter, limit)
    if limit:
        cdx_iter = cdx_limit(cdx_iter, limit)
@ -91,26 +86,28 @@ def cdx_load_and_filter(sources, params):
 #=================================================================
 # load and source merge cdx streams
-def load_cdx_streams(sources, params):
+def load_cdx_streams(sources, query):
    # Optimize: no need to merge if just one input
    if len(sources) == 1:
-        return sources[0].load_cdx(params)
+        cdx_iter = sources[0].load_cdx(query)
    else:
        source_iters = map(lambda src: src.load_cdx(query), sources)
        cdx_iter = merge(*(source_iters))
-    source_iters = map(lambda src: src.load_cdx(params), sources)
+    for cdx in cdx_iter:
-    merged_stream = merge(*(source_iters))
+        yield cdx
    return merged_stream
 #=================================================================
 # convert text cdx stream to CDXObject/IDXObject
-def make_obj_iter(text_iter, params):
+def make_obj_iter(text_iter, query):
    # already converted
-    if params.get('showPagedIndex'):
+    if query.secondary_index_only:
        cls = IDXObject
    else:
        cls = CDXObject
-    return itertools.imap(lambda line: cls(line), text_iter)
+    return (cls(line) for line in text_iter)
 #=================================================================
@ -161,6 +158,7 @@ def cdx_filter(cdx_iter, filter_strings):
            if string.startswith('='):
                string = string[1:]
                self.compare_func = self.exact
            # contains match
            elif string.startswith('~'):
                string = string[1:]
                self.compare_func = self.contains
@ -257,8 +255,8 @@ def cdx_resolve_revisits(cdx_iter):
    originals = {}
    for cdx in cdx_iter:
-        is_revisit = ((cdx['mimetype'] == 'warc/revisit') or
+        
-                      (cdx['filename'] == '-'))
+        is_revisit = cdx.is_revisit()
        digest = cdx['digest']
--- a/pywb/cdx/cdxserver.py
+++ b/pywb/cdx/cdxserver.py
@ -4,6 +4,7 @@ from cdxops import cdx_load
 from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
 from zipnum import ZipNumCluster
 from cdxobject import CDXObject, CaptureNotFoundException, CDXException
 from query import CDXQuery
 from cdxdomainspecific import load_domain_specific_cdx_rules
 from pywb.utils.loaders import is_http
@ -36,7 +37,7 @@ class BaseCDXServer(object):
        # set perms checker, if any
        self.perms_checker = kwargs.get('perms_checker')
-    def _check_cdx_iter(self, cdx_iter, params):
+    def _check_cdx_iter(self, cdx_iter, query):
        """ Check cdx iter semantics
        If iter is empty (no matches), check if fuzzy matching
        is allowed, and try it -- otherwise,
@ -48,21 +49,23 @@ class BaseCDXServer(object):
        if cdx_iter:
            return cdx_iter
        url = params['url']
        # check if fuzzy is allowed and ensure that its an
        # exact match
-        if (self.fuzzy_query and params.get('allowFuzzy') and
+        if (self.fuzzy_query and
-            params.get('matchType', 'exact') == 'exact'):
+            query.allow_fuzzy and
            query.is_exact):
-            fuzzy_params = self.fuzzy_query(params)
+            fuzzy_query_params = self.fuzzy_query(query)
-            if fuzzy_params:
+            if fuzzy_query_params:
-                return self.load_cdx(**fuzzy_params)
+                return self.load_cdx_query(fuzzy_query_params)
-        msg = 'No Captures found for: ' + url
+        msg = 'No Captures found for: ' + query.url
        raise CaptureNotFoundException(msg)
    def load_cdx(self, **params):
        return self.load_cdx_query(CDXQuery(**params))
    def load_cdx_query(self, query):
        raise NotImplementedError('Implement in subclass')
    @staticmethod
@ -84,28 +87,77 @@ class CDXServer(BaseCDXServer):
    def __init__(self, paths, **kwargs):
        super(CDXServer, self).__init__(**kwargs)
-        self.sources = create_cdx_sources(paths, kwargs.get('config'))
+        # TODO: we could save config in member, so that other
        # methods can use it. it's bad for add_cdx_source to take
        # config argument.
        self._create_cdx_sources(paths, kwargs.get('config'))
-    def load_cdx(self, **params):
+    def load_cdx_query(self, query):
-        # if key not set, assume 'url' is set and needs canonicalization
+        url = query.url
-        if not params.get('key'):
+        key, end_key = calc_search_range(url=url,
-            try:
+                                         match_type=query.match_type,
-                url = params['url']
+                                         url_canon=self.url_canon)
-            except KeyError:
+        query.set_key(key, end_key)
                msg = 'A url= param must be specified to query the cdx server'
                raise CDXException(msg)
-            match_type = params.get('matchType', 'exact')
+        cdx_iter = cdx_load(self.sources,
                            query,
                            perms_checker=self.perms_checker)
-            key, end_key = calc_search_range(url=url,
+        return self._check_cdx_iter(cdx_iter, query)
                                             match_type=match_type,
                                             url_canon=self.url_canon)
            params['key'] = key
            params['end_key'] = end_key
-        cdx_iter = cdx_load(self.sources, params, self.perms_checker)
+    def _create_cdx_sources(self, paths, config):
        """
        build CDXSource instances for each of path in :param paths:.
        :param paths: list of sources or single source.
        each source may be either string or CDXSource instance. value
        of any other types will be silently ignored.
        :param config: config object passed to :method:`add_cdx_source`.
        """
        self.sources = []
-        return self._check_cdx_iter(cdx_iter, params)
+        if paths is not None:
            if not isinstance(paths, (list, tuple)):
                paths = [paths]
            for path in paths:
                self.add_cdx_source(path, config)
        if len(self.sources) == 0:
            logging.warn('No CDX Sources configured from paths=%s', paths)
    def _add_cdx_source(self, source):
        if source is None: return
        logging.debug('Adding CDX Source: %s', source)
        self.sources.append(source)
    def add_cdx_source(self, source, config):
        if source is None: return
        if isinstance(source, CDXSource):
            self._add_cdx_source(source)
        elif isinstance(source, str):
            if os.path.isdir(source):
                for fn in os.listdir(source):
                    self._add_cdx_source(self._create_cdx_source(
                            os.path.join(source, fn), config))
            else:
                self._add_cdx_source(self._create_cdx_source(
                        source, config))
    def _create_cdx_source(self, filename, config):
        if is_http(filename):
            return RemoteCDXSource(filename)
        if filename.startswith('redis://'):
            return RedisCDXSource(filename, config)
        if filename.endswith('.cdx'):
            return CDXFile(filename)
        if filename.endswith(('.summary', '.idx')):
            return ZipNumCluster(filename, config)
        logging.warn('skipping unrecognized URI:%s', filename)
        return None
    def __str__(self):
        return 'CDX server serving from ' + str(self.sources)
@ -123,20 +175,14 @@ class RemoteCDXServer(BaseCDXServer):
        if isinstance(source, RemoteCDXSource):
            self.source = source
-        elif (isinstance(source, str) and
+        elif (isinstance(source, str) and is_http(source)):
-              any(source.startswith(x) for x in ['http://', 'https://'])):
+            self.source = RemoteCDXSource(source, remote_processing=True)
            self.source = RemoteCDXSource(source)
        else:
            raise Exception('Invalid remote cdx source: ' + str(source))
-    def load_cdx(self, **params):
+    def load_cdx_query(self, query):
-        remote_iter = self.source.load_cdx(params)
+        remote_iter = cdx_load([self.source], query, process=False)
-
+        return self._check_cdx_iter(remote_iter, query)
        # if need raw, convert to raw format here
        if params.get('output') == 'raw':
            remote_iter = (CDXObject(cdx) for cdx in remote_iter)
        return self._check_cdx_iter(remote_iter, params)
    def __str__(self):
        return 'Remote CDX server serving from ' + str(self.sources[0])
@ -169,74 +215,3 @@ def create_cdx_server(config, ds_rules_file=None):
                      perms_checker=perms_checker)
 #=================================================================
 def create_cdx_sources(paths, config=None):
    sources = []
    if not isinstance(paths, list):
        paths = [paths]
    for path in paths:
        if isinstance(path, CDXSource):
            add_cdx_source(sources, path, config)
        elif isinstance(path, str):
            if os.path.isdir(path):
                for file in os.listdir(path):
                    add_cdx_source(sources, path + file, config)
            else:
                add_cdx_source(sources, path, config)
    if len(sources) == 0:
        logging.exception('No CDX Sources Found from: ' + str(sources))
    return sources
 #=================================================================
 def add_cdx_source(sources, source, config):
    if not isinstance(source, CDXSource):
        source = create_cdx_source(source, config)
        if not source:
            return
    logging.debug('Adding CDX Source: ' + str(source))
    sources.append(source)
 #=================================================================
 def create_cdx_source(filename, config):
    if is_http(filename):
        return RemoteCDXSource(filename)
    if filename.startswith('redis://'):
        return RedisCDXSource(filename, config)
    if filename.endswith('.cdx'):
        return CDXFile(filename)
    if filename.endswith(('.summary', '.idx')):
        return ZipNumCluster(filename, config)
    return None
 #=================================================================
 def extract_params_from_wsgi_env(env):
    """ utility function to extract params from the query
    string of a WSGI environment dictionary
    """
    # use url= param to get actual url
    params = urlparse.parse_qs(env['QUERY_STRING'])
    if not 'output' in params:
        params['output'] = 'text'
    # parse_qs produces arrays for single values
    # cdx processing expects singleton params for all params,
    # except filters, so convert here
    # use first value of the list
    for name, val in params.iteritems():
        if name != 'filter':
            params[name] = val[0]
    return params
--- a/pywb/cdx/cdxsource.py
+++ b/pywb/cdx/cdxsource.py
@ -2,6 +2,7 @@ from pywb.utils.binsearch import iter_range
 from pywb.utils.loaders import SeekableTextFileReader
 from cdxobject import AccessException
 from query import CDXQuery
 import urllib
 import urllib2
@ -12,7 +13,7 @@ class CDXSource(object):
    """
    Represents any cdx index source
    """
-    def load_cdx(self, params):
+    def load_cdx(self, query):
        raise NotImplementedError('Implement in subclass')
@ -24,9 +25,9 @@ class CDXFile(CDXSource):
    def __init__(self, filename):
        self.filename = filename
-    def load_cdx(self, params):
+    def load_cdx(self, query):
        source = SeekableTextFileReader(self.filename)
-        return iter_range(source, params.get('key'), params.get('end_key'))
+        return iter_range(source, query.key, query.end_key)
    def __str__(self):
        return 'CDX File - ' + self.filename
@ -40,25 +41,20 @@ class RemoteCDXSource(CDXSource):
    Only url and match type params are proxied at this time,
    the stream is passed through all other filters locally.
    """
-    def __init__(self, filename, cookie=None, proxy_all=True):
+    def __init__(self, filename, cookie=None, remote_processing=False):
        self.remote_url = filename
        self.cookie = cookie
-        self.proxy_all = proxy_all
+        self.remote_processing = remote_processing
-    def load_cdx(self, proxy_params):
+    def load_cdx(self, query):
-        if self.proxy_all:
+        if self.remote_processing:
-            params = proxy_params
+            remote_query = query
            params['proxyAll'] = True
        else:
            # Only send url and matchType params to remote
-            params = {}
+            remote_query = CDXQuery(url=query.url,
-            params['url'] = proxy_params['url']
+                                    match_type=query.match_type)
            match_type = proxy_params.get('matchType')
-            if match_type:
+        urlparams = remote_query.urlencode()
                proxy_params['matchType'] = match_type
        urlparams = urllib.urlencode(params, True)
        try:
            request = urllib2.Request(self.remote_url, urlparams)
@ -97,14 +93,14 @@ class RedisCDXSource(CDXSource):
            self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
-    def load_cdx(self, params):
+    def load_cdx(self, query):
        """
        Load cdx from redis cache, from an ordered list
        Currently, there is no support for range queries
        Only 'exact' matchType is supported
        """
-        key = params['key']
+        key = query.key
        # ensure only url/surt is part of key
        key = key.split(' ')[0]
--- a/pywb/cdx/query.py
+++ b/pywb/cdx/query.py
@ -0,0 +1,119 @@
 from urllib import urlencode
 from urlparse import parse_qs
 #=================================================================
 class CDXQuery(object):
    def __init__(self, **kwargs):
        self.params = kwargs
    @property
    def key(self):
        return self.params['key']
    @property
    def end_key(self):
        return self.params['end_key']
    def set_key(self, key, end_key):
        self.params['key'] = key
        self.params['end_key'] = end_key
    @property
    def url(self):
        try:
            return self.params['url']
        except KeyError:
            msg = 'A url= param must be specified to query the cdx server'
            raise CDXException(msg)
    @property
    def match_type(self):
        return self.params.get('matchType', 'exact')
    @property
    def is_exact(self):
        return self.match_type == 'exact'
    @property
    def allow_fuzzy(self):
        return self._get_bool('allowFuzzy')
    @property
    def output(self):
        return self.params.get('output', 'text')
    @property
    def limit(self):
        return int(self.params.get('limit', 100000))
    @property
    def collapse_time(self):
        return self.params.get('collapseTime')
    @property
    def resolve_revisits(self):
        return self._get_bool('resolveRevisits')
    @property
    def filters(self):
        return self.params.get('filter', [])
    @property
    def fields(self):
        v = self.params.get('fields')
        return v.split(',') if v else None
    @property
    def closest(self):
        # sort=closest is not required
        return self.params.get('closest')
    @property
    def reverse(self):
        # sort=reverse overrides reverse=0
        return (self._get_bool('reverse') or
                self.params.get('sort') == 'reverse')
    @property
    def secondary_index_only(self):
        return self._get_bool('showPagedIndex')
    def _get_bool(self, name, def_val=False):
        v = self.params.get(name)
        if v:
            try:
                v = int(v)
            except ValueError as ex:
                v = (v.lower() == 'true')
        else:
            v = def_val
        return bool(v)
    def urlencode(self):
        return urlencode(self.params, True)
    @staticmethod
    def from_wsgi_env(env):
        return CDXQuery(**CDXQuery.extract_params_from_wsgi_env(env))
    @staticmethod
    def extract_params_from_wsgi_env(env):
        """ utility function to extract params and create a CDXQuery
        from a WSGI environment dictionary
        """
        params = parse_qs(env['QUERY_STRING'])
        if not 'output' in params:
            params['output'] = 'text'
        # parse_qs produces arrays for single values
        # cdx processing expects singleton params for all params,
        # except filters, so convert here
        # use first value of the list
        for name, val in params.iteritems():
            if name != 'filter':
                params[name] = val[0]
        return params
--- a/pywb/cdx/test/cdxserver_test.py
+++ b/pywb/cdx/test/cdxserver_test.py
@ -142,6 +142,8 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
 ('filename', 'dupes.warc.gz')]
 # NOTE: external dependency -- need self-contained test TODO
 # Load remote query but filter locally
 >>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
 >>> pprint.pprint(x.next().items())
 [('urlkey', 'com,example)/'),
@ -152,14 +154,24 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
 ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
 ('length', '1792')]
 # No local filtering/processing of cdx, simply return result from remote server
 >>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
 >>> pprint.pprint(x.next().items())
 [('urlkey', 'com,example)/'),
 ('timestamp', '20020120142510'),
 ('original', 'http://example.com:80/'),
 ('mimetype', 'text/html'),
 ('statuscode', '200'),
 ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
 ('length', '1792')]
->>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2')
+>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2')
 Traceback (most recent call last):
 AccessException: Blocked By Robots
 """
 #=================================================================
-from pywb.cdx.cdxserver import CDXServer
+from pywb.cdx.cdxserver import CDXServer, RemoteCDXServer
 import os
 import sys
 import pprint
@ -167,22 +179,42 @@ import pprint
 from pywb import get_test_dir
 #test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/'
 test_cdx_dir = get_test_dir() + 'cdx/'
 from pywb.cdx.cdxobject import AccessException
 from tests.fixture import testconfig, TestExclusionPerms
 import pytest
 def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
    kwparams['url'] = url
-    kwparams['output'] = 'text'
+    fields = kwparams.get('fields')
    if fields:
        fields = fields.split(',')
    server = CDXServer(sources)
    results = server.load_cdx(**kwparams)
    for x in results:
-        x = x.replace('\t', '    ')
+        l = x.to_text(fields).replace('\t', '    ')
-        sys.stdout.write(x)
+        sys.stdout.write(l)
 #================================================================
 def test_excluded(testconfig):
    testconfig['perms_checker'] = TestExclusionPerms()
    sources = testconfig.get('index_paths')
    print sources
    server = CDXServer(sources, perms_checker=testconfig['perms_checker'])
    assert isinstance(server, CDXServer)
    assert server.perms_checker
    url = 'http://www.iana.org/_img/bookmark_icon.ico'
    key = 'org,iana)/_img/bookmark_icon.ico'
    with pytest.raises(AccessException):
        cdxobjs = list(server.load_cdx(url=url))
        print cdxobjs
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/pywb/cdx/test/test_perms.py
+++ b/pywb/cdx/test/test_perms.py
@ -0,0 +1,28 @@
 from pywb.cdx.cdxops import cdx_load
 from pywb.cdx.perms import AllowAllPerms
 from pywb.cdx.query import CDXQuery
 from pywb.cdx.cdxobject import AccessException
 from pytest import raises
 class BlockAllPerms(AllowAllPerms):
    def allow_url_lookup(self, urlkey, url):
        return False
 def test_exclusion_short_circuit():
    """
    # Verify that exclusion check 'short-circuits' further evaluation.. eg, a bad cdx source is not even loaded
    # if exclusion check does not pass
    """
    cdx_iter = cdx_load(['bogus ignored'], CDXQuery(url='example.com', key='com,example)/'),
                        perms_checker=BlockAllPerms(), process=True)
    # exception happens on first access attempt
    with raises(AccessException):
        cdx_iter.next()
--- a/pywb/cdx/test/wsgi_cdxserver_test.py
+++ b/pywb/cdx/test/wsgi_cdxserver_test.py
@ -1,10 +1,10 @@
 import webtest
-from pywb.cdx.wsgi_cdxserver import main
+from pywb.cdx.wsgi_cdxserver import create_app
 from pywb import get_test_dir
 class TestCdx:
    def setup(self):
-        self.app = main(get_test_dir() + 'cdx/')
+        self.app = create_app(get_test_dir() + 'cdx/')
        self.testapp = webtest.TestApp(self.app)
    def test_cdx(self):
--- a/pywb/cdx/wsgi_cdxserver.py
+++ b/pywb/cdx/wsgi_cdxserver.py
@ -1,10 +1,12 @@
-from cdxserver import create_cdx_server, extract_params_from_wsgi_env
+from werkzeug.wrappers import BaseResponse
 from cdxserver import create_cdx_server
 from pywb import get_test_dir
 from query import CDXQuery
 import logging
 import os
 import yaml
-import pkgutil
+import pkg_resources
 #=================================================================
 CONFIG_FILE = 'config.yaml'
@ -13,65 +15,89 @@ RULES_FILE = 'rules.yaml'
 DEFAULT_PORT = 8080
 config = None
 if __package__:
    try:
        config = pkgutil.get_data(__package__, CONFIG_FILE)
        config = yaml.load(config)
    except:
        pass
 #=================================================================
-def main(paths=None):
+
 class CDXQueryRequest(object):
    def __init__(self, environ):
        self.query = CDXQuery.from_wsgi_env(environ)
 class WSGICDXServer(object):
    def __init__(self, config, rules_file):
        self.cdxserver = create_cdx_server(config, rules_file)
    def __call__(self, environ, start_response):
        request = CDXQueryRequest(environ)
        try:
            logging.debug('request.args=%s', request.query)
            result = self.cdxserver.load_cdx_query(request.query)
            # TODO: select response type by "output" parameter
            response = PlainTextResponse(result, request.query.fields)
            return response(environ, start_response)
        except Exception as exc:
            logging.error('load_cdx failed', exc_info=1)
            # TODO: error response should be different for each response
            # type
            start_response('400 Error', [('Content-Type', 'text/plain')])
            return [str(exc)]
 def cdx_text_out(cdx, fields):
    if not fields:
        return str(cdx) + '\n'
    else:
        logging.info('cdx fields=%s', cdx.keys)
        # TODO: this will results in an exception if fields contain
        # non-existent field name.
        return ' '.join(cdx[x] for x in fields) + '\n'
 class PlainTextResponse(BaseResponse):
    def __init__(self, cdxitr, fields, status=200, content_type='text/plain'):
        super(PlainTextResponse, self).__init__(
            response=(
                cdx.to_text(fields) for cdx in cdxitr
                ),
            status=status, content_type=content_type)
 # class JsonResponse(Response):
 #     pass
 # class MementoResponse(Response):
 #     pass
 def create_app(config=None):
    logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
                        level=logging.DEBUG)
-    if not paths:
+    if not config:
-        if config:
+        index_paths = get_test_dir() + 'cdx/'
-            paths = config
+        config = dict(index_paths=index_paths)
        else:
            paths = get_test_dir() + 'cdx/'
    cdxserver = create_cdx_server(paths, RULES_FILE)
    def application(env, start_response):
        try:
            params = extract_params_from_wsgi_env(env)
            response = cdxserver.load_cdx(**params)
            start_response('200 OK', [('Content-Type', 'text/plain')])
        except Exception as exc:
            import traceback
            err_details = traceback.format_exc(exc)
            start_response('400 Error', [('Content-Type', 'text/plain')])
            response = [str(exc)]
            print err_details
        return response
    return application
    return WSGICDXServer(config, RULES_FILE)
 if __name__ == "__main__":
-    from wsgiref.simple_server import make_server
+    from optparse import OptionParser
    from werkzeug.serving import run_simple
-    app = main()
+    opt = OptionParser('%prog [OPTIONS]')
    opt.add_option('-p', '--port', type='int', default=None)
-    port = DEFAULT_PORT
+    options, args = opt.parse_args()
    if config:
        port = config.get('port', DEFAULT_PORT)
-    httpd = make_server('', port, app)
+    configdata = pkg_resources.resource_string(__name__, CONFIG_FILE)
    config = yaml.load(configdata)
-    logging.debug('Starting CDX Server on port ' + str(port))
+    port = options.port
    if port is None:
        port = (config and config.get('port')) or DEFAULT_PORT
    app = create_app(config)
    logging.debug('Starting CDX Server on port %s', port)
    try:
-        httpd.serve_forever()
+        run_simple('0.0.0.0', port, app, use_reloader=True, use_debugger=True)
-    except KeyboardInterrupt:
+    except KeyboardInterrupt as ex:
        pass
    logging.debug('Stopping CDX Server')
 else:
-    application = main()
+    # XXX pass production config
    application = create_app()
--- a/pywb/cdx/zipnum.py
+++ b/pywb/cdx/zipnum.py
@ -110,21 +110,20 @@ class ZipNumCluster(CDXSource):
    def lookup_loc(self, part):
        return self.loc_map[part]
-    def load_cdx(self, params):
+    def load_cdx(self, query):
        self.reload_loc()
        reader = SeekableTextFileReader(self.summary)
        idx_iter = iter_range(reader,
-                              params['key'],
+                              query.key,
-                              params['end_key'],
+                              query.end_key,
                              prev_size=1)
-        if params.get('showPagedIndex'):
+        if query.secondary_index_only:
            params['proxyAll'] = True
            return idx_iter
        else:
-            blocks = self.idx_to_cdx(idx_iter, params)
+            blocks = self.idx_to_cdx(idx_iter, query)
            def gen_cdx():
                for blk in blocks:
@ -133,7 +132,7 @@ class ZipNumCluster(CDXSource):
            return gen_cdx()
-    def idx_to_cdx(self, idx_iter, params):
+    def idx_to_cdx(self, idx_iter, query):
        blocks = None
        ranges = []
@ -150,7 +149,7 @@ class ZipNumCluster(CDXSource):
            else:
                if blocks:
-                    yield self.block_to_cdx_iter(blocks, ranges, params)
+                    yield self.block_to_cdx_iter(blocks, ranges, query)
                blocks = ZipBlocks(idx['part'],
                                   idx['offset'],
@ -160,15 +159,15 @@ class ZipNumCluster(CDXSource):
                ranges = [blocks.length]
        if blocks:
-            yield self.block_to_cdx_iter(blocks, ranges, params)
+            yield self.block_to_cdx_iter(blocks, ranges, query)
-    def block_to_cdx_iter(self, blocks, ranges, params):
+    def block_to_cdx_iter(self, blocks, ranges, query):
        last_exc = None
        last_traceback = None
        for location in self.lookup_loc(blocks.part):
            try:
-                return self.load_blocks(location, blocks, ranges, params)
+                return self.load_blocks(location, blocks, ranges, query)
            except Exception as exc:
                last_exc = exc
                import sys
@ -179,7 +178,7 @@ class ZipNumCluster(CDXSource):
        else:
            raise Exception('No Locations Found for: ' + block.part)
-    def load_blocks(self, location, blocks, ranges, params):
+    def load_blocks(self, location, blocks, ranges, query):
        if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG):
            msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
@ -195,9 +194,9 @@ class ZipNumCluster(CDXSource):
        iter_ = itertools.chain(*itertools.imap(decompress_block, ranges))
        # start bound
-        iter_ = linearsearch(iter_, params['key'])
+        iter_ = linearsearch(iter_, query.key)
        # end bound
-        end = params['end_key']
+        end = query.end_key
        iter_ = itertools.takewhile(lambda line: line < end, iter_)
        return iter_
--- a/pywb/core/handlers.py
+++ b/pywb/core/handlers.py
@ -4,7 +4,7 @@ import mimetypes
 import time
 from pywb.rewrite.wburl import WbUrl
-from pywb.cdx.cdxserver import extract_params_from_wsgi_env
+from pywb.cdx.query import CDXQuery
 from wbrequestresponse import WbResponse
 from wbexceptions import WbException, NotFoundException
 from views import TextCapturesView
@ -82,7 +82,7 @@ class CDXHandler(BaseHandler):
        self.view = view if view else TextCapturesView()
    def __call__(self, wbrequest):
-        params = extract_params_from_wsgi_env(wbrequest.env)
+        params = CDXQuery.extract_params_from_wsgi_env(wbrequest.env)
        cdx_lines = self.index_reader.load_cdx(**params)
        return self.view.render_response(wbrequest, cdx_lines)
--- a/pywb/core/indexreader.py
+++ b/pywb/core/indexreader.py
@ -30,7 +30,7 @@ class IndexReader(object):
        params['allowFuzzy'] = True
-        cdxlines = self.load_cdx(url=wburl.url, output='raw', **params)
+        cdxlines = self.load_cdx(url=wburl.url, **params)
        return cdxlines
--- a/setup.py
+++ b/setup.py
@ -1,24 +1,48 @@
 #!/usr/bin/env python
 # vim: set sw=4 et:
-import setuptools
+from setuptools import setup, find_packages
 import glob
-setuptools.setup(name='pywb',
+setup(
-        version='0.2',
+    name='pywb',
-        url='https://github.com/ikreymer/pywb',
+    version='0.2',
-        author='Ilya Kreymer',
+    url='https://github.com/ikreymer/pywb',
-        author_email='ilya@archive.org',
+    author='Ilya Kreymer',
-        long_description=open('README.md').read(),
+    author_email='ilya@archive.org',
-        license='GPL',
+    long_description=open('README.md').read(),
-        packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite','pywb.core','pywb.dispatch','pywb.bootstrap'],
+    license='GPL',
-        provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite','pywb.core','pywb.dispatch','pywb.bootstrap'],
+    packages=find_packages(),
-        package_data={'pywb': ['ui/*', 'static/*', '*.yaml']},
+    provides=[
-        data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
+        'pywb',
-                      ('sample_archive/zipcdx/', glob.glob('sample_archive/zipcdx/*')),
+        'pywb.utils',
-                      ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')),
+        'pywb.cdx',
-                      ('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))],
+        'pywb.warc',
-        install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'],
+        'pywb.rewrite',
-#        tests_require=['WebTest', 'pytest'],
+        'pywb.core',
-        zip_safe=False)
+        'pywb.dispatch',
-
+        'pywb.bootstrap'
        ],
    package_data={
        'pywb': ['ui/*', 'static/*', '*.yaml'],
        },
    data_files = [
        ('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
        ('sample_archive/zipcdx/', glob.glob('sample_archive/zipcdx/*')),
        ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')),
        ('sample_archive/text_content/', glob.glob('sample_archive/text_content/*')),
        ],
    install_requires=[
        'rfc3987',
        'chardet',
        'redis',
        'jinja2',
        'surt',
        'pyyaml',
        'WebTest',
        'pytest',
        'werkzeug>=0.9.4',
        ],
    # tests_require=['WebTest', 'pytest'],
    zip_safe=False
    )
--- a/test_config.yaml
+++ b/test_config.yaml
@ -92,10 +92,10 @@ enable_cdx_api: true
 # optional reporter callback func
 # if set, called with request and cdx object
-reporter: !!python/object/new:tests.test_integration.PrintReporter []
+reporter: !!python/object/new:tests.fixture.PrintReporter []
 # custom rules for domain specific matching
 #domain_specific_rules: rules.yaml
 #perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms []
-perms_checker:  !!python/object/new:tests.test_integration.TestExclusionPerms []
+perms_checker:  !!python/object/new:tests.fixture.TestExclusionPerms []
--- a/tests/fixture.py
+++ b/tests/fixture.py
@ -0,0 +1,45 @@
 import os
 import pytest
 import yaml
 from pywb.cdx.perms import AllowAllPerms
@pytest.fixture
 def testconfig():
    config = yaml.load(open('test_config.yaml'))
    assert config
    if 'index_paths' not in config:
        # !!! assumes this module is in a sub-directory of project root.
        config['index_paths'] = os.path.join(
            os.path.dirname(os.path.realpath(__file__)),
            '../sample_archive/cdx')
    return config
 #================================================================
 # Reporter callback for replay view
 class PrintReporter:
    """Reporter callback for replay view.
    """
    def __call__(self, wbrequest, cdx, response):
        print wbrequest
        print cdx
        pass
 #================================================================
 class TestExclusionPerms(AllowAllPerms):
    """
    Perm Checker fixture which can block one URL.
    """
    # sample_archive has captures for this URLKEY
    URLKEY_EXCLUDED = 'org,iana)/_img/bookmark_icon.ico'
    def allow_url_lookup(self, urlkey, url):
        """
        Return true/false if url or urlkey (canonicalized url)
        should be allowed
        """
        if urlkey == self.URLKEY_EXCLUDED:
            return False
        return super(TestExclusionPerms, self).allow_url_lookup(urlkey, url)
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@ -2,14 +2,17 @@ import webtest
 from pywb.bootstrap.pywb_init import pywb_config
 from pywb.bootstrap.wbapp import create_wb_app
 from pywb.cdx.cdxobject import CDXObject
-from pywb.cdx.perms import AllowAllPerms
+
 from fixture import TestExclusionPerms
 class TestWb:
    TEST_CONFIG = 'test_config.yaml'
    def setup(self):
        #self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config())
-        self.app = create_wb_app(pywb_config(self.TEST_CONFIG))
+        # save it in self - useful for debugging
        self.router = pywb_config(self.TEST_CONFIG)
        self.app = create_wb_app(self.router)
        self.testapp = webtest.TestApp(self.app)
    def _assert_basic_html(self, resp):
@ -207,24 +210,3 @@ class TestWb:
        assert resp.status_int == 400
        assert 'Invalid Url: http://?abc' in resp.body
 #=================================================================
 # Reporter callback for replay view
 class PrintReporter:
    def __call__(self, wbrequest, cdx, response):
        print wbrequest
        print cdx
 #=================================================================
 class TestExclusionPerms(AllowAllPerms):
    """
    Sample Perm Checker with hard-coded exclusion
    """
    def allow_url_lookup(self, urlkey, url):
        """
        Return true/false if url or urlkey (canonicalized url)
        should be allowed
        """
        if urlkey == 'org,iana)/_img/bookmark_icon.ico':
            return False
        return super(TestExclusionPerms, self).allow_url_lookup(urlkey, url)
--- a/tests/test_wsgi_cdxserver.py
+++ b/tests/test_wsgi_cdxserver.py
@ -0,0 +1,189 @@
 import os
 import re
 import pytest
 from urllib import urlencode
 from werkzeug.test import Client
 from werkzeug.wrappers import BaseResponse, Response
 import yaml
 from pywb.cdx.cdxobject import CDXObject
 from pywb.cdx.wsgi_cdxserver import create_app
 from tests.fixture import testconfig
@pytest.fixture
 def client(testconfig):
    app = create_app(testconfig)
    return Client(app, Response)
 # ================================================================
 def query(client, url, **params):
    params['url'] = url
    return client.get('/cdx?' + urlencode(params, doseq=1))
 # ================================================================
 def test_exact_url(client):
    """
    basic exact match, no filters, etc.
    """
    resp = query(client, 'http://www.iana.org/')
    assert resp.status_code == 200
    print resp.data
 def test_prefix_match(client):
    """
    prefix match test
    """
    resp = query(client, 'http://www.iana.org/', matchType='prefix')
    print resp.data.splitlines()
    assert resp.status_code == 200
    suburls = 0
    for l in resp.data.splitlines():
        fields = l.split(' ')
        if len(fields[0]) > len('org,iana)/'):
            suburls += 1
    assert suburls > 0
 def test_filters(client):
    """
    filter cdxes by mimetype and filename field, exact match.
    """
    resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css',
                 filter=('mimetype:warc/revisit', 'filename:dupes.warc.gz'))
    assert resp.status_code == 200
    assert resp.mimetype == 'text/plain'
    for l in resp.data.splitlines():
        fields = l.split(' ')
        assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
        assert fields[3] == 'warc/revisit'
        assert fields[10] == 'dupes.warc.gz'
 def test_limit(client):
    resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css',
                 limit='1')
    assert resp.status_code == 200
    assert resp.mimetype == 'text/plain'
    cdxes = resp.data.splitlines()
    assert len(cdxes) == 1
    fields = cdxes[0].split(' ')
    assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
    assert fields[1] == '20140126200625'
    assert fields[3] == 'text/css'
    resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css',
                 limit='1', reverse='1')
    assert resp.status_code == 200
    assert resp.mimetype == 'text/plain'
    cdxes = resp.data.splitlines()
    assert len(cdxes) == 1
    fields = cdxes[0].split(' ')
    assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
    assert fields[1] == '20140127171239'
    assert fields[3] == 'warc/revisit'
 def test_fields(client):
    """
    retrieve subset of fields with ``fields`` parameter.
    """
    resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
                 fields='urlkey,timestamp,statuscode')
    assert resp.status_code == 200
    cdxes = resp.data.splitlines()
    for cdx in cdxes:
        fields = cdx.split(' ')
        assert len(fields) == 3
        assert fields[0] == 'org,iana)/_css/2013.1/print.css'
        assert re.match(r'\d{14}$', fields[1])
        assert re.match(r'\d{3}|-', fields[2])
 def test_fields_undefined(client):
    """
    server shall respond with Bad Request (TODO: with proper explanation),
    when ``fields`` parameter contains undefined name(s).
    """
    resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
                 fields='urlkey,nosuchfield')
    resp.status_code == 400
 def test_resolveRevisits(client):
    """
    with ``resolveRevisits=true``, server adds three fields pointing to
    the *original* capture.
    """
    resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
                 resolveRevisits='true'
                 )
    assert resp.status_code == 200
    assert resp.mimetype == 'text/plain'
    cdxes = resp.data.splitlines()
    originals = {}
    for cdx in cdxes:
        fields = cdx.split(' ')
        assert len(fields) == 14
        (key, ts, url, mt, st, sha, _, _, size, offset, fn,
         orig_size, orig_offset, orig_fn) = fields
        # orig_* fields are either all '-' or (int, int, filename)
        # check if orig_* fields are equals to corresponding fields
        # for the original capture.
        if orig_size == '-':
            assert orig_offset == '-' and orig_fn == '-'
            originals[sha] = (int(size), int(offset), fn)
        else:
            orig = originals.get(sha)
            assert orig == (int(orig_size), int(orig_offset), orig_fn)
 def test_resolveRevisits_orig_fields(client):
    """
    when resolveRevisits=true, extra three fields are named
    ``orig.length``, ``orig.offset`` and ``orig.filename``, respectively.
    it is possible to filter fields by these names.
    """
    resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
                 resolveRevisits='1',
                 fields='urlkey,orig.length,orig.offset,orig.filename'
                 )
    assert resp.status_code == 200
    assert resp.mimetype == 'text/plain'
    cdxes = resp.data.splitlines()
    for cdx in cdxes:
        fields = cdx.split(' ')
        assert len(fields) == 4
        key, orig_len, orig_offset, orig_fn = fields
        assert (orig_len == '-' and orig_offset == '-' and orig_fn == '-' or
                (int(orig_len), int(orig_offset), orig_fn))
 def test_collapseTime_resolveRevisits_reverse(client):
    resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
                 collapseTime='11',
                 resolveRevisits='true',
                 reverse='true'
                 )
    cdxes = [CDXObject(l) for l in resp.data.splitlines()]
    assert len(cdxes) == 3
    # timestamp is in descending order
    for i in range(len(cdxes) - 1):
        assert cdxes[i]['timestamp'] >= cdxes[i + 1]['timestamp']