diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index 7295bf0c..9890605e 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -7,6 +7,7 @@ from pywb.utils.dsrules import BaseRule, RuleSet from pywb.utils.canonicalize import unsurt, UrlCanonicalizer +from cdxobject import CDXQuery #================================================================= def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered): @@ -70,13 +71,13 @@ class FuzzyQuery: def __init__(self, rules): self.rules = rules - def __call__(self, params): + def __call__(self, query): matched_rule = None - urlkey = params['key'] - url = params['url'] - filter_ = params.get('filter', []) - output = params.get('output') + urlkey = query.key + url = query.url + filter_ = query.filters + output = query.output for rule in self.rules.iter_matching(urlkey): m = rule.regex.search(urlkey) @@ -102,7 +103,7 @@ class FuzzyQuery: 'filter': filter_, 'output': output} - return params + return CDXQuery(**params) #================================================================= diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py index 8fed07af..682e8c74 100644 --- a/pywb/cdx/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -1,6 +1,9 @@ from collections import OrderedDict import itertools +from urllib import urlencode +from urlparse import parse_qs + #================================================================= class CDXException(Exception): @@ -20,6 +23,126 @@ class AccessException(CDXException): return '403 Access Denied' +#================================================================= +class CDXQuery(object): + def __init__(self, **kwargs): + self.params = kwargs + + @property + def key(self): + return self.params['key'] + + @property + def end_key(self): + return self.params['end_key'] + + def set_key(self, key, end_key): + self.params['key'] = key + self.params['end_key'] = end_key + + @property + def url(self): + try: + return self.params['url'] + except KeyError: + msg = 'A url= param must be specified to query the cdx server' + raise CDXException(msg) + + @property + def match_type(self): + return self.params.get('matchType', 'exact') + + @property + def is_exact(self): + return self.match_type == 'exact' + + @property + def allow_fuzzy(self): + return self._get_bool('allowFuzzy') + + @property + def output(self): + return self.params.get('output', 'text') + + @property + def limit(self): + return int(self.params.get('limit', 100000)) + + @property + def collapse_time(self): + return self.params.get('collapseTime') + + @property + def resolve_revisits(self): + return self._get_bool('resolveRevisits') + + @property + def filters(self): + return self.params.get('filter', []) + + @property + def fields(self): + v = self.params.get('fields') + return v.split(',') if v else None + + @property + def closest(self): + # sort=closest is not required + return self.params.get('closest') + + @property + def reverse(self): + # sort=reverse overrides reverse=0 + return (self._get_bool('reverse') or + self.params.get('sort') == 'reverse') + + @property + def secondary_index_only(self): + return self._get_bool('showPagedIndex') + + @property + def process(self): + return self._get_bool('processOps', True) + + def set_process(self, process): + self.params['processOps'] = process + + def _get_bool(self, name, def_val=False): + v = self.params.get(name) + if v: + try: + v = int(v) + except ValueError as ex: + v = (v.lower() == 'true') + else: + v = def_val + + return bool(v) + + def urlencode(self): + return urlencode(self.params, True) + + @staticmethod + def from_wsgi_env(env): + """ utility function to extract params and create a CDXQuery + from a WSGI environment dictionary + """ + params = parse_qs(env['QUERY_STRING']) + + if not 'output' in params: + params['output'] = 'text' + + # parse_qs produces arrays for single values + # cdx processing expects singleton params for all params, + # except filters, so convert here + # use first value of the list + for name, val in params.iteritems(): + if name != 'filter': + params[name] = val[0] + + return CDXQuery(**params) + + #================================================================= class CDXObject(OrderedDict): CDX_FORMATS = [ diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index 4bdb0a55..72d69417 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -1,4 +1,4 @@ -from cdxobject import CDXObject, IDXObject, AccessException +from cdxobject import CDXObject, IDXObject, AccessException, CDXQuery from pywb.utils.timeutils import timestamp_to_sec import bisect @@ -10,7 +10,7 @@ from collections import deque #================================================================= -def cdx_load(sources, params, perms_checker=None, filter=True): +def cdx_load(sources, query, perms_checker=None, process=True): """ merge text CDX lines from sources, return an iterator for filtered and access-checked sequence of CDX objects. @@ -19,25 +19,30 @@ def cdx_load(sources, params, perms_checker=None, filter=True): :param perms_checker: access check filter object implementing allow_url_lookup(key, url), allow_capture(cdxobj) and filter_fields(cdxobj) methods. + :param process: bool, perform processing sorting/filtering/grouping ops """ - cdx_iter = load_cdx_streams(sources, params) - cdx_iter = make_obj_iter(cdx_iter, params) - cdx_iter = filter_cdx(cdx_iter, params) + cdx_iter = load_cdx_streams(sources, query) + cdx_iter = make_obj_iter(cdx_iter, query) + + if process and query.process: + cdx_iter = process_cdx(cdx_iter, query) + if perms_checker: - cdx_iter = restrict_cdx(cdx_iter, params, perms_checker) + cdx_iter = restrict_cdx(cdx_iter, query, perms_checker) + return cdx_iter #================================================================= -def restrict_cdx(cdx_iter, params, perms_checker): +def restrict_cdx(cdx_iter, query, perms_checker): """ filter out those cdx records that user doesn't have access to, by consulting :param perms_checker:. :param cdx_iter: cdx record source iterable - :param params: request parameters (dict) + :param query: request parameters (CDXQuery) :param perms_checker: object implementing permission checker """ - if not perms_checker.allow_url_lookup(params['key'], params['url']): - if params.get('matchType', 'exact') == 'exact': + if not perms_checker.allow_url_lookup(query.key, query.url): + if query.is_exact: raise AccessException('Excluded') for cdx in cdx_iter: @@ -51,31 +56,26 @@ def restrict_cdx(cdx_iter, params, perms_checker): yield cdx #================================================================= -def filter_cdx(cdx_iter, params): - if params.get('proxyAll'): - return cdx_iter - - resolve_revisits = params.get('resolveRevisits', False) - if resolve_revisits: +def process_cdx(cdx_iter, query): + if query.resolve_revisits: cdx_iter = cdx_resolve_revisits(cdx_iter) - filters = params.get('filter', None) + filters = query.filters if filters: cdx_iter = cdx_filter(cdx_iter, filters) - collapse_time = params.get('collapseTime', None) + collapse_time = query.collapse_time if collapse_time: cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time) - limit = int(params.get('limit', 1000000)) + limit = query.limit - reverse = params.get('reverse', False) or params.get('sort') == 'reverse' - if reverse: + if query.reverse: cdx_iter = cdx_reverse(cdx_iter, limit) - closest_to = params.get('closest', None) - if closest_to: - cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit) + closest = query.closest + if closest: + cdx_iter = cdx_sort_closest(closest, cdx_iter, limit) if limit: cdx_iter = cdx_limit(cdx_iter, limit) @@ -85,21 +85,21 @@ def filter_cdx(cdx_iter, params): #================================================================= # load and source merge cdx streams -def load_cdx_streams(sources, params): +def load_cdx_streams(sources, query): # Optimize: no need to merge if just one input if len(sources) == 1: - return sources[0].load_cdx(params) + return sources[0].load_cdx(query) - source_iters = map(lambda src: src.load_cdx(params), sources) + source_iters = map(lambda src: src.load_cdx(query), sources) merged_stream = merge(*(source_iters)) return merged_stream #================================================================= # convert text cdx stream to CDXObject/IDXObject -def make_obj_iter(text_iter, params): +def make_obj_iter(text_iter, query): # already converted - if params.get('showPagedIndex'): + if query.secondary_index_only: cls = IDXObject else: cls = CDXObject diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 0de183ae..c3874a93 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -3,7 +3,7 @@ from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range from cdxops import cdx_load from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource from zipnum import ZipNumCluster -from cdxobject import CDXObject, CaptureNotFoundException, CDXException +from cdxobject import CDXObject, CaptureNotFoundException, CDXException, CDXQuery from cdxdomainspecific import load_domain_specific_cdx_rules from pywb.utils.loaders import is_http @@ -36,7 +36,7 @@ class BaseCDXServer(object): # set perms checker, if any self.perms_checker = kwargs.get('perms_checker') - def _check_cdx_iter(self, cdx_iter, params): + def _check_cdx_iter(self, cdx_iter, query): """ Check cdx iter semantics If iter is empty (no matches), check if fuzzy matching is allowed, and try it -- otherwise, @@ -48,21 +48,23 @@ class BaseCDXServer(object): if cdx_iter: return cdx_iter - url = params['url'] - # check if fuzzy is allowed and ensure that its an # exact match - if (self.fuzzy_query and params.get('allowFuzzy') and - params.get('matchType', 'exact') == 'exact'): + if (self.fuzzy_query and + query.allow_fuzzy and + query.is_exact): - fuzzy_params = self.fuzzy_query(params) - if fuzzy_params: - return self.load_cdx(**fuzzy_params) + fuzzy_query_params = self.fuzzy_query(query) + if fuzzy_query_params: + return self.load_cdx_query(fuzzy_query_params) - msg = 'No Captures found for: ' + url + msg = 'No Captures found for: ' + query.url raise CaptureNotFoundException(msg) def load_cdx(self, **params): + return self.load_cdx_query(CDXQuery(**params)) + + def load_cdx_query(self, query): raise NotImplementedError('Implement in subclass') @staticmethod @@ -89,26 +91,18 @@ class CDXServer(BaseCDXServer): # config argument. self._create_cdx_sources(paths, kwargs.get('config')) - def load_cdx(self, **params): - # if key not set, assume 'url' is set and needs canonicalization - if not params.get('key'): - try: - url = params['url'] - except KeyError: - msg = 'A url= param must be specified to query the cdx server' - raise CDXException(msg) + def load_cdx_query(self, query): + url = query.url + key, end_key = calc_search_range(url=url, + match_type=query.match_type, + url_canon=self.url_canon) + query.set_key(key, end_key) - match_type = params.get('matchType', 'exact') - - key, end_key = calc_search_range(url=url, - match_type=match_type, - url_canon=self.url_canon) - params['key'] = key - params['end_key'] = end_key - - cdx_iter = cdx_load(self.sources, params, + cdx_iter = cdx_load(self.sources, + query, perms_checker=self.perms_checker) - return self._check_cdx_iter(cdx_iter, params) + + return self._check_cdx_iter(cdx_iter, query) def _create_cdx_sources(self, paths, config): """ @@ -186,9 +180,9 @@ class RemoteCDXServer(BaseCDXServer): else: raise Exception('Invalid remote cdx source: ' + str(source)) - def load_cdx(self, **params): - remote_iter = cdx_load((self.sources,), params, filter=False) - return self._check_cdx_iter(remote_iter, params) + def load_cdx_query(self, query): + remote_iter = cdx_load(self.sources, query, process=False) + return self._check_cdx_iter(remote_iter, query) def __str__(self): return 'Remote CDX server serving from ' + str(self.sources[0]) @@ -220,23 +214,4 @@ def create_cdx_server(config, ds_rules_file=None): ds_rules_file=ds_rules_file, perms_checker=perms_checker) -#================================================================= -def extract_params_from_wsgi_env(env): - """ utility function to extract params from the query - string of a WSGI environment dictionary - """ - # use url= param to get actual url - params = urlparse.parse_qs(env['QUERY_STRING']) - if not 'output' in params: - params['output'] = 'text' - - # parse_qs produces arrays for single values - # cdx processing expects singleton params for all params, - # except filters, so convert here - # use first value of the list - for name, val in params.iteritems(): - if name != 'filter': - params[name] = val[0] - - return params diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index ba5f8b3b..c17312c0 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -12,7 +12,7 @@ class CDXSource(object): """ Represents any cdx index source """ - def load_cdx(self, params): + def load_cdx(self, query): raise NotImplementedError('Implement in subclass') @@ -24,9 +24,9 @@ class CDXFile(CDXSource): def __init__(self, filename): self.filename = filename - def load_cdx(self, params): + def load_cdx(self, query): source = SeekableTextFileReader(self.filename) - return iter_range(source, params.get('key'), params.get('end_key')) + return iter_range(source, query.key, query.end_key) def __str__(self): return 'CDX File - ' + self.filename @@ -45,20 +45,16 @@ class RemoteCDXSource(CDXSource): self.cookie = cookie self.proxy_all = proxy_all - def load_cdx(self, proxy_params): + def load_cdx(self, query): if self.proxy_all: - params = proxy_params - params['proxyAll'] = True + query.set_process(False) + remote_query = query else: # Only send url and matchType params to remote - params = {} - params['url'] = proxy_params['url'] - match_type = proxy_params.get('matchType') + remote_query = CDXQuery(url=query.url, + match_type=query.matchType) - if match_type: - proxy_params['matchType'] = match_type - - urlparams = urllib.urlencode(params, True) + urlparams = remote_query.urlencode() try: request = urllib2.Request(self.remote_url, urlparams) @@ -97,14 +93,14 @@ class RedisCDXSource(CDXSource): self.key_prefix = config.get('redis_key_prefix', self.key_prefix) - def load_cdx(self, params): + def load_cdx(self, query): """ Load cdx from redis cache, from an ordered list Currently, there is no support for range queries Only 'exact' matchType is supported """ - key = params['key'] + key = query.key # ensure only url/surt is part of key key = key.split(' ')[0] diff --git a/pywb/cdx/wsgi_cdxserver.py b/pywb/cdx/wsgi_cdxserver.py index c138e5c6..e6ab6067 100644 --- a/pywb/cdx/wsgi_cdxserver.py +++ b/pywb/cdx/wsgi_cdxserver.py @@ -1,6 +1,7 @@ -from werkzeug.wrappers import BaseRequest, BaseResponse +from werkzeug.wrappers import BaseResponse from cdxserver import create_cdx_server from pywb import get_test_dir +from cdxobject import CDXQuery import logging import os @@ -18,37 +19,10 @@ DEFAULT_PORT = 8080 #================================================================= -class CDXQueryRequest(BaseRequest): +class CDXQueryRequest(object): def __init__(self, environ): - super(CDXQueryRequest, self).__init__(environ) + self.query = CDXQuery.from_wsgi_env(environ) - def _get_bool(self, name): - v = self.args.get(name) - if v: - try: - v = int(s) - except ValueError as ex: - v = (s.lower() == 'true') - return bool(v) - @property - def output(self): - return self.args.get('output', 'text') - @property - def filter(self): - return self.args.getlist('filter', []) - @property - def fields(self): - v = self.args.get('fields') - return v.split(',') if v else None - @property - def reverse(self): - # sort=reverse overrides reverse=0 - return (self._get_bool('reverse') or - self.args.get('sort') == 'reverse') - @property - def params(self): - return dict(t if t[0] == 'filter' else (t[0], t[1][0]) - for t in self.args.iterlists()) class WSGICDXServer(object): def __init__(self, config, rules_file): @@ -57,11 +31,11 @@ class WSGICDXServer(object): def __call__(self, environ, start_response): request = CDXQueryRequest(environ) try: - logging.debug('request.args=%s', request.params) - result = self.cdxserver.load_cdx(**request.params) + logging.debug('request.args=%s', request.query) + result = self.cdxserver.load_cdx_query(request.query) # TODO: select response type by "output" parameter - response = PlainTextResponse(result, request.fields) + response = PlainTextResponse(result, request.query.fields) return response(environ, start_response) except Exception as exc: logging.error('load_cdx failed', exc_info=1) @@ -74,7 +48,7 @@ def cdx_text_out(cdx, fields): if not fields: return str(cdx) + '\n' else: - logging.info('cdx fields=%s', cdx.keys()) + logging.info('cdx fields=%s', cdx.keys) # TODO: this will results in an exception if fields contain # non-existent field name. return ' '.join(cdx[x] for x in fields) + '\n' diff --git a/pywb/cdx/zipnum.py b/pywb/cdx/zipnum.py index 847c660f..fbb1503f 100644 --- a/pywb/cdx/zipnum.py +++ b/pywb/cdx/zipnum.py @@ -110,21 +110,21 @@ class ZipNumCluster(CDXSource): def lookup_loc(self, part): return self.loc_map[part] - def load_cdx(self, params): + def load_cdx(self, query): self.reload_loc() reader = SeekableTextFileReader(self.summary) idx_iter = iter_range(reader, - params['key'], - params['end_key'], + query.key, + query.end_key, prev_size=1) - if params.get('showPagedIndex'): - params['proxyAll'] = True + if query.secondary_index_only: + query.set_process(False) return idx_iter else: - blocks = self.idx_to_cdx(idx_iter, params) + blocks = self.idx_to_cdx(idx_iter, query) def gen_cdx(): for blk in blocks: @@ -133,7 +133,7 @@ class ZipNumCluster(CDXSource): return gen_cdx() - def idx_to_cdx(self, idx_iter, params): + def idx_to_cdx(self, idx_iter, query): blocks = None ranges = [] @@ -150,7 +150,7 @@ class ZipNumCluster(CDXSource): else: if blocks: - yield self.block_to_cdx_iter(blocks, ranges, params) + yield self.block_to_cdx_iter(blocks, ranges, query) blocks = ZipBlocks(idx['part'], idx['offset'], @@ -160,15 +160,15 @@ class ZipNumCluster(CDXSource): ranges = [blocks.length] if blocks: - yield self.block_to_cdx_iter(blocks, ranges, params) + yield self.block_to_cdx_iter(blocks, ranges, query) - def block_to_cdx_iter(self, blocks, ranges, params): + def block_to_cdx_iter(self, blocks, ranges, query): last_exc = None last_traceback = None for location in self.lookup_loc(blocks.part): try: - return self.load_blocks(location, blocks, ranges, params) + return self.load_blocks(location, blocks, ranges, query) except Exception as exc: last_exc = exc import sys @@ -179,7 +179,7 @@ class ZipNumCluster(CDXSource): else: raise Exception('No Locations Found for: ' + block.part) - def load_blocks(self, location, blocks, ranges, params): + def load_blocks(self, location, blocks, ranges, query): if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG): msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}' @@ -195,9 +195,9 @@ class ZipNumCluster(CDXSource): iter_ = itertools.chain(*itertools.imap(decompress_block, ranges)) # start bound - iter_ = linearsearch(iter_, params['key']) + iter_ = linearsearch(iter_, query.key) # end bound - end = params['end_key'] + end = query.end_key iter_ = itertools.takewhile(lambda line: line < end, iter_) return iter_ diff --git a/pywb/handlers.py b/pywb/handlers.py index c82db7fe..10456380 100644 --- a/pywb/handlers.py +++ b/pywb/handlers.py @@ -4,7 +4,7 @@ import mimetypes import time from pywb.rewrite.wburl import WbUrl -from pywb.cdx.cdxserver import extract_params_from_wsgi_env +from pywb.cdx.cdxobject import CDXQuery from wbrequestresponse import WbResponse from wbexceptions import WbException, NotFoundException from views import TextCapturesView @@ -79,8 +79,8 @@ class CDXHandler(BaseHandler): self.view = view if view else TextCapturesView() def __call__(self, wbrequest): - params = extract_params_from_wsgi_env(wbrequest.env) - cdx_lines = self.index_reader.load_cdx(**params) + query = CDXQuery.from_wsgi_env(wbrequest.env) + cdx_lines = self.index_reader.load_cdx_query(query) return self.view.render_response(wbrequest, cdx_lines) diff --git a/pywb/indexreader.py b/pywb/indexreader.py index a422d0b4..ff17dfde 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -34,6 +34,9 @@ class IndexReader(object): return cdxlines + def load_cdx_query(self, query): + return self.cdx_server.load_cdx_query(query) + def load_cdx(self, **params): return self.cdx_server.load_cdx(**params) diff --git a/setup.py b/setup.py index 7f843161..c9ff86bd 100755 --- a/setup.py +++ b/setup.py @@ -26,7 +26,6 @@ setup( ('sample_archive/text_content/', glob.glob('sample_archive/text_content/*')), ], install_requires=[ - 'uwsgi', 'rfc3987', 'chardet', 'redis', @@ -36,7 +35,6 @@ setup( 'WebTest', 'pytest', 'werkzeug>=0.9.4', - 'setuptools', ], # tests_require=['WebTest', 'pytest'], zip_safe=False