diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index 556534a7..2e8a3855 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -1,12 +1,13 @@ import yaml import re import logging -import pkgutil +import pkg_resources from pywb.utils.dsrules import BaseRule, RuleSet from pywb.utils.canonicalize import unsurt, UrlCanonicalizer +from query import CDXQuery #================================================================= def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered): @@ -70,13 +71,13 @@ class FuzzyQuery: def __init__(self, rules): self.rules = rules - def __call__(self, params): + def __call__(self, query): matched_rule = None - urlkey = params['key'] - url = params['url'] - filter_ = params.get('filter', []) - output = params.get('output') + urlkey = query.key + url = query.url + filter_ = query.filters + output = query.output for rule in self.rules.iter_matching(urlkey): m = rule.regex.search(urlkey) @@ -102,7 +103,7 @@ class FuzzyQuery: 'filter': filter_, 'output': output} - return params + return CDXQuery(**params) #================================================================= diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py index 4eba8025..3915f169 100644 --- a/pywb/cdx/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -1,6 +1,9 @@ from collections import OrderedDict import itertools +from urllib import urlencode +from urlparse import parse_qs + #================================================================= class CDXException(Exception): @@ -71,12 +74,25 @@ class CDXObject(OrderedDict): # force regen on next __str__ call self.cdxline = None + def is_revisit(self): + return (self['mimetype'] == 'warc/revisit' or + self['filename'] == '-') + + def to_text(self, fields=None): + """ + return plaintext CDX record (includes newline). + :param fields: list of field names to output. + """ + if fields is None: + return str(self) + '\n' + else: + return ' '.join(self[x] for x in fields) + '\n' + def __str__(self): if self.cdxline: return self.cdxline - li = itertools.imap(lambda (n, val): val, self.items()) - return ' '.join(li) + return ' '.join(val for n, val in self.iteritems()) #================================================================= @@ -106,5 +122,12 @@ class IDXObject(OrderedDict): self.idxline = idxline + def to_text(self, fields=None): + """ + return plaintext IDX record (including newline). + :param fields: list of field names to output (currently ignored) + """ + return str(self) + '\n' + def __str__(self): return self.idxline diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index 1a90d7ca..c4f865c2 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -1,4 +1,5 @@ from cdxobject import CDXObject, IDXObject, AccessException +from query import CDXQuery from pywb.utils.timeutils import timestamp_to_sec import bisect @@ -10,32 +11,44 @@ from collections import deque #================================================================= -def cdx_load(sources, params, perms_checker=None): +def cdx_load(sources, query, perms_checker=None, process=True): + """ + merge text CDX lines from sources, return an iterator for + filtered and access-checked sequence of CDX objects. + + :param sources: iterable for text CDX sources. + :param perms_checker: access check filter object implementing + allow_url_lookup(key, url), allow_capture(cdxobj) and + filter_fields(cdxobj) methods. + :param process: bool, perform processing sorting/filtering/grouping ops + """ + cdx_iter = load_cdx_streams(sources, query) + cdx_iter = make_obj_iter(cdx_iter, query) + + if process and not query.secondary_index_only: + cdx_iter = process_cdx(cdx_iter, query) + if perms_checker: - cdx_iter = cdx_load_with_perms(sources, params, perms_checker) - else: - cdx_iter = cdx_load_and_filter(sources, params) - - # output raw cdx objects - if params.get('output') == 'raw': - return cdx_iter - - def write_cdx(fields): - for cdx in cdx_iter: - yield cdx_text_out(cdx, fields) + '\n' - - return write_cdx(params.get('fields')) + cdx_iter = restrict_cdx(cdx_iter, query, perms_checker) + return cdx_iter #================================================================= -def cdx_load_with_perms(sources, params, perms_checker): - if not perms_checker.allow_url_lookup(params['key'], params['url']): - if params.get('matchType', 'exact') == 'exact': +def restrict_cdx(cdx_iter, query, perms_checker): + """ + filter out those cdx records that user doesn't have access to, + by consulting :param perms_checker:. + :param cdx_iter: cdx record source iterable + :param query: request parameters (CDXQuery) + :param perms_checker: object implementing permission checker + """ + if not perms_checker.allow_url_lookup(query.key, query.url): + if query.is_exact: raise AccessException('Excluded') - cdx_iter = cdx_load_and_filter(sources, params) - for cdx in cdx_iter: + # TODO: we could let filter_fields handle this case by accepting + # None as a return value. if not perms_checker.allow_capture(cdx): continue @@ -43,45 +56,27 @@ def cdx_load_with_perms(sources, params, perms_checker): yield cdx - #================================================================= -def cdx_text_out(cdx, fields): - if not fields: - return str(cdx) - else: - return ' '.join(map(lambda x: cdx[x], fields.split(','))) - - -#================================================================= -def cdx_load_and_filter(sources, params): - cdx_iter = load_cdx_streams(sources, params) - - cdx_iter = make_obj_iter(cdx_iter, params) - - if params.get('proxyAll'): - return cdx_iter - - resolve_revisits = params.get('resolveRevisits', False) - if resolve_revisits: +def process_cdx(cdx_iter, query): + if query.resolve_revisits: cdx_iter = cdx_resolve_revisits(cdx_iter) - filters = params.get('filter', None) + filters = query.filters if filters: cdx_iter = cdx_filter(cdx_iter, filters) - collapse_time = params.get('collapseTime', None) + collapse_time = query.collapse_time if collapse_time: cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time) - limit = int(params.get('limit', 1000000)) + limit = query.limit - reverse = params.get('reverse', False) or params.get('sort') == 'reverse' - if reverse: + if query.reverse: cdx_iter = cdx_reverse(cdx_iter, limit) - closest_to = params.get('closest', None) - if closest_to: - cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit) + closest = query.closest + if closest: + cdx_iter = cdx_sort_closest(closest, cdx_iter, limit) if limit: cdx_iter = cdx_limit(cdx_iter, limit) @@ -91,26 +86,28 @@ def cdx_load_and_filter(sources, params): #================================================================= # load and source merge cdx streams -def load_cdx_streams(sources, params): +def load_cdx_streams(sources, query): # Optimize: no need to merge if just one input if len(sources) == 1: - return sources[0].load_cdx(params) + cdx_iter = sources[0].load_cdx(query) + else: + source_iters = map(lambda src: src.load_cdx(query), sources) + cdx_iter = merge(*(source_iters)) - source_iters = map(lambda src: src.load_cdx(params), sources) - merged_stream = merge(*(source_iters)) - return merged_stream + for cdx in cdx_iter: + yield cdx #================================================================= # convert text cdx stream to CDXObject/IDXObject -def make_obj_iter(text_iter, params): +def make_obj_iter(text_iter, query): # already converted - if params.get('showPagedIndex'): + if query.secondary_index_only: cls = IDXObject else: cls = CDXObject - return itertools.imap(lambda line: cls(line), text_iter) + return (cls(line) for line in text_iter) #================================================================= @@ -161,6 +158,7 @@ def cdx_filter(cdx_iter, filter_strings): if string.startswith('='): string = string[1:] self.compare_func = self.exact + # contains match elif string.startswith('~'): string = string[1:] self.compare_func = self.contains @@ -257,8 +255,8 @@ def cdx_resolve_revisits(cdx_iter): originals = {} for cdx in cdx_iter: - is_revisit = ((cdx['mimetype'] == 'warc/revisit') or - (cdx['filename'] == '-')) + + is_revisit = cdx.is_revisit() digest = cdx['digest'] diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index fd0c14e9..54d46f4b 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -4,6 +4,7 @@ from cdxops import cdx_load from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource from zipnum import ZipNumCluster from cdxobject import CDXObject, CaptureNotFoundException, CDXException +from query import CDXQuery from cdxdomainspecific import load_domain_specific_cdx_rules from pywb.utils.loaders import is_http @@ -36,7 +37,7 @@ class BaseCDXServer(object): # set perms checker, if any self.perms_checker = kwargs.get('perms_checker') - def _check_cdx_iter(self, cdx_iter, params): + def _check_cdx_iter(self, cdx_iter, query): """ Check cdx iter semantics If iter is empty (no matches), check if fuzzy matching is allowed, and try it -- otherwise, @@ -48,21 +49,23 @@ class BaseCDXServer(object): if cdx_iter: return cdx_iter - url = params['url'] - # check if fuzzy is allowed and ensure that its an # exact match - if (self.fuzzy_query and params.get('allowFuzzy') and - params.get('matchType', 'exact') == 'exact'): + if (self.fuzzy_query and + query.allow_fuzzy and + query.is_exact): - fuzzy_params = self.fuzzy_query(params) - if fuzzy_params: - return self.load_cdx(**fuzzy_params) + fuzzy_query_params = self.fuzzy_query(query) + if fuzzy_query_params: + return self.load_cdx_query(fuzzy_query_params) - msg = 'No Captures found for: ' + url + msg = 'No Captures found for: ' + query.url raise CaptureNotFoundException(msg) def load_cdx(self, **params): + return self.load_cdx_query(CDXQuery(**params)) + + def load_cdx_query(self, query): raise NotImplementedError('Implement in subclass') @staticmethod @@ -84,28 +87,77 @@ class CDXServer(BaseCDXServer): def __init__(self, paths, **kwargs): super(CDXServer, self).__init__(**kwargs) - self.sources = create_cdx_sources(paths, kwargs.get('config')) + # TODO: we could save config in member, so that other + # methods can use it. it's bad for add_cdx_source to take + # config argument. + self._create_cdx_sources(paths, kwargs.get('config')) - def load_cdx(self, **params): - # if key not set, assume 'url' is set and needs canonicalization - if not params.get('key'): - try: - url = params['url'] - except KeyError: - msg = 'A url= param must be specified to query the cdx server' - raise CDXException(msg) + def load_cdx_query(self, query): + url = query.url + key, end_key = calc_search_range(url=url, + match_type=query.match_type, + url_canon=self.url_canon) + query.set_key(key, end_key) - match_type = params.get('matchType', 'exact') + cdx_iter = cdx_load(self.sources, + query, + perms_checker=self.perms_checker) - key, end_key = calc_search_range(url=url, - match_type=match_type, - url_canon=self.url_canon) - params['key'] = key - params['end_key'] = end_key + return self._check_cdx_iter(cdx_iter, query) - cdx_iter = cdx_load(self.sources, params, self.perms_checker) + def _create_cdx_sources(self, paths, config): + """ + build CDXSource instances for each of path in :param paths:. + :param paths: list of sources or single source. + each source may be either string or CDXSource instance. value + of any other types will be silently ignored. + :param config: config object passed to :method:`add_cdx_source`. + """ + self.sources = [] - return self._check_cdx_iter(cdx_iter, params) + if paths is not None: + if not isinstance(paths, (list, tuple)): + paths = [paths] + + for path in paths: + self.add_cdx_source(path, config) + + if len(self.sources) == 0: + logging.warn('No CDX Sources configured from paths=%s', paths) + + def _add_cdx_source(self, source): + if source is None: return + logging.debug('Adding CDX Source: %s', source) + self.sources.append(source) + + def add_cdx_source(self, source, config): + if source is None: return + if isinstance(source, CDXSource): + self._add_cdx_source(source) + elif isinstance(source, str): + if os.path.isdir(source): + for fn in os.listdir(source): + self._add_cdx_source(self._create_cdx_source( + os.path.join(source, fn), config)) + else: + self._add_cdx_source(self._create_cdx_source( + source, config)) + + def _create_cdx_source(self, filename, config): + if is_http(filename): + return RemoteCDXSource(filename) + + if filename.startswith('redis://'): + return RedisCDXSource(filename, config) + + if filename.endswith('.cdx'): + return CDXFile(filename) + + if filename.endswith(('.summary', '.idx')): + return ZipNumCluster(filename, config) + + logging.warn('skipping unrecognized URI:%s', filename) + return None def __str__(self): return 'CDX server serving from ' + str(self.sources) @@ -123,20 +175,14 @@ class RemoteCDXServer(BaseCDXServer): if isinstance(source, RemoteCDXSource): self.source = source - elif (isinstance(source, str) and - any(source.startswith(x) for x in ['http://', 'https://'])): - self.source = RemoteCDXSource(source) + elif (isinstance(source, str) and is_http(source)): + self.source = RemoteCDXSource(source, remote_processing=True) else: raise Exception('Invalid remote cdx source: ' + str(source)) - def load_cdx(self, **params): - remote_iter = self.source.load_cdx(params) - - # if need raw, convert to raw format here - if params.get('output') == 'raw': - remote_iter = (CDXObject(cdx) for cdx in remote_iter) - - return self._check_cdx_iter(remote_iter, params) + def load_cdx_query(self, query): + remote_iter = cdx_load([self.source], query, process=False) + return self._check_cdx_iter(remote_iter, query) def __str__(self): return 'Remote CDX server serving from ' + str(self.sources[0]) @@ -169,74 +215,3 @@ def create_cdx_server(config, ds_rules_file=None): perms_checker=perms_checker) -#================================================================= -def create_cdx_sources(paths, config=None): - sources = [] - - if not isinstance(paths, list): - paths = [paths] - - for path in paths: - if isinstance(path, CDXSource): - add_cdx_source(sources, path, config) - elif isinstance(path, str): - if os.path.isdir(path): - for file in os.listdir(path): - add_cdx_source(sources, path + file, config) - else: - add_cdx_source(sources, path, config) - - if len(sources) == 0: - logging.exception('No CDX Sources Found from: ' + str(sources)) - - return sources - - -#================================================================= -def add_cdx_source(sources, source, config): - if not isinstance(source, CDXSource): - source = create_cdx_source(source, config) - if not source: - return - - logging.debug('Adding CDX Source: ' + str(source)) - sources.append(source) - - -#================================================================= -def create_cdx_source(filename, config): - if is_http(filename): - return RemoteCDXSource(filename) - - if filename.startswith('redis://'): - return RedisCDXSource(filename, config) - - if filename.endswith('.cdx'): - return CDXFile(filename) - - if filename.endswith(('.summary', '.idx')): - return ZipNumCluster(filename, config) - - return None - - -#================================================================= -def extract_params_from_wsgi_env(env): - """ utility function to extract params from the query - string of a WSGI environment dictionary - """ - # use url= param to get actual url - params = urlparse.parse_qs(env['QUERY_STRING']) - - if not 'output' in params: - params['output'] = 'text' - - # parse_qs produces arrays for single values - # cdx processing expects singleton params for all params, - # except filters, so convert here - # use first value of the list - for name, val in params.iteritems(): - if name != 'filter': - params[name] = val[0] - - return params diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index ba5f8b3b..0923fba9 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -2,6 +2,7 @@ from pywb.utils.binsearch import iter_range from pywb.utils.loaders import SeekableTextFileReader from cdxobject import AccessException +from query import CDXQuery import urllib import urllib2 @@ -12,7 +13,7 @@ class CDXSource(object): """ Represents any cdx index source """ - def load_cdx(self, params): + def load_cdx(self, query): raise NotImplementedError('Implement in subclass') @@ -24,9 +25,9 @@ class CDXFile(CDXSource): def __init__(self, filename): self.filename = filename - def load_cdx(self, params): + def load_cdx(self, query): source = SeekableTextFileReader(self.filename) - return iter_range(source, params.get('key'), params.get('end_key')) + return iter_range(source, query.key, query.end_key) def __str__(self): return 'CDX File - ' + self.filename @@ -40,25 +41,20 @@ class RemoteCDXSource(CDXSource): Only url and match type params are proxied at this time, the stream is passed through all other filters locally. """ - def __init__(self, filename, cookie=None, proxy_all=True): + def __init__(self, filename, cookie=None, remote_processing=False): self.remote_url = filename self.cookie = cookie - self.proxy_all = proxy_all + self.remote_processing = remote_processing - def load_cdx(self, proxy_params): - if self.proxy_all: - params = proxy_params - params['proxyAll'] = True + def load_cdx(self, query): + if self.remote_processing: + remote_query = query else: # Only send url and matchType params to remote - params = {} - params['url'] = proxy_params['url'] - match_type = proxy_params.get('matchType') + remote_query = CDXQuery(url=query.url, + match_type=query.match_type) - if match_type: - proxy_params['matchType'] = match_type - - urlparams = urllib.urlencode(params, True) + urlparams = remote_query.urlencode() try: request = urllib2.Request(self.remote_url, urlparams) @@ -97,14 +93,14 @@ class RedisCDXSource(CDXSource): self.key_prefix = config.get('redis_key_prefix', self.key_prefix) - def load_cdx(self, params): + def load_cdx(self, query): """ Load cdx from redis cache, from an ordered list Currently, there is no support for range queries Only 'exact' matchType is supported """ - key = params['key'] + key = query.key # ensure only url/surt is part of key key = key.split(' ')[0] diff --git a/pywb/cdx/query.py b/pywb/cdx/query.py new file mode 100644 index 00000000..dc480836 --- /dev/null +++ b/pywb/cdx/query.py @@ -0,0 +1,119 @@ +from urllib import urlencode +from urlparse import parse_qs + + +#================================================================= +class CDXQuery(object): + def __init__(self, **kwargs): + self.params = kwargs + + @property + def key(self): + return self.params['key'] + + @property + def end_key(self): + return self.params['end_key'] + + def set_key(self, key, end_key): + self.params['key'] = key + self.params['end_key'] = end_key + + @property + def url(self): + try: + return self.params['url'] + except KeyError: + msg = 'A url= param must be specified to query the cdx server' + raise CDXException(msg) + + @property + def match_type(self): + return self.params.get('matchType', 'exact') + + @property + def is_exact(self): + return self.match_type == 'exact' + + @property + def allow_fuzzy(self): + return self._get_bool('allowFuzzy') + + @property + def output(self): + return self.params.get('output', 'text') + + @property + def limit(self): + return int(self.params.get('limit', 100000)) + + @property + def collapse_time(self): + return self.params.get('collapseTime') + + @property + def resolve_revisits(self): + return self._get_bool('resolveRevisits') + + @property + def filters(self): + return self.params.get('filter', []) + + @property + def fields(self): + v = self.params.get('fields') + return v.split(',') if v else None + + @property + def closest(self): + # sort=closest is not required + return self.params.get('closest') + + @property + def reverse(self): + # sort=reverse overrides reverse=0 + return (self._get_bool('reverse') or + self.params.get('sort') == 'reverse') + + @property + def secondary_index_only(self): + return self._get_bool('showPagedIndex') + + def _get_bool(self, name, def_val=False): + v = self.params.get(name) + if v: + try: + v = int(v) + except ValueError as ex: + v = (v.lower() == 'true') + else: + v = def_val + + return bool(v) + + def urlencode(self): + return urlencode(self.params, True) + + @staticmethod + def from_wsgi_env(env): + return CDXQuery(**CDXQuery.extract_params_from_wsgi_env(env)) + + @staticmethod + def extract_params_from_wsgi_env(env): + """ utility function to extract params and create a CDXQuery + from a WSGI environment dictionary + """ + params = parse_qs(env['QUERY_STRING']) + + if not 'output' in params: + params['output'] = 'text' + + # parse_qs produces arrays for single values + # cdx processing expects singleton params for all params, + # except filters, so convert here + # use first value of the list + for name, val in params.iteritems(): + if name != 'filter': + params[name] = val[0] + + return params diff --git a/pywb/cdx/test/cdxserver_test.py b/pywb/cdx/test/cdxserver_test.py index e5fac6b3..e261ead4 100644 --- a/pywb/cdx/test/cdxserver_test.py +++ b/pywb/cdx/test/cdxserver_test.py @@ -142,6 +142,8 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex ('filename', 'dupes.warc.gz')] # NOTE: external dependency -- need self-contained test TODO + +# Load remote query but filter locally >>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') >>> pprint.pprint(x.next().items()) [('urlkey', 'com,example)/'), @@ -152,14 +154,24 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'), ('length', '1792')] +# No local filtering/processing of cdx, simply return result from remote server +>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') +>>> pprint.pprint(x.next().items()) +[('urlkey', 'com,example)/'), + ('timestamp', '20020120142510'), + ('original', 'http://example.com:80/'), + ('mimetype', 'text/html'), + ('statuscode', '200'), + ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'), + ('length', '1792')] ->>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2') +>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2') Traceback (most recent call last): AccessException: Blocked By Robots """ #================================================================= -from pywb.cdx.cdxserver import CDXServer +from pywb.cdx.cdxserver import CDXServer, RemoteCDXServer import os import sys import pprint @@ -167,22 +179,42 @@ import pprint from pywb import get_test_dir #test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/' test_cdx_dir = get_test_dir() + 'cdx/' +from pywb.cdx.cdxobject import AccessException + +from tests.fixture import testconfig, TestExclusionPerms + +import pytest def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams): kwparams['url'] = url - kwparams['output'] = 'text' + fields = kwparams.get('fields') + if fields: + fields = fields.split(',') server = CDXServer(sources) results = server.load_cdx(**kwparams) for x in results: - x = x.replace('\t', ' ') - sys.stdout.write(x) + l = x.to_text(fields).replace('\t', ' ') + sys.stdout.write(l) +#================================================================ + +def test_excluded(testconfig): + testconfig['perms_checker'] = TestExclusionPerms() + sources = testconfig.get('index_paths') + print sources + server = CDXServer(sources, perms_checker=testconfig['perms_checker']) + assert isinstance(server, CDXServer) + assert server.perms_checker + + url = 'http://www.iana.org/_img/bookmark_icon.ico' + key = 'org,iana)/_img/bookmark_icon.ico' + with pytest.raises(AccessException): + cdxobjs = list(server.load_cdx(url=url)) + print cdxobjs if __name__ == "__main__": import doctest doctest.testmod() - - diff --git a/pywb/cdx/test/test_perms.py b/pywb/cdx/test/test_perms.py new file mode 100644 index 00000000..eb5a30ac --- /dev/null +++ b/pywb/cdx/test/test_perms.py @@ -0,0 +1,28 @@ +from pywb.cdx.cdxops import cdx_load +from pywb.cdx.perms import AllowAllPerms +from pywb.cdx.query import CDXQuery +from pywb.cdx.cdxobject import AccessException + +from pytest import raises + +class BlockAllPerms(AllowAllPerms): + def allow_url_lookup(self, urlkey, url): + return False + + +def test_exclusion_short_circuit(): + """ + # Verify that exclusion check 'short-circuits' further evaluation.. eg, a bad cdx source is not even loaded + # if exclusion check does not pass + """ + cdx_iter = cdx_load(['bogus ignored'], CDXQuery(url='example.com', key='com,example)/'), + perms_checker=BlockAllPerms(), process=True) + + # exception happens on first access attempt + with raises(AccessException): + cdx_iter.next() + + + + + diff --git a/pywb/cdx/test/wsgi_cdxserver_test.py b/pywb/cdx/test/wsgi_cdxserver_test.py index 70c4fe71..a7d1ecdb 100644 --- a/pywb/cdx/test/wsgi_cdxserver_test.py +++ b/pywb/cdx/test/wsgi_cdxserver_test.py @@ -1,10 +1,10 @@ import webtest -from pywb.cdx.wsgi_cdxserver import main +from pywb.cdx.wsgi_cdxserver import create_app from pywb import get_test_dir class TestCdx: def setup(self): - self.app = main(get_test_dir() + 'cdx/') + self.app = create_app(get_test_dir() + 'cdx/') self.testapp = webtest.TestApp(self.app) def test_cdx(self): diff --git a/pywb/cdx/wsgi_cdxserver.py b/pywb/cdx/wsgi_cdxserver.py index c4e0649c..c9fe11d7 100644 --- a/pywb/cdx/wsgi_cdxserver.py +++ b/pywb/cdx/wsgi_cdxserver.py @@ -1,10 +1,12 @@ -from cdxserver import create_cdx_server, extract_params_from_wsgi_env +from werkzeug.wrappers import BaseResponse +from cdxserver import create_cdx_server from pywb import get_test_dir +from query import CDXQuery import logging import os import yaml -import pkgutil +import pkg_resources #================================================================= CONFIG_FILE = 'config.yaml' @@ -13,65 +15,89 @@ RULES_FILE = 'rules.yaml' DEFAULT_PORT = 8080 -config = None -if __package__: - try: - config = pkgutil.get_data(__package__, CONFIG_FILE) - config = yaml.load(config) - except: - pass - - #================================================================= -def main(paths=None): + +class CDXQueryRequest(object): + def __init__(self, environ): + self.query = CDXQuery.from_wsgi_env(environ) + + +class WSGICDXServer(object): + def __init__(self, config, rules_file): + self.cdxserver = create_cdx_server(config, rules_file) + + def __call__(self, environ, start_response): + request = CDXQueryRequest(environ) + try: + logging.debug('request.args=%s', request.query) + result = self.cdxserver.load_cdx_query(request.query) + + # TODO: select response type by "output" parameter + response = PlainTextResponse(result, request.query.fields) + return response(environ, start_response) + except Exception as exc: + logging.error('load_cdx failed', exc_info=1) + # TODO: error response should be different for each response + # type + start_response('400 Error', [('Content-Type', 'text/plain')]) + return [str(exc)] + +def cdx_text_out(cdx, fields): + if not fields: + return str(cdx) + '\n' + else: + logging.info('cdx fields=%s', cdx.keys) + # TODO: this will results in an exception if fields contain + # non-existent field name. + return ' '.join(cdx[x] for x in fields) + '\n' + +class PlainTextResponse(BaseResponse): + def __init__(self, cdxitr, fields, status=200, content_type='text/plain'): + super(PlainTextResponse, self).__init__( + response=( + cdx.to_text(fields) for cdx in cdxitr + ), + status=status, content_type=content_type) + +# class JsonResponse(Response): +# pass +# class MementoResponse(Response): +# pass + +def create_app(config=None): logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s', level=logging.DEBUG) - if not paths: - if config: - paths = config - else: - paths = get_test_dir() + 'cdx/' - - cdxserver = create_cdx_server(paths, RULES_FILE) - - def application(env, start_response): - try: - params = extract_params_from_wsgi_env(env) - response = cdxserver.load_cdx(**params) - - start_response('200 OK', [('Content-Type', 'text/plain')]) - - except Exception as exc: - import traceback - err_details = traceback.format_exc(exc) - start_response('400 Error', [('Content-Type', 'text/plain')]) - response = [str(exc)] - print err_details - - return response - - return application + if not config: + index_paths = get_test_dir() + 'cdx/' + config = dict(index_paths=index_paths) + return WSGICDXServer(config, RULES_FILE) if __name__ == "__main__": - from wsgiref.simple_server import make_server + from optparse import OptionParser + from werkzeug.serving import run_simple - app = main() + opt = OptionParser('%prog [OPTIONS]') + opt.add_option('-p', '--port', type='int', default=None) - port = DEFAULT_PORT - if config: - port = config.get('port', DEFAULT_PORT) + options, args = opt.parse_args() - httpd = make_server('', port, app) + configdata = pkg_resources.resource_string(__name__, CONFIG_FILE) + config = yaml.load(configdata) - logging.debug('Starting CDX Server on port ' + str(port)) + port = options.port + if port is None: + port = (config and config.get('port')) or DEFAULT_PORT + app = create_app(config) + + logging.debug('Starting CDX Server on port %s', port) try: - httpd.serve_forever() - except KeyboardInterrupt: + run_simple('0.0.0.0', port, app, use_reloader=True, use_debugger=True) + except KeyboardInterrupt as ex: pass - logging.debug('Stopping CDX Server') else: - application = main() + # XXX pass production config + application = create_app() diff --git a/pywb/cdx/zipnum.py b/pywb/cdx/zipnum.py index 847c660f..1d0cb24f 100644 --- a/pywb/cdx/zipnum.py +++ b/pywb/cdx/zipnum.py @@ -110,21 +110,20 @@ class ZipNumCluster(CDXSource): def lookup_loc(self, part): return self.loc_map[part] - def load_cdx(self, params): + def load_cdx(self, query): self.reload_loc() reader = SeekableTextFileReader(self.summary) idx_iter = iter_range(reader, - params['key'], - params['end_key'], + query.key, + query.end_key, prev_size=1) - if params.get('showPagedIndex'): - params['proxyAll'] = True + if query.secondary_index_only: return idx_iter else: - blocks = self.idx_to_cdx(idx_iter, params) + blocks = self.idx_to_cdx(idx_iter, query) def gen_cdx(): for blk in blocks: @@ -133,7 +132,7 @@ class ZipNumCluster(CDXSource): return gen_cdx() - def idx_to_cdx(self, idx_iter, params): + def idx_to_cdx(self, idx_iter, query): blocks = None ranges = [] @@ -150,7 +149,7 @@ class ZipNumCluster(CDXSource): else: if blocks: - yield self.block_to_cdx_iter(blocks, ranges, params) + yield self.block_to_cdx_iter(blocks, ranges, query) blocks = ZipBlocks(idx['part'], idx['offset'], @@ -160,15 +159,15 @@ class ZipNumCluster(CDXSource): ranges = [blocks.length] if blocks: - yield self.block_to_cdx_iter(blocks, ranges, params) + yield self.block_to_cdx_iter(blocks, ranges, query) - def block_to_cdx_iter(self, blocks, ranges, params): + def block_to_cdx_iter(self, blocks, ranges, query): last_exc = None last_traceback = None for location in self.lookup_loc(blocks.part): try: - return self.load_blocks(location, blocks, ranges, params) + return self.load_blocks(location, blocks, ranges, query) except Exception as exc: last_exc = exc import sys @@ -179,7 +178,7 @@ class ZipNumCluster(CDXSource): else: raise Exception('No Locations Found for: ' + block.part) - def load_blocks(self, location, blocks, ranges, params): + def load_blocks(self, location, blocks, ranges, query): if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG): msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}' @@ -195,9 +194,9 @@ class ZipNumCluster(CDXSource): iter_ = itertools.chain(*itertools.imap(decompress_block, ranges)) # start bound - iter_ = linearsearch(iter_, params['key']) + iter_ = linearsearch(iter_, query.key) # end bound - end = params['end_key'] + end = query.end_key iter_ = itertools.takewhile(lambda line: line < end, iter_) return iter_ diff --git a/pywb/core/handlers.py b/pywb/core/handlers.py index d166e640..cbf2d71f 100644 --- a/pywb/core/handlers.py +++ b/pywb/core/handlers.py @@ -4,7 +4,7 @@ import mimetypes import time from pywb.rewrite.wburl import WbUrl -from pywb.cdx.cdxserver import extract_params_from_wsgi_env +from pywb.cdx.query import CDXQuery from wbrequestresponse import WbResponse from wbexceptions import WbException, NotFoundException from views import TextCapturesView @@ -82,7 +82,7 @@ class CDXHandler(BaseHandler): self.view = view if view else TextCapturesView() def __call__(self, wbrequest): - params = extract_params_from_wsgi_env(wbrequest.env) + params = CDXQuery.extract_params_from_wsgi_env(wbrequest.env) cdx_lines = self.index_reader.load_cdx(**params) return self.view.render_response(wbrequest, cdx_lines) diff --git a/pywb/core/indexreader.py b/pywb/core/indexreader.py index cea27a8f..a422d0b4 100644 --- a/pywb/core/indexreader.py +++ b/pywb/core/indexreader.py @@ -30,7 +30,7 @@ class IndexReader(object): params['allowFuzzy'] = True - cdxlines = self.load_cdx(url=wburl.url, output='raw', **params) + cdxlines = self.load_cdx(url=wburl.url, **params) return cdxlines diff --git a/setup.py b/setup.py index 1fe72fa7..4c2cad20 100755 --- a/setup.py +++ b/setup.py @@ -1,24 +1,48 @@ #!/usr/bin/env python # vim: set sw=4 et: -import setuptools +from setuptools import setup, find_packages import glob -setuptools.setup(name='pywb', - version='0.2', - url='https://github.com/ikreymer/pywb', - author='Ilya Kreymer', - author_email='ilya@archive.org', - long_description=open('README.md').read(), - license='GPL', - packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite','pywb.core','pywb.dispatch','pywb.bootstrap'], - provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite','pywb.core','pywb.dispatch','pywb.bootstrap'], - package_data={'pywb': ['ui/*', 'static/*', '*.yaml']}, - data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')), - ('sample_archive/zipcdx/', glob.glob('sample_archive/zipcdx/*')), - ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')), - ('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))], - install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'], -# tests_require=['WebTest', 'pytest'], - zip_safe=False) - +setup( + name='pywb', + version='0.2', + url='https://github.com/ikreymer/pywb', + author='Ilya Kreymer', + author_email='ilya@archive.org', + long_description=open('README.md').read(), + license='GPL', + packages=find_packages(), + provides=[ + 'pywb', + 'pywb.utils', + 'pywb.cdx', + 'pywb.warc', + 'pywb.rewrite', + 'pywb.core', + 'pywb.dispatch', + 'pywb.bootstrap' + ], + package_data={ + 'pywb': ['ui/*', 'static/*', '*.yaml'], + }, + data_files = [ + ('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')), + ('sample_archive/zipcdx/', glob.glob('sample_archive/zipcdx/*')), + ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')), + ('sample_archive/text_content/', glob.glob('sample_archive/text_content/*')), + ], + install_requires=[ + 'rfc3987', + 'chardet', + 'redis', + 'jinja2', + 'surt', + 'pyyaml', + 'WebTest', + 'pytest', + 'werkzeug>=0.9.4', + ], + # tests_require=['WebTest', 'pytest'], + zip_safe=False + ) diff --git a/test_config.yaml b/test_config.yaml index 8421aead..20e52933 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -92,10 +92,10 @@ enable_cdx_api: true # optional reporter callback func # if set, called with request and cdx object -reporter: !!python/object/new:tests.test_integration.PrintReporter [] +reporter: !!python/object/new:tests.fixture.PrintReporter [] # custom rules for domain specific matching #domain_specific_rules: rules.yaml #perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms [] -perms_checker: !!python/object/new:tests.test_integration.TestExclusionPerms [] +perms_checker: !!python/object/new:tests.fixture.TestExclusionPerms [] diff --git a/tests/fixture.py b/tests/fixture.py new file mode 100644 index 00000000..ff7c4307 --- /dev/null +++ b/tests/fixture.py @@ -0,0 +1,45 @@ +import os +import pytest + +import yaml + +from pywb.cdx.perms import AllowAllPerms + +@pytest.fixture +def testconfig(): + config = yaml.load(open('test_config.yaml')) + assert config + if 'index_paths' not in config: + # !!! assumes this module is in a sub-directory of project root. + config['index_paths'] = os.path.join( + os.path.dirname(os.path.realpath(__file__)), + '../sample_archive/cdx') + return config + +#================================================================ +# Reporter callback for replay view +class PrintReporter: + """Reporter callback for replay view. + """ + def __call__(self, wbrequest, cdx, response): + print wbrequest + print cdx + pass + +#================================================================ +class TestExclusionPerms(AllowAllPerms): + """ + Perm Checker fixture which can block one URL. + """ + # sample_archive has captures for this URLKEY + URLKEY_EXCLUDED = 'org,iana)/_img/bookmark_icon.ico' + + def allow_url_lookup(self, urlkey, url): + """ + Return true/false if url or urlkey (canonicalized url) + should be allowed + """ + if urlkey == self.URLKEY_EXCLUDED: + return False + + return super(TestExclusionPerms, self).allow_url_lookup(urlkey, url) diff --git a/tests/test_integration.py b/tests/test_integration.py index 5af34e34..b9b20e06 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -2,14 +2,17 @@ import webtest from pywb.bootstrap.pywb_init import pywb_config from pywb.bootstrap.wbapp import create_wb_app from pywb.cdx.cdxobject import CDXObject -from pywb.cdx.perms import AllowAllPerms + +from fixture import TestExclusionPerms class TestWb: TEST_CONFIG = 'test_config.yaml' def setup(self): #self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config()) - self.app = create_wb_app(pywb_config(self.TEST_CONFIG)) + # save it in self - useful for debugging + self.router = pywb_config(self.TEST_CONFIG) + self.app = create_wb_app(self.router) self.testapp = webtest.TestApp(self.app) def _assert_basic_html(self, resp): @@ -207,24 +210,3 @@ class TestWb: assert resp.status_int == 400 assert 'Invalid Url: http://?abc' in resp.body -#================================================================= -# Reporter callback for replay view -class PrintReporter: - def __call__(self, wbrequest, cdx, response): - print wbrequest - print cdx - -#================================================================= -class TestExclusionPerms(AllowAllPerms): - """ - Sample Perm Checker with hard-coded exclusion - """ - def allow_url_lookup(self, urlkey, url): - """ - Return true/false if url or urlkey (canonicalized url) - should be allowed - """ - if urlkey == 'org,iana)/_img/bookmark_icon.ico': - return False - - return super(TestExclusionPerms, self).allow_url_lookup(urlkey, url) diff --git a/tests/test_wsgi_cdxserver.py b/tests/test_wsgi_cdxserver.py new file mode 100644 index 00000000..8eee2484 --- /dev/null +++ b/tests/test_wsgi_cdxserver.py @@ -0,0 +1,189 @@ +import os +import re + +import pytest +from urllib import urlencode + +from werkzeug.test import Client +from werkzeug.wrappers import BaseResponse, Response + +import yaml + +from pywb.cdx.cdxobject import CDXObject +from pywb.cdx.wsgi_cdxserver import create_app + +from tests.fixture import testconfig + +@pytest.fixture +def client(testconfig): + app = create_app(testconfig) + return Client(app, Response) + +# ================================================================ + +def query(client, url, **params): + params['url'] = url + return client.get('/cdx?' + urlencode(params, doseq=1)) + +# ================================================================ + +def test_exact_url(client): + """ + basic exact match, no filters, etc. + """ + resp = query(client, 'http://www.iana.org/') + + assert resp.status_code == 200 + print resp.data + +def test_prefix_match(client): + """ + prefix match test + """ + resp = query(client, 'http://www.iana.org/', matchType='prefix') + + print resp.data.splitlines() + assert resp.status_code == 200 + + suburls = 0 + for l in resp.data.splitlines(): + fields = l.split(' ') + if len(fields[0]) > len('org,iana)/'): + suburls += 1 + assert suburls > 0 + +def test_filters(client): + """ + filter cdxes by mimetype and filename field, exact match. + """ + resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css', + filter=('mimetype:warc/revisit', 'filename:dupes.warc.gz')) + + assert resp.status_code == 200 + assert resp.mimetype == 'text/plain' + + for l in resp.data.splitlines(): + fields = l.split(' ') + assert fields[0] == 'org,iana)/_css/2013.1/screen.css' + assert fields[3] == 'warc/revisit' + assert fields[10] == 'dupes.warc.gz' + +def test_limit(client): + resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css', + limit='1') + + assert resp.status_code == 200 + assert resp.mimetype == 'text/plain' + + cdxes = resp.data.splitlines() + assert len(cdxes) == 1 + fields = cdxes[0].split(' ') + assert fields[0] == 'org,iana)/_css/2013.1/screen.css' + assert fields[1] == '20140126200625' + assert fields[3] == 'text/css' + + resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css', + limit='1', reverse='1') + + assert resp.status_code == 200 + assert resp.mimetype == 'text/plain' + + cdxes = resp.data.splitlines() + assert len(cdxes) == 1 + fields = cdxes[0].split(' ') + assert fields[0] == 'org,iana)/_css/2013.1/screen.css' + assert fields[1] == '20140127171239' + assert fields[3] == 'warc/revisit' + +def test_fields(client): + """ + retrieve subset of fields with ``fields`` parameter. + """ + resp = query(client, 'http://www.iana.org/_css/2013.1/print.css', + fields='urlkey,timestamp,statuscode') + + assert resp.status_code == 200 + + cdxes = resp.data.splitlines() + + for cdx in cdxes: + fields = cdx.split(' ') + assert len(fields) == 3 + assert fields[0] == 'org,iana)/_css/2013.1/print.css' + assert re.match(r'\d{14}$', fields[1]) + assert re.match(r'\d{3}|-', fields[2]) + +def test_fields_undefined(client): + """ + server shall respond with Bad Request (TODO: with proper explanation), + when ``fields`` parameter contains undefined name(s). + """ + resp = query(client, 'http://www.iana.org/_css/2013.1/print.css', + fields='urlkey,nosuchfield') + + resp.status_code == 400 + +def test_resolveRevisits(client): + """ + with ``resolveRevisits=true``, server adds three fields pointing to + the *original* capture. + """ + resp = query(client, 'http://www.iana.org/_css/2013.1/print.css', + resolveRevisits='true' + ) + assert resp.status_code == 200 + assert resp.mimetype == 'text/plain' + + cdxes = resp.data.splitlines() + originals = {} + for cdx in cdxes: + fields = cdx.split(' ') + assert len(fields) == 14 + (key, ts, url, mt, st, sha, _, _, size, offset, fn, + orig_size, orig_offset, orig_fn) = fields + # orig_* fields are either all '-' or (int, int, filename) + # check if orig_* fields are equals to corresponding fields + # for the original capture. + if orig_size == '-': + assert orig_offset == '-' and orig_fn == '-' + originals[sha] = (int(size), int(offset), fn) + else: + orig = originals.get(sha) + assert orig == (int(orig_size), int(orig_offset), orig_fn) + +def test_resolveRevisits_orig_fields(client): + """ + when resolveRevisits=true, extra three fields are named + ``orig.length``, ``orig.offset`` and ``orig.filename``, respectively. + it is possible to filter fields by these names. + """ + resp = query(client, 'http://www.iana.org/_css/2013.1/print.css', + resolveRevisits='1', + fields='urlkey,orig.length,orig.offset,orig.filename' + ) + assert resp.status_code == 200 + assert resp.mimetype == 'text/plain' + + cdxes = resp.data.splitlines() + for cdx in cdxes: + fields = cdx.split(' ') + assert len(fields) == 4 + key, orig_len, orig_offset, orig_fn = fields + assert (orig_len == '-' and orig_offset == '-' and orig_fn == '-' or + (int(orig_len), int(orig_offset), orig_fn)) + +def test_collapseTime_resolveRevisits_reverse(client): + resp = query(client, 'http://www.iana.org/_css/2013.1/print.css', + collapseTime='11', + resolveRevisits='true', + reverse='true' + ) + + cdxes = [CDXObject(l) for l in resp.data.splitlines()] + + assert len(cdxes) == 3 + + # timestamp is in descending order + for i in range(len(cdxes) - 1): + assert cdxes[i]['timestamp'] >= cdxes[i + 1]['timestamp'] +