mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
update pkg-reorg with changes from master, including
CDXQuery configuration
This commit is contained in:
commit
19f86305bf
@ -1,12 +1,13 @@
|
||||
import yaml
|
||||
import re
|
||||
import logging
|
||||
import pkgutil
|
||||
import pkg_resources
|
||||
|
||||
from pywb.utils.dsrules import BaseRule, RuleSet
|
||||
|
||||
from pywb.utils.canonicalize import unsurt, UrlCanonicalizer
|
||||
|
||||
from query import CDXQuery
|
||||
|
||||
#=================================================================
|
||||
def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
|
||||
@ -70,13 +71,13 @@ class FuzzyQuery:
|
||||
def __init__(self, rules):
|
||||
self.rules = rules
|
||||
|
||||
def __call__(self, params):
|
||||
def __call__(self, query):
|
||||
matched_rule = None
|
||||
|
||||
urlkey = params['key']
|
||||
url = params['url']
|
||||
filter_ = params.get('filter', [])
|
||||
output = params.get('output')
|
||||
urlkey = query.key
|
||||
url = query.url
|
||||
filter_ = query.filters
|
||||
output = query.output
|
||||
|
||||
for rule in self.rules.iter_matching(urlkey):
|
||||
m = rule.regex.search(urlkey)
|
||||
@ -102,7 +103,7 @@ class FuzzyQuery:
|
||||
'filter': filter_,
|
||||
'output': output}
|
||||
|
||||
return params
|
||||
return CDXQuery(**params)
|
||||
|
||||
|
||||
#=================================================================
|
||||
|
@ -1,6 +1,9 @@
|
||||
from collections import OrderedDict
|
||||
import itertools
|
||||
|
||||
from urllib import urlencode
|
||||
from urlparse import parse_qs
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDXException(Exception):
|
||||
@ -71,12 +74,25 @@ class CDXObject(OrderedDict):
|
||||
# force regen on next __str__ call
|
||||
self.cdxline = None
|
||||
|
||||
def is_revisit(self):
|
||||
return (self['mimetype'] == 'warc/revisit' or
|
||||
self['filename'] == '-')
|
||||
|
||||
def to_text(self, fields=None):
|
||||
"""
|
||||
return plaintext CDX record (includes newline).
|
||||
:param fields: list of field names to output.
|
||||
"""
|
||||
if fields is None:
|
||||
return str(self) + '\n'
|
||||
else:
|
||||
return ' '.join(self[x] for x in fields) + '\n'
|
||||
|
||||
def __str__(self):
|
||||
if self.cdxline:
|
||||
return self.cdxline
|
||||
|
||||
li = itertools.imap(lambda (n, val): val, self.items())
|
||||
return ' '.join(li)
|
||||
return ' '.join(val for n, val in self.iteritems())
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -106,5 +122,12 @@ class IDXObject(OrderedDict):
|
||||
|
||||
self.idxline = idxline
|
||||
|
||||
def to_text(self, fields=None):
|
||||
"""
|
||||
return plaintext IDX record (including newline).
|
||||
:param fields: list of field names to output (currently ignored)
|
||||
"""
|
||||
return str(self) + '\n'
|
||||
|
||||
def __str__(self):
|
||||
return self.idxline
|
||||
|
@ -1,4 +1,5 @@
|
||||
from cdxobject import CDXObject, IDXObject, AccessException
|
||||
from query import CDXQuery
|
||||
from pywb.utils.timeutils import timestamp_to_sec
|
||||
|
||||
import bisect
|
||||
@ -10,32 +11,44 @@ from collections import deque
|
||||
|
||||
|
||||
#=================================================================
|
||||
def cdx_load(sources, params, perms_checker=None):
|
||||
def cdx_load(sources, query, perms_checker=None, process=True):
|
||||
"""
|
||||
merge text CDX lines from sources, return an iterator for
|
||||
filtered and access-checked sequence of CDX objects.
|
||||
|
||||
:param sources: iterable for text CDX sources.
|
||||
:param perms_checker: access check filter object implementing
|
||||
allow_url_lookup(key, url), allow_capture(cdxobj) and
|
||||
filter_fields(cdxobj) methods.
|
||||
:param process: bool, perform processing sorting/filtering/grouping ops
|
||||
"""
|
||||
cdx_iter = load_cdx_streams(sources, query)
|
||||
cdx_iter = make_obj_iter(cdx_iter, query)
|
||||
|
||||
if process and not query.secondary_index_only:
|
||||
cdx_iter = process_cdx(cdx_iter, query)
|
||||
|
||||
if perms_checker:
|
||||
cdx_iter = cdx_load_with_perms(sources, params, perms_checker)
|
||||
else:
|
||||
cdx_iter = cdx_load_and_filter(sources, params)
|
||||
|
||||
# output raw cdx objects
|
||||
if params.get('output') == 'raw':
|
||||
return cdx_iter
|
||||
|
||||
def write_cdx(fields):
|
||||
for cdx in cdx_iter:
|
||||
yield cdx_text_out(cdx, fields) + '\n'
|
||||
|
||||
return write_cdx(params.get('fields'))
|
||||
cdx_iter = restrict_cdx(cdx_iter, query, perms_checker)
|
||||
|
||||
return cdx_iter
|
||||
|
||||
#=================================================================
|
||||
def cdx_load_with_perms(sources, params, perms_checker):
|
||||
if not perms_checker.allow_url_lookup(params['key'], params['url']):
|
||||
if params.get('matchType', 'exact') == 'exact':
|
||||
def restrict_cdx(cdx_iter, query, perms_checker):
|
||||
"""
|
||||
filter out those cdx records that user doesn't have access to,
|
||||
by consulting :param perms_checker:.
|
||||
:param cdx_iter: cdx record source iterable
|
||||
:param query: request parameters (CDXQuery)
|
||||
:param perms_checker: object implementing permission checker
|
||||
"""
|
||||
if not perms_checker.allow_url_lookup(query.key, query.url):
|
||||
if query.is_exact:
|
||||
raise AccessException('Excluded')
|
||||
|
||||
cdx_iter = cdx_load_and_filter(sources, params)
|
||||
|
||||
for cdx in cdx_iter:
|
||||
# TODO: we could let filter_fields handle this case by accepting
|
||||
# None as a return value.
|
||||
if not perms_checker.allow_capture(cdx):
|
||||
continue
|
||||
|
||||
@ -43,45 +56,27 @@ def cdx_load_with_perms(sources, params, perms_checker):
|
||||
|
||||
yield cdx
|
||||
|
||||
|
||||
#=================================================================
|
||||
def cdx_text_out(cdx, fields):
|
||||
if not fields:
|
||||
return str(cdx)
|
||||
else:
|
||||
return ' '.join(map(lambda x: cdx[x], fields.split(',')))
|
||||
|
||||
|
||||
#=================================================================
|
||||
def cdx_load_and_filter(sources, params):
|
||||
cdx_iter = load_cdx_streams(sources, params)
|
||||
|
||||
cdx_iter = make_obj_iter(cdx_iter, params)
|
||||
|
||||
if params.get('proxyAll'):
|
||||
return cdx_iter
|
||||
|
||||
resolve_revisits = params.get('resolveRevisits', False)
|
||||
if resolve_revisits:
|
||||
def process_cdx(cdx_iter, query):
|
||||
if query.resolve_revisits:
|
||||
cdx_iter = cdx_resolve_revisits(cdx_iter)
|
||||
|
||||
filters = params.get('filter', None)
|
||||
filters = query.filters
|
||||
if filters:
|
||||
cdx_iter = cdx_filter(cdx_iter, filters)
|
||||
|
||||
collapse_time = params.get('collapseTime', None)
|
||||
collapse_time = query.collapse_time
|
||||
if collapse_time:
|
||||
cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time)
|
||||
|
||||
limit = int(params.get('limit', 1000000))
|
||||
limit = query.limit
|
||||
|
||||
reverse = params.get('reverse', False) or params.get('sort') == 'reverse'
|
||||
if reverse:
|
||||
if query.reverse:
|
||||
cdx_iter = cdx_reverse(cdx_iter, limit)
|
||||
|
||||
closest_to = params.get('closest', None)
|
||||
if closest_to:
|
||||
cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit)
|
||||
closest = query.closest
|
||||
if closest:
|
||||
cdx_iter = cdx_sort_closest(closest, cdx_iter, limit)
|
||||
|
||||
if limit:
|
||||
cdx_iter = cdx_limit(cdx_iter, limit)
|
||||
@ -91,26 +86,28 @@ def cdx_load_and_filter(sources, params):
|
||||
|
||||
#=================================================================
|
||||
# load and source merge cdx streams
|
||||
def load_cdx_streams(sources, params):
|
||||
def load_cdx_streams(sources, query):
|
||||
# Optimize: no need to merge if just one input
|
||||
if len(sources) == 1:
|
||||
return sources[0].load_cdx(params)
|
||||
cdx_iter = sources[0].load_cdx(query)
|
||||
else:
|
||||
source_iters = map(lambda src: src.load_cdx(query), sources)
|
||||
cdx_iter = merge(*(source_iters))
|
||||
|
||||
source_iters = map(lambda src: src.load_cdx(params), sources)
|
||||
merged_stream = merge(*(source_iters))
|
||||
return merged_stream
|
||||
for cdx in cdx_iter:
|
||||
yield cdx
|
||||
|
||||
|
||||
#=================================================================
|
||||
# convert text cdx stream to CDXObject/IDXObject
|
||||
def make_obj_iter(text_iter, params):
|
||||
def make_obj_iter(text_iter, query):
|
||||
# already converted
|
||||
if params.get('showPagedIndex'):
|
||||
if query.secondary_index_only:
|
||||
cls = IDXObject
|
||||
else:
|
||||
cls = CDXObject
|
||||
|
||||
return itertools.imap(lambda line: cls(line), text_iter)
|
||||
return (cls(line) for line in text_iter)
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -161,6 +158,7 @@ def cdx_filter(cdx_iter, filter_strings):
|
||||
if string.startswith('='):
|
||||
string = string[1:]
|
||||
self.compare_func = self.exact
|
||||
# contains match
|
||||
elif string.startswith('~'):
|
||||
string = string[1:]
|
||||
self.compare_func = self.contains
|
||||
@ -257,8 +255,8 @@ def cdx_resolve_revisits(cdx_iter):
|
||||
originals = {}
|
||||
|
||||
for cdx in cdx_iter:
|
||||
is_revisit = ((cdx['mimetype'] == 'warc/revisit') or
|
||||
(cdx['filename'] == '-'))
|
||||
|
||||
is_revisit = cdx.is_revisit()
|
||||
|
||||
digest = cdx['digest']
|
||||
|
||||
|
@ -4,6 +4,7 @@ from cdxops import cdx_load
|
||||
from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
|
||||
from zipnum import ZipNumCluster
|
||||
from cdxobject import CDXObject, CaptureNotFoundException, CDXException
|
||||
from query import CDXQuery
|
||||
from cdxdomainspecific import load_domain_specific_cdx_rules
|
||||
|
||||
from pywb.utils.loaders import is_http
|
||||
@ -36,7 +37,7 @@ class BaseCDXServer(object):
|
||||
# set perms checker, if any
|
||||
self.perms_checker = kwargs.get('perms_checker')
|
||||
|
||||
def _check_cdx_iter(self, cdx_iter, params):
|
||||
def _check_cdx_iter(self, cdx_iter, query):
|
||||
""" Check cdx iter semantics
|
||||
If iter is empty (no matches), check if fuzzy matching
|
||||
is allowed, and try it -- otherwise,
|
||||
@ -48,21 +49,23 @@ class BaseCDXServer(object):
|
||||
if cdx_iter:
|
||||
return cdx_iter
|
||||
|
||||
url = params['url']
|
||||
|
||||
# check if fuzzy is allowed and ensure that its an
|
||||
# exact match
|
||||
if (self.fuzzy_query and params.get('allowFuzzy') and
|
||||
params.get('matchType', 'exact') == 'exact'):
|
||||
if (self.fuzzy_query and
|
||||
query.allow_fuzzy and
|
||||
query.is_exact):
|
||||
|
||||
fuzzy_params = self.fuzzy_query(params)
|
||||
if fuzzy_params:
|
||||
return self.load_cdx(**fuzzy_params)
|
||||
fuzzy_query_params = self.fuzzy_query(query)
|
||||
if fuzzy_query_params:
|
||||
return self.load_cdx_query(fuzzy_query_params)
|
||||
|
||||
msg = 'No Captures found for: ' + url
|
||||
msg = 'No Captures found for: ' + query.url
|
||||
raise CaptureNotFoundException(msg)
|
||||
|
||||
def load_cdx(self, **params):
|
||||
return self.load_cdx_query(CDXQuery(**params))
|
||||
|
||||
def load_cdx_query(self, query):
|
||||
raise NotImplementedError('Implement in subclass')
|
||||
|
||||
@staticmethod
|
||||
@ -84,28 +87,77 @@ class CDXServer(BaseCDXServer):
|
||||
|
||||
def __init__(self, paths, **kwargs):
|
||||
super(CDXServer, self).__init__(**kwargs)
|
||||
self.sources = create_cdx_sources(paths, kwargs.get('config'))
|
||||
# TODO: we could save config in member, so that other
|
||||
# methods can use it. it's bad for add_cdx_source to take
|
||||
# config argument.
|
||||
self._create_cdx_sources(paths, kwargs.get('config'))
|
||||
|
||||
def load_cdx(self, **params):
|
||||
# if key not set, assume 'url' is set and needs canonicalization
|
||||
if not params.get('key'):
|
||||
try:
|
||||
url = params['url']
|
||||
except KeyError:
|
||||
msg = 'A url= param must be specified to query the cdx server'
|
||||
raise CDXException(msg)
|
||||
def load_cdx_query(self, query):
|
||||
url = query.url
|
||||
key, end_key = calc_search_range(url=url,
|
||||
match_type=query.match_type,
|
||||
url_canon=self.url_canon)
|
||||
query.set_key(key, end_key)
|
||||
|
||||
match_type = params.get('matchType', 'exact')
|
||||
cdx_iter = cdx_load(self.sources,
|
||||
query,
|
||||
perms_checker=self.perms_checker)
|
||||
|
||||
key, end_key = calc_search_range(url=url,
|
||||
match_type=match_type,
|
||||
url_canon=self.url_canon)
|
||||
params['key'] = key
|
||||
params['end_key'] = end_key
|
||||
return self._check_cdx_iter(cdx_iter, query)
|
||||
|
||||
cdx_iter = cdx_load(self.sources, params, self.perms_checker)
|
||||
def _create_cdx_sources(self, paths, config):
|
||||
"""
|
||||
build CDXSource instances for each of path in :param paths:.
|
||||
:param paths: list of sources or single source.
|
||||
each source may be either string or CDXSource instance. value
|
||||
of any other types will be silently ignored.
|
||||
:param config: config object passed to :method:`add_cdx_source`.
|
||||
"""
|
||||
self.sources = []
|
||||
|
||||
return self._check_cdx_iter(cdx_iter, params)
|
||||
if paths is not None:
|
||||
if not isinstance(paths, (list, tuple)):
|
||||
paths = [paths]
|
||||
|
||||
for path in paths:
|
||||
self.add_cdx_source(path, config)
|
||||
|
||||
if len(self.sources) == 0:
|
||||
logging.warn('No CDX Sources configured from paths=%s', paths)
|
||||
|
||||
def _add_cdx_source(self, source):
|
||||
if source is None: return
|
||||
logging.debug('Adding CDX Source: %s', source)
|
||||
self.sources.append(source)
|
||||
|
||||
def add_cdx_source(self, source, config):
|
||||
if source is None: return
|
||||
if isinstance(source, CDXSource):
|
||||
self._add_cdx_source(source)
|
||||
elif isinstance(source, str):
|
||||
if os.path.isdir(source):
|
||||
for fn in os.listdir(source):
|
||||
self._add_cdx_source(self._create_cdx_source(
|
||||
os.path.join(source, fn), config))
|
||||
else:
|
||||
self._add_cdx_source(self._create_cdx_source(
|
||||
source, config))
|
||||
|
||||
def _create_cdx_source(self, filename, config):
|
||||
if is_http(filename):
|
||||
return RemoteCDXSource(filename)
|
||||
|
||||
if filename.startswith('redis://'):
|
||||
return RedisCDXSource(filename, config)
|
||||
|
||||
if filename.endswith('.cdx'):
|
||||
return CDXFile(filename)
|
||||
|
||||
if filename.endswith(('.summary', '.idx')):
|
||||
return ZipNumCluster(filename, config)
|
||||
|
||||
logging.warn('skipping unrecognized URI:%s', filename)
|
||||
return None
|
||||
|
||||
def __str__(self):
|
||||
return 'CDX server serving from ' + str(self.sources)
|
||||
@ -123,20 +175,14 @@ class RemoteCDXServer(BaseCDXServer):
|
||||
|
||||
if isinstance(source, RemoteCDXSource):
|
||||
self.source = source
|
||||
elif (isinstance(source, str) and
|
||||
any(source.startswith(x) for x in ['http://', 'https://'])):
|
||||
self.source = RemoteCDXSource(source)
|
||||
elif (isinstance(source, str) and is_http(source)):
|
||||
self.source = RemoteCDXSource(source, remote_processing=True)
|
||||
else:
|
||||
raise Exception('Invalid remote cdx source: ' + str(source))
|
||||
|
||||
def load_cdx(self, **params):
|
||||
remote_iter = self.source.load_cdx(params)
|
||||
|
||||
# if need raw, convert to raw format here
|
||||
if params.get('output') == 'raw':
|
||||
remote_iter = (CDXObject(cdx) for cdx in remote_iter)
|
||||
|
||||
return self._check_cdx_iter(remote_iter, params)
|
||||
def load_cdx_query(self, query):
|
||||
remote_iter = cdx_load([self.source], query, process=False)
|
||||
return self._check_cdx_iter(remote_iter, query)
|
||||
|
||||
def __str__(self):
|
||||
return 'Remote CDX server serving from ' + str(self.sources[0])
|
||||
@ -169,74 +215,3 @@ def create_cdx_server(config, ds_rules_file=None):
|
||||
perms_checker=perms_checker)
|
||||
|
||||
|
||||
#=================================================================
|
||||
def create_cdx_sources(paths, config=None):
|
||||
sources = []
|
||||
|
||||
if not isinstance(paths, list):
|
||||
paths = [paths]
|
||||
|
||||
for path in paths:
|
||||
if isinstance(path, CDXSource):
|
||||
add_cdx_source(sources, path, config)
|
||||
elif isinstance(path, str):
|
||||
if os.path.isdir(path):
|
||||
for file in os.listdir(path):
|
||||
add_cdx_source(sources, path + file, config)
|
||||
else:
|
||||
add_cdx_source(sources, path, config)
|
||||
|
||||
if len(sources) == 0:
|
||||
logging.exception('No CDX Sources Found from: ' + str(sources))
|
||||
|
||||
return sources
|
||||
|
||||
|
||||
#=================================================================
|
||||
def add_cdx_source(sources, source, config):
|
||||
if not isinstance(source, CDXSource):
|
||||
source = create_cdx_source(source, config)
|
||||
if not source:
|
||||
return
|
||||
|
||||
logging.debug('Adding CDX Source: ' + str(source))
|
||||
sources.append(source)
|
||||
|
||||
|
||||
#=================================================================
|
||||
def create_cdx_source(filename, config):
|
||||
if is_http(filename):
|
||||
return RemoteCDXSource(filename)
|
||||
|
||||
if filename.startswith('redis://'):
|
||||
return RedisCDXSource(filename, config)
|
||||
|
||||
if filename.endswith('.cdx'):
|
||||
return CDXFile(filename)
|
||||
|
||||
if filename.endswith(('.summary', '.idx')):
|
||||
return ZipNumCluster(filename, config)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
#=================================================================
|
||||
def extract_params_from_wsgi_env(env):
|
||||
""" utility function to extract params from the query
|
||||
string of a WSGI environment dictionary
|
||||
"""
|
||||
# use url= param to get actual url
|
||||
params = urlparse.parse_qs(env['QUERY_STRING'])
|
||||
|
||||
if not 'output' in params:
|
||||
params['output'] = 'text'
|
||||
|
||||
# parse_qs produces arrays for single values
|
||||
# cdx processing expects singleton params for all params,
|
||||
# except filters, so convert here
|
||||
# use first value of the list
|
||||
for name, val in params.iteritems():
|
||||
if name != 'filter':
|
||||
params[name] = val[0]
|
||||
|
||||
return params
|
||||
|
@ -2,6 +2,7 @@ from pywb.utils.binsearch import iter_range
|
||||
from pywb.utils.loaders import SeekableTextFileReader
|
||||
|
||||
from cdxobject import AccessException
|
||||
from query import CDXQuery
|
||||
|
||||
import urllib
|
||||
import urllib2
|
||||
@ -12,7 +13,7 @@ class CDXSource(object):
|
||||
"""
|
||||
Represents any cdx index source
|
||||
"""
|
||||
def load_cdx(self, params):
|
||||
def load_cdx(self, query):
|
||||
raise NotImplementedError('Implement in subclass')
|
||||
|
||||
|
||||
@ -24,9 +25,9 @@ class CDXFile(CDXSource):
|
||||
def __init__(self, filename):
|
||||
self.filename = filename
|
||||
|
||||
def load_cdx(self, params):
|
||||
def load_cdx(self, query):
|
||||
source = SeekableTextFileReader(self.filename)
|
||||
return iter_range(source, params.get('key'), params.get('end_key'))
|
||||
return iter_range(source, query.key, query.end_key)
|
||||
|
||||
def __str__(self):
|
||||
return 'CDX File - ' + self.filename
|
||||
@ -40,25 +41,20 @@ class RemoteCDXSource(CDXSource):
|
||||
Only url and match type params are proxied at this time,
|
||||
the stream is passed through all other filters locally.
|
||||
"""
|
||||
def __init__(self, filename, cookie=None, proxy_all=True):
|
||||
def __init__(self, filename, cookie=None, remote_processing=False):
|
||||
self.remote_url = filename
|
||||
self.cookie = cookie
|
||||
self.proxy_all = proxy_all
|
||||
self.remote_processing = remote_processing
|
||||
|
||||
def load_cdx(self, proxy_params):
|
||||
if self.proxy_all:
|
||||
params = proxy_params
|
||||
params['proxyAll'] = True
|
||||
def load_cdx(self, query):
|
||||
if self.remote_processing:
|
||||
remote_query = query
|
||||
else:
|
||||
# Only send url and matchType params to remote
|
||||
params = {}
|
||||
params['url'] = proxy_params['url']
|
||||
match_type = proxy_params.get('matchType')
|
||||
remote_query = CDXQuery(url=query.url,
|
||||
match_type=query.match_type)
|
||||
|
||||
if match_type:
|
||||
proxy_params['matchType'] = match_type
|
||||
|
||||
urlparams = urllib.urlencode(params, True)
|
||||
urlparams = remote_query.urlencode()
|
||||
|
||||
try:
|
||||
request = urllib2.Request(self.remote_url, urlparams)
|
||||
@ -97,14 +93,14 @@ class RedisCDXSource(CDXSource):
|
||||
self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
|
||||
|
||||
|
||||
def load_cdx(self, params):
|
||||
def load_cdx(self, query):
|
||||
"""
|
||||
Load cdx from redis cache, from an ordered list
|
||||
|
||||
Currently, there is no support for range queries
|
||||
Only 'exact' matchType is supported
|
||||
"""
|
||||
key = params['key']
|
||||
key = query.key
|
||||
|
||||
# ensure only url/surt is part of key
|
||||
key = key.split(' ')[0]
|
||||
|
119
pywb/cdx/query.py
Normal file
119
pywb/cdx/query.py
Normal file
@ -0,0 +1,119 @@
|
||||
from urllib import urlencode
|
||||
from urlparse import parse_qs
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDXQuery(object):
|
||||
def __init__(self, **kwargs):
|
||||
self.params = kwargs
|
||||
|
||||
@property
|
||||
def key(self):
|
||||
return self.params['key']
|
||||
|
||||
@property
|
||||
def end_key(self):
|
||||
return self.params['end_key']
|
||||
|
||||
def set_key(self, key, end_key):
|
||||
self.params['key'] = key
|
||||
self.params['end_key'] = end_key
|
||||
|
||||
@property
|
||||
def url(self):
|
||||
try:
|
||||
return self.params['url']
|
||||
except KeyError:
|
||||
msg = 'A url= param must be specified to query the cdx server'
|
||||
raise CDXException(msg)
|
||||
|
||||
@property
|
||||
def match_type(self):
|
||||
return self.params.get('matchType', 'exact')
|
||||
|
||||
@property
|
||||
def is_exact(self):
|
||||
return self.match_type == 'exact'
|
||||
|
||||
@property
|
||||
def allow_fuzzy(self):
|
||||
return self._get_bool('allowFuzzy')
|
||||
|
||||
@property
|
||||
def output(self):
|
||||
return self.params.get('output', 'text')
|
||||
|
||||
@property
|
||||
def limit(self):
|
||||
return int(self.params.get('limit', 100000))
|
||||
|
||||
@property
|
||||
def collapse_time(self):
|
||||
return self.params.get('collapseTime')
|
||||
|
||||
@property
|
||||
def resolve_revisits(self):
|
||||
return self._get_bool('resolveRevisits')
|
||||
|
||||
@property
|
||||
def filters(self):
|
||||
return self.params.get('filter', [])
|
||||
|
||||
@property
|
||||
def fields(self):
|
||||
v = self.params.get('fields')
|
||||
return v.split(',') if v else None
|
||||
|
||||
@property
|
||||
def closest(self):
|
||||
# sort=closest is not required
|
||||
return self.params.get('closest')
|
||||
|
||||
@property
|
||||
def reverse(self):
|
||||
# sort=reverse overrides reverse=0
|
||||
return (self._get_bool('reverse') or
|
||||
self.params.get('sort') == 'reverse')
|
||||
|
||||
@property
|
||||
def secondary_index_only(self):
|
||||
return self._get_bool('showPagedIndex')
|
||||
|
||||
def _get_bool(self, name, def_val=False):
|
||||
v = self.params.get(name)
|
||||
if v:
|
||||
try:
|
||||
v = int(v)
|
||||
except ValueError as ex:
|
||||
v = (v.lower() == 'true')
|
||||
else:
|
||||
v = def_val
|
||||
|
||||
return bool(v)
|
||||
|
||||
def urlencode(self):
|
||||
return urlencode(self.params, True)
|
||||
|
||||
@staticmethod
|
||||
def from_wsgi_env(env):
|
||||
return CDXQuery(**CDXQuery.extract_params_from_wsgi_env(env))
|
||||
|
||||
@staticmethod
|
||||
def extract_params_from_wsgi_env(env):
|
||||
""" utility function to extract params and create a CDXQuery
|
||||
from a WSGI environment dictionary
|
||||
"""
|
||||
params = parse_qs(env['QUERY_STRING'])
|
||||
|
||||
if not 'output' in params:
|
||||
params['output'] = 'text'
|
||||
|
||||
# parse_qs produces arrays for single values
|
||||
# cdx processing expects singleton params for all params,
|
||||
# except filters, so convert here
|
||||
# use first value of the list
|
||||
for name, val in params.iteritems():
|
||||
if name != 'filter':
|
||||
params[name] = val[0]
|
||||
|
||||
return params
|
@ -142,6 +142,8 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
|
||||
('filename', 'dupes.warc.gz')]
|
||||
|
||||
# NOTE: external dependency -- need self-contained test TODO
|
||||
|
||||
# Load remote query but filter locally
|
||||
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
|
||||
>>> pprint.pprint(x.next().items())
|
||||
[('urlkey', 'com,example)/'),
|
||||
@ -152,14 +154,24 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
|
||||
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
|
||||
('length', '1792')]
|
||||
|
||||
# No local filtering/processing of cdx, simply return result from remote server
|
||||
>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
|
||||
>>> pprint.pprint(x.next().items())
|
||||
[('urlkey', 'com,example)/'),
|
||||
('timestamp', '20020120142510'),
|
||||
('original', 'http://example.com:80/'),
|
||||
('mimetype', 'text/html'),
|
||||
('statuscode', '200'),
|
||||
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
|
||||
('length', '1792')]
|
||||
|
||||
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2')
|
||||
>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2')
|
||||
Traceback (most recent call last):
|
||||
AccessException: Blocked By Robots
|
||||
"""
|
||||
|
||||
#=================================================================
|
||||
from pywb.cdx.cdxserver import CDXServer
|
||||
from pywb.cdx.cdxserver import CDXServer, RemoteCDXServer
|
||||
import os
|
||||
import sys
|
||||
import pprint
|
||||
@ -167,22 +179,42 @@ import pprint
|
||||
from pywb import get_test_dir
|
||||
#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/'
|
||||
test_cdx_dir = get_test_dir() + 'cdx/'
|
||||
from pywb.cdx.cdxobject import AccessException
|
||||
|
||||
from tests.fixture import testconfig, TestExclusionPerms
|
||||
|
||||
import pytest
|
||||
|
||||
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
|
||||
kwparams['url'] = url
|
||||
kwparams['output'] = 'text'
|
||||
fields = kwparams.get('fields')
|
||||
if fields:
|
||||
fields = fields.split(',')
|
||||
|
||||
server = CDXServer(sources)
|
||||
results = server.load_cdx(**kwparams)
|
||||
|
||||
for x in results:
|
||||
x = x.replace('\t', ' ')
|
||||
sys.stdout.write(x)
|
||||
l = x.to_text(fields).replace('\t', ' ')
|
||||
sys.stdout.write(l)
|
||||
|
||||
#================================================================
|
||||
|
||||
def test_excluded(testconfig):
|
||||
testconfig['perms_checker'] = TestExclusionPerms()
|
||||
sources = testconfig.get('index_paths')
|
||||
print sources
|
||||
server = CDXServer(sources, perms_checker=testconfig['perms_checker'])
|
||||
assert isinstance(server, CDXServer)
|
||||
assert server.perms_checker
|
||||
|
||||
url = 'http://www.iana.org/_img/bookmark_icon.ico'
|
||||
key = 'org,iana)/_img/bookmark_icon.ico'
|
||||
with pytest.raises(AccessException):
|
||||
cdxobjs = list(server.load_cdx(url=url))
|
||||
print cdxobjs
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
||||
|
||||
|
28
pywb/cdx/test/test_perms.py
Normal file
28
pywb/cdx/test/test_perms.py
Normal file
@ -0,0 +1,28 @@
|
||||
from pywb.cdx.cdxops import cdx_load
|
||||
from pywb.cdx.perms import AllowAllPerms
|
||||
from pywb.cdx.query import CDXQuery
|
||||
from pywb.cdx.cdxobject import AccessException
|
||||
|
||||
from pytest import raises
|
||||
|
||||
class BlockAllPerms(AllowAllPerms):
|
||||
def allow_url_lookup(self, urlkey, url):
|
||||
return False
|
||||
|
||||
|
||||
def test_exclusion_short_circuit():
|
||||
"""
|
||||
# Verify that exclusion check 'short-circuits' further evaluation.. eg, a bad cdx source is not even loaded
|
||||
# if exclusion check does not pass
|
||||
"""
|
||||
cdx_iter = cdx_load(['bogus ignored'], CDXQuery(url='example.com', key='com,example)/'),
|
||||
perms_checker=BlockAllPerms(), process=True)
|
||||
|
||||
# exception happens on first access attempt
|
||||
with raises(AccessException):
|
||||
cdx_iter.next()
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -1,10 +1,10 @@
|
||||
import webtest
|
||||
from pywb.cdx.wsgi_cdxserver import main
|
||||
from pywb.cdx.wsgi_cdxserver import create_app
|
||||
from pywb import get_test_dir
|
||||
|
||||
class TestCdx:
|
||||
def setup(self):
|
||||
self.app = main(get_test_dir() + 'cdx/')
|
||||
self.app = create_app(get_test_dir() + 'cdx/')
|
||||
self.testapp = webtest.TestApp(self.app)
|
||||
|
||||
def test_cdx(self):
|
||||
|
@ -1,10 +1,12 @@
|
||||
from cdxserver import create_cdx_server, extract_params_from_wsgi_env
|
||||
from werkzeug.wrappers import BaseResponse
|
||||
from cdxserver import create_cdx_server
|
||||
from pywb import get_test_dir
|
||||
from query import CDXQuery
|
||||
|
||||
import logging
|
||||
import os
|
||||
import yaml
|
||||
import pkgutil
|
||||
import pkg_resources
|
||||
|
||||
#=================================================================
|
||||
CONFIG_FILE = 'config.yaml'
|
||||
@ -13,65 +15,89 @@ RULES_FILE = 'rules.yaml'
|
||||
|
||||
DEFAULT_PORT = 8080
|
||||
|
||||
config = None
|
||||
if __package__:
|
||||
try:
|
||||
config = pkgutil.get_data(__package__, CONFIG_FILE)
|
||||
config = yaml.load(config)
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
#=================================================================
|
||||
def main(paths=None):
|
||||
|
||||
class CDXQueryRequest(object):
|
||||
def __init__(self, environ):
|
||||
self.query = CDXQuery.from_wsgi_env(environ)
|
||||
|
||||
|
||||
class WSGICDXServer(object):
|
||||
def __init__(self, config, rules_file):
|
||||
self.cdxserver = create_cdx_server(config, rules_file)
|
||||
|
||||
def __call__(self, environ, start_response):
|
||||
request = CDXQueryRequest(environ)
|
||||
try:
|
||||
logging.debug('request.args=%s', request.query)
|
||||
result = self.cdxserver.load_cdx_query(request.query)
|
||||
|
||||
# TODO: select response type by "output" parameter
|
||||
response = PlainTextResponse(result, request.query.fields)
|
||||
return response(environ, start_response)
|
||||
except Exception as exc:
|
||||
logging.error('load_cdx failed', exc_info=1)
|
||||
# TODO: error response should be different for each response
|
||||
# type
|
||||
start_response('400 Error', [('Content-Type', 'text/plain')])
|
||||
return [str(exc)]
|
||||
|
||||
def cdx_text_out(cdx, fields):
|
||||
if not fields:
|
||||
return str(cdx) + '\n'
|
||||
else:
|
||||
logging.info('cdx fields=%s', cdx.keys)
|
||||
# TODO: this will results in an exception if fields contain
|
||||
# non-existent field name.
|
||||
return ' '.join(cdx[x] for x in fields) + '\n'
|
||||
|
||||
class PlainTextResponse(BaseResponse):
|
||||
def __init__(self, cdxitr, fields, status=200, content_type='text/plain'):
|
||||
super(PlainTextResponse, self).__init__(
|
||||
response=(
|
||||
cdx.to_text(fields) for cdx in cdxitr
|
||||
),
|
||||
status=status, content_type=content_type)
|
||||
|
||||
# class JsonResponse(Response):
|
||||
# pass
|
||||
# class MementoResponse(Response):
|
||||
# pass
|
||||
|
||||
def create_app(config=None):
|
||||
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
|
||||
level=logging.DEBUG)
|
||||
|
||||
if not paths:
|
||||
if config:
|
||||
paths = config
|
||||
else:
|
||||
paths = get_test_dir() + 'cdx/'
|
||||
|
||||
cdxserver = create_cdx_server(paths, RULES_FILE)
|
||||
|
||||
def application(env, start_response):
|
||||
try:
|
||||
params = extract_params_from_wsgi_env(env)
|
||||
response = cdxserver.load_cdx(**params)
|
||||
|
||||
start_response('200 OK', [('Content-Type', 'text/plain')])
|
||||
|
||||
except Exception as exc:
|
||||
import traceback
|
||||
err_details = traceback.format_exc(exc)
|
||||
start_response('400 Error', [('Content-Type', 'text/plain')])
|
||||
response = [str(exc)]
|
||||
print err_details
|
||||
|
||||
return response
|
||||
|
||||
return application
|
||||
if not config:
|
||||
index_paths = get_test_dir() + 'cdx/'
|
||||
config = dict(index_paths=index_paths)
|
||||
|
||||
return WSGICDXServer(config, RULES_FILE)
|
||||
|
||||
if __name__ == "__main__":
|
||||
from wsgiref.simple_server import make_server
|
||||
from optparse import OptionParser
|
||||
from werkzeug.serving import run_simple
|
||||
|
||||
app = main()
|
||||
opt = OptionParser('%prog [OPTIONS]')
|
||||
opt.add_option('-p', '--port', type='int', default=None)
|
||||
|
||||
port = DEFAULT_PORT
|
||||
if config:
|
||||
port = config.get('port', DEFAULT_PORT)
|
||||
options, args = opt.parse_args()
|
||||
|
||||
httpd = make_server('', port, app)
|
||||
configdata = pkg_resources.resource_string(__name__, CONFIG_FILE)
|
||||
config = yaml.load(configdata)
|
||||
|
||||
logging.debug('Starting CDX Server on port ' + str(port))
|
||||
port = options.port
|
||||
if port is None:
|
||||
port = (config and config.get('port')) or DEFAULT_PORT
|
||||
|
||||
app = create_app(config)
|
||||
|
||||
logging.debug('Starting CDX Server on port %s', port)
|
||||
try:
|
||||
httpd.serve_forever()
|
||||
except KeyboardInterrupt:
|
||||
run_simple('0.0.0.0', port, app, use_reloader=True, use_debugger=True)
|
||||
except KeyboardInterrupt as ex:
|
||||
pass
|
||||
|
||||
logging.debug('Stopping CDX Server')
|
||||
else:
|
||||
application = main()
|
||||
# XXX pass production config
|
||||
application = create_app()
|
||||
|
@ -110,21 +110,20 @@ class ZipNumCluster(CDXSource):
|
||||
def lookup_loc(self, part):
|
||||
return self.loc_map[part]
|
||||
|
||||
def load_cdx(self, params):
|
||||
def load_cdx(self, query):
|
||||
self.reload_loc()
|
||||
|
||||
reader = SeekableTextFileReader(self.summary)
|
||||
|
||||
idx_iter = iter_range(reader,
|
||||
params['key'],
|
||||
params['end_key'],
|
||||
query.key,
|
||||
query.end_key,
|
||||
prev_size=1)
|
||||
|
||||
if params.get('showPagedIndex'):
|
||||
params['proxyAll'] = True
|
||||
if query.secondary_index_only:
|
||||
return idx_iter
|
||||
else:
|
||||
blocks = self.idx_to_cdx(idx_iter, params)
|
||||
blocks = self.idx_to_cdx(idx_iter, query)
|
||||
|
||||
def gen_cdx():
|
||||
for blk in blocks:
|
||||
@ -133,7 +132,7 @@ class ZipNumCluster(CDXSource):
|
||||
|
||||
return gen_cdx()
|
||||
|
||||
def idx_to_cdx(self, idx_iter, params):
|
||||
def idx_to_cdx(self, idx_iter, query):
|
||||
blocks = None
|
||||
ranges = []
|
||||
|
||||
@ -150,7 +149,7 @@ class ZipNumCluster(CDXSource):
|
||||
|
||||
else:
|
||||
if blocks:
|
||||
yield self.block_to_cdx_iter(blocks, ranges, params)
|
||||
yield self.block_to_cdx_iter(blocks, ranges, query)
|
||||
|
||||
blocks = ZipBlocks(idx['part'],
|
||||
idx['offset'],
|
||||
@ -160,15 +159,15 @@ class ZipNumCluster(CDXSource):
|
||||
ranges = [blocks.length]
|
||||
|
||||
if blocks:
|
||||
yield self.block_to_cdx_iter(blocks, ranges, params)
|
||||
yield self.block_to_cdx_iter(blocks, ranges, query)
|
||||
|
||||
def block_to_cdx_iter(self, blocks, ranges, params):
|
||||
def block_to_cdx_iter(self, blocks, ranges, query):
|
||||
last_exc = None
|
||||
last_traceback = None
|
||||
|
||||
for location in self.lookup_loc(blocks.part):
|
||||
try:
|
||||
return self.load_blocks(location, blocks, ranges, params)
|
||||
return self.load_blocks(location, blocks, ranges, query)
|
||||
except Exception as exc:
|
||||
last_exc = exc
|
||||
import sys
|
||||
@ -179,7 +178,7 @@ class ZipNumCluster(CDXSource):
|
||||
else:
|
||||
raise Exception('No Locations Found for: ' + block.part)
|
||||
|
||||
def load_blocks(self, location, blocks, ranges, params):
|
||||
def load_blocks(self, location, blocks, ranges, query):
|
||||
|
||||
if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG):
|
||||
msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
|
||||
@ -195,9 +194,9 @@ class ZipNumCluster(CDXSource):
|
||||
iter_ = itertools.chain(*itertools.imap(decompress_block, ranges))
|
||||
|
||||
# start bound
|
||||
iter_ = linearsearch(iter_, params['key'])
|
||||
iter_ = linearsearch(iter_, query.key)
|
||||
|
||||
# end bound
|
||||
end = params['end_key']
|
||||
end = query.end_key
|
||||
iter_ = itertools.takewhile(lambda line: line < end, iter_)
|
||||
return iter_
|
||||
|
@ -4,7 +4,7 @@ import mimetypes
|
||||
import time
|
||||
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.cdx.cdxserver import extract_params_from_wsgi_env
|
||||
from pywb.cdx.query import CDXQuery
|
||||
from wbrequestresponse import WbResponse
|
||||
from wbexceptions import WbException, NotFoundException
|
||||
from views import TextCapturesView
|
||||
@ -82,7 +82,7 @@ class CDXHandler(BaseHandler):
|
||||
self.view = view if view else TextCapturesView()
|
||||
|
||||
def __call__(self, wbrequest):
|
||||
params = extract_params_from_wsgi_env(wbrequest.env)
|
||||
params = CDXQuery.extract_params_from_wsgi_env(wbrequest.env)
|
||||
cdx_lines = self.index_reader.load_cdx(**params)
|
||||
|
||||
return self.view.render_response(wbrequest, cdx_lines)
|
||||
|
@ -30,7 +30,7 @@ class IndexReader(object):
|
||||
|
||||
params['allowFuzzy'] = True
|
||||
|
||||
cdxlines = self.load_cdx(url=wburl.url, output='raw', **params)
|
||||
cdxlines = self.load_cdx(url=wburl.url, **params)
|
||||
|
||||
return cdxlines
|
||||
|
||||
|
62
setup.py
62
setup.py
@ -1,24 +1,48 @@
|
||||
#!/usr/bin/env python
|
||||
# vim: set sw=4 et:
|
||||
|
||||
import setuptools
|
||||
from setuptools import setup, find_packages
|
||||
import glob
|
||||
|
||||
setuptools.setup(name='pywb',
|
||||
version='0.2',
|
||||
url='https://github.com/ikreymer/pywb',
|
||||
author='Ilya Kreymer',
|
||||
author_email='ilya@archive.org',
|
||||
long_description=open('README.md').read(),
|
||||
license='GPL',
|
||||
packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite','pywb.core','pywb.dispatch','pywb.bootstrap'],
|
||||
provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite','pywb.core','pywb.dispatch','pywb.bootstrap'],
|
||||
package_data={'pywb': ['ui/*', 'static/*', '*.yaml']},
|
||||
data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
|
||||
('sample_archive/zipcdx/', glob.glob('sample_archive/zipcdx/*')),
|
||||
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')),
|
||||
('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))],
|
||||
install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'],
|
||||
# tests_require=['WebTest', 'pytest'],
|
||||
zip_safe=False)
|
||||
|
||||
setup(
|
||||
name='pywb',
|
||||
version='0.2',
|
||||
url='https://github.com/ikreymer/pywb',
|
||||
author='Ilya Kreymer',
|
||||
author_email='ilya@archive.org',
|
||||
long_description=open('README.md').read(),
|
||||
license='GPL',
|
||||
packages=find_packages(),
|
||||
provides=[
|
||||
'pywb',
|
||||
'pywb.utils',
|
||||
'pywb.cdx',
|
||||
'pywb.warc',
|
||||
'pywb.rewrite',
|
||||
'pywb.core',
|
||||
'pywb.dispatch',
|
||||
'pywb.bootstrap'
|
||||
],
|
||||
package_data={
|
||||
'pywb': ['ui/*', 'static/*', '*.yaml'],
|
||||
},
|
||||
data_files = [
|
||||
('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
|
||||
('sample_archive/zipcdx/', glob.glob('sample_archive/zipcdx/*')),
|
||||
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')),
|
||||
('sample_archive/text_content/', glob.glob('sample_archive/text_content/*')),
|
||||
],
|
||||
install_requires=[
|
||||
'rfc3987',
|
||||
'chardet',
|
||||
'redis',
|
||||
'jinja2',
|
||||
'surt',
|
||||
'pyyaml',
|
||||
'WebTest',
|
||||
'pytest',
|
||||
'werkzeug>=0.9.4',
|
||||
],
|
||||
# tests_require=['WebTest', 'pytest'],
|
||||
zip_safe=False
|
||||
)
|
||||
|
@ -92,10 +92,10 @@ enable_cdx_api: true
|
||||
|
||||
# optional reporter callback func
|
||||
# if set, called with request and cdx object
|
||||
reporter: !!python/object/new:tests.test_integration.PrintReporter []
|
||||
reporter: !!python/object/new:tests.fixture.PrintReporter []
|
||||
|
||||
# custom rules for domain specific matching
|
||||
#domain_specific_rules: rules.yaml
|
||||
|
||||
#perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms []
|
||||
perms_checker: !!python/object/new:tests.test_integration.TestExclusionPerms []
|
||||
perms_checker: !!python/object/new:tests.fixture.TestExclusionPerms []
|
||||
|
45
tests/fixture.py
Normal file
45
tests/fixture.py
Normal file
@ -0,0 +1,45 @@
|
||||
import os
|
||||
import pytest
|
||||
|
||||
import yaml
|
||||
|
||||
from pywb.cdx.perms import AllowAllPerms
|
||||
|
||||
@pytest.fixture
|
||||
def testconfig():
|
||||
config = yaml.load(open('test_config.yaml'))
|
||||
assert config
|
||||
if 'index_paths' not in config:
|
||||
# !!! assumes this module is in a sub-directory of project root.
|
||||
config['index_paths'] = os.path.join(
|
||||
os.path.dirname(os.path.realpath(__file__)),
|
||||
'../sample_archive/cdx')
|
||||
return config
|
||||
|
||||
#================================================================
|
||||
# Reporter callback for replay view
|
||||
class PrintReporter:
|
||||
"""Reporter callback for replay view.
|
||||
"""
|
||||
def __call__(self, wbrequest, cdx, response):
|
||||
print wbrequest
|
||||
print cdx
|
||||
pass
|
||||
|
||||
#================================================================
|
||||
class TestExclusionPerms(AllowAllPerms):
|
||||
"""
|
||||
Perm Checker fixture which can block one URL.
|
||||
"""
|
||||
# sample_archive has captures for this URLKEY
|
||||
URLKEY_EXCLUDED = 'org,iana)/_img/bookmark_icon.ico'
|
||||
|
||||
def allow_url_lookup(self, urlkey, url):
|
||||
"""
|
||||
Return true/false if url or urlkey (canonicalized url)
|
||||
should be allowed
|
||||
"""
|
||||
if urlkey == self.URLKEY_EXCLUDED:
|
||||
return False
|
||||
|
||||
return super(TestExclusionPerms, self).allow_url_lookup(urlkey, url)
|
@ -2,14 +2,17 @@ import webtest
|
||||
from pywb.bootstrap.pywb_init import pywb_config
|
||||
from pywb.bootstrap.wbapp import create_wb_app
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
from pywb.cdx.perms import AllowAllPerms
|
||||
|
||||
from fixture import TestExclusionPerms
|
||||
|
||||
class TestWb:
|
||||
TEST_CONFIG = 'test_config.yaml'
|
||||
|
||||
def setup(self):
|
||||
#self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config())
|
||||
self.app = create_wb_app(pywb_config(self.TEST_CONFIG))
|
||||
# save it in self - useful for debugging
|
||||
self.router = pywb_config(self.TEST_CONFIG)
|
||||
self.app = create_wb_app(self.router)
|
||||
self.testapp = webtest.TestApp(self.app)
|
||||
|
||||
def _assert_basic_html(self, resp):
|
||||
@ -207,24 +210,3 @@ class TestWb:
|
||||
assert resp.status_int == 400
|
||||
assert 'Invalid Url: http://?abc' in resp.body
|
||||
|
||||
#=================================================================
|
||||
# Reporter callback for replay view
|
||||
class PrintReporter:
|
||||
def __call__(self, wbrequest, cdx, response):
|
||||
print wbrequest
|
||||
print cdx
|
||||
|
||||
#=================================================================
|
||||
class TestExclusionPerms(AllowAllPerms):
|
||||
"""
|
||||
Sample Perm Checker with hard-coded exclusion
|
||||
"""
|
||||
def allow_url_lookup(self, urlkey, url):
|
||||
"""
|
||||
Return true/false if url or urlkey (canonicalized url)
|
||||
should be allowed
|
||||
"""
|
||||
if urlkey == 'org,iana)/_img/bookmark_icon.ico':
|
||||
return False
|
||||
|
||||
return super(TestExclusionPerms, self).allow_url_lookup(urlkey, url)
|
||||
|
189
tests/test_wsgi_cdxserver.py
Normal file
189
tests/test_wsgi_cdxserver.py
Normal file
@ -0,0 +1,189 @@
|
||||
import os
|
||||
import re
|
||||
|
||||
import pytest
|
||||
from urllib import urlencode
|
||||
|
||||
from werkzeug.test import Client
|
||||
from werkzeug.wrappers import BaseResponse, Response
|
||||
|
||||
import yaml
|
||||
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
from pywb.cdx.wsgi_cdxserver import create_app
|
||||
|
||||
from tests.fixture import testconfig
|
||||
|
||||
@pytest.fixture
|
||||
def client(testconfig):
|
||||
app = create_app(testconfig)
|
||||
return Client(app, Response)
|
||||
|
||||
# ================================================================
|
||||
|
||||
def query(client, url, **params):
|
||||
params['url'] = url
|
||||
return client.get('/cdx?' + urlencode(params, doseq=1))
|
||||
|
||||
# ================================================================
|
||||
|
||||
def test_exact_url(client):
|
||||
"""
|
||||
basic exact match, no filters, etc.
|
||||
"""
|
||||
resp = query(client, 'http://www.iana.org/')
|
||||
|
||||
assert resp.status_code == 200
|
||||
print resp.data
|
||||
|
||||
def test_prefix_match(client):
|
||||
"""
|
||||
prefix match test
|
||||
"""
|
||||
resp = query(client, 'http://www.iana.org/', matchType='prefix')
|
||||
|
||||
print resp.data.splitlines()
|
||||
assert resp.status_code == 200
|
||||
|
||||
suburls = 0
|
||||
for l in resp.data.splitlines():
|
||||
fields = l.split(' ')
|
||||
if len(fields[0]) > len('org,iana)/'):
|
||||
suburls += 1
|
||||
assert suburls > 0
|
||||
|
||||
def test_filters(client):
|
||||
"""
|
||||
filter cdxes by mimetype and filename field, exact match.
|
||||
"""
|
||||
resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css',
|
||||
filter=('mimetype:warc/revisit', 'filename:dupes.warc.gz'))
|
||||
|
||||
assert resp.status_code == 200
|
||||
assert resp.mimetype == 'text/plain'
|
||||
|
||||
for l in resp.data.splitlines():
|
||||
fields = l.split(' ')
|
||||
assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
|
||||
assert fields[3] == 'warc/revisit'
|
||||
assert fields[10] == 'dupes.warc.gz'
|
||||
|
||||
def test_limit(client):
|
||||
resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css',
|
||||
limit='1')
|
||||
|
||||
assert resp.status_code == 200
|
||||
assert resp.mimetype == 'text/plain'
|
||||
|
||||
cdxes = resp.data.splitlines()
|
||||
assert len(cdxes) == 1
|
||||
fields = cdxes[0].split(' ')
|
||||
assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
|
||||
assert fields[1] == '20140126200625'
|
||||
assert fields[3] == 'text/css'
|
||||
|
||||
resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css',
|
||||
limit='1', reverse='1')
|
||||
|
||||
assert resp.status_code == 200
|
||||
assert resp.mimetype == 'text/plain'
|
||||
|
||||
cdxes = resp.data.splitlines()
|
||||
assert len(cdxes) == 1
|
||||
fields = cdxes[0].split(' ')
|
||||
assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
|
||||
assert fields[1] == '20140127171239'
|
||||
assert fields[3] == 'warc/revisit'
|
||||
|
||||
def test_fields(client):
|
||||
"""
|
||||
retrieve subset of fields with ``fields`` parameter.
|
||||
"""
|
||||
resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
|
||||
fields='urlkey,timestamp,statuscode')
|
||||
|
||||
assert resp.status_code == 200
|
||||
|
||||
cdxes = resp.data.splitlines()
|
||||
|
||||
for cdx in cdxes:
|
||||
fields = cdx.split(' ')
|
||||
assert len(fields) == 3
|
||||
assert fields[0] == 'org,iana)/_css/2013.1/print.css'
|
||||
assert re.match(r'\d{14}$', fields[1])
|
||||
assert re.match(r'\d{3}|-', fields[2])
|
||||
|
||||
def test_fields_undefined(client):
|
||||
"""
|
||||
server shall respond with Bad Request (TODO: with proper explanation),
|
||||
when ``fields`` parameter contains undefined name(s).
|
||||
"""
|
||||
resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
|
||||
fields='urlkey,nosuchfield')
|
||||
|
||||
resp.status_code == 400
|
||||
|
||||
def test_resolveRevisits(client):
|
||||
"""
|
||||
with ``resolveRevisits=true``, server adds three fields pointing to
|
||||
the *original* capture.
|
||||
"""
|
||||
resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
|
||||
resolveRevisits='true'
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
assert resp.mimetype == 'text/plain'
|
||||
|
||||
cdxes = resp.data.splitlines()
|
||||
originals = {}
|
||||
for cdx in cdxes:
|
||||
fields = cdx.split(' ')
|
||||
assert len(fields) == 14
|
||||
(key, ts, url, mt, st, sha, _, _, size, offset, fn,
|
||||
orig_size, orig_offset, orig_fn) = fields
|
||||
# orig_* fields are either all '-' or (int, int, filename)
|
||||
# check if orig_* fields are equals to corresponding fields
|
||||
# for the original capture.
|
||||
if orig_size == '-':
|
||||
assert orig_offset == '-' and orig_fn == '-'
|
||||
originals[sha] = (int(size), int(offset), fn)
|
||||
else:
|
||||
orig = originals.get(sha)
|
||||
assert orig == (int(orig_size), int(orig_offset), orig_fn)
|
||||
|
||||
def test_resolveRevisits_orig_fields(client):
|
||||
"""
|
||||
when resolveRevisits=true, extra three fields are named
|
||||
``orig.length``, ``orig.offset`` and ``orig.filename``, respectively.
|
||||
it is possible to filter fields by these names.
|
||||
"""
|
||||
resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
|
||||
resolveRevisits='1',
|
||||
fields='urlkey,orig.length,orig.offset,orig.filename'
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
assert resp.mimetype == 'text/plain'
|
||||
|
||||
cdxes = resp.data.splitlines()
|
||||
for cdx in cdxes:
|
||||
fields = cdx.split(' ')
|
||||
assert len(fields) == 4
|
||||
key, orig_len, orig_offset, orig_fn = fields
|
||||
assert (orig_len == '-' and orig_offset == '-' and orig_fn == '-' or
|
||||
(int(orig_len), int(orig_offset), orig_fn))
|
||||
|
||||
def test_collapseTime_resolveRevisits_reverse(client):
|
||||
resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
|
||||
collapseTime='11',
|
||||
resolveRevisits='true',
|
||||
reverse='true'
|
||||
)
|
||||
|
||||
cdxes = [CDXObject(l) for l in resp.data.splitlines()]
|
||||
|
||||
assert len(cdxes) == 3
|
||||
|
||||
# timestamp is in descending order
|
||||
for i in range(len(cdxes) - 1):
|
||||
assert cdxes[i]['timestamp'] >= cdxes[i + 1]['timestamp']
|
||||
|
Loading…
x
Reference in New Issue
Block a user