1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

update pkg-reorg with changes from master, including

CDXQuery configuration
This commit is contained in:
Ilya Kreymer 2014-03-02 00:26:29 -08:00
commit 19f86305bf
18 changed files with 746 additions and 309 deletions

View File

@ -1,12 +1,13 @@
import yaml import yaml
import re import re
import logging import logging
import pkgutil import pkg_resources
from pywb.utils.dsrules import BaseRule, RuleSet from pywb.utils.dsrules import BaseRule, RuleSet
from pywb.utils.canonicalize import unsurt, UrlCanonicalizer from pywb.utils.canonicalize import unsurt, UrlCanonicalizer
from query import CDXQuery
#================================================================= #=================================================================
def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered): def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
@ -70,13 +71,13 @@ class FuzzyQuery:
def __init__(self, rules): def __init__(self, rules):
self.rules = rules self.rules = rules
def __call__(self, params): def __call__(self, query):
matched_rule = None matched_rule = None
urlkey = params['key'] urlkey = query.key
url = params['url'] url = query.url
filter_ = params.get('filter', []) filter_ = query.filters
output = params.get('output') output = query.output
for rule in self.rules.iter_matching(urlkey): for rule in self.rules.iter_matching(urlkey):
m = rule.regex.search(urlkey) m = rule.regex.search(urlkey)
@ -102,7 +103,7 @@ class FuzzyQuery:
'filter': filter_, 'filter': filter_,
'output': output} 'output': output}
return params return CDXQuery(**params)
#================================================================= #=================================================================

View File

@ -1,6 +1,9 @@
from collections import OrderedDict from collections import OrderedDict
import itertools import itertools
from urllib import urlencode
from urlparse import parse_qs
#================================================================= #=================================================================
class CDXException(Exception): class CDXException(Exception):
@ -71,12 +74,25 @@ class CDXObject(OrderedDict):
# force regen on next __str__ call # force regen on next __str__ call
self.cdxline = None self.cdxline = None
def is_revisit(self):
return (self['mimetype'] == 'warc/revisit' or
self['filename'] == '-')
def to_text(self, fields=None):
"""
return plaintext CDX record (includes newline).
:param fields: list of field names to output.
"""
if fields is None:
return str(self) + '\n'
else:
return ' '.join(self[x] for x in fields) + '\n'
def __str__(self): def __str__(self):
if self.cdxline: if self.cdxline:
return self.cdxline return self.cdxline
li = itertools.imap(lambda (n, val): val, self.items()) return ' '.join(val for n, val in self.iteritems())
return ' '.join(li)
#================================================================= #=================================================================
@ -106,5 +122,12 @@ class IDXObject(OrderedDict):
self.idxline = idxline self.idxline = idxline
def to_text(self, fields=None):
"""
return plaintext IDX record (including newline).
:param fields: list of field names to output (currently ignored)
"""
return str(self) + '\n'
def __str__(self): def __str__(self):
return self.idxline return self.idxline

View File

@ -1,4 +1,5 @@
from cdxobject import CDXObject, IDXObject, AccessException from cdxobject import CDXObject, IDXObject, AccessException
from query import CDXQuery
from pywb.utils.timeutils import timestamp_to_sec from pywb.utils.timeutils import timestamp_to_sec
import bisect import bisect
@ -10,32 +11,44 @@ from collections import deque
#================================================================= #=================================================================
def cdx_load(sources, params, perms_checker=None): def cdx_load(sources, query, perms_checker=None, process=True):
"""
merge text CDX lines from sources, return an iterator for
filtered and access-checked sequence of CDX objects.
:param sources: iterable for text CDX sources.
:param perms_checker: access check filter object implementing
allow_url_lookup(key, url), allow_capture(cdxobj) and
filter_fields(cdxobj) methods.
:param process: bool, perform processing sorting/filtering/grouping ops
"""
cdx_iter = load_cdx_streams(sources, query)
cdx_iter = make_obj_iter(cdx_iter, query)
if process and not query.secondary_index_only:
cdx_iter = process_cdx(cdx_iter, query)
if perms_checker: if perms_checker:
cdx_iter = cdx_load_with_perms(sources, params, perms_checker) cdx_iter = restrict_cdx(cdx_iter, query, perms_checker)
else:
cdx_iter = cdx_load_and_filter(sources, params)
# output raw cdx objects
if params.get('output') == 'raw':
return cdx_iter
def write_cdx(fields):
for cdx in cdx_iter:
yield cdx_text_out(cdx, fields) + '\n'
return write_cdx(params.get('fields'))
return cdx_iter
#================================================================= #=================================================================
def cdx_load_with_perms(sources, params, perms_checker): def restrict_cdx(cdx_iter, query, perms_checker):
if not perms_checker.allow_url_lookup(params['key'], params['url']): """
if params.get('matchType', 'exact') == 'exact': filter out those cdx records that user doesn't have access to,
by consulting :param perms_checker:.
:param cdx_iter: cdx record source iterable
:param query: request parameters (CDXQuery)
:param perms_checker: object implementing permission checker
"""
if not perms_checker.allow_url_lookup(query.key, query.url):
if query.is_exact:
raise AccessException('Excluded') raise AccessException('Excluded')
cdx_iter = cdx_load_and_filter(sources, params)
for cdx in cdx_iter: for cdx in cdx_iter:
# TODO: we could let filter_fields handle this case by accepting
# None as a return value.
if not perms_checker.allow_capture(cdx): if not perms_checker.allow_capture(cdx):
continue continue
@ -43,45 +56,27 @@ def cdx_load_with_perms(sources, params, perms_checker):
yield cdx yield cdx
#================================================================= #=================================================================
def cdx_text_out(cdx, fields): def process_cdx(cdx_iter, query):
if not fields: if query.resolve_revisits:
return str(cdx)
else:
return ' '.join(map(lambda x: cdx[x], fields.split(',')))
#=================================================================
def cdx_load_and_filter(sources, params):
cdx_iter = load_cdx_streams(sources, params)
cdx_iter = make_obj_iter(cdx_iter, params)
if params.get('proxyAll'):
return cdx_iter
resolve_revisits = params.get('resolveRevisits', False)
if resolve_revisits:
cdx_iter = cdx_resolve_revisits(cdx_iter) cdx_iter = cdx_resolve_revisits(cdx_iter)
filters = params.get('filter', None) filters = query.filters
if filters: if filters:
cdx_iter = cdx_filter(cdx_iter, filters) cdx_iter = cdx_filter(cdx_iter, filters)
collapse_time = params.get('collapseTime', None) collapse_time = query.collapse_time
if collapse_time: if collapse_time:
cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time) cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time)
limit = int(params.get('limit', 1000000)) limit = query.limit
reverse = params.get('reverse', False) or params.get('sort') == 'reverse' if query.reverse:
if reverse:
cdx_iter = cdx_reverse(cdx_iter, limit) cdx_iter = cdx_reverse(cdx_iter, limit)
closest_to = params.get('closest', None) closest = query.closest
if closest_to: if closest:
cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit) cdx_iter = cdx_sort_closest(closest, cdx_iter, limit)
if limit: if limit:
cdx_iter = cdx_limit(cdx_iter, limit) cdx_iter = cdx_limit(cdx_iter, limit)
@ -91,26 +86,28 @@ def cdx_load_and_filter(sources, params):
#================================================================= #=================================================================
# load and source merge cdx streams # load and source merge cdx streams
def load_cdx_streams(sources, params): def load_cdx_streams(sources, query):
# Optimize: no need to merge if just one input # Optimize: no need to merge if just one input
if len(sources) == 1: if len(sources) == 1:
return sources[0].load_cdx(params) cdx_iter = sources[0].load_cdx(query)
else:
source_iters = map(lambda src: src.load_cdx(query), sources)
cdx_iter = merge(*(source_iters))
source_iters = map(lambda src: src.load_cdx(params), sources) for cdx in cdx_iter:
merged_stream = merge(*(source_iters)) yield cdx
return merged_stream
#================================================================= #=================================================================
# convert text cdx stream to CDXObject/IDXObject # convert text cdx stream to CDXObject/IDXObject
def make_obj_iter(text_iter, params): def make_obj_iter(text_iter, query):
# already converted # already converted
if params.get('showPagedIndex'): if query.secondary_index_only:
cls = IDXObject cls = IDXObject
else: else:
cls = CDXObject cls = CDXObject
return itertools.imap(lambda line: cls(line), text_iter) return (cls(line) for line in text_iter)
#================================================================= #=================================================================
@ -161,6 +158,7 @@ def cdx_filter(cdx_iter, filter_strings):
if string.startswith('='): if string.startswith('='):
string = string[1:] string = string[1:]
self.compare_func = self.exact self.compare_func = self.exact
# contains match
elif string.startswith('~'): elif string.startswith('~'):
string = string[1:] string = string[1:]
self.compare_func = self.contains self.compare_func = self.contains
@ -257,8 +255,8 @@ def cdx_resolve_revisits(cdx_iter):
originals = {} originals = {}
for cdx in cdx_iter: for cdx in cdx_iter:
is_revisit = ((cdx['mimetype'] == 'warc/revisit') or
(cdx['filename'] == '-')) is_revisit = cdx.is_revisit()
digest = cdx['digest'] digest = cdx['digest']

View File

@ -4,6 +4,7 @@ from cdxops import cdx_load
from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
from zipnum import ZipNumCluster from zipnum import ZipNumCluster
from cdxobject import CDXObject, CaptureNotFoundException, CDXException from cdxobject import CDXObject, CaptureNotFoundException, CDXException
from query import CDXQuery
from cdxdomainspecific import load_domain_specific_cdx_rules from cdxdomainspecific import load_domain_specific_cdx_rules
from pywb.utils.loaders import is_http from pywb.utils.loaders import is_http
@ -36,7 +37,7 @@ class BaseCDXServer(object):
# set perms checker, if any # set perms checker, if any
self.perms_checker = kwargs.get('perms_checker') self.perms_checker = kwargs.get('perms_checker')
def _check_cdx_iter(self, cdx_iter, params): def _check_cdx_iter(self, cdx_iter, query):
""" Check cdx iter semantics """ Check cdx iter semantics
If iter is empty (no matches), check if fuzzy matching If iter is empty (no matches), check if fuzzy matching
is allowed, and try it -- otherwise, is allowed, and try it -- otherwise,
@ -48,21 +49,23 @@ class BaseCDXServer(object):
if cdx_iter: if cdx_iter:
return cdx_iter return cdx_iter
url = params['url']
# check if fuzzy is allowed and ensure that its an # check if fuzzy is allowed and ensure that its an
# exact match # exact match
if (self.fuzzy_query and params.get('allowFuzzy') and if (self.fuzzy_query and
params.get('matchType', 'exact') == 'exact'): query.allow_fuzzy and
query.is_exact):
fuzzy_params = self.fuzzy_query(params) fuzzy_query_params = self.fuzzy_query(query)
if fuzzy_params: if fuzzy_query_params:
return self.load_cdx(**fuzzy_params) return self.load_cdx_query(fuzzy_query_params)
msg = 'No Captures found for: ' + url msg = 'No Captures found for: ' + query.url
raise CaptureNotFoundException(msg) raise CaptureNotFoundException(msg)
def load_cdx(self, **params): def load_cdx(self, **params):
return self.load_cdx_query(CDXQuery(**params))
def load_cdx_query(self, query):
raise NotImplementedError('Implement in subclass') raise NotImplementedError('Implement in subclass')
@staticmethod @staticmethod
@ -84,28 +87,77 @@ class CDXServer(BaseCDXServer):
def __init__(self, paths, **kwargs): def __init__(self, paths, **kwargs):
super(CDXServer, self).__init__(**kwargs) super(CDXServer, self).__init__(**kwargs)
self.sources = create_cdx_sources(paths, kwargs.get('config')) # TODO: we could save config in member, so that other
# methods can use it. it's bad for add_cdx_source to take
# config argument.
self._create_cdx_sources(paths, kwargs.get('config'))
def load_cdx(self, **params): def load_cdx_query(self, query):
# if key not set, assume 'url' is set and needs canonicalization url = query.url
if not params.get('key'): key, end_key = calc_search_range(url=url,
try: match_type=query.match_type,
url = params['url'] url_canon=self.url_canon)
except KeyError: query.set_key(key, end_key)
msg = 'A url= param must be specified to query the cdx server'
raise CDXException(msg)
match_type = params.get('matchType', 'exact') cdx_iter = cdx_load(self.sources,
query,
perms_checker=self.perms_checker)
key, end_key = calc_search_range(url=url, return self._check_cdx_iter(cdx_iter, query)
match_type=match_type,
url_canon=self.url_canon)
params['key'] = key
params['end_key'] = end_key
cdx_iter = cdx_load(self.sources, params, self.perms_checker) def _create_cdx_sources(self, paths, config):
"""
build CDXSource instances for each of path in :param paths:.
:param paths: list of sources or single source.
each source may be either string or CDXSource instance. value
of any other types will be silently ignored.
:param config: config object passed to :method:`add_cdx_source`.
"""
self.sources = []
return self._check_cdx_iter(cdx_iter, params) if paths is not None:
if not isinstance(paths, (list, tuple)):
paths = [paths]
for path in paths:
self.add_cdx_source(path, config)
if len(self.sources) == 0:
logging.warn('No CDX Sources configured from paths=%s', paths)
def _add_cdx_source(self, source):
if source is None: return
logging.debug('Adding CDX Source: %s', source)
self.sources.append(source)
def add_cdx_source(self, source, config):
if source is None: return
if isinstance(source, CDXSource):
self._add_cdx_source(source)
elif isinstance(source, str):
if os.path.isdir(source):
for fn in os.listdir(source):
self._add_cdx_source(self._create_cdx_source(
os.path.join(source, fn), config))
else:
self._add_cdx_source(self._create_cdx_source(
source, config))
def _create_cdx_source(self, filename, config):
if is_http(filename):
return RemoteCDXSource(filename)
if filename.startswith('redis://'):
return RedisCDXSource(filename, config)
if filename.endswith('.cdx'):
return CDXFile(filename)
if filename.endswith(('.summary', '.idx')):
return ZipNumCluster(filename, config)
logging.warn('skipping unrecognized URI:%s', filename)
return None
def __str__(self): def __str__(self):
return 'CDX server serving from ' + str(self.sources) return 'CDX server serving from ' + str(self.sources)
@ -123,20 +175,14 @@ class RemoteCDXServer(BaseCDXServer):
if isinstance(source, RemoteCDXSource): if isinstance(source, RemoteCDXSource):
self.source = source self.source = source
elif (isinstance(source, str) and elif (isinstance(source, str) and is_http(source)):
any(source.startswith(x) for x in ['http://', 'https://'])): self.source = RemoteCDXSource(source, remote_processing=True)
self.source = RemoteCDXSource(source)
else: else:
raise Exception('Invalid remote cdx source: ' + str(source)) raise Exception('Invalid remote cdx source: ' + str(source))
def load_cdx(self, **params): def load_cdx_query(self, query):
remote_iter = self.source.load_cdx(params) remote_iter = cdx_load([self.source], query, process=False)
return self._check_cdx_iter(remote_iter, query)
# if need raw, convert to raw format here
if params.get('output') == 'raw':
remote_iter = (CDXObject(cdx) for cdx in remote_iter)
return self._check_cdx_iter(remote_iter, params)
def __str__(self): def __str__(self):
return 'Remote CDX server serving from ' + str(self.sources[0]) return 'Remote CDX server serving from ' + str(self.sources[0])
@ -169,74 +215,3 @@ def create_cdx_server(config, ds_rules_file=None):
perms_checker=perms_checker) perms_checker=perms_checker)
#=================================================================
def create_cdx_sources(paths, config=None):
sources = []
if not isinstance(paths, list):
paths = [paths]
for path in paths:
if isinstance(path, CDXSource):
add_cdx_source(sources, path, config)
elif isinstance(path, str):
if os.path.isdir(path):
for file in os.listdir(path):
add_cdx_source(sources, path + file, config)
else:
add_cdx_source(sources, path, config)
if len(sources) == 0:
logging.exception('No CDX Sources Found from: ' + str(sources))
return sources
#=================================================================
def add_cdx_source(sources, source, config):
if not isinstance(source, CDXSource):
source = create_cdx_source(source, config)
if not source:
return
logging.debug('Adding CDX Source: ' + str(source))
sources.append(source)
#=================================================================
def create_cdx_source(filename, config):
if is_http(filename):
return RemoteCDXSource(filename)
if filename.startswith('redis://'):
return RedisCDXSource(filename, config)
if filename.endswith('.cdx'):
return CDXFile(filename)
if filename.endswith(('.summary', '.idx')):
return ZipNumCluster(filename, config)
return None
#=================================================================
def extract_params_from_wsgi_env(env):
""" utility function to extract params from the query
string of a WSGI environment dictionary
"""
# use url= param to get actual url
params = urlparse.parse_qs(env['QUERY_STRING'])
if not 'output' in params:
params['output'] = 'text'
# parse_qs produces arrays for single values
# cdx processing expects singleton params for all params,
# except filters, so convert here
# use first value of the list
for name, val in params.iteritems():
if name != 'filter':
params[name] = val[0]
return params

View File

@ -2,6 +2,7 @@ from pywb.utils.binsearch import iter_range
from pywb.utils.loaders import SeekableTextFileReader from pywb.utils.loaders import SeekableTextFileReader
from cdxobject import AccessException from cdxobject import AccessException
from query import CDXQuery
import urllib import urllib
import urllib2 import urllib2
@ -12,7 +13,7 @@ class CDXSource(object):
""" """
Represents any cdx index source Represents any cdx index source
""" """
def load_cdx(self, params): def load_cdx(self, query):
raise NotImplementedError('Implement in subclass') raise NotImplementedError('Implement in subclass')
@ -24,9 +25,9 @@ class CDXFile(CDXSource):
def __init__(self, filename): def __init__(self, filename):
self.filename = filename self.filename = filename
def load_cdx(self, params): def load_cdx(self, query):
source = SeekableTextFileReader(self.filename) source = SeekableTextFileReader(self.filename)
return iter_range(source, params.get('key'), params.get('end_key')) return iter_range(source, query.key, query.end_key)
def __str__(self): def __str__(self):
return 'CDX File - ' + self.filename return 'CDX File - ' + self.filename
@ -40,25 +41,20 @@ class RemoteCDXSource(CDXSource):
Only url and match type params are proxied at this time, Only url and match type params are proxied at this time,
the stream is passed through all other filters locally. the stream is passed through all other filters locally.
""" """
def __init__(self, filename, cookie=None, proxy_all=True): def __init__(self, filename, cookie=None, remote_processing=False):
self.remote_url = filename self.remote_url = filename
self.cookie = cookie self.cookie = cookie
self.proxy_all = proxy_all self.remote_processing = remote_processing
def load_cdx(self, proxy_params): def load_cdx(self, query):
if self.proxy_all: if self.remote_processing:
params = proxy_params remote_query = query
params['proxyAll'] = True
else: else:
# Only send url and matchType params to remote # Only send url and matchType params to remote
params = {} remote_query = CDXQuery(url=query.url,
params['url'] = proxy_params['url'] match_type=query.match_type)
match_type = proxy_params.get('matchType')
if match_type: urlparams = remote_query.urlencode()
proxy_params['matchType'] = match_type
urlparams = urllib.urlencode(params, True)
try: try:
request = urllib2.Request(self.remote_url, urlparams) request = urllib2.Request(self.remote_url, urlparams)
@ -97,14 +93,14 @@ class RedisCDXSource(CDXSource):
self.key_prefix = config.get('redis_key_prefix', self.key_prefix) self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
def load_cdx(self, params): def load_cdx(self, query):
""" """
Load cdx from redis cache, from an ordered list Load cdx from redis cache, from an ordered list
Currently, there is no support for range queries Currently, there is no support for range queries
Only 'exact' matchType is supported Only 'exact' matchType is supported
""" """
key = params['key'] key = query.key
# ensure only url/surt is part of key # ensure only url/surt is part of key
key = key.split(' ')[0] key = key.split(' ')[0]

119
pywb/cdx/query.py Normal file
View File

@ -0,0 +1,119 @@
from urllib import urlencode
from urlparse import parse_qs
#=================================================================
class CDXQuery(object):
def __init__(self, **kwargs):
self.params = kwargs
@property
def key(self):
return self.params['key']
@property
def end_key(self):
return self.params['end_key']
def set_key(self, key, end_key):
self.params['key'] = key
self.params['end_key'] = end_key
@property
def url(self):
try:
return self.params['url']
except KeyError:
msg = 'A url= param must be specified to query the cdx server'
raise CDXException(msg)
@property
def match_type(self):
return self.params.get('matchType', 'exact')
@property
def is_exact(self):
return self.match_type == 'exact'
@property
def allow_fuzzy(self):
return self._get_bool('allowFuzzy')
@property
def output(self):
return self.params.get('output', 'text')
@property
def limit(self):
return int(self.params.get('limit', 100000))
@property
def collapse_time(self):
return self.params.get('collapseTime')
@property
def resolve_revisits(self):
return self._get_bool('resolveRevisits')
@property
def filters(self):
return self.params.get('filter', [])
@property
def fields(self):
v = self.params.get('fields')
return v.split(',') if v else None
@property
def closest(self):
# sort=closest is not required
return self.params.get('closest')
@property
def reverse(self):
# sort=reverse overrides reverse=0
return (self._get_bool('reverse') or
self.params.get('sort') == 'reverse')
@property
def secondary_index_only(self):
return self._get_bool('showPagedIndex')
def _get_bool(self, name, def_val=False):
v = self.params.get(name)
if v:
try:
v = int(v)
except ValueError as ex:
v = (v.lower() == 'true')
else:
v = def_val
return bool(v)
def urlencode(self):
return urlencode(self.params, True)
@staticmethod
def from_wsgi_env(env):
return CDXQuery(**CDXQuery.extract_params_from_wsgi_env(env))
@staticmethod
def extract_params_from_wsgi_env(env):
""" utility function to extract params and create a CDXQuery
from a WSGI environment dictionary
"""
params = parse_qs(env['QUERY_STRING'])
if not 'output' in params:
params['output'] = 'text'
# parse_qs produces arrays for single values
# cdx processing expects singleton params for all params,
# except filters, so convert here
# use first value of the list
for name, val in params.iteritems():
if name != 'filter':
params[name] = val[0]
return params

View File

@ -142,6 +142,8 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
('filename', 'dupes.warc.gz')] ('filename', 'dupes.warc.gz')]
# NOTE: external dependency -- need self-contained test TODO # NOTE: external dependency -- need self-contained test TODO
# Load remote query but filter locally
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2') >>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
>>> pprint.pprint(x.next().items()) >>> pprint.pprint(x.next().items())
[('urlkey', 'com,example)/'), [('urlkey', 'com,example)/'),
@ -152,14 +154,24 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'), ('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
('length', '1792')] ('length', '1792')]
# No local filtering/processing of cdx, simply return result from remote server
>>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'example.com', output = 'raw', limit = '2')
>>> pprint.pprint(x.next().items())
[('urlkey', 'com,example)/'),
('timestamp', '20020120142510'),
('original', 'http://example.com:80/'),
('mimetype', 'text/html'),
('statuscode', '200'),
('digest', 'HT2DYGA5UKZCPBSFVCV3JOBXGW2G5UUA'),
('length', '1792')]
>>> x = CDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2') >>> x = RemoteCDXServer('http://web.archive.org/cdx/search/cdx').load_cdx(url = 'facebook.com', output = 'raw', limit = '2')
Traceback (most recent call last): Traceback (most recent call last):
AccessException: Blocked By Robots AccessException: Blocked By Robots
""" """
#================================================================= #=================================================================
from pywb.cdx.cdxserver import CDXServer from pywb.cdx.cdxserver import CDXServer, RemoteCDXServer
import os import os
import sys import sys
import pprint import pprint
@ -167,22 +179,42 @@ import pprint
from pywb import get_test_dir from pywb import get_test_dir
#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/' #test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/'
test_cdx_dir = get_test_dir() + 'cdx/' test_cdx_dir = get_test_dir() + 'cdx/'
from pywb.cdx.cdxobject import AccessException
from tests.fixture import testconfig, TestExclusionPerms
import pytest
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams): def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
kwparams['url'] = url kwparams['url'] = url
kwparams['output'] = 'text' fields = kwparams.get('fields')
if fields:
fields = fields.split(',')
server = CDXServer(sources) server = CDXServer(sources)
results = server.load_cdx(**kwparams) results = server.load_cdx(**kwparams)
for x in results: for x in results:
x = x.replace('\t', ' ') l = x.to_text(fields).replace('\t', ' ')
sys.stdout.write(x) sys.stdout.write(l)
#================================================================
def test_excluded(testconfig):
testconfig['perms_checker'] = TestExclusionPerms()
sources = testconfig.get('index_paths')
print sources
server = CDXServer(sources, perms_checker=testconfig['perms_checker'])
assert isinstance(server, CDXServer)
assert server.perms_checker
url = 'http://www.iana.org/_img/bookmark_icon.ico'
key = 'org,iana)/_img/bookmark_icon.ico'
with pytest.raises(AccessException):
cdxobjs = list(server.load_cdx(url=url))
print cdxobjs
if __name__ == "__main__": if __name__ == "__main__":
import doctest import doctest
doctest.testmod() doctest.testmod()

View File

@ -0,0 +1,28 @@
from pywb.cdx.cdxops import cdx_load
from pywb.cdx.perms import AllowAllPerms
from pywb.cdx.query import CDXQuery
from pywb.cdx.cdxobject import AccessException
from pytest import raises
class BlockAllPerms(AllowAllPerms):
def allow_url_lookup(self, urlkey, url):
return False
def test_exclusion_short_circuit():
"""
# Verify that exclusion check 'short-circuits' further evaluation.. eg, a bad cdx source is not even loaded
# if exclusion check does not pass
"""
cdx_iter = cdx_load(['bogus ignored'], CDXQuery(url='example.com', key='com,example)/'),
perms_checker=BlockAllPerms(), process=True)
# exception happens on first access attempt
with raises(AccessException):
cdx_iter.next()

View File

@ -1,10 +1,10 @@
import webtest import webtest
from pywb.cdx.wsgi_cdxserver import main from pywb.cdx.wsgi_cdxserver import create_app
from pywb import get_test_dir from pywb import get_test_dir
class TestCdx: class TestCdx:
def setup(self): def setup(self):
self.app = main(get_test_dir() + 'cdx/') self.app = create_app(get_test_dir() + 'cdx/')
self.testapp = webtest.TestApp(self.app) self.testapp = webtest.TestApp(self.app)
def test_cdx(self): def test_cdx(self):

View File

@ -1,10 +1,12 @@
from cdxserver import create_cdx_server, extract_params_from_wsgi_env from werkzeug.wrappers import BaseResponse
from cdxserver import create_cdx_server
from pywb import get_test_dir from pywb import get_test_dir
from query import CDXQuery
import logging import logging
import os import os
import yaml import yaml
import pkgutil import pkg_resources
#================================================================= #=================================================================
CONFIG_FILE = 'config.yaml' CONFIG_FILE = 'config.yaml'
@ -13,65 +15,89 @@ RULES_FILE = 'rules.yaml'
DEFAULT_PORT = 8080 DEFAULT_PORT = 8080
config = None
if __package__:
try:
config = pkgutil.get_data(__package__, CONFIG_FILE)
config = yaml.load(config)
except:
pass
#================================================================= #=================================================================
def main(paths=None):
class CDXQueryRequest(object):
def __init__(self, environ):
self.query = CDXQuery.from_wsgi_env(environ)
class WSGICDXServer(object):
def __init__(self, config, rules_file):
self.cdxserver = create_cdx_server(config, rules_file)
def __call__(self, environ, start_response):
request = CDXQueryRequest(environ)
try:
logging.debug('request.args=%s', request.query)
result = self.cdxserver.load_cdx_query(request.query)
# TODO: select response type by "output" parameter
response = PlainTextResponse(result, request.query.fields)
return response(environ, start_response)
except Exception as exc:
logging.error('load_cdx failed', exc_info=1)
# TODO: error response should be different for each response
# type
start_response('400 Error', [('Content-Type', 'text/plain')])
return [str(exc)]
def cdx_text_out(cdx, fields):
if not fields:
return str(cdx) + '\n'
else:
logging.info('cdx fields=%s', cdx.keys)
# TODO: this will results in an exception if fields contain
# non-existent field name.
return ' '.join(cdx[x] for x in fields) + '\n'
class PlainTextResponse(BaseResponse):
def __init__(self, cdxitr, fields, status=200, content_type='text/plain'):
super(PlainTextResponse, self).__init__(
response=(
cdx.to_text(fields) for cdx in cdxitr
),
status=status, content_type=content_type)
# class JsonResponse(Response):
# pass
# class MementoResponse(Response):
# pass
def create_app(config=None):
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s', logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
level=logging.DEBUG) level=logging.DEBUG)
if not paths: if not config:
if config: index_paths = get_test_dir() + 'cdx/'
paths = config config = dict(index_paths=index_paths)
else:
paths = get_test_dir() + 'cdx/'
cdxserver = create_cdx_server(paths, RULES_FILE)
def application(env, start_response):
try:
params = extract_params_from_wsgi_env(env)
response = cdxserver.load_cdx(**params)
start_response('200 OK', [('Content-Type', 'text/plain')])
except Exception as exc:
import traceback
err_details = traceback.format_exc(exc)
start_response('400 Error', [('Content-Type', 'text/plain')])
response = [str(exc)]
print err_details
return response
return application
return WSGICDXServer(config, RULES_FILE)
if __name__ == "__main__": if __name__ == "__main__":
from wsgiref.simple_server import make_server from optparse import OptionParser
from werkzeug.serving import run_simple
app = main() opt = OptionParser('%prog [OPTIONS]')
opt.add_option('-p', '--port', type='int', default=None)
port = DEFAULT_PORT options, args = opt.parse_args()
if config:
port = config.get('port', DEFAULT_PORT)
httpd = make_server('', port, app) configdata = pkg_resources.resource_string(__name__, CONFIG_FILE)
config = yaml.load(configdata)
logging.debug('Starting CDX Server on port ' + str(port)) port = options.port
if port is None:
port = (config and config.get('port')) or DEFAULT_PORT
app = create_app(config)
logging.debug('Starting CDX Server on port %s', port)
try: try:
httpd.serve_forever() run_simple('0.0.0.0', port, app, use_reloader=True, use_debugger=True)
except KeyboardInterrupt: except KeyboardInterrupt as ex:
pass pass
logging.debug('Stopping CDX Server') logging.debug('Stopping CDX Server')
else: else:
application = main() # XXX pass production config
application = create_app()

View File

@ -110,21 +110,20 @@ class ZipNumCluster(CDXSource):
def lookup_loc(self, part): def lookup_loc(self, part):
return self.loc_map[part] return self.loc_map[part]
def load_cdx(self, params): def load_cdx(self, query):
self.reload_loc() self.reload_loc()
reader = SeekableTextFileReader(self.summary) reader = SeekableTextFileReader(self.summary)
idx_iter = iter_range(reader, idx_iter = iter_range(reader,
params['key'], query.key,
params['end_key'], query.end_key,
prev_size=1) prev_size=1)
if params.get('showPagedIndex'): if query.secondary_index_only:
params['proxyAll'] = True
return idx_iter return idx_iter
else: else:
blocks = self.idx_to_cdx(idx_iter, params) blocks = self.idx_to_cdx(idx_iter, query)
def gen_cdx(): def gen_cdx():
for blk in blocks: for blk in blocks:
@ -133,7 +132,7 @@ class ZipNumCluster(CDXSource):
return gen_cdx() return gen_cdx()
def idx_to_cdx(self, idx_iter, params): def idx_to_cdx(self, idx_iter, query):
blocks = None blocks = None
ranges = [] ranges = []
@ -150,7 +149,7 @@ class ZipNumCluster(CDXSource):
else: else:
if blocks: if blocks:
yield self.block_to_cdx_iter(blocks, ranges, params) yield self.block_to_cdx_iter(blocks, ranges, query)
blocks = ZipBlocks(idx['part'], blocks = ZipBlocks(idx['part'],
idx['offset'], idx['offset'],
@ -160,15 +159,15 @@ class ZipNumCluster(CDXSource):
ranges = [blocks.length] ranges = [blocks.length]
if blocks: if blocks:
yield self.block_to_cdx_iter(blocks, ranges, params) yield self.block_to_cdx_iter(blocks, ranges, query)
def block_to_cdx_iter(self, blocks, ranges, params): def block_to_cdx_iter(self, blocks, ranges, query):
last_exc = None last_exc = None
last_traceback = None last_traceback = None
for location in self.lookup_loc(blocks.part): for location in self.lookup_loc(blocks.part):
try: try:
return self.load_blocks(location, blocks, ranges, params) return self.load_blocks(location, blocks, ranges, query)
except Exception as exc: except Exception as exc:
last_exc = exc last_exc = exc
import sys import sys
@ -179,7 +178,7 @@ class ZipNumCluster(CDXSource):
else: else:
raise Exception('No Locations Found for: ' + block.part) raise Exception('No Locations Found for: ' + block.part)
def load_blocks(self, location, blocks, ranges, params): def load_blocks(self, location, blocks, ranges, query):
if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG): if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG):
msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}' msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
@ -195,9 +194,9 @@ class ZipNumCluster(CDXSource):
iter_ = itertools.chain(*itertools.imap(decompress_block, ranges)) iter_ = itertools.chain(*itertools.imap(decompress_block, ranges))
# start bound # start bound
iter_ = linearsearch(iter_, params['key']) iter_ = linearsearch(iter_, query.key)
# end bound # end bound
end = params['end_key'] end = query.end_key
iter_ = itertools.takewhile(lambda line: line < end, iter_) iter_ = itertools.takewhile(lambda line: line < end, iter_)
return iter_ return iter_

View File

@ -4,7 +4,7 @@ import mimetypes
import time import time
from pywb.rewrite.wburl import WbUrl from pywb.rewrite.wburl import WbUrl
from pywb.cdx.cdxserver import extract_params_from_wsgi_env from pywb.cdx.query import CDXQuery
from wbrequestresponse import WbResponse from wbrequestresponse import WbResponse
from wbexceptions import WbException, NotFoundException from wbexceptions import WbException, NotFoundException
from views import TextCapturesView from views import TextCapturesView
@ -82,7 +82,7 @@ class CDXHandler(BaseHandler):
self.view = view if view else TextCapturesView() self.view = view if view else TextCapturesView()
def __call__(self, wbrequest): def __call__(self, wbrequest):
params = extract_params_from_wsgi_env(wbrequest.env) params = CDXQuery.extract_params_from_wsgi_env(wbrequest.env)
cdx_lines = self.index_reader.load_cdx(**params) cdx_lines = self.index_reader.load_cdx(**params)
return self.view.render_response(wbrequest, cdx_lines) return self.view.render_response(wbrequest, cdx_lines)

View File

@ -30,7 +30,7 @@ class IndexReader(object):
params['allowFuzzy'] = True params['allowFuzzy'] = True
cdxlines = self.load_cdx(url=wburl.url, output='raw', **params) cdxlines = self.load_cdx(url=wburl.url, **params)
return cdxlines return cdxlines

View File

@ -1,24 +1,48 @@
#!/usr/bin/env python #!/usr/bin/env python
# vim: set sw=4 et: # vim: set sw=4 et:
import setuptools from setuptools import setup, find_packages
import glob import glob
setuptools.setup(name='pywb', setup(
version='0.2', name='pywb',
url='https://github.com/ikreymer/pywb', version='0.2',
author='Ilya Kreymer', url='https://github.com/ikreymer/pywb',
author_email='ilya@archive.org', author='Ilya Kreymer',
long_description=open('README.md').read(), author_email='ilya@archive.org',
license='GPL', long_description=open('README.md').read(),
packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite','pywb.core','pywb.dispatch','pywb.bootstrap'], license='GPL',
provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite','pywb.core','pywb.dispatch','pywb.bootstrap'], packages=find_packages(),
package_data={'pywb': ['ui/*', 'static/*', '*.yaml']}, provides=[
data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')), 'pywb',
('sample_archive/zipcdx/', glob.glob('sample_archive/zipcdx/*')), 'pywb.utils',
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')), 'pywb.cdx',
('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))], 'pywb.warc',
install_requires=['uwsgi', 'rfc3987', 'chardet', 'redis', 'jinja2', 'surt', 'pyyaml', 'WebTest','pytest'], 'pywb.rewrite',
# tests_require=['WebTest', 'pytest'], 'pywb.core',
zip_safe=False) 'pywb.dispatch',
'pywb.bootstrap'
],
package_data={
'pywb': ['ui/*', 'static/*', '*.yaml'],
},
data_files = [
('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
('sample_archive/zipcdx/', glob.glob('sample_archive/zipcdx/*')),
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')),
('sample_archive/text_content/', glob.glob('sample_archive/text_content/*')),
],
install_requires=[
'rfc3987',
'chardet',
'redis',
'jinja2',
'surt',
'pyyaml',
'WebTest',
'pytest',
'werkzeug>=0.9.4',
],
# tests_require=['WebTest', 'pytest'],
zip_safe=False
)

View File

@ -92,10 +92,10 @@ enable_cdx_api: true
# optional reporter callback func # optional reporter callback func
# if set, called with request and cdx object # if set, called with request and cdx object
reporter: !!python/object/new:tests.test_integration.PrintReporter [] reporter: !!python/object/new:tests.fixture.PrintReporter []
# custom rules for domain specific matching # custom rules for domain specific matching
#domain_specific_rules: rules.yaml #domain_specific_rules: rules.yaml
#perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms [] #perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms []
perms_checker: !!python/object/new:tests.test_integration.TestExclusionPerms [] perms_checker: !!python/object/new:tests.fixture.TestExclusionPerms []

45
tests/fixture.py Normal file
View File

@ -0,0 +1,45 @@
import os
import pytest
import yaml
from pywb.cdx.perms import AllowAllPerms
@pytest.fixture
def testconfig():
config = yaml.load(open('test_config.yaml'))
assert config
if 'index_paths' not in config:
# !!! assumes this module is in a sub-directory of project root.
config['index_paths'] = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
'../sample_archive/cdx')
return config
#================================================================
# Reporter callback for replay view
class PrintReporter:
"""Reporter callback for replay view.
"""
def __call__(self, wbrequest, cdx, response):
print wbrequest
print cdx
pass
#================================================================
class TestExclusionPerms(AllowAllPerms):
"""
Perm Checker fixture which can block one URL.
"""
# sample_archive has captures for this URLKEY
URLKEY_EXCLUDED = 'org,iana)/_img/bookmark_icon.ico'
def allow_url_lookup(self, urlkey, url):
"""
Return true/false if url or urlkey (canonicalized url)
should be allowed
"""
if urlkey == self.URLKEY_EXCLUDED:
return False
return super(TestExclusionPerms, self).allow_url_lookup(urlkey, url)

View File

@ -2,14 +2,17 @@ import webtest
from pywb.bootstrap.pywb_init import pywb_config from pywb.bootstrap.pywb_init import pywb_config
from pywb.bootstrap.wbapp import create_wb_app from pywb.bootstrap.wbapp import create_wb_app
from pywb.cdx.cdxobject import CDXObject from pywb.cdx.cdxobject import CDXObject
from pywb.cdx.perms import AllowAllPerms
from fixture import TestExclusionPerms
class TestWb: class TestWb:
TEST_CONFIG = 'test_config.yaml' TEST_CONFIG = 'test_config.yaml'
def setup(self): def setup(self):
#self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config()) #self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config())
self.app = create_wb_app(pywb_config(self.TEST_CONFIG)) # save it in self - useful for debugging
self.router = pywb_config(self.TEST_CONFIG)
self.app = create_wb_app(self.router)
self.testapp = webtest.TestApp(self.app) self.testapp = webtest.TestApp(self.app)
def _assert_basic_html(self, resp): def _assert_basic_html(self, resp):
@ -207,24 +210,3 @@ class TestWb:
assert resp.status_int == 400 assert resp.status_int == 400
assert 'Invalid Url: http://?abc' in resp.body assert 'Invalid Url: http://?abc' in resp.body
#=================================================================
# Reporter callback for replay view
class PrintReporter:
def __call__(self, wbrequest, cdx, response):
print wbrequest
print cdx
#=================================================================
class TestExclusionPerms(AllowAllPerms):
"""
Sample Perm Checker with hard-coded exclusion
"""
def allow_url_lookup(self, urlkey, url):
"""
Return true/false if url or urlkey (canonicalized url)
should be allowed
"""
if urlkey == 'org,iana)/_img/bookmark_icon.ico':
return False
return super(TestExclusionPerms, self).allow_url_lookup(urlkey, url)

View File

@ -0,0 +1,189 @@
import os
import re
import pytest
from urllib import urlencode
from werkzeug.test import Client
from werkzeug.wrappers import BaseResponse, Response
import yaml
from pywb.cdx.cdxobject import CDXObject
from pywb.cdx.wsgi_cdxserver import create_app
from tests.fixture import testconfig
@pytest.fixture
def client(testconfig):
app = create_app(testconfig)
return Client(app, Response)
# ================================================================
def query(client, url, **params):
params['url'] = url
return client.get('/cdx?' + urlencode(params, doseq=1))
# ================================================================
def test_exact_url(client):
"""
basic exact match, no filters, etc.
"""
resp = query(client, 'http://www.iana.org/')
assert resp.status_code == 200
print resp.data
def test_prefix_match(client):
"""
prefix match test
"""
resp = query(client, 'http://www.iana.org/', matchType='prefix')
print resp.data.splitlines()
assert resp.status_code == 200
suburls = 0
for l in resp.data.splitlines():
fields = l.split(' ')
if len(fields[0]) > len('org,iana)/'):
suburls += 1
assert suburls > 0
def test_filters(client):
"""
filter cdxes by mimetype and filename field, exact match.
"""
resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css',
filter=('mimetype:warc/revisit', 'filename:dupes.warc.gz'))
assert resp.status_code == 200
assert resp.mimetype == 'text/plain'
for l in resp.data.splitlines():
fields = l.split(' ')
assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
assert fields[3] == 'warc/revisit'
assert fields[10] == 'dupes.warc.gz'
def test_limit(client):
resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css',
limit='1')
assert resp.status_code == 200
assert resp.mimetype == 'text/plain'
cdxes = resp.data.splitlines()
assert len(cdxes) == 1
fields = cdxes[0].split(' ')
assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
assert fields[1] == '20140126200625'
assert fields[3] == 'text/css'
resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css',
limit='1', reverse='1')
assert resp.status_code == 200
assert resp.mimetype == 'text/plain'
cdxes = resp.data.splitlines()
assert len(cdxes) == 1
fields = cdxes[0].split(' ')
assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
assert fields[1] == '20140127171239'
assert fields[3] == 'warc/revisit'
def test_fields(client):
"""
retrieve subset of fields with ``fields`` parameter.
"""
resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
fields='urlkey,timestamp,statuscode')
assert resp.status_code == 200
cdxes = resp.data.splitlines()
for cdx in cdxes:
fields = cdx.split(' ')
assert len(fields) == 3
assert fields[0] == 'org,iana)/_css/2013.1/print.css'
assert re.match(r'\d{14}$', fields[1])
assert re.match(r'\d{3}|-', fields[2])
def test_fields_undefined(client):
"""
server shall respond with Bad Request (TODO: with proper explanation),
when ``fields`` parameter contains undefined name(s).
"""
resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
fields='urlkey,nosuchfield')
resp.status_code == 400
def test_resolveRevisits(client):
"""
with ``resolveRevisits=true``, server adds three fields pointing to
the *original* capture.
"""
resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
resolveRevisits='true'
)
assert resp.status_code == 200
assert resp.mimetype == 'text/plain'
cdxes = resp.data.splitlines()
originals = {}
for cdx in cdxes:
fields = cdx.split(' ')
assert len(fields) == 14
(key, ts, url, mt, st, sha, _, _, size, offset, fn,
orig_size, orig_offset, orig_fn) = fields
# orig_* fields are either all '-' or (int, int, filename)
# check if orig_* fields are equals to corresponding fields
# for the original capture.
if orig_size == '-':
assert orig_offset == '-' and orig_fn == '-'
originals[sha] = (int(size), int(offset), fn)
else:
orig = originals.get(sha)
assert orig == (int(orig_size), int(orig_offset), orig_fn)
def test_resolveRevisits_orig_fields(client):
"""
when resolveRevisits=true, extra three fields are named
``orig.length``, ``orig.offset`` and ``orig.filename``, respectively.
it is possible to filter fields by these names.
"""
resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
resolveRevisits='1',
fields='urlkey,orig.length,orig.offset,orig.filename'
)
assert resp.status_code == 200
assert resp.mimetype == 'text/plain'
cdxes = resp.data.splitlines()
for cdx in cdxes:
fields = cdx.split(' ')
assert len(fields) == 4
key, orig_len, orig_offset, orig_fn = fields
assert (orig_len == '-' and orig_offset == '-' and orig_fn == '-' or
(int(orig_len), int(orig_offset), orig_fn))
def test_collapseTime_resolveRevisits_reverse(client):
resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
collapseTime='11',
resolveRevisits='true',
reverse='true'
)
cdxes = [CDXObject(l) for l in resp.data.splitlines()]
assert len(cdxes) == 3
# timestamp is in descending order
for i in range(len(cdxes) - 1):
assert cdxes[i]['timestamp'] >= cdxes[i + 1]['timestamp']