mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Merge branch 'cdx-server' of git://github.com/kngenie/pywb into kngenie-cdx-server
Kengie's cdx server refactoring and wsgi improvements
This commit is contained in:
commit
af9cabdc72
@ -1,7 +1,7 @@
|
|||||||
import yaml
|
import yaml
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
import pkgutil
|
import pkg_resources
|
||||||
|
|
||||||
from pywb.utils.dsrules import BaseRule, RuleSet
|
from pywb.utils.dsrules import BaseRule, RuleSet
|
||||||
|
|
||||||
|
@ -71,12 +71,25 @@ class CDXObject(OrderedDict):
|
|||||||
# force regen on next __str__ call
|
# force regen on next __str__ call
|
||||||
self.cdxline = None
|
self.cdxline = None
|
||||||
|
|
||||||
|
def is_revisit(self):
|
||||||
|
return (self['mimetype'] == 'warc/revisit' or
|
||||||
|
self['filename'] == '-')
|
||||||
|
|
||||||
|
def to_text(self, fields=None):
|
||||||
|
"""
|
||||||
|
return plaintext CDX record (includes newline).
|
||||||
|
:param fields: list of field names to output.
|
||||||
|
"""
|
||||||
|
if fields is None:
|
||||||
|
return str(self) + '\n'
|
||||||
|
else:
|
||||||
|
return ' '.join(self[x] for x in fields) + '\n'
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
if self.cdxline:
|
if self.cdxline:
|
||||||
return self.cdxline
|
return self.cdxline
|
||||||
|
|
||||||
li = itertools.imap(lambda (n, val): val, self.items())
|
return ' '.join(val for n, val in self.iteritems())
|
||||||
return ' '.join(li)
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -106,5 +119,12 @@ class IDXObject(OrderedDict):
|
|||||||
|
|
||||||
self.idxline = idxline
|
self.idxline = idxline
|
||||||
|
|
||||||
|
def to_text(self, fields=None):
|
||||||
|
"""
|
||||||
|
return plaintext IDX record (including newline).
|
||||||
|
:param fields: list of field names to output (currently ignored)
|
||||||
|
"""
|
||||||
|
return str(self) + '\n'
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.idxline
|
return self.idxline
|
||||||
|
@ -10,32 +10,39 @@ from collections import deque
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def cdx_load(sources, params, perms_checker=None):
|
def cdx_load(sources, params, perms_checker=None, filter=True):
|
||||||
|
"""
|
||||||
|
merge text CDX lines from sources, return an iterator for
|
||||||
|
filtered and access-checked sequence of CDX objects.
|
||||||
|
|
||||||
|
:param sources: iterable for text CDX sources.
|
||||||
|
:param perms_checker: access check filter object implementing
|
||||||
|
allow_url_lookup(key, url), allow_capture(cdxobj) and
|
||||||
|
filter_fields(cdxobj) methods.
|
||||||
|
"""
|
||||||
|
cdx_iter = load_cdx_streams(sources, params)
|
||||||
|
cdx_iter = make_obj_iter(cdx_iter, params)
|
||||||
|
cdx_iter = filter_cdx(cdx_iter, params)
|
||||||
if perms_checker:
|
if perms_checker:
|
||||||
cdx_iter = cdx_load_with_perms(sources, params, perms_checker)
|
cdx_iter = restrict_cdx(cdx_iter, params, perms_checker)
|
||||||
else:
|
return cdx_iter
|
||||||
cdx_iter = cdx_load_and_filter(sources, params)
|
|
||||||
|
|
||||||
# output raw cdx objects
|
|
||||||
if params.get('output') == 'raw':
|
|
||||||
return cdx_iter
|
|
||||||
|
|
||||||
def write_cdx(fields):
|
|
||||||
for cdx in cdx_iter:
|
|
||||||
yield cdx_text_out(cdx, fields) + '\n'
|
|
||||||
|
|
||||||
return write_cdx(params.get('fields'))
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def cdx_load_with_perms(sources, params, perms_checker):
|
def restrict_cdx(cdx_iter, params, perms_checker):
|
||||||
|
"""
|
||||||
|
filter out those cdx records that user doesn't have access to,
|
||||||
|
by consulting :param perms_checker:.
|
||||||
|
:param cdx_iter: cdx record source iterable
|
||||||
|
:param params: request parameters (dict)
|
||||||
|
:param perms_checker: object implementing permission checker
|
||||||
|
"""
|
||||||
if not perms_checker.allow_url_lookup(params['key'], params['url']):
|
if not perms_checker.allow_url_lookup(params['key'], params['url']):
|
||||||
if params.get('matchType', 'exact') == 'exact':
|
if params.get('matchType', 'exact') == 'exact':
|
||||||
raise AccessException('Excluded')
|
raise AccessException('Excluded')
|
||||||
|
|
||||||
cdx_iter = cdx_load_and_filter(sources, params)
|
|
||||||
|
|
||||||
for cdx in cdx_iter:
|
for cdx in cdx_iter:
|
||||||
|
# TODO: we could let filter_fields handle this case by accepting
|
||||||
|
# None as a return value.
|
||||||
if not perms_checker.allow_capture(cdx):
|
if not perms_checker.allow_capture(cdx):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -43,21 +50,8 @@ def cdx_load_with_perms(sources, params, perms_checker):
|
|||||||
|
|
||||||
yield cdx
|
yield cdx
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def cdx_text_out(cdx, fields):
|
def filter_cdx(cdx_iter, params):
|
||||||
if not fields:
|
|
||||||
return str(cdx)
|
|
||||||
else:
|
|
||||||
return ' '.join(map(lambda x: cdx[x], fields.split(',')))
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def cdx_load_and_filter(sources, params):
|
|
||||||
cdx_iter = load_cdx_streams(sources, params)
|
|
||||||
|
|
||||||
cdx_iter = make_obj_iter(cdx_iter, params)
|
|
||||||
|
|
||||||
if params.get('proxyAll'):
|
if params.get('proxyAll'):
|
||||||
return cdx_iter
|
return cdx_iter
|
||||||
|
|
||||||
@ -110,7 +104,7 @@ def make_obj_iter(text_iter, params):
|
|||||||
else:
|
else:
|
||||||
cls = CDXObject
|
cls = CDXObject
|
||||||
|
|
||||||
return itertools.imap(lambda line: cls(line), text_iter)
|
return (cls(line) for line in text_iter)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -257,8 +251,8 @@ def cdx_resolve_revisits(cdx_iter):
|
|||||||
originals = {}
|
originals = {}
|
||||||
|
|
||||||
for cdx in cdx_iter:
|
for cdx in cdx_iter:
|
||||||
is_revisit = ((cdx['mimetype'] == 'warc/revisit') or
|
|
||||||
(cdx['filename'] == '-'))
|
is_revisit = cdx.is_revisit()
|
||||||
|
|
||||||
digest = cdx['digest']
|
digest = cdx['digest']
|
||||||
|
|
||||||
|
@ -84,7 +84,10 @@ class CDXServer(BaseCDXServer):
|
|||||||
|
|
||||||
def __init__(self, paths, **kwargs):
|
def __init__(self, paths, **kwargs):
|
||||||
super(CDXServer, self).__init__(**kwargs)
|
super(CDXServer, self).__init__(**kwargs)
|
||||||
self.sources = create_cdx_sources(paths, kwargs.get('config'))
|
# TODO: we could save config in member, so that other
|
||||||
|
# methods can use it. it's bad for add_cdx_source to take
|
||||||
|
# config argument.
|
||||||
|
self._create_cdx_sources(paths, kwargs.get('config'))
|
||||||
|
|
||||||
def load_cdx(self, **params):
|
def load_cdx(self, **params):
|
||||||
# if key not set, assume 'url' is set and needs canonicalization
|
# if key not set, assume 'url' is set and needs canonicalization
|
||||||
@ -103,10 +106,64 @@ class CDXServer(BaseCDXServer):
|
|||||||
params['key'] = key
|
params['key'] = key
|
||||||
params['end_key'] = end_key
|
params['end_key'] = end_key
|
||||||
|
|
||||||
cdx_iter = cdx_load(self.sources, params, self.perms_checker)
|
cdx_iter = cdx_load(self.sources, params,
|
||||||
|
perms_checker=self.perms_checker)
|
||||||
return self._check_cdx_iter(cdx_iter, params)
|
return self._check_cdx_iter(cdx_iter, params)
|
||||||
|
|
||||||
|
def _create_cdx_sources(self, paths, config):
|
||||||
|
"""
|
||||||
|
build CDXSource instances for each of path in :param paths:.
|
||||||
|
:param paths: list of sources or single source.
|
||||||
|
each source may be either string or CDXSource instance. value
|
||||||
|
of any other types will be silently ignored.
|
||||||
|
:param config: config object passed to :method:`add_cdx_source`.
|
||||||
|
"""
|
||||||
|
self.sources = []
|
||||||
|
|
||||||
|
if paths is not None:
|
||||||
|
if not isinstance(paths, (list, tuple)):
|
||||||
|
paths = [paths]
|
||||||
|
|
||||||
|
for path in paths:
|
||||||
|
self.add_cdx_source(path, config)
|
||||||
|
|
||||||
|
if len(self.sources) == 0:
|
||||||
|
logging.warn('No CDX Sources configured from paths=%s', paths)
|
||||||
|
|
||||||
|
def _add_cdx_source(self, source):
|
||||||
|
if source is None: return
|
||||||
|
logging.debug('Adding CDX Source: %s', source)
|
||||||
|
self.sources.append(source)
|
||||||
|
|
||||||
|
def add_cdx_source(self, source, config):
|
||||||
|
if source is None: return
|
||||||
|
if isinstance(source, CDXSource):
|
||||||
|
self._add_cdx_source(source)
|
||||||
|
elif isinstance(source, str):
|
||||||
|
if os.path.isdir(source):
|
||||||
|
for fn in os.listdir(source):
|
||||||
|
self._add_cdx_source(self._create_cdx_source(
|
||||||
|
os.path.join(source, fn), config))
|
||||||
|
else:
|
||||||
|
self._add_cdx_source(self._create_cdx_source(
|
||||||
|
source, config))
|
||||||
|
|
||||||
|
def _create_cdx_source(self, filename, config):
|
||||||
|
if is_http(filename):
|
||||||
|
return RemoteCDXSource(filename)
|
||||||
|
|
||||||
|
if filename.startswith('redis://'):
|
||||||
|
return RedisCDXSource(filename, config)
|
||||||
|
|
||||||
|
if filename.endswith('.cdx'):
|
||||||
|
return CDXFile(filename)
|
||||||
|
|
||||||
|
if filename.endswith(('.summary', '.idx')):
|
||||||
|
return ZipNumCluster(filename, config)
|
||||||
|
|
||||||
|
logging.warn('skipping unrecognized URI:%s', filename)
|
||||||
|
return None
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'CDX server serving from ' + str(self.sources)
|
return 'CDX server serving from ' + str(self.sources)
|
||||||
|
|
||||||
@ -130,12 +187,7 @@ class RemoteCDXServer(BaseCDXServer):
|
|||||||
raise Exception('Invalid remote cdx source: ' + str(source))
|
raise Exception('Invalid remote cdx source: ' + str(source))
|
||||||
|
|
||||||
def load_cdx(self, **params):
|
def load_cdx(self, **params):
|
||||||
remote_iter = self.source.load_cdx(params)
|
remote_iter = cdx_load((self.sources,), params, filter=False)
|
||||||
|
|
||||||
# if need raw, convert to raw format here
|
|
||||||
if params.get('output') == 'raw':
|
|
||||||
remote_iter = (CDXObject(cdx) for cdx in remote_iter)
|
|
||||||
|
|
||||||
return self._check_cdx_iter(remote_iter, params)
|
return self._check_cdx_iter(remote_iter, params)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
@ -168,58 +220,6 @@ def create_cdx_server(config, ds_rules_file=None):
|
|||||||
ds_rules_file=ds_rules_file,
|
ds_rules_file=ds_rules_file,
|
||||||
perms_checker=perms_checker)
|
perms_checker=perms_checker)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def create_cdx_sources(paths, config=None):
|
|
||||||
sources = []
|
|
||||||
|
|
||||||
if not isinstance(paths, list):
|
|
||||||
paths = [paths]
|
|
||||||
|
|
||||||
for path in paths:
|
|
||||||
if isinstance(path, CDXSource):
|
|
||||||
add_cdx_source(sources, path, config)
|
|
||||||
elif isinstance(path, str):
|
|
||||||
if os.path.isdir(path):
|
|
||||||
for file in os.listdir(path):
|
|
||||||
add_cdx_source(sources, path + file, config)
|
|
||||||
else:
|
|
||||||
add_cdx_source(sources, path, config)
|
|
||||||
|
|
||||||
if len(sources) == 0:
|
|
||||||
logging.exception('No CDX Sources Found from: ' + str(sources))
|
|
||||||
|
|
||||||
return sources
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def add_cdx_source(sources, source, config):
|
|
||||||
if not isinstance(source, CDXSource):
|
|
||||||
source = create_cdx_source(source, config)
|
|
||||||
if not source:
|
|
||||||
return
|
|
||||||
|
|
||||||
logging.debug('Adding CDX Source: ' + str(source))
|
|
||||||
sources.append(source)
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def create_cdx_source(filename, config):
|
|
||||||
if is_http(filename):
|
|
||||||
return RemoteCDXSource(filename)
|
|
||||||
|
|
||||||
if filename.startswith('redis://'):
|
|
||||||
return RedisCDXSource(filename, config)
|
|
||||||
|
|
||||||
if filename.endswith('.cdx'):
|
|
||||||
return CDXFile(filename)
|
|
||||||
|
|
||||||
if filename.endswith(('.summary', '.idx')):
|
|
||||||
return ZipNumCluster(filename, config)
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def extract_params_from_wsgi_env(env):
|
def extract_params_from_wsgi_env(env):
|
||||||
""" utility function to extract params from the query
|
""" utility function to extract params from the query
|
||||||
|
@ -167,22 +167,42 @@ import pprint
|
|||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/'
|
#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample_data/'
|
||||||
test_cdx_dir = get_test_dir() + 'cdx/'
|
test_cdx_dir = get_test_dir() + 'cdx/'
|
||||||
|
from pywb.cdx.cdxobject import AccessException
|
||||||
|
|
||||||
|
from tests.fixture import testconfig, TestExclusionPerms
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
|
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
|
||||||
kwparams['url'] = url
|
kwparams['url'] = url
|
||||||
kwparams['output'] = 'text'
|
fields = kwparams.get('fields')
|
||||||
|
if fields:
|
||||||
|
fields = fields.split(',')
|
||||||
|
|
||||||
server = CDXServer(sources)
|
server = CDXServer(sources)
|
||||||
results = server.load_cdx(**kwparams)
|
results = server.load_cdx(**kwparams)
|
||||||
|
|
||||||
for x in results:
|
for x in results:
|
||||||
x = x.replace('\t', ' ')
|
l = x.to_text(fields).replace('\t', ' ')
|
||||||
sys.stdout.write(x)
|
sys.stdout.write(l)
|
||||||
|
|
||||||
|
#================================================================
|
||||||
|
|
||||||
|
def test_excluded(testconfig):
|
||||||
|
testconfig['perms_checker'] = TestExclusionPerms()
|
||||||
|
sources = testconfig.get('index_paths')
|
||||||
|
print sources
|
||||||
|
server = CDXServer(sources, perms_checker=testconfig['perms_checker'])
|
||||||
|
assert isinstance(server, CDXServer)
|
||||||
|
assert server.perms_checker
|
||||||
|
|
||||||
|
url = 'http://www.iana.org/_img/bookmark_icon.ico'
|
||||||
|
key = 'org,iana)/_img/bookmark_icon.ico'
|
||||||
|
with pytest.raises(AccessException):
|
||||||
|
cdxobjs = list(server.load_cdx(url=url))
|
||||||
|
print cdxobjs
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import doctest
|
import doctest
|
||||||
doctest.testmod()
|
doctest.testmod()
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
import webtest
|
import webtest
|
||||||
from pywb.cdx.wsgi_cdxserver import main
|
from pywb.cdx.wsgi_cdxserver import create_app
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
|
|
||||||
class TestCdx:
|
class TestCdx:
|
||||||
def setup(self):
|
def setup(self):
|
||||||
self.app = main(get_test_dir() + 'cdx/')
|
self.app = create_app(get_test_dir() + 'cdx/')
|
||||||
self.testapp = webtest.TestApp(self.app)
|
self.testapp = webtest.TestApp(self.app)
|
||||||
|
|
||||||
def test_cdx(self):
|
def test_cdx(self):
|
||||||
|
@ -1,10 +1,13 @@
|
|||||||
from cdxserver import create_cdx_server, extract_params_from_wsgi_env
|
from werkzeug.wrappers import BaseRequest, BaseResponse
|
||||||
|
from cdxserver import create_cdx_server
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import yaml
|
import yaml
|
||||||
import pkgutil
|
import pkg_resources
|
||||||
|
|
||||||
|
import cdxops
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
CONFIG_FILE = 'config.yaml'
|
CONFIG_FILE = 'config.yaml'
|
||||||
@ -13,65 +16,116 @@ RULES_FILE = 'rules.yaml'
|
|||||||
|
|
||||||
DEFAULT_PORT = 8080
|
DEFAULT_PORT = 8080
|
||||||
|
|
||||||
config = None
|
|
||||||
if __package__:
|
|
||||||
try:
|
|
||||||
config = pkgutil.get_data(__package__, CONFIG_FILE)
|
|
||||||
config = yaml.load(config)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def main(paths=None):
|
|
||||||
|
class CDXQueryRequest(BaseRequest):
|
||||||
|
def __init__(self, environ):
|
||||||
|
super(CDXQueryRequest, self).__init__(environ)
|
||||||
|
|
||||||
|
def _get_bool(self, name):
|
||||||
|
v = self.args.get(name)
|
||||||
|
if v:
|
||||||
|
try:
|
||||||
|
v = int(s)
|
||||||
|
except ValueError as ex:
|
||||||
|
v = (s.lower() == 'true')
|
||||||
|
return bool(v)
|
||||||
|
@property
|
||||||
|
def output(self):
|
||||||
|
return self.args.get('output', 'text')
|
||||||
|
@property
|
||||||
|
def filter(self):
|
||||||
|
return self.args.getlist('filter', [])
|
||||||
|
@property
|
||||||
|
def fields(self):
|
||||||
|
v = self.args.get('fields')
|
||||||
|
return v.split(',') if v else None
|
||||||
|
@property
|
||||||
|
def reverse(self):
|
||||||
|
# sort=reverse overrides reverse=0
|
||||||
|
return (self._get_bool('reverse') or
|
||||||
|
self.args.get('sort') == 'reverse')
|
||||||
|
@property
|
||||||
|
def params(self):
|
||||||
|
return dict(t if t[0] == 'filter' else (t[0], t[1][0])
|
||||||
|
for t in self.args.iterlists())
|
||||||
|
|
||||||
|
class WSGICDXServer(object):
|
||||||
|
def __init__(self, config, rules_file):
|
||||||
|
self.cdxserver = create_cdx_server(config, rules_file)
|
||||||
|
|
||||||
|
def __call__(self, environ, start_response):
|
||||||
|
request = CDXQueryRequest(environ)
|
||||||
|
try:
|
||||||
|
logging.debug('request.args=%s', request.params)
|
||||||
|
result = self.cdxserver.load_cdx(**request.params)
|
||||||
|
|
||||||
|
# TODO: select response type by "output" parameter
|
||||||
|
response = PlainTextResponse(result, request.fields)
|
||||||
|
return response(environ, start_response)
|
||||||
|
except Exception as exc:
|
||||||
|
logging.error('load_cdx failed', exc_info=1)
|
||||||
|
# TODO: error response should be different for each response
|
||||||
|
# type
|
||||||
|
start_response('400 Error', [('Content-Type', 'text/plain')])
|
||||||
|
return [str(exc)]
|
||||||
|
|
||||||
|
def cdx_text_out(cdx, fields):
|
||||||
|
if not fields:
|
||||||
|
return str(cdx) + '\n'
|
||||||
|
else:
|
||||||
|
logging.info('cdx fields=%s', cdx.keys())
|
||||||
|
# TODO: this will results in an exception if fields contain
|
||||||
|
# non-existent field name.
|
||||||
|
return ' '.join(cdx[x] for x in fields) + '\n'
|
||||||
|
|
||||||
|
class PlainTextResponse(BaseResponse):
|
||||||
|
def __init__(self, cdxitr, fields, status=200, content_type='text/plain'):
|
||||||
|
super(PlainTextResponse, self).__init__(
|
||||||
|
response=(
|
||||||
|
cdx.to_text(fields) for cdx in cdxitr
|
||||||
|
),
|
||||||
|
status=status, content_type=content_type)
|
||||||
|
|
||||||
|
# class JsonResponse(Response):
|
||||||
|
# pass
|
||||||
|
# class MementoResponse(Response):
|
||||||
|
# pass
|
||||||
|
|
||||||
|
def create_app(config=None):
|
||||||
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
|
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
|
||||||
level=logging.DEBUG)
|
level=logging.DEBUG)
|
||||||
|
|
||||||
if not paths:
|
if not config:
|
||||||
if config:
|
index_paths = get_test_dir() + 'cdx/'
|
||||||
paths = config
|
config = dict(index_paths=index_paths)
|
||||||
else:
|
|
||||||
paths = get_test_dir() + 'cdx/'
|
|
||||||
|
|
||||||
cdxserver = create_cdx_server(paths, RULES_FILE)
|
|
||||||
|
|
||||||
def application(env, start_response):
|
|
||||||
try:
|
|
||||||
params = extract_params_from_wsgi_env(env)
|
|
||||||
response = cdxserver.load_cdx(**params)
|
|
||||||
|
|
||||||
start_response('200 OK', [('Content-Type', 'text/plain')])
|
|
||||||
|
|
||||||
except Exception as exc:
|
|
||||||
import traceback
|
|
||||||
err_details = traceback.format_exc(exc)
|
|
||||||
start_response('400 Error', [('Content-Type', 'text/plain')])
|
|
||||||
response = [str(exc)]
|
|
||||||
print err_details
|
|
||||||
|
|
||||||
return response
|
|
||||||
|
|
||||||
return application
|
|
||||||
|
|
||||||
|
return WSGICDXServer(config, RULES_FILE)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
from wsgiref.simple_server import make_server
|
from optparse import OptionParser
|
||||||
|
from werkzeug.serving import run_simple
|
||||||
|
|
||||||
app = main()
|
opt = OptionParser('%prog [OPTIONS]')
|
||||||
|
opt.add_option('-p', '--port', type='int', default=None)
|
||||||
|
|
||||||
port = DEFAULT_PORT
|
options, args = opt.parse_args()
|
||||||
if config:
|
|
||||||
port = config.get('port', DEFAULT_PORT)
|
|
||||||
|
|
||||||
httpd = make_server('', port, app)
|
configdata = pkg_resources.resource_string(__name__, CONFIG_FILE)
|
||||||
|
config = yaml.load(configdata)
|
||||||
|
|
||||||
logging.debug('Starting CDX Server on port ' + str(port))
|
port = options.port
|
||||||
|
if port is None:
|
||||||
|
port = (config and config.get('port')) or DEFAULT_PORT
|
||||||
|
|
||||||
|
app = create_app(config)
|
||||||
|
|
||||||
|
logging.debug('Starting CDX Server on port %s', port)
|
||||||
try:
|
try:
|
||||||
httpd.serve_forever()
|
run_simple('0.0.0.0', port, app, use_reloader=True, use_debugger=True)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt as ex:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
logging.debug('Stopping CDX Server')
|
logging.debug('Stopping CDX Server')
|
||||||
else:
|
else:
|
||||||
application = main()
|
# XXX pass production config
|
||||||
|
application = create_app()
|
||||||
|
@ -30,7 +30,7 @@ class IndexReader(object):
|
|||||||
|
|
||||||
params['allowFuzzy'] = True
|
params['allowFuzzy'] = True
|
||||||
|
|
||||||
cdxlines = self.load_cdx(url=wburl.url, output='raw', **params)
|
cdxlines = self.load_cdx(url=wburl.url, **params)
|
||||||
|
|
||||||
return cdxlines
|
return cdxlines
|
||||||
|
|
||||||
|
2
setup.py
2
setup.py
@ -35,6 +35,8 @@ setup(
|
|||||||
'pyyaml',
|
'pyyaml',
|
||||||
'WebTest',
|
'WebTest',
|
||||||
'pytest',
|
'pytest',
|
||||||
|
'werkzeug>=0.9.4',
|
||||||
|
'setuptools',
|
||||||
],
|
],
|
||||||
# tests_require=['WebTest', 'pytest'],
|
# tests_require=['WebTest', 'pytest'],
|
||||||
zip_safe=False
|
zip_safe=False
|
||||||
|
@ -92,10 +92,10 @@ enable_cdx_api: true
|
|||||||
|
|
||||||
# optional reporter callback func
|
# optional reporter callback func
|
||||||
# if set, called with request and cdx object
|
# if set, called with request and cdx object
|
||||||
reporter: !!python/object/new:tests.test_integration.PrintReporter []
|
reporter: !!python/object/new:tests.fixture.PrintReporter []
|
||||||
|
|
||||||
# custom rules for domain specific matching
|
# custom rules for domain specific matching
|
||||||
#domain_specific_rules: rules.yaml
|
#domain_specific_rules: rules.yaml
|
||||||
|
|
||||||
#perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms []
|
#perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms []
|
||||||
perms_checker: !!python/object/new:tests.test_integration.TestExclusionPerms []
|
perms_checker: !!python/object/new:tests.fixture.TestExclusionPerms []
|
||||||
|
45
tests/fixture.py
Normal file
45
tests/fixture.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
import os
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from pywb.cdx.perms import AllowAllPerms
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def testconfig():
|
||||||
|
config = yaml.load(open('test_config.yaml'))
|
||||||
|
assert config
|
||||||
|
if 'index_paths' not in config:
|
||||||
|
# !!! assumes this module is in a sub-directory of project root.
|
||||||
|
config['index_paths'] = os.path.join(
|
||||||
|
os.path.dirname(os.path.realpath(__file__)),
|
||||||
|
'../sample_archive/cdx')
|
||||||
|
return config
|
||||||
|
|
||||||
|
#================================================================
|
||||||
|
# Reporter callback for replay view
|
||||||
|
class PrintReporter:
|
||||||
|
"""Reporter callback for replay view.
|
||||||
|
"""
|
||||||
|
def __call__(self, wbrequest, cdx, response):
|
||||||
|
print wbrequest
|
||||||
|
print cdx
|
||||||
|
pass
|
||||||
|
|
||||||
|
#================================================================
|
||||||
|
class TestExclusionPerms(AllowAllPerms):
|
||||||
|
"""
|
||||||
|
Perm Checker fixture which can block one URL.
|
||||||
|
"""
|
||||||
|
# sample_archive has captures for this URLKEY
|
||||||
|
URLKEY_EXCLUDED = 'org,iana)/_img/bookmark_icon.ico'
|
||||||
|
|
||||||
|
def allow_url_lookup(self, urlkey, url):
|
||||||
|
"""
|
||||||
|
Return true/false if url or urlkey (canonicalized url)
|
||||||
|
should be allowed
|
||||||
|
"""
|
||||||
|
if urlkey == self.URLKEY_EXCLUDED:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return super(TestExclusionPerms, self).allow_url_lookup(urlkey, url)
|
@ -9,7 +9,9 @@ class TestWb:
|
|||||||
|
|
||||||
def setup(self):
|
def setup(self):
|
||||||
#self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config())
|
#self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config())
|
||||||
self.app = create_wb_app(pywb_config(self.TEST_CONFIG))
|
# save it in self - useful for debugging
|
||||||
|
self.router = pywb_config(self.TEST_CONFIG)
|
||||||
|
self.app = create_wb_app(self.router)
|
||||||
self.testapp = webtest.TestApp(self.app)
|
self.testapp = webtest.TestApp(self.app)
|
||||||
|
|
||||||
def _assert_basic_html(self, resp):
|
def _assert_basic_html(self, resp):
|
||||||
@ -206,25 +208,3 @@ class TestWb:
|
|||||||
resp = self.testapp.get('/pywb/?abc', status = 400)
|
resp = self.testapp.get('/pywb/?abc', status = 400)
|
||||||
assert resp.status_int == 400
|
assert resp.status_int == 400
|
||||||
assert 'Invalid Url: http://?abc' in resp.body
|
assert 'Invalid Url: http://?abc' in resp.body
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
# Reporter callback for replay view
|
|
||||||
class PrintReporter:
|
|
||||||
def __call__(self, wbrequest, cdx, response):
|
|
||||||
print wbrequest
|
|
||||||
print cdx
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class TestExclusionPerms(AllowAllPerms):
|
|
||||||
"""
|
|
||||||
Sample Perm Checker with hard-coded exclusion
|
|
||||||
"""
|
|
||||||
def allow_url_lookup(self, urlkey, url):
|
|
||||||
"""
|
|
||||||
Return true/false if url or urlkey (canonicalized url)
|
|
||||||
should be allowed
|
|
||||||
"""
|
|
||||||
if urlkey == 'org,iana)/_img/bookmark_icon.ico':
|
|
||||||
return False
|
|
||||||
|
|
||||||
return super(TestExclusionPerms, self).allow_url_lookup(urlkey, url)
|
|
||||||
|
189
tests/test_wsgi_cdxserver.py
Normal file
189
tests/test_wsgi_cdxserver.py
Normal file
@ -0,0 +1,189 @@
|
|||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from urllib import urlencode
|
||||||
|
|
||||||
|
from werkzeug.test import Client
|
||||||
|
from werkzeug.wrappers import BaseResponse, Response
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from pywb.cdx.cdxobject import CDXObject
|
||||||
|
from pywb.cdx.wsgi_cdxserver import create_app
|
||||||
|
|
||||||
|
from tests.fixture import testconfig
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def client(testconfig):
|
||||||
|
app = create_app(testconfig)
|
||||||
|
return Client(app, Response)
|
||||||
|
|
||||||
|
# ================================================================
|
||||||
|
|
||||||
|
def query(client, url, **params):
|
||||||
|
params['url'] = url
|
||||||
|
return client.get('/cdx?' + urlencode(params, doseq=1))
|
||||||
|
|
||||||
|
# ================================================================
|
||||||
|
|
||||||
|
def test_exact_url(client):
|
||||||
|
"""
|
||||||
|
basic exact match, no filters, etc.
|
||||||
|
"""
|
||||||
|
resp = query(client, 'http://www.iana.org/')
|
||||||
|
|
||||||
|
assert resp.status_code == 200
|
||||||
|
print resp.data
|
||||||
|
|
||||||
|
def test_prefix_match(client):
|
||||||
|
"""
|
||||||
|
prefix match test
|
||||||
|
"""
|
||||||
|
resp = query(client, 'http://www.iana.org/', matchType='prefix')
|
||||||
|
|
||||||
|
print resp.data.splitlines()
|
||||||
|
assert resp.status_code == 200
|
||||||
|
|
||||||
|
suburls = 0
|
||||||
|
for l in resp.data.splitlines():
|
||||||
|
fields = l.split(' ')
|
||||||
|
if len(fields[0]) > len('org,iana)/'):
|
||||||
|
suburls += 1
|
||||||
|
assert suburls > 0
|
||||||
|
|
||||||
|
def test_filters(client):
|
||||||
|
"""
|
||||||
|
filter cdxes by mimetype and filename field, exact match.
|
||||||
|
"""
|
||||||
|
resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css',
|
||||||
|
filter=('mimetype:warc/revisit', 'filename:dupes.warc.gz'))
|
||||||
|
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.mimetype == 'text/plain'
|
||||||
|
|
||||||
|
for l in resp.data.splitlines():
|
||||||
|
fields = l.split(' ')
|
||||||
|
assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
|
||||||
|
assert fields[3] == 'warc/revisit'
|
||||||
|
assert fields[10] == 'dupes.warc.gz'
|
||||||
|
|
||||||
|
def test_limit(client):
|
||||||
|
resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css',
|
||||||
|
limit='1')
|
||||||
|
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.mimetype == 'text/plain'
|
||||||
|
|
||||||
|
cdxes = resp.data.splitlines()
|
||||||
|
assert len(cdxes) == 1
|
||||||
|
fields = cdxes[0].split(' ')
|
||||||
|
assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
|
||||||
|
assert fields[1] == '20140126200625'
|
||||||
|
assert fields[3] == 'text/css'
|
||||||
|
|
||||||
|
resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css',
|
||||||
|
limit='1', reverse='1')
|
||||||
|
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.mimetype == 'text/plain'
|
||||||
|
|
||||||
|
cdxes = resp.data.splitlines()
|
||||||
|
assert len(cdxes) == 1
|
||||||
|
fields = cdxes[0].split(' ')
|
||||||
|
assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
|
||||||
|
assert fields[1] == '20140127171239'
|
||||||
|
assert fields[3] == 'warc/revisit'
|
||||||
|
|
||||||
|
def test_fields(client):
|
||||||
|
"""
|
||||||
|
retrieve subset of fields with ``fields`` parameter.
|
||||||
|
"""
|
||||||
|
resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
|
||||||
|
fields='urlkey,timestamp,statuscode')
|
||||||
|
|
||||||
|
assert resp.status_code == 200
|
||||||
|
|
||||||
|
cdxes = resp.data.splitlines()
|
||||||
|
|
||||||
|
for cdx in cdxes:
|
||||||
|
fields = cdx.split(' ')
|
||||||
|
assert len(fields) == 3
|
||||||
|
assert fields[0] == 'org,iana)/_css/2013.1/print.css'
|
||||||
|
assert re.match(r'\d{14}$', fields[1])
|
||||||
|
assert re.match(r'\d{3}|-', fields[2])
|
||||||
|
|
||||||
|
def test_fields_undefined(client):
|
||||||
|
"""
|
||||||
|
server shall respond with Bad Request (TODO: with proper explanation),
|
||||||
|
when ``fields`` parameter contains undefined name(s).
|
||||||
|
"""
|
||||||
|
resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
|
||||||
|
fields='urlkey,nosuchfield')
|
||||||
|
|
||||||
|
resp.status_code == 400
|
||||||
|
|
||||||
|
def test_resolveRevisits(client):
|
||||||
|
"""
|
||||||
|
with ``resolveRevisits=true``, server adds three fields pointing to
|
||||||
|
the *original* capture.
|
||||||
|
"""
|
||||||
|
resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
|
||||||
|
resolveRevisits='true'
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.mimetype == 'text/plain'
|
||||||
|
|
||||||
|
cdxes = resp.data.splitlines()
|
||||||
|
originals = {}
|
||||||
|
for cdx in cdxes:
|
||||||
|
fields = cdx.split(' ')
|
||||||
|
assert len(fields) == 14
|
||||||
|
(key, ts, url, mt, st, sha, _, _, size, offset, fn,
|
||||||
|
orig_size, orig_offset, orig_fn) = fields
|
||||||
|
# orig_* fields are either all '-' or (int, int, filename)
|
||||||
|
# check if orig_* fields are equals to corresponding fields
|
||||||
|
# for the original capture.
|
||||||
|
if orig_size == '-':
|
||||||
|
assert orig_offset == '-' and orig_fn == '-'
|
||||||
|
originals[sha] = (int(size), int(offset), fn)
|
||||||
|
else:
|
||||||
|
orig = originals.get(sha)
|
||||||
|
assert orig == (int(orig_size), int(orig_offset), orig_fn)
|
||||||
|
|
||||||
|
def test_resolveRevisits_orig_fields(client):
|
||||||
|
"""
|
||||||
|
when resolveRevisits=true, extra three fields are named
|
||||||
|
``orig.length``, ``orig.offset`` and ``orig.filename``, respectively.
|
||||||
|
it is possible to filter fields by these names.
|
||||||
|
"""
|
||||||
|
resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
|
||||||
|
resolveRevisits='1',
|
||||||
|
fields='urlkey,orig.length,orig.offset,orig.filename'
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.mimetype == 'text/plain'
|
||||||
|
|
||||||
|
cdxes = resp.data.splitlines()
|
||||||
|
for cdx in cdxes:
|
||||||
|
fields = cdx.split(' ')
|
||||||
|
assert len(fields) == 4
|
||||||
|
key, orig_len, orig_offset, orig_fn = fields
|
||||||
|
assert (orig_len == '-' and orig_offset == '-' and orig_fn == '-' or
|
||||||
|
(int(orig_len), int(orig_offset), orig_fn))
|
||||||
|
|
||||||
|
def test_collapseTime_resolveRevisits_reverse(client):
|
||||||
|
resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
|
||||||
|
collapseTime='11',
|
||||||
|
resolveRevisits='true',
|
||||||
|
reverse='true'
|
||||||
|
)
|
||||||
|
|
||||||
|
cdxes = [CDXObject(l) for l in resp.data.splitlines()]
|
||||||
|
|
||||||
|
assert len(cdxes) == 3
|
||||||
|
|
||||||
|
# timestamp is in descending order
|
||||||
|
for i in range(len(cdxes) - 1):
|
||||||
|
assert cdxes[i]['timestamp'] >= cdxes[i + 1]['timestamp']
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user