1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

add cdx_server app!

port wsgi cdx server tests to test new app!
move base handlers to basehandlers in framework pkg
(remove werkzeug dependency)
This commit is contained in:
Ilya Kreymer 2014-03-02 23:41:44 -08:00
parent f0a0976038
commit 0bf651c2e3
17 changed files with 147 additions and 190 deletions

30
pywb/apps/cdx_server.py Normal file
View File

@ -0,0 +1,30 @@
from pywb.cdx.cdxserver import create_cdx_server
from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server
from pywb.framework.archivalrouter import ArchivalRouter, Route
from pywb.core.handlers import CDXHandler
DEFAULT_RULES = 'pywb/rules.yaml'
# cdx-server only config
DEFAULT_CONFIG = 'pywb/cdx/config.yaml'
#=================================================================
# create simple cdx server under '/cdx' using config file
# TODO: support multiple collections like full wayback?
def create_cdx_server_app(config):
cdx_server = create_cdx_server(config, DEFAULT_RULES)
routes = [Route('cdx', CDXHandler(cdx_server))]
return ArchivalRouter(routes)
#=================================================================
# init pywb app
#=================================================================
application = init_app(create_cdx_server_app,
load_yaml=True,
config_file=DEFAULT_CONFIG)
if __name__ == "__main__":
start_wsgi_server(application)

View File

@ -63,7 +63,7 @@ class CDXObject(OrderedDict):
cdxformat = i
if not cdxformat:
raise Exception('unknown {0}-field cdx format'.format(len(fields)))
raise CDXException('unknown {0}-field cdx format'.format(len(fields)))
for header, field in itertools.izip(cdxformat, fields):
self[header] = field
@ -87,8 +87,15 @@ class CDXObject(OrderedDict):
"""
if fields is None:
return str(self) + '\n'
else:
return ' '.join(self[x] for x in fields) + '\n'
try:
result = ' '.join(self[x] for x in fields) + '\n'
except KeyError as ke:
msg = 'Invalid field "{0}" found in fields= argument'
msg = msg.format(ke.message)
raise CDXException(msg)
return result
def __str__(self):
if self.cdxline:
@ -111,7 +118,7 @@ class IDXObject(OrderedDict):
if len(fields) < self.NUM_REQ_FIELDS:
msg = 'invalid idx format: {0} fields found, {1} required'
raise Exception(msg.format(len(fields), self.NUM_REQ_FIELDS))
raise CDXException(msg.format(len(fields), self.NUM_REQ_FIELDS))
for header, field in itertools.izip(self.FORMAT, fields):
self[header] = field

View File

@ -31,9 +31,18 @@ def cdx_load(sources, query, perms_checker=None, process=True):
if perms_checker:
cdx_iter = restrict_cdx(cdx_iter, query, perms_checker)
if query.output == 'text':
cdx_iter = cdx_to_text(cdx_iter, query.fields)
return cdx_iter
#=================================================================
def cdx_to_text(cdx_iter, fields):
for cdx in cdx_iter:
yield cdx.to_text(fields)
#=================================================================
def restrict_cdx(cdx_iter, query, perms_checker):
"""

View File

@ -1,5 +1,6 @@
from urllib import urlencode
from urlparse import parse_qs
from cdxobject import CDXException
#=================================================================
@ -62,6 +63,9 @@ class CDXQuery(object):
@property
def fields(self):
v = self.params.get('fields')
# check old param name
if not v:
v = self.params.get('fl')
return v.split(',') if v else None
@property
@ -105,9 +109,6 @@ class CDXQuery(object):
"""
params = parse_qs(env['QUERY_STRING'])
if not 'output' in params:
params['output'] = 'text'
# parse_qs produces arrays for single values
# cdx processing expects singleton params for all params,
# except filters, so convert here
@ -116,4 +117,8 @@ class CDXQuery(object):
if name != 'filter':
params[name] = val[0]
if not 'output' in params:
params['output'] = 'text'
return params

View File

@ -187,6 +187,7 @@ import pytest
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
kwparams['url'] = url
kwparams['output'] = 'cdxobject'
fields = kwparams.get('fields')
if fields:
fields = fields.split(',')

View File

@ -1,15 +0,0 @@
import webtest
from pywb.cdx.wsgi_cdxserver import create_app
from pywb import get_test_dir
class TestCdx:
def setup(self):
self.app = create_app(get_test_dir() + 'cdx/')
self.testapp = webtest.TestApp(self.app)
def test_cdx(self):
resp = self.testapp.get('/cdx?url=http://www.iana.org/_css/2013.1/screen.css')
assert resp.content_type == 'text/plain'
assert resp.content_length > 0

View File

@ -1,103 +0,0 @@
from werkzeug.wrappers import BaseResponse
from cdxserver import create_cdx_server
from pywb import get_test_dir
from query import CDXQuery
import logging
import os
import yaml
import pkg_resources
#=================================================================
CONFIG_FILE = 'config.yaml'
RULES_FILE = 'rules.yaml'
DEFAULT_PORT = 8080
#=================================================================
class CDXQueryRequest(object):
def __init__(self, environ):
self.query = CDXQuery.from_wsgi_env(environ)
class WSGICDXServer(object):
def __init__(self, config, rules_file):
self.cdxserver = create_cdx_server(config, rules_file)
def __call__(self, environ, start_response):
request = CDXQueryRequest(environ)
try:
logging.debug('request.args=%s', request.query)
result = self.cdxserver.load_cdx_query(request.query)
# TODO: select response type by "output" parameter
response = PlainTextResponse(result, request.query.fields)
return response(environ, start_response)
except Exception as exc:
logging.error('load_cdx failed', exc_info=1)
# TODO: error response should be different for each response
# type
start_response('400 Error', [('Content-Type', 'text/plain')])
return [str(exc)]
def cdx_text_out(cdx, fields):
if not fields:
return str(cdx) + '\n'
else:
logging.info('cdx fields=%s', cdx.keys)
# TODO: this will results in an exception if fields contain
# non-existent field name.
return ' '.join(cdx[x] for x in fields) + '\n'
class PlainTextResponse(BaseResponse):
def __init__(self, cdxitr, fields, status=200, content_type='text/plain'):
super(PlainTextResponse, self).__init__(
response=(
cdx.to_text(fields) for cdx in cdxitr
),
status=status, content_type=content_type)
# class JsonResponse(Response):
# pass
# class MementoResponse(Response):
# pass
def create_app(config=None):
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
level=logging.DEBUG)
if not config:
index_paths = get_test_dir() + 'cdx/'
config = dict(index_paths=index_paths)
return WSGICDXServer(config, RULES_FILE)
if __name__ == "__main__":
from optparse import OptionParser
from werkzeug.serving import run_simple
opt = OptionParser('%prog [OPTIONS]')
opt.add_option('-p', '--port', type='int', default=None)
options, args = opt.parse_args()
configdata = pkg_resources.resource_string(__name__, CONFIG_FILE)
config = yaml.load(configdata)
port = options.port
if port is None:
port = (config and config.get('port')) or DEFAULT_PORT
app = create_app(config)
logging.debug('Starting CDX Server on port %s', port)
try:
run_simple('0.0.0.0', port, app, use_reloader=True, use_debugger=True)
except KeyboardInterrupt as ex:
pass
logging.debug('Stopping CDX Server')
else:
# XXX pass production config
application = create_app()

View File

@ -3,28 +3,13 @@ import pkgutil
import mimetypes
import time
from pywb.rewrite.wburl import WbUrl
from pywb.cdx.query import CDXQuery
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
from pywb.framework.wbrequestresponse import WbResponse
from wbexceptions import WbException, NotFoundException
from pywb.framework.wbexceptions import WbException, NotFoundException
from views import TextCapturesView
#=================================================================
class BaseHandler(object):
def __call__(self, wbrequest):
return wbrequest
def get_wburl_type(self):
return None
#=================================================================
class WbUrlHandler(BaseHandler):
def get_wburl_type(self):
return WbUrl
#=================================================================
# Standard WB Handler
#=================================================================

View File

@ -29,6 +29,7 @@ class IndexReader(object):
params.update(wbrequest.custom_params)
params['allowFuzzy'] = True
params['output'] = 'cdxobject'
cdxlines = self.load_cdx(url=wburl.url, **params)

View File

@ -13,7 +13,12 @@ class ArchivalRouter(object):
home_view=None, error_view=None):
self.routes = routes
self.fallback = ReferRedirect(hostpaths)
if hostpaths:
self.fallback = ReferRedirect(hostpaths)
else:
self.fallback = None
self.abs_path = abs_path
self.home_view = home_view

View File

@ -0,0 +1,23 @@
from pywb.rewrite.wburl import WbUrl
#=================================================================
class BaseHandler(object):
"""
Represents a base handler class that handles any request
"""
def __call__(self, wbrequest):
return wbrequest
def get_wburl_type(self):
return None
#=================================================================
class WbUrlHandler(BaseHandler):
"""
Represents a handler which assumes the request contains a WbUrl
Ensure that the WbUrl is parsed in the request
"""
def get_wburl_type(self):
return WbUrl

View File

@ -85,7 +85,7 @@ False
"""
from pywb.framework.archivalrouter import Route, ReferRedirect
from pywb.core.handlers import BaseHandler, WbUrlHandler
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
import pprint
def print_req(req):

View File

@ -3,9 +3,9 @@ from loaders import load_yaml_config
#=================================================================
DEFAULT_RULES_FILE = 'pywb/rules.yaml'
#=================================================================
class RuleSet(object):
DEFAULT_KEY = ''

View File

@ -58,13 +58,13 @@ class BlockLoader(object):
afile = open(url, 'rb')
except IOError:
#if file_only:
# raise
if file_only:
raise
# then, try as package.path/file
pkg_split = url.split('/', 1)
#if len(pkg_split) == 1:
# raise
if len(pkg_split) == 1:
raise
afile = pkg_resources.resource_stream(pkg_split[0],
pkg_split[1])

View File

@ -176,6 +176,6 @@ class ResolvingLoader:
params = {'url': url,
'closest': timestamp,
'filter': 'digest:' + digest,
'output': 'raw'}
'output': 'cdxobject'}
return self.cdx_server.load_cdx(**params)

View File

@ -19,9 +19,8 @@ setup(
'pywb.cdx',
'pywb.warc',
'pywb.rewrite',
'pywb.framework'
'pywb.core',
'pywb.dispatch',
'pywb.bootstrap'
'pywb.apps'
],
package_data={

View File

@ -1,32 +1,26 @@
import os
import re
import webtest
import pytest
from urllib import urlencode
from werkzeug.test import Client
from werkzeug.wrappers import BaseResponse, Response
import yaml
from pywb.cdx.cdxobject import CDXObject
from pywb.cdx.wsgi_cdxserver import create_app
from pywb.apps.cdx_server import application
from tests.fixture import testconfig
import pytest
#================================================================
@pytest.fixture
def client(testconfig):
app = create_app(testconfig)
return Client(app, Response)
def client():
return webtest.TestApp(application)
# ================================================================
def query(client, url, **params):
#================================================================
def query(client, url, is_error=False, **params):
params['url'] = url
return client.get('/cdx?' + urlencode(params, doseq=1))
return client.get('/cdx?' + urlencode(params, doseq=1), expect_errors=is_error)
# ================================================================
#================================================================
def test_exact_url(client):
"""
basic exact match, no filters, etc.
@ -34,48 +28,54 @@ def test_exact_url(client):
resp = query(client, 'http://www.iana.org/')
assert resp.status_code == 200
print resp.data
print resp.body
#================================================================
def test_prefix_match(client):
"""
prefix match test
"""
resp = query(client, 'http://www.iana.org/', matchType='prefix')
print resp.data.splitlines()
print resp.body.splitlines()
assert resp.status_code == 200
suburls = 0
for l in resp.data.splitlines():
for l in resp.body.splitlines():
fields = l.split(' ')
if len(fields[0]) > len('org,iana)/'):
suburls += 1
assert suburls > 0
#================================================================
def test_filters(client):
"""
filter cdxes by mimetype and filename field, exact match.
"""
resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css',
filter=('mimetype:warc/revisit', 'filename:dupes.warc.gz'))
assert resp.status_code == 200
assert resp.mimetype == 'text/plain'
for l in resp.data.splitlines():
assert resp.status_code == 200
assert resp.content_type == 'text/plain'
for l in resp.body.splitlines():
fields = l.split(' ')
assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
assert fields[3] == 'warc/revisit'
assert fields[10] == 'dupes.warc.gz'
#================================================================
def test_limit(client):
resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css',
limit='1')
assert resp.status_code == 200
assert resp.mimetype == 'text/plain'
assert resp.content_type == 'text/plain'
cdxes = resp.data.splitlines()
cdxes = resp.body.splitlines()
assert len(cdxes) == 1
fields = cdxes[0].split(' ')
assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
@ -86,15 +86,17 @@ def test_limit(client):
limit='1', reverse='1')
assert resp.status_code == 200
assert resp.mimetype == 'text/plain'
assert resp.content_type == 'text/plain'
cdxes = resp.data.splitlines()
cdxes = resp.body.splitlines()
assert len(cdxes) == 1
fields = cdxes[0].split(' ')
assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
assert fields[1] == '20140127171239'
assert fields[3] == 'warc/revisit'
#================================================================
def test_fields(client):
"""
retrieve subset of fields with ``fields`` parameter.
@ -104,7 +106,7 @@ def test_fields(client):
assert resp.status_code == 200
cdxes = resp.data.splitlines()
cdxes = resp.body.splitlines()
for cdx in cdxes:
fields = cdx.split(' ')
@ -113,16 +115,21 @@ def test_fields(client):
assert re.match(r'\d{14}$', fields[1])
assert re.match(r'\d{3}|-', fields[2])
#================================================================
def test_fields_undefined(client):
"""
server shall respond with Bad Request (TODO: with proper explanation),
server shall respond with Bad Request and name of undefined
when ``fields`` parameter contains undefined name(s).
"""
resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
is_error=True,
fields='urlkey,nosuchfield')
resp.status_code == 400
#================================================================
def test_resolveRevisits(client):
"""
with ``resolveRevisits=true``, server adds three fields pointing to
@ -132,9 +139,9 @@ def test_resolveRevisits(client):
resolveRevisits='true'
)
assert resp.status_code == 200
assert resp.mimetype == 'text/plain'
assert resp.content_type == 'text/plain'
cdxes = resp.data.splitlines()
cdxes = resp.body.splitlines()
originals = {}
for cdx in cdxes:
fields = cdx.split(' ')
@ -151,6 +158,8 @@ def test_resolveRevisits(client):
orig = originals.get(sha)
assert orig == (int(orig_size), int(orig_offset), orig_fn)
#================================================================
def test_resolveRevisits_orig_fields(client):
"""
when resolveRevisits=true, extra three fields are named
@ -162,9 +171,9 @@ def test_resolveRevisits_orig_fields(client):
fields='urlkey,orig.length,orig.offset,orig.filename'
)
assert resp.status_code == 200
assert resp.mimetype == 'text/plain'
assert resp.content_type == 'text/plain'
cdxes = resp.data.splitlines()
cdxes = resp.body.splitlines()
for cdx in cdxes:
fields = cdx.split(' ')
assert len(fields) == 4
@ -172,6 +181,8 @@ def test_resolveRevisits_orig_fields(client):
assert (orig_len == '-' and orig_offset == '-' and orig_fn == '-' or
(int(orig_len), int(orig_offset), orig_fn))
#================================================================
def test_collapseTime_resolveRevisits_reverse(client):
resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
collapseTime='11',
@ -179,11 +190,10 @@ def test_collapseTime_resolveRevisits_reverse(client):
reverse='true'
)
cdxes = [CDXObject(l) for l in resp.data.splitlines()]
cdxes = [CDXObject(l) for l in resp.body.splitlines()]
assert len(cdxes) == 3
# timestamp is in descending order
for i in range(len(cdxes) - 1):
assert cdxes[i]['timestamp'] >= cdxes[i + 1]['timestamp']