diff --git a/pywb/apps/cdx_server.py b/pywb/apps/cdx_server.py new file mode 100644 index 00000000..893531b7 --- /dev/null +++ b/pywb/apps/cdx_server.py @@ -0,0 +1,30 @@ +from pywb.cdx.cdxserver import create_cdx_server + +from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server +from pywb.framework.archivalrouter import ArchivalRouter, Route + +from pywb.core.handlers import CDXHandler + +DEFAULT_RULES = 'pywb/rules.yaml' + +# cdx-server only config +DEFAULT_CONFIG = 'pywb/cdx/config.yaml' + +#================================================================= +# create simple cdx server under '/cdx' using config file +# TODO: support multiple collections like full wayback? + +def create_cdx_server_app(config): + cdx_server = create_cdx_server(config, DEFAULT_RULES) + routes = [Route('cdx', CDXHandler(cdx_server))] + return ArchivalRouter(routes) + +#================================================================= +# init pywb app +#================================================================= +application = init_app(create_cdx_server_app, + load_yaml=True, + config_file=DEFAULT_CONFIG) + +if __name__ == "__main__": + start_wsgi_server(application) diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py index 49cd74c5..9ea4a92e 100644 --- a/pywb/cdx/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -63,7 +63,7 @@ class CDXObject(OrderedDict): cdxformat = i if not cdxformat: - raise Exception('unknown {0}-field cdx format'.format(len(fields))) + raise CDXException('unknown {0}-field cdx format'.format(len(fields))) for header, field in itertools.izip(cdxformat, fields): self[header] = field @@ -87,8 +87,15 @@ class CDXObject(OrderedDict): """ if fields is None: return str(self) + '\n' - else: - return ' '.join(self[x] for x in fields) + '\n' + + try: + result = ' '.join(self[x] for x in fields) + '\n' + except KeyError as ke: + msg = 'Invalid field "{0}" found in fields= argument' + msg = msg.format(ke.message) + raise CDXException(msg) + + return result def __str__(self): if self.cdxline: @@ -111,7 +118,7 @@ class IDXObject(OrderedDict): if len(fields) < self.NUM_REQ_FIELDS: msg = 'invalid idx format: {0} fields found, {1} required' - raise Exception(msg.format(len(fields), self.NUM_REQ_FIELDS)) + raise CDXException(msg.format(len(fields), self.NUM_REQ_FIELDS)) for header, field in itertools.izip(self.FORMAT, fields): self[header] = field diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index e3a1a13b..6963b28c 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -31,9 +31,18 @@ def cdx_load(sources, query, perms_checker=None, process=True): if perms_checker: cdx_iter = restrict_cdx(cdx_iter, query, perms_checker) + if query.output == 'text': + cdx_iter = cdx_to_text(cdx_iter, query.fields) + return cdx_iter +#================================================================= +def cdx_to_text(cdx_iter, fields): + for cdx in cdx_iter: + yield cdx.to_text(fields) + + #================================================================= def restrict_cdx(cdx_iter, query, perms_checker): """ diff --git a/pywb/cdx/query.py b/pywb/cdx/query.py index dc480836..6449223a 100644 --- a/pywb/cdx/query.py +++ b/pywb/cdx/query.py @@ -1,5 +1,6 @@ from urllib import urlencode from urlparse import parse_qs +from cdxobject import CDXException #================================================================= @@ -62,6 +63,9 @@ class CDXQuery(object): @property def fields(self): v = self.params.get('fields') + # check old param name + if not v: + v = self.params.get('fl') return v.split(',') if v else None @property @@ -105,9 +109,6 @@ class CDXQuery(object): """ params = parse_qs(env['QUERY_STRING']) - if not 'output' in params: - params['output'] = 'text' - # parse_qs produces arrays for single values # cdx processing expects singleton params for all params, # except filters, so convert here @@ -116,4 +117,8 @@ class CDXQuery(object): if name != 'filter': params[name] = val[0] + if not 'output' in params: + params['output'] = 'text' + + return params diff --git a/pywb/cdx/test/cdxserver_test.py b/pywb/cdx/test/cdxserver_test.py index e261ead4..f0a3398d 100644 --- a/pywb/cdx/test/cdxserver_test.py +++ b/pywb/cdx/test/cdxserver_test.py @@ -187,6 +187,7 @@ import pytest def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams): kwparams['url'] = url + kwparams['output'] = 'cdxobject' fields = kwparams.get('fields') if fields: fields = fields.split(',') diff --git a/pywb/cdx/test/wsgi_cdxserver_test.py b/pywb/cdx/test/wsgi_cdxserver_test.py deleted file mode 100644 index a7d1ecdb..00000000 --- a/pywb/cdx/test/wsgi_cdxserver_test.py +++ /dev/null @@ -1,15 +0,0 @@ -import webtest -from pywb.cdx.wsgi_cdxserver import create_app -from pywb import get_test_dir - -class TestCdx: - def setup(self): - self.app = create_app(get_test_dir() + 'cdx/') - self.testapp = webtest.TestApp(self.app) - - def test_cdx(self): - resp = self.testapp.get('/cdx?url=http://www.iana.org/_css/2013.1/screen.css') - assert resp.content_type == 'text/plain' - assert resp.content_length > 0 - - diff --git a/pywb/cdx/wsgi_cdxserver.py b/pywb/cdx/wsgi_cdxserver.py deleted file mode 100644 index c9fe11d7..00000000 --- a/pywb/cdx/wsgi_cdxserver.py +++ /dev/null @@ -1,103 +0,0 @@ -from werkzeug.wrappers import BaseResponse -from cdxserver import create_cdx_server -from pywb import get_test_dir -from query import CDXQuery - -import logging -import os -import yaml -import pkg_resources - -#================================================================= -CONFIG_FILE = 'config.yaml' - -RULES_FILE = 'rules.yaml' - -DEFAULT_PORT = 8080 - -#================================================================= - -class CDXQueryRequest(object): - def __init__(self, environ): - self.query = CDXQuery.from_wsgi_env(environ) - - -class WSGICDXServer(object): - def __init__(self, config, rules_file): - self.cdxserver = create_cdx_server(config, rules_file) - - def __call__(self, environ, start_response): - request = CDXQueryRequest(environ) - try: - logging.debug('request.args=%s', request.query) - result = self.cdxserver.load_cdx_query(request.query) - - # TODO: select response type by "output" parameter - response = PlainTextResponse(result, request.query.fields) - return response(environ, start_response) - except Exception as exc: - logging.error('load_cdx failed', exc_info=1) - # TODO: error response should be different for each response - # type - start_response('400 Error', [('Content-Type', 'text/plain')]) - return [str(exc)] - -def cdx_text_out(cdx, fields): - if not fields: - return str(cdx) + '\n' - else: - logging.info('cdx fields=%s', cdx.keys) - # TODO: this will results in an exception if fields contain - # non-existent field name. - return ' '.join(cdx[x] for x in fields) + '\n' - -class PlainTextResponse(BaseResponse): - def __init__(self, cdxitr, fields, status=200, content_type='text/plain'): - super(PlainTextResponse, self).__init__( - response=( - cdx.to_text(fields) for cdx in cdxitr - ), - status=status, content_type=content_type) - -# class JsonResponse(Response): -# pass -# class MementoResponse(Response): -# pass - -def create_app(config=None): - logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s', - level=logging.DEBUG) - - if not config: - index_paths = get_test_dir() + 'cdx/' - config = dict(index_paths=index_paths) - - return WSGICDXServer(config, RULES_FILE) - -if __name__ == "__main__": - from optparse import OptionParser - from werkzeug.serving import run_simple - - opt = OptionParser('%prog [OPTIONS]') - opt.add_option('-p', '--port', type='int', default=None) - - options, args = opt.parse_args() - - configdata = pkg_resources.resource_string(__name__, CONFIG_FILE) - config = yaml.load(configdata) - - port = options.port - if port is None: - port = (config and config.get('port')) or DEFAULT_PORT - - app = create_app(config) - - logging.debug('Starting CDX Server on port %s', port) - try: - run_simple('0.0.0.0', port, app, use_reloader=True, use_debugger=True) - except KeyboardInterrupt as ex: - pass - logging.debug('Stopping CDX Server') -else: - # XXX pass production config - application = create_app() diff --git a/pywb/core/handlers.py b/pywb/core/handlers.py index 1984a4df..18bd0fc9 100644 --- a/pywb/core/handlers.py +++ b/pywb/core/handlers.py @@ -3,28 +3,13 @@ import pkgutil import mimetypes import time -from pywb.rewrite.wburl import WbUrl from pywb.cdx.query import CDXQuery +from pywb.framework.basehandlers import BaseHandler, WbUrlHandler from pywb.framework.wbrequestresponse import WbResponse -from wbexceptions import WbException, NotFoundException +from pywb.framework.wbexceptions import WbException, NotFoundException from views import TextCapturesView -#================================================================= -class BaseHandler(object): - def __call__(self, wbrequest): - return wbrequest - - def get_wburl_type(self): - return None - - -#================================================================= -class WbUrlHandler(BaseHandler): - def get_wburl_type(self): - return WbUrl - - #================================================================= # Standard WB Handler #================================================================= diff --git a/pywb/core/indexreader.py b/pywb/core/indexreader.py index a422d0b4..b77f8590 100644 --- a/pywb/core/indexreader.py +++ b/pywb/core/indexreader.py @@ -29,6 +29,7 @@ class IndexReader(object): params.update(wbrequest.custom_params) params['allowFuzzy'] = True + params['output'] = 'cdxobject' cdxlines = self.load_cdx(url=wburl.url, **params) diff --git a/pywb/framework/archivalrouter.py b/pywb/framework/archivalrouter.py index 2ae3bb5f..29701fa8 100644 --- a/pywb/framework/archivalrouter.py +++ b/pywb/framework/archivalrouter.py @@ -13,7 +13,12 @@ class ArchivalRouter(object): home_view=None, error_view=None): self.routes = routes - self.fallback = ReferRedirect(hostpaths) + + if hostpaths: + self.fallback = ReferRedirect(hostpaths) + else: + self.fallback = None + self.abs_path = abs_path self.home_view = home_view diff --git a/pywb/framework/basehandlers.py b/pywb/framework/basehandlers.py new file mode 100644 index 00000000..8ae4d662 --- /dev/null +++ b/pywb/framework/basehandlers.py @@ -0,0 +1,23 @@ +from pywb.rewrite.wburl import WbUrl + + +#================================================================= +class BaseHandler(object): + """ + Represents a base handler class that handles any request + """ + def __call__(self, wbrequest): + return wbrequest + + def get_wburl_type(self): + return None + + +#================================================================= +class WbUrlHandler(BaseHandler): + """ + Represents a handler which assumes the request contains a WbUrl + Ensure that the WbUrl is parsed in the request + """ + def get_wburl_type(self): + return WbUrl diff --git a/pywb/framework/test/test_archivalrouter.py b/pywb/framework/test/test_archivalrouter.py index 86df528a..706027ba 100644 --- a/pywb/framework/test/test_archivalrouter.py +++ b/pywb/framework/test/test_archivalrouter.py @@ -85,7 +85,7 @@ False """ from pywb.framework.archivalrouter import Route, ReferRedirect -from pywb.core.handlers import BaseHandler, WbUrlHandler +from pywb.framework.basehandlers import BaseHandler, WbUrlHandler import pprint def print_req(req): diff --git a/pywb/utils/dsrules.py b/pywb/utils/dsrules.py index bfbb5a1a..672ce738 100644 --- a/pywb/utils/dsrules.py +++ b/pywb/utils/dsrules.py @@ -3,9 +3,9 @@ from loaders import load_yaml_config #================================================================= - DEFAULT_RULES_FILE = 'pywb/rules.yaml' + #================================================================= class RuleSet(object): DEFAULT_KEY = '' diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 0f925105..d2ca827f 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -58,13 +58,13 @@ class BlockLoader(object): afile = open(url, 'rb') except IOError: - #if file_only: - # raise + if file_only: + raise # then, try as package.path/file pkg_split = url.split('/', 1) - #if len(pkg_split) == 1: - # raise + if len(pkg_split) == 1: + raise afile = pkg_resources.resource_stream(pkg_split[0], pkg_split[1]) diff --git a/pywb/warc/resolvingloader.py b/pywb/warc/resolvingloader.py index 041024e7..6a44739d 100644 --- a/pywb/warc/resolvingloader.py +++ b/pywb/warc/resolvingloader.py @@ -176,6 +176,6 @@ class ResolvingLoader: params = {'url': url, 'closest': timestamp, 'filter': 'digest:' + digest, - 'output': 'raw'} + 'output': 'cdxobject'} return self.cdx_server.load_cdx(**params) diff --git a/setup.py b/setup.py index 889fe2a8..54f136b4 100755 --- a/setup.py +++ b/setup.py @@ -19,9 +19,8 @@ setup( 'pywb.cdx', 'pywb.warc', 'pywb.rewrite', + 'pywb.framework' 'pywb.core', - 'pywb.dispatch', - 'pywb.bootstrap' 'pywb.apps' ], package_data={ diff --git a/tests/test_wsgi_cdxserver.py b/tests/test_cdx_server_app.py similarity index 73% rename from tests/test_wsgi_cdxserver.py rename to tests/test_cdx_server_app.py index 8eee2484..613273b5 100644 --- a/tests/test_wsgi_cdxserver.py +++ b/tests/test_cdx_server_app.py @@ -1,32 +1,26 @@ -import os import re +import webtest -import pytest from urllib import urlencode -from werkzeug.test import Client -from werkzeug.wrappers import BaseResponse, Response - -import yaml - from pywb.cdx.cdxobject import CDXObject -from pywb.cdx.wsgi_cdxserver import create_app +from pywb.apps.cdx_server import application -from tests.fixture import testconfig +import pytest +#================================================================ @pytest.fixture -def client(testconfig): - app = create_app(testconfig) - return Client(app, Response) +def client(): + return webtest.TestApp(application) -# ================================================================ -def query(client, url, **params): +#================================================================ +def query(client, url, is_error=False, **params): params['url'] = url - return client.get('/cdx?' + urlencode(params, doseq=1)) + return client.get('/cdx?' + urlencode(params, doseq=1), expect_errors=is_error) -# ================================================================ +#================================================================ def test_exact_url(client): """ basic exact match, no filters, etc. @@ -34,48 +28,54 @@ def test_exact_url(client): resp = query(client, 'http://www.iana.org/') assert resp.status_code == 200 - print resp.data + print resp.body + +#================================================================ def test_prefix_match(client): """ prefix match test """ resp = query(client, 'http://www.iana.org/', matchType='prefix') - print resp.data.splitlines() + print resp.body.splitlines() assert resp.status_code == 200 suburls = 0 - for l in resp.data.splitlines(): + for l in resp.body.splitlines(): fields = l.split(' ') if len(fields[0]) > len('org,iana)/'): suburls += 1 assert suburls > 0 - + + +#================================================================ def test_filters(client): """ filter cdxes by mimetype and filename field, exact match. """ resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css', filter=('mimetype:warc/revisit', 'filename:dupes.warc.gz')) - - assert resp.status_code == 200 - assert resp.mimetype == 'text/plain' - for l in resp.data.splitlines(): + assert resp.status_code == 200 + assert resp.content_type == 'text/plain' + + for l in resp.body.splitlines(): fields = l.split(' ') assert fields[0] == 'org,iana)/_css/2013.1/screen.css' assert fields[3] == 'warc/revisit' assert fields[10] == 'dupes.warc.gz' + +#================================================================ def test_limit(client): resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css', limit='1') assert resp.status_code == 200 - assert resp.mimetype == 'text/plain' + assert resp.content_type == 'text/plain' - cdxes = resp.data.splitlines() + cdxes = resp.body.splitlines() assert len(cdxes) == 1 fields = cdxes[0].split(' ') assert fields[0] == 'org,iana)/_css/2013.1/screen.css' @@ -86,15 +86,17 @@ def test_limit(client): limit='1', reverse='1') assert resp.status_code == 200 - assert resp.mimetype == 'text/plain' + assert resp.content_type == 'text/plain' - cdxes = resp.data.splitlines() + cdxes = resp.body.splitlines() assert len(cdxes) == 1 fields = cdxes[0].split(' ') assert fields[0] == 'org,iana)/_css/2013.1/screen.css' assert fields[1] == '20140127171239' assert fields[3] == 'warc/revisit' + +#================================================================ def test_fields(client): """ retrieve subset of fields with ``fields`` parameter. @@ -104,7 +106,7 @@ def test_fields(client): assert resp.status_code == 200 - cdxes = resp.data.splitlines() + cdxes = resp.body.splitlines() for cdx in cdxes: fields = cdx.split(' ') @@ -113,16 +115,21 @@ def test_fields(client): assert re.match(r'\d{14}$', fields[1]) assert re.match(r'\d{3}|-', fields[2]) + +#================================================================ def test_fields_undefined(client): """ - server shall respond with Bad Request (TODO: with proper explanation), + server shall respond with Bad Request and name of undefined when ``fields`` parameter contains undefined name(s). """ resp = query(client, 'http://www.iana.org/_css/2013.1/print.css', + is_error=True, fields='urlkey,nosuchfield') resp.status_code == 400 - + + +#================================================================ def test_resolveRevisits(client): """ with ``resolveRevisits=true``, server adds three fields pointing to @@ -132,9 +139,9 @@ def test_resolveRevisits(client): resolveRevisits='true' ) assert resp.status_code == 200 - assert resp.mimetype == 'text/plain' + assert resp.content_type == 'text/plain' - cdxes = resp.data.splitlines() + cdxes = resp.body.splitlines() originals = {} for cdx in cdxes: fields = cdx.split(' ') @@ -151,6 +158,8 @@ def test_resolveRevisits(client): orig = originals.get(sha) assert orig == (int(orig_size), int(orig_offset), orig_fn) + +#================================================================ def test_resolveRevisits_orig_fields(client): """ when resolveRevisits=true, extra three fields are named @@ -162,9 +171,9 @@ def test_resolveRevisits_orig_fields(client): fields='urlkey,orig.length,orig.offset,orig.filename' ) assert resp.status_code == 200 - assert resp.mimetype == 'text/plain' + assert resp.content_type == 'text/plain' - cdxes = resp.data.splitlines() + cdxes = resp.body.splitlines() for cdx in cdxes: fields = cdx.split(' ') assert len(fields) == 4 @@ -172,6 +181,8 @@ def test_resolveRevisits_orig_fields(client): assert (orig_len == '-' and orig_offset == '-' and orig_fn == '-' or (int(orig_len), int(orig_offset), orig_fn)) + +#================================================================ def test_collapseTime_resolveRevisits_reverse(client): resp = query(client, 'http://www.iana.org/_css/2013.1/print.css', collapseTime='11', @@ -179,11 +190,10 @@ def test_collapseTime_resolveRevisits_reverse(client): reverse='true' ) - cdxes = [CDXObject(l) for l in resp.data.splitlines()] - + cdxes = [CDXObject(l) for l in resp.body.splitlines()] + assert len(cdxes) == 3 # timestamp is in descending order for i in range(len(cdxes) - 1): assert cdxes[i]['timestamp'] >= cdxes[i + 1]['timestamp'] -