From 2d4ae62fbe45710402fbdd28c91fb5bea1a0e0ad Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 3 Mar 2014 10:35:57 -0800 Subject: [PATCH] - cdx handler refactoring: factor out CDXHandler and init to seperate cdx_handler module - Make wsgi app a class, add port as an optional field in wsgi app and router. (not required to be specified) --- pywb/apps/cdx_server.py | 21 +++------------- pywb/core/cdx_handler.py | 43 ++++++++++++++++++++++++++++++++ pywb/core/handlers.py | 20 --------------- pywb/core/pywb_init.py | 6 ++++- pywb/framework/archivalrouter.py | 11 ++++++-- pywb/framework/proxy.py | 26 +++++++++++++------ pywb/framework/wbexceptions.py | 5 ++-- pywb/framework/wsgi_wrappers.py | 31 ++++++++++++----------- test_config.yaml | 3 +++ 9 files changed, 102 insertions(+), 64 deletions(-) create mode 100644 pywb/core/cdx_handler.py diff --git a/pywb/apps/cdx_server.py b/pywb/apps/cdx_server.py index 893531b7..a16df1fe 100644 --- a/pywb/apps/cdx_server.py +++ b/pywb/apps/cdx_server.py @@ -1,27 +1,14 @@ -from pywb.cdx.cdxserver import create_cdx_server - from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server -from pywb.framework.archivalrouter import ArchivalRouter, Route -from pywb.core.handlers import CDXHandler +from pywb.core.cdx_handler import create_cdx_server_app -DEFAULT_RULES = 'pywb/rules.yaml' +#================================================================= +# init cdx server app +#================================================================= # cdx-server only config DEFAULT_CONFIG = 'pywb/cdx/config.yaml' -#================================================================= -# create simple cdx server under '/cdx' using config file -# TODO: support multiple collections like full wayback? - -def create_cdx_server_app(config): - cdx_server = create_cdx_server(config, DEFAULT_RULES) - routes = [Route('cdx', CDXHandler(cdx_server))] - return ArchivalRouter(routes) - -#================================================================= -# init pywb app -#================================================================= application = init_app(create_cdx_server_app, load_yaml=True, config_file=DEFAULT_CONFIG) diff --git a/pywb/core/cdx_handler.py b/pywb/core/cdx_handler.py new file mode 100644 index 00000000..3f5bb2a8 --- /dev/null +++ b/pywb/core/cdx_handler.py @@ -0,0 +1,43 @@ +from pywb.cdx.query import CDXQuery +from pywb.cdx.cdxserver import create_cdx_server + +from pywb.framework.archivalrouter import ArchivalRouter, Route +from pywb.framework.basehandlers import BaseHandler + +from views import TextCapturesView + + +#================================================================= +class CDXHandler(BaseHandler): + """ + Handler which passes wsgi request to cdx server and + returns a text-based cdx response + """ + def __init__(self, index_reader, view=None): + self.index_reader = index_reader + self.view = view if view else TextCapturesView() + + def __call__(self, wbrequest): + params = CDXQuery.extract_params_from_wsgi_env(wbrequest.env) + cdx_lines = self.index_reader.load_cdx(**params) + + return self.view.render_response(wbrequest, cdx_lines) + + def __str__(self): + return 'CDX Handler: ' + str(self.index_reader) + + +#================================================================= +DEFAULT_RULES = 'pywb/rules.yaml' + +#================================================================= +def create_cdx_server_app(config): + """ + Create a cdx server config to be wrapped in a wsgi app + Currently using single access point '/cdx' + TODO: more complex example with multiple collections? + """ + cdx_server = create_cdx_server(config, DEFAULT_RULES) + port = config.get('port') + routes = [Route('cdx', CDXHandler(cdx_server))] + return ArchivalRouter(routes, port=port) diff --git a/pywb/core/handlers.py b/pywb/core/handlers.py index 18bd0fc9..049888df 100644 --- a/pywb/core/handlers.py +++ b/pywb/core/handlers.py @@ -1,9 +1,7 @@ -import urlparse import pkgutil import mimetypes import time -from pywb.cdx.query import CDXQuery from pywb.framework.basehandlers import BaseHandler, WbUrlHandler from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.wbexceptions import WbException, NotFoundException @@ -58,24 +56,6 @@ class WBHandler(WbUrlHandler): return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay) -#================================================================= -# CDX-Server Handler -- pass all params to cdx server -#================================================================= -class CDXHandler(BaseHandler): - def __init__(self, index_reader, view = None): - self.index_reader = index_reader - self.view = view if view else TextCapturesView() - - def __call__(self, wbrequest): - params = CDXQuery.extract_params_from_wsgi_env(wbrequest.env) - cdx_lines = self.index_reader.load_cdx(**params) - - return self.view.render_response(wbrequest, cdx_lines) - - def __str__(self): - return 'Index Reader: ' + str(self.index_reader) - - #================================================================= # Static Content Handler #================================================================= diff --git a/pywb/core/pywb_init.py b/pywb/core/pywb_init.py index 52df9f5f..10c7b999 100644 --- a/pywb/core/pywb_init.py +++ b/pywb/core/pywb_init.py @@ -11,7 +11,8 @@ from views import J2TemplateView, J2HtmlCapturesView from replay_views import ReplayView from handlers import WBHandler -from handlers import CDXHandler, StaticHandler +from handlers import StaticHandler +from cdx_handler import CDXHandler from handlers import DebugEchoHandler, DebugEchoEnvHandler @@ -115,6 +116,8 @@ def create_wb_router(passed_config = {}): hostpaths = config.get('hostpaths') + port = config.get('port') + # collections based on cdx source collections = config.get('collections') @@ -169,6 +172,7 @@ def create_wb_router(passed_config = {}): # This will help catch occasionally missed rewrites that fall-through to the host # (See archivalrouter.ReferRedirect) hostpaths = hostpaths, + port = port, abs_path = config.get('absolute_paths', True), diff --git a/pywb/framework/archivalrouter.py b/pywb/framework/archivalrouter.py index 29701fa8..6c901fac 100644 --- a/pywb/framework/archivalrouter.py +++ b/pywb/framework/archivalrouter.py @@ -9,11 +9,18 @@ from wbrequestresponse import WbRequest, WbResponse # ArchivalRouter -- route WB requests in archival mode #================================================================= class ArchivalRouter(object): - def __init__(self, routes, hostpaths=None, abs_path=True, - home_view=None, error_view=None): + def __init__(self, routes, + hostpaths=None, + port=None, + abs_path=True, + home_view=None, + error_view=None): self.routes = routes + # optional port setting may be ignored by wsgi container + self.port = port + if hostpaths: self.fallback = ReferRedirect(hostpaths) else: diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index cbebf4ae..d27b922e 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -8,21 +8,31 @@ import urlparse # http proxy mode support is very simple so far: # only latest capture is available currently #================================================================= -class ProxyArchivalRouter: - def __init__(self, routes, hostpaths=None, abs_path=True, - home_view=None, error_view=None): +class ProxyArchivalRouter(ArchivalRouter): + def __init__(self, routes, + hostpaths=None, + port=None, + abs_path=True, + home_view=None, + error_view=None): + + (super(ProxyArchivalRouter, self). + __init__(routes, + hostpaths=hostpaths, + port=port, + abs_path=abs_path, + home_view=home_view, + error_view=error_view)) - self.archival = ArchivalRouter(routes, hostpaths, abs_path, - home_view, error_view) self.proxy = ProxyRouter(routes[0].handler, hostpaths, error_view) - self.error_view = error_view + #self.error_view = error_view def __call__(self, env): - response = self.archival(env) + response = self.proxy(env) if response: return response - response = self.proxy(env) + response = super(ProxyArchivalRouter, self).__call__(env) if response: return response diff --git a/pywb/framework/wbexceptions.py b/pywb/framework/wbexceptions.py index e9b07ad3..6d437a4e 100644 --- a/pywb/framework/wbexceptions.py +++ b/pywb/framework/wbexceptions.py @@ -5,17 +5,18 @@ class NotFoundException(WbException): def status(self): return '404 Not Found' + # Exceptions that effect a specific capture and result in a retry class CaptureException(WbException): def status(self): return '500 Internal Server Error' + class InternalRedirect(WbException): - def __init__(self, location, status = '302 Internal Redirect'): + def __init__(self, location, status='302 Internal Redirect'): WbException.__init__(self, 'Redirecting -> ' + location) self.status = status self.httpHeaders = [('Location', location)] def status(self): return self.status - diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py index 2811aa92..1dd433de 100644 --- a/pywb/framework/wsgi_wrappers.py +++ b/pywb/framework/wsgi_wrappers.py @@ -10,6 +10,8 @@ import importlib import logging +DEFAULT_PORT = 8080 + #================================================================= # adapted from wsgiref.request_uri, but doesn't include domain name # and allows all characters which are allowed in the path segment @@ -18,6 +20,7 @@ import logging # http://stackoverflow.com/questions/4669692/ # valid-characters-for-directory-part-of-a-url-for-short-links + def rel_request_uri(environ, include_query=1): """ Return the requested path, optionally including the query string @@ -40,14 +43,21 @@ def rel_request_uri(environ, include_query=1): #================================================================= -def create_wb_app(wb_router): +class WSGIApp(object): + def __init__(self, wb_router): + self.wb_router = wb_router + self.port = DEFAULT_PORT + if hasattr(wb_router, 'port'): + self.port = wb_router.port + # Top-level wsgi application - def application(env, start_response): + def __call__(self, env, start_response): if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'): env['REL_REQUEST_URI'] = rel_request_uri(env) else: env['REL_REQUEST_URI'] = env['REQUEST_URI'] + wb_router = self.wb_router response = None try: @@ -68,8 +78,6 @@ def create_wb_app(wb_router): return response(env, start_response) - return application - #================================================================= def handle_exception(env, error_view, exc, print_trace): @@ -126,13 +134,10 @@ def init_app(init_func, load_yaml=True, config_file=None): msg = '*** pywb app inited with config from "%s"!\n' logging.info(msg, init_func.__name__) - return create_wb_app(wb_router) + return WSGIApp(wb_router) #================================================================= -DEFAULT_PORT = 8080 - - def start_wsgi_server(the_app): from wsgiref.simple_server import make_server from optparse import OptionParser @@ -144,12 +149,10 @@ def start_wsgi_server(the_app): port = options.port - if port is None: - try: - config = load_default_config() - port = config.get('port', DEFAULT_PORT) - except: - port = DEFAULT_PORT + port = the_app.port + + if not port: + port = DEFAULT_PORT logging.debug('Starting CDX Server on port %s', port) diff --git a/test_config.yaml b/test_config.yaml index 20e52933..d6c75650 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -90,6 +90,9 @@ enable_http_proxy: true # enable cdx server api for querying cdx directly (experimental) enable_cdx_api: true +# test different port +port: 9000 + # optional reporter callback func # if set, called with request and cdx object reporter: !!python/object/new:tests.fixture.PrintReporter []