1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

- cdx handler refactoring: factor out CDXHandler and init to

seperate cdx_handler module
- Make wsgi app a class, add port as an optional field in wsgi app
and router. (not required to be specified)
This commit is contained in:
Ilya Kreymer 2014-03-03 10:35:57 -08:00
parent 0bf651c2e3
commit 2d4ae62fbe
9 changed files with 102 additions and 64 deletions

View File

@ -1,27 +1,14 @@
from pywb.cdx.cdxserver import create_cdx_server
from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server
from pywb.framework.archivalrouter import ArchivalRouter, Route
from pywb.core.handlers import CDXHandler
from pywb.core.cdx_handler import create_cdx_server_app
DEFAULT_RULES = 'pywb/rules.yaml'
#=================================================================
# init cdx server app
#=================================================================
# cdx-server only config
DEFAULT_CONFIG = 'pywb/cdx/config.yaml'
#=================================================================
# create simple cdx server under '/cdx' using config file
# TODO: support multiple collections like full wayback?
def create_cdx_server_app(config):
cdx_server = create_cdx_server(config, DEFAULT_RULES)
routes = [Route('cdx', CDXHandler(cdx_server))]
return ArchivalRouter(routes)
#=================================================================
# init pywb app
#=================================================================
application = init_app(create_cdx_server_app,
load_yaml=True,
config_file=DEFAULT_CONFIG)

43
pywb/core/cdx_handler.py Normal file
View File

@ -0,0 +1,43 @@
from pywb.cdx.query import CDXQuery
from pywb.cdx.cdxserver import create_cdx_server
from pywb.framework.archivalrouter import ArchivalRouter, Route
from pywb.framework.basehandlers import BaseHandler
from views import TextCapturesView
#=================================================================
class CDXHandler(BaseHandler):
"""
Handler which passes wsgi request to cdx server and
returns a text-based cdx response
"""
def __init__(self, index_reader, view=None):
self.index_reader = index_reader
self.view = view if view else TextCapturesView()
def __call__(self, wbrequest):
params = CDXQuery.extract_params_from_wsgi_env(wbrequest.env)
cdx_lines = self.index_reader.load_cdx(**params)
return self.view.render_response(wbrequest, cdx_lines)
def __str__(self):
return 'CDX Handler: ' + str(self.index_reader)
#=================================================================
DEFAULT_RULES = 'pywb/rules.yaml'
#=================================================================
def create_cdx_server_app(config):
"""
Create a cdx server config to be wrapped in a wsgi app
Currently using single access point '/cdx'
TODO: more complex example with multiple collections?
"""
cdx_server = create_cdx_server(config, DEFAULT_RULES)
port = config.get('port')
routes = [Route('cdx', CDXHandler(cdx_server))]
return ArchivalRouter(routes, port=port)

View File

@ -1,9 +1,7 @@
import urlparse
import pkgutil
import mimetypes
import time
from pywb.cdx.query import CDXQuery
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.wbexceptions import WbException, NotFoundException
@ -58,24 +56,6 @@ class WBHandler(WbUrlHandler):
return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay)
#=================================================================
# CDX-Server Handler -- pass all params to cdx server
#=================================================================
class CDXHandler(BaseHandler):
def __init__(self, index_reader, view = None):
self.index_reader = index_reader
self.view = view if view else TextCapturesView()
def __call__(self, wbrequest):
params = CDXQuery.extract_params_from_wsgi_env(wbrequest.env)
cdx_lines = self.index_reader.load_cdx(**params)
return self.view.render_response(wbrequest, cdx_lines)
def __str__(self):
return 'Index Reader: ' + str(self.index_reader)
#=================================================================
# Static Content Handler
#=================================================================

View File

@ -11,7 +11,8 @@ from views import J2TemplateView, J2HtmlCapturesView
from replay_views import ReplayView
from handlers import WBHandler
from handlers import CDXHandler, StaticHandler
from handlers import StaticHandler
from cdx_handler import CDXHandler
from handlers import DebugEchoHandler, DebugEchoEnvHandler
@ -115,6 +116,8 @@ def create_wb_router(passed_config = {}):
hostpaths = config.get('hostpaths')
port = config.get('port')
# collections based on cdx source
collections = config.get('collections')
@ -169,6 +172,7 @@ def create_wb_router(passed_config = {}):
# This will help catch occasionally missed rewrites that fall-through to the host
# (See archivalrouter.ReferRedirect)
hostpaths = hostpaths,
port = port,
abs_path = config.get('absolute_paths', True),

View File

@ -9,11 +9,18 @@ from wbrequestresponse import WbRequest, WbResponse
# ArchivalRouter -- route WB requests in archival mode
#=================================================================
class ArchivalRouter(object):
def __init__(self, routes, hostpaths=None, abs_path=True,
home_view=None, error_view=None):
def __init__(self, routes,
hostpaths=None,
port=None,
abs_path=True,
home_view=None,
error_view=None):
self.routes = routes
# optional port setting may be ignored by wsgi container
self.port = port
if hostpaths:
self.fallback = ReferRedirect(hostpaths)
else:

View File

@ -8,21 +8,31 @@ import urlparse
# http proxy mode support is very simple so far:
# only latest capture is available currently
#=================================================================
class ProxyArchivalRouter:
def __init__(self, routes, hostpaths=None, abs_path=True,
home_view=None, error_view=None):
class ProxyArchivalRouter(ArchivalRouter):
def __init__(self, routes,
hostpaths=None,
port=None,
abs_path=True,
home_view=None,
error_view=None):
(super(ProxyArchivalRouter, self).
__init__(routes,
hostpaths=hostpaths,
port=port,
abs_path=abs_path,
home_view=home_view,
error_view=error_view))
self.archival = ArchivalRouter(routes, hostpaths, abs_path,
home_view, error_view)
self.proxy = ProxyRouter(routes[0].handler, hostpaths, error_view)
self.error_view = error_view
#self.error_view = error_view
def __call__(self, env):
response = self.archival(env)
response = self.proxy(env)
if response:
return response
response = self.proxy(env)
response = super(ProxyArchivalRouter, self).__call__(env)
if response:
return response

View File

@ -5,17 +5,18 @@ class NotFoundException(WbException):
def status(self):
return '404 Not Found'
# Exceptions that effect a specific capture and result in a retry
class CaptureException(WbException):
def status(self):
return '500 Internal Server Error'
class InternalRedirect(WbException):
def __init__(self, location, status = '302 Internal Redirect'):
def __init__(self, location, status='302 Internal Redirect'):
WbException.__init__(self, 'Redirecting -> ' + location)
self.status = status
self.httpHeaders = [('Location', location)]
def status(self):
return self.status

View File

@ -10,6 +10,8 @@ import importlib
import logging
DEFAULT_PORT = 8080
#=================================================================
# adapted from wsgiref.request_uri, but doesn't include domain name
# and allows all characters which are allowed in the path segment
@ -18,6 +20,7 @@ import logging
# http://stackoverflow.com/questions/4669692/
# valid-characters-for-directory-part-of-a-url-for-short-links
def rel_request_uri(environ, include_query=1):
"""
Return the requested path, optionally including the query string
@ -40,14 +43,21 @@ def rel_request_uri(environ, include_query=1):
#=================================================================
def create_wb_app(wb_router):
class WSGIApp(object):
def __init__(self, wb_router):
self.wb_router = wb_router
self.port = DEFAULT_PORT
if hasattr(wb_router, 'port'):
self.port = wb_router.port
# Top-level wsgi application
def application(env, start_response):
def __call__(self, env, start_response):
if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
env['REL_REQUEST_URI'] = rel_request_uri(env)
else:
env['REL_REQUEST_URI'] = env['REQUEST_URI']
wb_router = self.wb_router
response = None
try:
@ -68,8 +78,6 @@ def create_wb_app(wb_router):
return response(env, start_response)
return application
#=================================================================
def handle_exception(env, error_view, exc, print_trace):
@ -126,13 +134,10 @@ def init_app(init_func, load_yaml=True, config_file=None):
msg = '*** pywb app inited with config from "%s"!\n'
logging.info(msg, init_func.__name__)
return create_wb_app(wb_router)
return WSGIApp(wb_router)
#=================================================================
DEFAULT_PORT = 8080
def start_wsgi_server(the_app):
from wsgiref.simple_server import make_server
from optparse import OptionParser
@ -144,12 +149,10 @@ def start_wsgi_server(the_app):
port = options.port
if port is None:
try:
config = load_default_config()
port = config.get('port', DEFAULT_PORT)
except:
port = DEFAULT_PORT
port = the_app.port
if not port:
port = DEFAULT_PORT
logging.debug('Starting CDX Server on port %s', port)

View File

@ -90,6 +90,9 @@ enable_http_proxy: true
# enable cdx server api for querying cdx directly (experimental)
enable_cdx_api: true
# test different port
port: 9000
# optional reporter callback func
# if set, called with request and cdx object
reporter: !!python/object/new:tests.fixture.PrintReporter []