From 86a093d1649bbfbe1285525a333921db0017b62d Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 1 Feb 2014 00:43:24 -0800 Subject: [PATCH] support cdx server query at (/cdx in default config) also enable /echo_env and /echo_req debug handlers --- config.yaml | 20 ++++++++++++++++++++ pywb/archivalrouter.py | 11 +++++------ pywb/handlers.py | 32 +++++++++++++++++++++++--------- pywb/indexreader.py | 5 +++++ pywb/pywb_init.py | 17 ++++++++++++++++- pywb/wbrequestresponse.py | 6 +++--- 6 files changed, 72 insertions(+), 19 deletions(-) diff --git a/config.yaml b/config.yaml index a2cc6842..838e6b72 100644 --- a/config.yaml +++ b/config.yaml @@ -56,6 +56,26 @@ routes: # this page is displayed when no search url is entered search_html_template: ./ui/search.html + # Sample Debug Handlers (subject to change) + # Echo Request + - name: echo_req + + type: echo_req + + # Echo WSGI Env + - name: echo_env + + type: echo_env + + # CDX Server + - name: cdx + + index_paths: ['./sample_archive/cdx/'] + + type: 'cdx' + + + # list of host names that pywb will be running from to detect # 'fallthrough' requests based on referrer # diff --git a/pywb/archivalrouter.py b/pywb/archivalrouter.py index 398d9c5c..017618f4 100644 --- a/pywb/archivalrouter.py +++ b/pywb/archivalrouter.py @@ -10,18 +10,17 @@ from wburl import WbUrl # ArchivalRequestRouter -- route WB requests in archival mode #================================================================= class ArchivalRequestRouter: - def __init__(self, routes, hostpaths = None, abs_path = True, archivalurl_class = WbUrl, homepage = None, errorpage = None): + def __init__(self, routes, hostpaths = None, abs_path = True, homepage = None, errorpage = None): self.routes = routes self.fallback = ReferRedirect(hostpaths) self.abs_path = abs_path - self.archivalurl_class = archivalurl_class self.homepage = homepage self.errorpage = errorpage def __call__(self, env): for route in self.routes: - result = route(env, self.abs_path, self.archivalurl_class) + result = route(env, self.abs_path) if result: return result @@ -51,7 +50,7 @@ class ArchivalRequestRouter: class Route: # match upto next slash - SLASH_LOOKAHEAD ='(?=/|$)' + SLASH_LOOKAHEAD ='(?=/|$|\?)' def __init__(self, regex, handler, coll_group = 0, lookahead = SLASH_LOOKAHEAD): @@ -62,7 +61,7 @@ class Route: self.coll_group = coll_group - def __call__(self, env, use_abs_prefix, archivalurl_class): + def __call__(self, env, use_abs_prefix): request_uri = env['REL_REQUEST_URI'] matcher = self.regex.match(request_uri[1:]) if not matcher: @@ -85,7 +84,7 @@ class Route: wb_url = wb_url, wb_prefix = wb_prefix, use_abs_prefix = use_abs_prefix, - archivalurl_class = archivalurl_class) + wburl_class = self.handler.get_wburl_type()) # Allow for setup of additional filters diff --git a/pywb/handlers.py b/pywb/handlers.py index 6c7026c4..50d3e644 100644 --- a/pywb/handlers.py +++ b/pywb/handlers.py @@ -3,11 +3,20 @@ import utils import urlparse from wbrequestresponse import WbResponse +from wburl import WbUrl +from wbexceptions import WbException + + +class BaseHandler: + @staticmethod + def get_wburl_type(): + return WbUrl + #================================================================= # Standard WB Handler #================================================================= -class WBHandler: +class WBHandler(BaseHandler): def __init__(self, cdx_reader, replay, capturespage = None, searchpage = None): self.cdx_reader = cdx_reader self.replay = replay @@ -44,6 +53,7 @@ class WBHandler: return WbResponse.text_response('No Lookup Url Specified') + def __str__(self): return 'WBHandler: ' + str(self.cdx_reader) + ', ' + str(self.replay) @@ -52,39 +62,43 @@ class WBHandler: #================================================================= # CDX-Server Handler -- pass all params to cdx server #================================================================= -class CDXHandler: +class CDXHandler(BaseHandler): def __init__(self, cdx_reader, view = None): self.cdx_reader = cdx_reader self.view = view if view else views.TextCapturesView() def __call__(self, wbrequest): - url = wbrequest.wb_url.url + #url = wbrequest.wb_url.url # use url= param to get actual url params = urlparse.parse_qs(wbrequest.env['QUERY_STRING']) url = params.get('url') if not url: - raise Exception('Must specify a url= param to query cdx server') + raise WbException('Must specify a url= param to query cdx server') url = url[0] cdx_lines = self.cdx_reader.load_cdx(url, params, parsed_cdx = False) - return self.view(wbrequest, cdx_lines) + return self.view.render_response(wbrequest, cdx_lines) + @staticmethod + def get_wburl_type(): + return None + #================================================================= # Debug Handlers #================================================================= -class DebugEchoEnvHandler: +class DebugEchoEnvHandler(BaseHandler): def __call__(self, wbrequest): - return wbrequestresponse.WbResponse.text_response(str(wbrequest.env)) + return WbResponse.text_response(str(wbrequest.env)) #================================================================= -class DebugEchoHandler: +class DebugEchoHandler(BaseHandler): def __call__(self, wbrequest): - return wbrequestresponse.WbResponse.text_response(str(wbrequest)) + return WbResponse.text_response(str(wbrequest)) diff --git a/pywb/indexreader.py b/pywb/indexreader.py index 1daacc82..042b49fc 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -176,6 +176,11 @@ class RemoteCDXServer(IndexReader): else: return response + + # Note: this params are designed to make pywb compatible with the original Java wayback-cdx-server API: + # https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server + # Soon, this will be switched over to support the native pywb cdx server + # BUG: Setting replayClosest to high number for now, as cdx server sometimes returns wrong result # with lower values if there are too many captures. Ideally, should be around 10-20 # The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make diff --git a/pywb/pywb_init.py b/pywb/pywb_init.py index ea21668e..eebf4a1c 100644 --- a/pywb/pywb_init.py +++ b/pywb/pywb_init.py @@ -105,7 +105,11 @@ def yaml_parse_index_loader(config): # support mixed cdx streams and remote servers? # for now, list implies local sources if isinstance(index_config, list): - return indexreader.LocalCDXServer(index_config, surt_ordered) + if len(index_config) > 1: + return indexreader.LocalCDXServer(index_config, surt_ordered) + else: + # treat as non-list + index_config = index_config[0] if isinstance(index_config, str): uri = index_config @@ -151,11 +155,22 @@ def yaml_parse_calendar_view(config): def yaml_parse_route(config): name = config['name'] + type = config.get('type', 'wb') + + if type == 'echo_env': + return Route(name, handlers.DebugEchoEnvHandler()) + + if type == 'echo_req': + return Route(name, handlers.DebugEchoHandler()) archive_loader = archiveloader.ArchiveLoader() index_loader = yaml_parse_index_loader(config) + if type == 'cdx': + handler = handlers.CDXHandler(index_loader) + return Route(name, handler) + archive_resolvers = map(replay_resolvers.make_best_resolver, config['archive_paths']) head_insert = yaml_parse_head_insert(config) diff --git a/pywb/wbrequestresponse.py b/pywb/wbrequestresponse.py index a09db184..43f46bfa 100644 --- a/pywb/wbrequestresponse.py +++ b/pywb/wbrequestresponse.py @@ -61,7 +61,7 @@ class WbRequest: return rel_prefix - def __init__(self, env, request_uri, wb_prefix, wb_url, coll, use_abs_prefix = False, archivalurl_class = WbUrl): + def __init__(self, env, request_uri, wb_prefix, wb_url, coll, use_abs_prefix = False, wburl_class = WbUrl): self.env = env self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI') @@ -69,9 +69,9 @@ class WbRequest: self.wb_prefix = wb_prefix if not use_abs_prefix else WbRequest.make_abs_prefix(env, wb_prefix) # wb_url present and not root page - if wb_url != '/' and wb_url != '' and archivalurl_class: + if wb_url != '/' and wb_url != '' and wburl_class: self.wb_url_str = wb_url - self.wb_url = archivalurl_class(wb_url) + self.wb_url = wburl_class(wb_url) else: # no wb_url, just store blank self.wb_url_str = '/'