1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

support cdx server query at (/cdx in default config)

also enable /echo_env and /echo_req debug handlers
This commit is contained in:
Ilya Kreymer 2014-02-01 00:43:24 -08:00
parent f00ac826cf
commit 86a093d164
6 changed files with 72 additions and 19 deletions

View File

@ -56,6 +56,26 @@ routes:
# this page is displayed when no search url is entered # this page is displayed when no search url is entered
search_html_template: ./ui/search.html search_html_template: ./ui/search.html
# Sample Debug Handlers (subject to change)
# Echo Request
- name: echo_req
type: echo_req
# Echo WSGI Env
- name: echo_env
type: echo_env
# CDX Server
- name: cdx
index_paths: ['./sample_archive/cdx/']
type: 'cdx'
# list of host names that pywb will be running from to detect # list of host names that pywb will be running from to detect
# 'fallthrough' requests based on referrer # 'fallthrough' requests based on referrer
# #

View File

@ -10,18 +10,17 @@ from wburl import WbUrl
# ArchivalRequestRouter -- route WB requests in archival mode # ArchivalRequestRouter -- route WB requests in archival mode
#================================================================= #=================================================================
class ArchivalRequestRouter: class ArchivalRequestRouter:
def __init__(self, routes, hostpaths = None, abs_path = True, archivalurl_class = WbUrl, homepage = None, errorpage = None): def __init__(self, routes, hostpaths = None, abs_path = True, homepage = None, errorpage = None):
self.routes = routes self.routes = routes
self.fallback = ReferRedirect(hostpaths) self.fallback = ReferRedirect(hostpaths)
self.abs_path = abs_path self.abs_path = abs_path
self.archivalurl_class = archivalurl_class
self.homepage = homepage self.homepage = homepage
self.errorpage = errorpage self.errorpage = errorpage
def __call__(self, env): def __call__(self, env):
for route in self.routes: for route in self.routes:
result = route(env, self.abs_path, self.archivalurl_class) result = route(env, self.abs_path)
if result: if result:
return result return result
@ -51,7 +50,7 @@ class ArchivalRequestRouter:
class Route: class Route:
# match upto next slash # match upto next slash
SLASH_LOOKAHEAD ='(?=/|$)' SLASH_LOOKAHEAD ='(?=/|$|\?)'
def __init__(self, regex, handler, coll_group = 0, lookahead = SLASH_LOOKAHEAD): def __init__(self, regex, handler, coll_group = 0, lookahead = SLASH_LOOKAHEAD):
@ -62,7 +61,7 @@ class Route:
self.coll_group = coll_group self.coll_group = coll_group
def __call__(self, env, use_abs_prefix, archivalurl_class): def __call__(self, env, use_abs_prefix):
request_uri = env['REL_REQUEST_URI'] request_uri = env['REL_REQUEST_URI']
matcher = self.regex.match(request_uri[1:]) matcher = self.regex.match(request_uri[1:])
if not matcher: if not matcher:
@ -85,7 +84,7 @@ class Route:
wb_url = wb_url, wb_url = wb_url,
wb_prefix = wb_prefix, wb_prefix = wb_prefix,
use_abs_prefix = use_abs_prefix, use_abs_prefix = use_abs_prefix,
archivalurl_class = archivalurl_class) wburl_class = self.handler.get_wburl_type())
# Allow for setup of additional filters # Allow for setup of additional filters

View File

@ -3,11 +3,20 @@ import utils
import urlparse import urlparse
from wbrequestresponse import WbResponse from wbrequestresponse import WbResponse
from wburl import WbUrl
from wbexceptions import WbException
class BaseHandler:
@staticmethod
def get_wburl_type():
return WbUrl
#================================================================= #=================================================================
# Standard WB Handler # Standard WB Handler
#================================================================= #=================================================================
class WBHandler: class WBHandler(BaseHandler):
def __init__(self, cdx_reader, replay, capturespage = None, searchpage = None): def __init__(self, cdx_reader, replay, capturespage = None, searchpage = None):
self.cdx_reader = cdx_reader self.cdx_reader = cdx_reader
self.replay = replay self.replay = replay
@ -44,6 +53,7 @@ class WBHandler:
return WbResponse.text_response('No Lookup Url Specified') return WbResponse.text_response('No Lookup Url Specified')
def __str__(self): def __str__(self):
return 'WBHandler: ' + str(self.cdx_reader) + ', ' + str(self.replay) return 'WBHandler: ' + str(self.cdx_reader) + ', ' + str(self.replay)
@ -52,39 +62,43 @@ class WBHandler:
#================================================================= #=================================================================
# CDX-Server Handler -- pass all params to cdx server # CDX-Server Handler -- pass all params to cdx server
#================================================================= #=================================================================
class CDXHandler: class CDXHandler(BaseHandler):
def __init__(self, cdx_reader, view = None): def __init__(self, cdx_reader, view = None):
self.cdx_reader = cdx_reader self.cdx_reader = cdx_reader
self.view = view if view else views.TextCapturesView() self.view = view if view else views.TextCapturesView()
def __call__(self, wbrequest): def __call__(self, wbrequest):
url = wbrequest.wb_url.url #url = wbrequest.wb_url.url
# use url= param to get actual url # use url= param to get actual url
params = urlparse.parse_qs(wbrequest.env['QUERY_STRING']) params = urlparse.parse_qs(wbrequest.env['QUERY_STRING'])
url = params.get('url') url = params.get('url')
if not url: if not url:
raise Exception('Must specify a url= param to query cdx server') raise WbException('Must specify a url= param to query cdx server')
url = url[0] url = url[0]
cdx_lines = self.cdx_reader.load_cdx(url, params, parsed_cdx = False) cdx_lines = self.cdx_reader.load_cdx(url, params, parsed_cdx = False)
return self.view(wbrequest, cdx_lines) return self.view.render_response(wbrequest, cdx_lines)
@staticmethod
def get_wburl_type():
return None
#================================================================= #=================================================================
# Debug Handlers # Debug Handlers
#================================================================= #=================================================================
class DebugEchoEnvHandler: class DebugEchoEnvHandler(BaseHandler):
def __call__(self, wbrequest): def __call__(self, wbrequest):
return wbrequestresponse.WbResponse.text_response(str(wbrequest.env)) return WbResponse.text_response(str(wbrequest.env))
#================================================================= #=================================================================
class DebugEchoHandler: class DebugEchoHandler(BaseHandler):
def __call__(self, wbrequest): def __call__(self, wbrequest):
return wbrequestresponse.WbResponse.text_response(str(wbrequest)) return WbResponse.text_response(str(wbrequest))

View File

@ -176,6 +176,11 @@ class RemoteCDXServer(IndexReader):
else: else:
return response return response
# Note: this params are designed to make pywb compatible with the original Java wayback-cdx-server API:
# https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
# Soon, this will be switched over to support the native pywb cdx server
# BUG: Setting replayClosest to high number for now, as cdx server sometimes returns wrong result # BUG: Setting replayClosest to high number for now, as cdx server sometimes returns wrong result
# with lower values if there are too many captures. Ideally, should be around 10-20 # with lower values if there are too many captures. Ideally, should be around 10-20
# The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make # The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make

View File

@ -105,7 +105,11 @@ def yaml_parse_index_loader(config):
# support mixed cdx streams and remote servers? # support mixed cdx streams and remote servers?
# for now, list implies local sources # for now, list implies local sources
if isinstance(index_config, list): if isinstance(index_config, list):
return indexreader.LocalCDXServer(index_config, surt_ordered) if len(index_config) > 1:
return indexreader.LocalCDXServer(index_config, surt_ordered)
else:
# treat as non-list
index_config = index_config[0]
if isinstance(index_config, str): if isinstance(index_config, str):
uri = index_config uri = index_config
@ -151,11 +155,22 @@ def yaml_parse_calendar_view(config):
def yaml_parse_route(config): def yaml_parse_route(config):
name = config['name'] name = config['name']
type = config.get('type', 'wb')
if type == 'echo_env':
return Route(name, handlers.DebugEchoEnvHandler())
if type == 'echo_req':
return Route(name, handlers.DebugEchoHandler())
archive_loader = archiveloader.ArchiveLoader() archive_loader = archiveloader.ArchiveLoader()
index_loader = yaml_parse_index_loader(config) index_loader = yaml_parse_index_loader(config)
if type == 'cdx':
handler = handlers.CDXHandler(index_loader)
return Route(name, handler)
archive_resolvers = map(replay_resolvers.make_best_resolver, config['archive_paths']) archive_resolvers = map(replay_resolvers.make_best_resolver, config['archive_paths'])
head_insert = yaml_parse_head_insert(config) head_insert = yaml_parse_head_insert(config)

View File

@ -61,7 +61,7 @@ class WbRequest:
return rel_prefix return rel_prefix
def __init__(self, env, request_uri, wb_prefix, wb_url, coll, use_abs_prefix = False, archivalurl_class = WbUrl): def __init__(self, env, request_uri, wb_prefix, wb_url, coll, use_abs_prefix = False, wburl_class = WbUrl):
self.env = env self.env = env
self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI') self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI')
@ -69,9 +69,9 @@ class WbRequest:
self.wb_prefix = wb_prefix if not use_abs_prefix else WbRequest.make_abs_prefix(env, wb_prefix) self.wb_prefix = wb_prefix if not use_abs_prefix else WbRequest.make_abs_prefix(env, wb_prefix)
# wb_url present and not root page # wb_url present and not root page
if wb_url != '/' and wb_url != '' and archivalurl_class: if wb_url != '/' and wb_url != '' and wburl_class:
self.wb_url_str = wb_url self.wb_url_str = wb_url
self.wb_url = archivalurl_class(wb_url) self.wb_url = wburl_class(wb_url)
else: else:
# no wb_url, just store blank # no wb_url, just store blank
self.wb_url_str = '/' self.wb_url_str = '/'