mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
support cdx server query at (/cdx in default config)
also enable /echo_env and /echo_req debug handlers
This commit is contained in:
parent
f00ac826cf
commit
86a093d164
20
config.yaml
20
config.yaml
@ -56,6 +56,26 @@ routes:
|
||||
# this page is displayed when no search url is entered
|
||||
search_html_template: ./ui/search.html
|
||||
|
||||
# Sample Debug Handlers (subject to change)
|
||||
# Echo Request
|
||||
- name: echo_req
|
||||
|
||||
type: echo_req
|
||||
|
||||
# Echo WSGI Env
|
||||
- name: echo_env
|
||||
|
||||
type: echo_env
|
||||
|
||||
# CDX Server
|
||||
- name: cdx
|
||||
|
||||
index_paths: ['./sample_archive/cdx/']
|
||||
|
||||
type: 'cdx'
|
||||
|
||||
|
||||
|
||||
# list of host names that pywb will be running from to detect
|
||||
# 'fallthrough' requests based on referrer
|
||||
#
|
||||
|
@ -10,18 +10,17 @@ from wburl import WbUrl
|
||||
# ArchivalRequestRouter -- route WB requests in archival mode
|
||||
#=================================================================
|
||||
class ArchivalRequestRouter:
|
||||
def __init__(self, routes, hostpaths = None, abs_path = True, archivalurl_class = WbUrl, homepage = None, errorpage = None):
|
||||
def __init__(self, routes, hostpaths = None, abs_path = True, homepage = None, errorpage = None):
|
||||
self.routes = routes
|
||||
self.fallback = ReferRedirect(hostpaths)
|
||||
self.abs_path = abs_path
|
||||
self.archivalurl_class = archivalurl_class
|
||||
|
||||
self.homepage = homepage
|
||||
self.errorpage = errorpage
|
||||
|
||||
def __call__(self, env):
|
||||
for route in self.routes:
|
||||
result = route(env, self.abs_path, self.archivalurl_class)
|
||||
result = route(env, self.abs_path)
|
||||
if result:
|
||||
return result
|
||||
|
||||
@ -51,7 +50,7 @@ class ArchivalRequestRouter:
|
||||
class Route:
|
||||
|
||||
# match upto next slash
|
||||
SLASH_LOOKAHEAD ='(?=/|$)'
|
||||
SLASH_LOOKAHEAD ='(?=/|$|\?)'
|
||||
|
||||
|
||||
def __init__(self, regex, handler, coll_group = 0, lookahead = SLASH_LOOKAHEAD):
|
||||
@ -62,7 +61,7 @@ class Route:
|
||||
self.coll_group = coll_group
|
||||
|
||||
|
||||
def __call__(self, env, use_abs_prefix, archivalurl_class):
|
||||
def __call__(self, env, use_abs_prefix):
|
||||
request_uri = env['REL_REQUEST_URI']
|
||||
matcher = self.regex.match(request_uri[1:])
|
||||
if not matcher:
|
||||
@ -85,7 +84,7 @@ class Route:
|
||||
wb_url = wb_url,
|
||||
wb_prefix = wb_prefix,
|
||||
use_abs_prefix = use_abs_prefix,
|
||||
archivalurl_class = archivalurl_class)
|
||||
wburl_class = self.handler.get_wburl_type())
|
||||
|
||||
|
||||
# Allow for setup of additional filters
|
||||
|
@ -3,11 +3,20 @@ import utils
|
||||
import urlparse
|
||||
|
||||
from wbrequestresponse import WbResponse
|
||||
from wburl import WbUrl
|
||||
from wbexceptions import WbException
|
||||
|
||||
|
||||
class BaseHandler:
|
||||
@staticmethod
|
||||
def get_wburl_type():
|
||||
return WbUrl
|
||||
|
||||
|
||||
#=================================================================
|
||||
# Standard WB Handler
|
||||
#=================================================================
|
||||
class WBHandler:
|
||||
class WBHandler(BaseHandler):
|
||||
def __init__(self, cdx_reader, replay, capturespage = None, searchpage = None):
|
||||
self.cdx_reader = cdx_reader
|
||||
self.replay = replay
|
||||
@ -44,6 +53,7 @@ class WBHandler:
|
||||
return WbResponse.text_response('No Lookup Url Specified')
|
||||
|
||||
|
||||
|
||||
def __str__(self):
|
||||
return 'WBHandler: ' + str(self.cdx_reader) + ', ' + str(self.replay)
|
||||
|
||||
@ -52,39 +62,43 @@ class WBHandler:
|
||||
#=================================================================
|
||||
# CDX-Server Handler -- pass all params to cdx server
|
||||
#=================================================================
|
||||
class CDXHandler:
|
||||
class CDXHandler(BaseHandler):
|
||||
def __init__(self, cdx_reader, view = None):
|
||||
self.cdx_reader = cdx_reader
|
||||
self.view = view if view else views.TextCapturesView()
|
||||
|
||||
def __call__(self, wbrequest):
|
||||
url = wbrequest.wb_url.url
|
||||
#url = wbrequest.wb_url.url
|
||||
|
||||
# use url= param to get actual url
|
||||
params = urlparse.parse_qs(wbrequest.env['QUERY_STRING'])
|
||||
|
||||
url = params.get('url')
|
||||
if not url:
|
||||
raise Exception('Must specify a url= param to query cdx server')
|
||||
raise WbException('Must specify a url= param to query cdx server')
|
||||
|
||||
url = url[0]
|
||||
|
||||
cdx_lines = self.cdx_reader.load_cdx(url, params, parsed_cdx = False)
|
||||
|
||||
return self.view(wbrequest, cdx_lines)
|
||||
return self.view.render_response(wbrequest, cdx_lines)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def get_wburl_type():
|
||||
return None
|
||||
|
||||
#=================================================================
|
||||
# Debug Handlers
|
||||
#=================================================================
|
||||
class DebugEchoEnvHandler:
|
||||
class DebugEchoEnvHandler(BaseHandler):
|
||||
def __call__(self, wbrequest):
|
||||
return wbrequestresponse.WbResponse.text_response(str(wbrequest.env))
|
||||
return WbResponse.text_response(str(wbrequest.env))
|
||||
|
||||
#=================================================================
|
||||
class DebugEchoHandler:
|
||||
class DebugEchoHandler(BaseHandler):
|
||||
def __call__(self, wbrequest):
|
||||
return wbrequestresponse.WbResponse.text_response(str(wbrequest))
|
||||
return WbResponse.text_response(str(wbrequest))
|
||||
|
||||
|
||||
|
||||
|
@ -176,6 +176,11 @@ class RemoteCDXServer(IndexReader):
|
||||
else:
|
||||
return response
|
||||
|
||||
|
||||
# Note: this params are designed to make pywb compatible with the original Java wayback-cdx-server API:
|
||||
# https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
|
||||
# Soon, this will be switched over to support the native pywb cdx server
|
||||
|
||||
# BUG: Setting replayClosest to high number for now, as cdx server sometimes returns wrong result
|
||||
# with lower values if there are too many captures. Ideally, should be around 10-20
|
||||
# The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make
|
||||
|
@ -105,7 +105,11 @@ def yaml_parse_index_loader(config):
|
||||
# support mixed cdx streams and remote servers?
|
||||
# for now, list implies local sources
|
||||
if isinstance(index_config, list):
|
||||
if len(index_config) > 1:
|
||||
return indexreader.LocalCDXServer(index_config, surt_ordered)
|
||||
else:
|
||||
# treat as non-list
|
||||
index_config = index_config[0]
|
||||
|
||||
if isinstance(index_config, str):
|
||||
uri = index_config
|
||||
@ -151,11 +155,22 @@ def yaml_parse_calendar_view(config):
|
||||
|
||||
def yaml_parse_route(config):
|
||||
name = config['name']
|
||||
type = config.get('type', 'wb')
|
||||
|
||||
if type == 'echo_env':
|
||||
return Route(name, handlers.DebugEchoEnvHandler())
|
||||
|
||||
if type == 'echo_req':
|
||||
return Route(name, handlers.DebugEchoHandler())
|
||||
|
||||
archive_loader = archiveloader.ArchiveLoader()
|
||||
|
||||
index_loader = yaml_parse_index_loader(config)
|
||||
|
||||
if type == 'cdx':
|
||||
handler = handlers.CDXHandler(index_loader)
|
||||
return Route(name, handler)
|
||||
|
||||
archive_resolvers = map(replay_resolvers.make_best_resolver, config['archive_paths'])
|
||||
|
||||
head_insert = yaml_parse_head_insert(config)
|
||||
|
@ -61,7 +61,7 @@ class WbRequest:
|
||||
return rel_prefix
|
||||
|
||||
|
||||
def __init__(self, env, request_uri, wb_prefix, wb_url, coll, use_abs_prefix = False, archivalurl_class = WbUrl):
|
||||
def __init__(self, env, request_uri, wb_prefix, wb_url, coll, use_abs_prefix = False, wburl_class = WbUrl):
|
||||
self.env = env
|
||||
|
||||
self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI')
|
||||
@ -69,9 +69,9 @@ class WbRequest:
|
||||
self.wb_prefix = wb_prefix if not use_abs_prefix else WbRequest.make_abs_prefix(env, wb_prefix)
|
||||
|
||||
# wb_url present and not root page
|
||||
if wb_url != '/' and wb_url != '' and archivalurl_class:
|
||||
if wb_url != '/' and wb_url != '' and wburl_class:
|
||||
self.wb_url_str = wb_url
|
||||
self.wb_url = archivalurl_class(wb_url)
|
||||
self.wb_url = wburl_class(wb_url)
|
||||
else:
|
||||
# no wb_url, just store blank
|
||||
self.wb_url_str = '/'
|
||||
|
Loading…
x
Reference in New Issue
Block a user