1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

support cdx server query at (/cdx in default config)

also enable /echo_env and /echo_req debug handlers
This commit is contained in:
Ilya Kreymer 2014-02-01 00:43:24 -08:00
parent f00ac826cf
commit 86a093d164
6 changed files with 72 additions and 19 deletions

View File

@ -56,6 +56,26 @@ routes:
# this page is displayed when no search url is entered
search_html_template: ./ui/search.html
# Sample Debug Handlers (subject to change)
# Echo Request
- name: echo_req
type: echo_req
# Echo WSGI Env
- name: echo_env
type: echo_env
# CDX Server
- name: cdx
index_paths: ['./sample_archive/cdx/']
type: 'cdx'
# list of host names that pywb will be running from to detect
# 'fallthrough' requests based on referrer
#

View File

@ -10,18 +10,17 @@ from wburl import WbUrl
# ArchivalRequestRouter -- route WB requests in archival mode
#=================================================================
class ArchivalRequestRouter:
def __init__(self, routes, hostpaths = None, abs_path = True, archivalurl_class = WbUrl, homepage = None, errorpage = None):
def __init__(self, routes, hostpaths = None, abs_path = True, homepage = None, errorpage = None):
self.routes = routes
self.fallback = ReferRedirect(hostpaths)
self.abs_path = abs_path
self.archivalurl_class = archivalurl_class
self.homepage = homepage
self.errorpage = errorpage
def __call__(self, env):
for route in self.routes:
result = route(env, self.abs_path, self.archivalurl_class)
result = route(env, self.abs_path)
if result:
return result
@ -51,7 +50,7 @@ class ArchivalRequestRouter:
class Route:
# match upto next slash
SLASH_LOOKAHEAD ='(?=/|$)'
SLASH_LOOKAHEAD ='(?=/|$|\?)'
def __init__(self, regex, handler, coll_group = 0, lookahead = SLASH_LOOKAHEAD):
@ -62,7 +61,7 @@ class Route:
self.coll_group = coll_group
def __call__(self, env, use_abs_prefix, archivalurl_class):
def __call__(self, env, use_abs_prefix):
request_uri = env['REL_REQUEST_URI']
matcher = self.regex.match(request_uri[1:])
if not matcher:
@ -85,7 +84,7 @@ class Route:
wb_url = wb_url,
wb_prefix = wb_prefix,
use_abs_prefix = use_abs_prefix,
archivalurl_class = archivalurl_class)
wburl_class = self.handler.get_wburl_type())
# Allow for setup of additional filters

View File

@ -3,11 +3,20 @@ import utils
import urlparse
from wbrequestresponse import WbResponse
from wburl import WbUrl
from wbexceptions import WbException
class BaseHandler:
@staticmethod
def get_wburl_type():
return WbUrl
#=================================================================
# Standard WB Handler
#=================================================================
class WBHandler:
class WBHandler(BaseHandler):
def __init__(self, cdx_reader, replay, capturespage = None, searchpage = None):
self.cdx_reader = cdx_reader
self.replay = replay
@ -44,6 +53,7 @@ class WBHandler:
return WbResponse.text_response('No Lookup Url Specified')
def __str__(self):
return 'WBHandler: ' + str(self.cdx_reader) + ', ' + str(self.replay)
@ -52,39 +62,43 @@ class WBHandler:
#=================================================================
# CDX-Server Handler -- pass all params to cdx server
#=================================================================
class CDXHandler:
class CDXHandler(BaseHandler):
def __init__(self, cdx_reader, view = None):
self.cdx_reader = cdx_reader
self.view = view if view else views.TextCapturesView()
def __call__(self, wbrequest):
url = wbrequest.wb_url.url
#url = wbrequest.wb_url.url
# use url= param to get actual url
params = urlparse.parse_qs(wbrequest.env['QUERY_STRING'])
url = params.get('url')
if not url:
raise Exception('Must specify a url= param to query cdx server')
raise WbException('Must specify a url= param to query cdx server')
url = url[0]
cdx_lines = self.cdx_reader.load_cdx(url, params, parsed_cdx = False)
return self.view(wbrequest, cdx_lines)
return self.view.render_response(wbrequest, cdx_lines)
@staticmethod
def get_wburl_type():
return None
#=================================================================
# Debug Handlers
#=================================================================
class DebugEchoEnvHandler:
class DebugEchoEnvHandler(BaseHandler):
def __call__(self, wbrequest):
return wbrequestresponse.WbResponse.text_response(str(wbrequest.env))
return WbResponse.text_response(str(wbrequest.env))
#=================================================================
class DebugEchoHandler:
class DebugEchoHandler(BaseHandler):
def __call__(self, wbrequest):
return wbrequestresponse.WbResponse.text_response(str(wbrequest))
return WbResponse.text_response(str(wbrequest))

View File

@ -176,6 +176,11 @@ class RemoteCDXServer(IndexReader):
else:
return response
# Note: this params are designed to make pywb compatible with the original Java wayback-cdx-server API:
# https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
# Soon, this will be switched over to support the native pywb cdx server
# BUG: Setting replayClosest to high number for now, as cdx server sometimes returns wrong result
# with lower values if there are too many captures. Ideally, should be around 10-20
# The replayClosest is the max number of cdx lines, so max number of retry attempts that WB will make

View File

@ -105,7 +105,11 @@ def yaml_parse_index_loader(config):
# support mixed cdx streams and remote servers?
# for now, list implies local sources
if isinstance(index_config, list):
if len(index_config) > 1:
return indexreader.LocalCDXServer(index_config, surt_ordered)
else:
# treat as non-list
index_config = index_config[0]
if isinstance(index_config, str):
uri = index_config
@ -151,11 +155,22 @@ def yaml_parse_calendar_view(config):
def yaml_parse_route(config):
name = config['name']
type = config.get('type', 'wb')
if type == 'echo_env':
return Route(name, handlers.DebugEchoEnvHandler())
if type == 'echo_req':
return Route(name, handlers.DebugEchoHandler())
archive_loader = archiveloader.ArchiveLoader()
index_loader = yaml_parse_index_loader(config)
if type == 'cdx':
handler = handlers.CDXHandler(index_loader)
return Route(name, handler)
archive_resolvers = map(replay_resolvers.make_best_resolver, config['archive_paths'])
head_insert = yaml_parse_head_insert(config)

View File

@ -61,7 +61,7 @@ class WbRequest:
return rel_prefix
def __init__(self, env, request_uri, wb_prefix, wb_url, coll, use_abs_prefix = False, archivalurl_class = WbUrl):
def __init__(self, env, request_uri, wb_prefix, wb_url, coll, use_abs_prefix = False, wburl_class = WbUrl):
self.env = env
self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI')
@ -69,9 +69,9 @@ class WbRequest:
self.wb_prefix = wb_prefix if not use_abs_prefix else WbRequest.make_abs_prefix(env, wb_prefix)
# wb_url present and not root page
if wb_url != '/' and wb_url != '' and archivalurl_class:
if wb_url != '/' and wb_url != '' and wburl_class:
self.wb_url_str = wb_url
self.wb_url = archivalurl_class(wb_url)
self.wb_url = wburl_class(wb_url)
else:
# no wb_url, just store blank
self.wb_url_str = '/'