1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-25 23:47:47 +01:00
pywb/pywb/pywb_init.py
Ilya Kreymer 86a093d164 support cdx server query at (/cdx in default config)
also enable /echo_env and /echo_req debug handlers
2014-02-01 00:43:24 -08:00

198 lines
6.4 KiB
Python

import archiveloader
import views
import handlers
import indexreader
import replay_views
import replay_resolvers
import cdxserve
from archivalrouter import ArchivalRequestRouter, Route
import os
import yaml
import utils
import logging
#=================================================================
## Reference non-YAML config
#=================================================================
def pywb_config_manual():
default_head_insert = """
<!-- WB Insert -->
<script src='/static/wb.js'> </script>
<link rel='stylesheet' href='/static/wb.css'/>
<!-- End WB Insert -->
"""
# Current test dir
#test_dir = utils.test_data_dir()
test_dir = './sample_archive/'
# Standard loader which supports WARC/ARC files
aloader = archiveloader.ArchiveLoader()
# Source for cdx source
#query_h = query.QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx'))
#test_cdx = [test_dir + 'iana.cdx', test_dir + 'example.cdx', test_dir + 'dupes.cdx']
indexs = indexreader.LocalCDXServer([test_dir + 'cdx/'])
# Loads warcs specified in cdx from these locations
prefixes = [replay_resolvers.PrefixResolver(test_dir + 'warcs/')]
# Jinja2 head insert
head_insert = views.J2TemplateView('./ui/head_insert.html')
# Create rewriting replay handler to rewrite records
replayer = replay_views.RewritingReplayView(resolvers = prefixes, archiveloader = aloader, head_insert_view = head_insert, buffer_response = True)
# Create Jinja2 based html query view
html_view = views.J2HtmlCapturesView('./ui/query.html')
# WB handler which uses the index reader, replayer, and html_view
wb_handler = handlers.WBHandler(indexs, replayer, html_view)
# cdx handler
cdx_handler = handlers.CDXHandler(indexs)
# Finally, create wb router
return ArchivalRequestRouter(
{
Route('echo_req', handlers.DebugEchoHandler()), # Debug ex: just echo parsed request
Route('pywb', wb_handler),
Route('cdx', cdx_handler),
},
# Specify hostnames that pywb will be running on
# This will help catch occasionally missed rewrites that fall-through to the host
# (See archivalrouter.ReferRedirect)
hostpaths = ['http://localhost:8080/'])
#=================================================================
# YAML config loader
#=================================================================
DEFAULT_CONFIG_FILE = 'config.yaml'
def pywb_config(config_file = None):
if not config_file:
config_file = os.environ.get('PYWB_CONFIG', DEFAULT_CONFIG_FILE)
config = yaml.load(open(config_file))
routes = map(yaml_parse_route, config['routes'])
homepage = yaml_load_template(config, 'home_html_template', 'Home Page Template')
errorpage = yaml_load_template(config, 'error_html_template', 'Error Page Template')
hostpaths = config.get('hostpaths', ['http://localhost:8080/'])
return ArchivalRequestRouter(routes, hostpaths, homepage = homepage, errorpage = errorpage)
def yaml_load_template(config, name, desc = None):
file = config.get(name)
if file:
logging.info('Adding {0}: {1}'.format(desc if desc else name, file))
file = views.J2TemplateView(file)
return file
def yaml_parse_index_loader(config):
index_config = config['index_paths']
surt_ordered = config.get('surt_ordered', True)
# support mixed cdx streams and remote servers?
# for now, list implies local sources
if isinstance(index_config, list):
if len(index_config) > 1:
return indexreader.LocalCDXServer(index_config, surt_ordered)
else:
# treat as non-list
index_config = index_config[0]
if isinstance(index_config, str):
uri = index_config
cookie = None
elif isinstance(index_config, dict):
uri = index_config['url']
cookie = index_config['cookie']
else:
raise Exception('Invalid Index Reader Config: ' + str(index_config))
# Check for remote cdx server
if (uri.startswith('http://') or uri.startswith('https://')) and not uri.endswith('.cdx'):
return indexreader.RemoteCDXServer(uri, cookie = cookie)
else:
return indexreader.LocalCDXServer([uri])
def yaml_parse_head_insert(config):
# First, try a template file
head_insert_file = config.get('head_insert_html_template')
if head_insert_file:
logging.info('Adding Head-Insert Template: ' + head_insert_file)
return views.J2TemplateView(head_insert_file)
# Then, static head_insert text
head_insert_text = config.get('head_insert_text', '')
logging.info('Adding Head-Insert Text: ' + head_insert_text)
return views.StaticTextView(head_insert_text)
def yaml_parse_calendar_view(config):
html_view_file = config.get('calendar_html_template')
if html_view_file:
logging.info('Adding HTML Calendar Template: ' + html_view_file)
else:
logging.info('No HTML Calendar View Present')
return views.J2HtmlCapturesView(html_view_file) if html_view_file else None
def yaml_parse_route(config):
name = config['name']
type = config.get('type', 'wb')
if type == 'echo_env':
return Route(name, handlers.DebugEchoEnvHandler())
if type == 'echo_req':
return Route(name, handlers.DebugEchoHandler())
archive_loader = archiveloader.ArchiveLoader()
index_loader = yaml_parse_index_loader(config)
if type == 'cdx':
handler = handlers.CDXHandler(index_loader)
return Route(name, handler)
archive_resolvers = map(replay_resolvers.make_best_resolver, config['archive_paths'])
head_insert = yaml_parse_head_insert(config)
replayer = replay_views.RewritingReplayView(resolvers = archive_resolvers,
archiveloader = archive_loader,
head_insert_view = head_insert,
buffer_response = config.get('buffer_response', False))
html_view = yaml_parse_calendar_view(config)
searchpage = yaml_load_template(config, 'search_html_template', 'Search Page Template')
wb_handler = handlers.WBHandler(index_loader, replayer, html_view, searchpage = searchpage)
return Route(name, wb_handler)
if __name__ == "__main__" or utils.enable_doctests():
# Just test for execution for now
pywb_config(os.path.dirname(os.path.realpath(__file__)) + '/../config.yaml')
pywb_config_manual()