1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

cleanup pywb_init, work on documenting config.yaml!

This commit is contained in:
Ilya Kreymer 2014-01-29 00:03:24 -08:00
parent 43a46b373d
commit 411e7fe8a3
5 changed files with 152 additions and 68 deletions

View File

@ -1,17 +1,67 @@
# pywb config file
# ========================================
#
# Settings for each route are defined below
# Each route may be an archival collection or other handler
#
routes:
pywb:
index_paths:
- ./sample_archive/cdx/
# route name (eg /pywb)
- name: pywb
archive_paths:
- ./sample_archive/warcs/
# list of paths to search cdx files
# * local .cdx file
# * local dir, will include all .cdx files in dir
#
# or a string value indicating remote http cdx server
index_paths:
- ./sample_archive/cdx/
head_insert_template: ./ui/head_insert.html
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
# SURT keys are recommended for future indices, but non-SURT cdxs
# are also supported
#
# * Set to true if cdxs start with surts: com,example)/
# * Set to false if cdx start with urls: example.com)/
surt_ordered: True
html_query_template: ./ui/query.html
# list of paths prefixes for pywb look to 'resolve' WARC and ARC filenames
# in the cdx to their absolute path
#
# if path is:
# * local dir, use path as prefix
# * local file, lookup prefix in tab-delimited sorted index
# * http:// path, use path as remote prefix
# * redis:// path, use redis to lookup full path for w:<warc> as key
archive_paths:
- ./sample_archive/warcs/
# ui: optional Jinja2 template to insert into <head> of each replay
head_insert_html_template: ./ui/head_insert.html
# ui: optional text to directly insert into <head>
# only loaded if ui_head_insert_template_file is not specified
#head_insert_text: <script src='example.js'></script>
hostpaths: http://localhost:8080/
# ui: optional Jinja2 template to use for 'calendar' query,
# eg, a listing of captures in response to a ../*/<url>
#
# may be a simple listing or a more complex 'calendar' UI
# if omitted, the capture listing lists raw index
calendar_html_template: ./ui/query.html
# list of host names that pywb will be running from to detect
# 'fallthrough' requests based on referrer
#
# eg: an incorrect request for http://localhost:8080/image.gif with a referrer
# of http://localhost:8080/pywb/index.html, pywb can correctly redirect
# to http://localhost:8080/pywb/image.gif
#
hostpaths: ['http://localhost:8080/']

View File

@ -11,16 +11,18 @@ import yaml
import utils
import logging
## ===========
default_head_insert = """
#=================================================================
## Reference non-YAML config
#=================================================================
def pywb_config_manual():
default_head_insert = """
<!-- WB Insert -->
<script src='/static/wb.js'> </script>
<link rel='stylesheet' href='/static/wb.css'/>
<!-- End WB Insert -->
"""
<!-- WB Insert -->
<script src='/static/wb.js'> </script>
<link rel='stylesheet' href='/static/wb.css'/>
<!-- End WB Insert -->
"""
def pywb_config2():
# Current test dir
#test_dir = utils.test_data_dir()
test_dir = './sample_archive/'
@ -64,10 +66,20 @@ def pywb_config2():
hostpaths = ['http://localhost:8080/'])
def pywb_config(filename = './pywb/config.yaml'):
config = yaml.load(open(filename))
routes = map(yaml_parse_route, config['routes'].iteritems())
#=================================================================
# YAML config loader
#=================================================================
DEFAULT_CONFIG_FILE = 'config.yaml'
def pywb_config(config_file = None):
if not config_file:
config_file = os.environ.get('PYWB_CONFIG', DEFAULT_CONFIG_FILE)
config = yaml.load(open(config_file))
routes = map(yaml_parse_route, config['routes'])
hostpaths = config.get('hostpaths', ['http://localhost:8080/'])
@ -75,9 +87,6 @@ def pywb_config(filename = './pywb/config.yaml'):
def yaml_parse_route((route_name, handler_def)):
return Route(route_name, yaml_parse_handler(handler_def))
def yaml_parse_index_loader(index_config):
# support mixed cdx streams and remote servers?
@ -101,60 +110,56 @@ def yaml_parse_index_loader(index_config):
return indexreader.LocalCDXServer([uri])
def yaml_parse_archive_resolvers(archive_paths):
#TODO: more options (remote files, contains param, etc..)
def make_resolver(path):
if path.startswith('redis://'):
return replay_resolvers.RedisResolver(path)
elif os.path.isfile(path):
return replay_resolvers.PathIndexResolver(path)
else:
logging.info('Adding Archive Source: ' + path)
return replay_resolvers.PrefixResolver(path)
return map(make_resolver, archive_paths)
def yaml_parse_head_insert(handler_def):
def yaml_parse_head_insert(config):
# First, try a template file
head_insert_file = handler_def.get('head_insert_template')
head_insert_file = config.get('head_insert_html_template')
if head_insert_file:
logging.info('Adding Head-Insert Template: ' + head_insert_file)
return views.J2HeadInsertView(head_insert_file)
# Then, static head_insert text
head_insert_text = handler_def.get('head_insert_text', '')
head_insert_text = config.get('head_insert_text', '')
logging.info('Adding Head-Insert Text: ' + head_insert_text)
return head_insert_text
def yaml_parse_handler(handler_def):
archive_loader = archiveloader.ArchiveLoader()
index_loader = yaml_parse_index_loader(handler_def['index_paths'])
archive_resolvers = yaml_parse_archive_resolvers(handler_def['archive_paths'])
head_insert = yaml_parse_head_insert(handler_def)
replayer = replay_views.RewritingReplayView(resolvers = archive_resolvers,
archiveloader = archive_loader,
head_insert = head_insert,
buffer_response = handler_def.get('buffer_response', False))
html_view_file = handler_def.get('html_query_template')
def yaml_parse_calendar_view(config):
html_view_file = config.get('calendar_html_template')
if html_view_file:
logging.info('Adding HTML Calendar Template: ' + html_view_file)
else:
logging.info('No HTML Calendar View Present')
html_view = views.J2QueryView(html_view_file) if html_view_file else None
return views.J2QueryView(html_view_file) if html_view_file else None
def yaml_parse_route(config):
name = config['name']
archive_loader = archiveloader.ArchiveLoader()
index_loader = yaml_parse_index_loader(config['index_paths'])
archive_resolvers = map(replay_resolvers.make_best_resolver, config['archive_paths'])
head_insert = yaml_parse_head_insert(config)
replayer = replay_views.RewritingReplayView(resolvers = archive_resolvers,
archiveloader = archive_loader,
head_insert = head_insert,
buffer_response = config.get('buffer_response', False))
html_view = yaml_parse_calendar_view(config)
wb_handler = handlers.WBHandler(index_loader, replayer, html_view)
return wb_handler
return Route(name, wb_handler)
if __name__ == "__main__" or utils.enable_doctests():
pass
#print pywb_config('config.yaml')
# Just test for execution for now
pywb_config(os.path.dirname(os.path.realpath(__file__)) + '/../config.yaml')
pywb_config_manual()

View File

@ -1,5 +1,10 @@
import redis
import binsearch
import urlparse
import os
import logging
#======================================
# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
#======================================
@ -41,3 +46,26 @@ class PathIndexResolver:
return gen_list(result)
#TODO: more options (remote files, contains param, etc..)
# find best resolver given the path
def make_best_resolver(path):
url_parts = urlparse.urlsplit(path)
if url_parts.scheme == 'redis':
logging.info('Adding Redis Index: ' + path)
return RedisResolver(path)
if url_parts.scheme == 'file':
path = url_parts.path
if os.path.isfile(path):
logging.info('Adding Path Index: ' + path)
return PathIndexResolver(path)
# non-file paths always treated as prefix for now
else:
logging.info('Adding Archive Path Source: ' + path)
return PrefixResolver(path)

View File

@ -84,18 +84,16 @@ def main():
# Attempt to load real settings from globalwb module
logging.basicConfig(format = '%(asctime)s: [%(levelname)s]: %(message)s', level = logging.DEBUG)
config_name = os.environ.get('PYWB_CONFIG')
config_name = os.environ.get('PYWB_CONFIG_MODULE')
if not config_name:
config_name = 'pywb.pywb_init'
logging.info('PYWB_CONFIG not specified, loading default settings from module "{0}"'.format(config_name))
logging.info('Loading from default config module "{0}"'.format(config_name))
logging.info('')
module = importlib.import_module(config_name)
config_file = DEFAULT_CONFIG_FILE
app = create_wb_app(module.pywb_config(config_file))
app = create_wb_app(module.pywb_config())
logging.info('')
logging.info('*** pywb inited with settings from {0}.pywb_config()!\n'.format(config_name))
return app
@ -107,8 +105,6 @@ def main():
#=================================================================
if __name__ == "__main__" or utils.enable_doctests():
import pywb_init
# Test sample settings
application = create_wb_app(pywb_init.pywb_config('../' + DEFAULT_CONFIG_FILE))
pass
else:
application = main()

9
run.sh
View File

@ -2,8 +2,13 @@
mypath=$(cd `dirname $0` && pwd)
# Setup init module
#export 'PYWB_CONFIG=globalwb'
# Set a different config file
#export 'PYWB_CONFIG=myconfig.yaml'
# Set alternate init module
# The modules pywb_config()
# ex: my_pywb.pywb_config()
#export 'PYWB_CONFIG=my_pywb'
app="pywb.wbapp"