From 411e7fe8a3628db121068efca9e6c0030d48b72e Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 29 Jan 2014 00:03:24 -0800 Subject: [PATCH] cleanup pywb_init, work on documenting config.yaml! --- config.yaml | 66 +++++++++++++++++++++--- pywb/pywb_init.py | 105 ++++++++++++++++++++------------------- pywb/replay_resolvers.py | 28 +++++++++++ pywb/wbapp.py | 12 ++--- run.sh | 9 +++- 5 files changed, 152 insertions(+), 68 deletions(-) diff --git a/config.yaml b/config.yaml index 8abf7f84..ad36bcf3 100644 --- a/config.yaml +++ b/config.yaml @@ -1,17 +1,67 @@ +# pywb config file +# ======================================== +# +# Settings for each route are defined below +# Each route may be an archival collection or other handler +# routes: - pywb: - index_paths: - - ./sample_archive/cdx/ + # route name (eg /pywb) + - name: pywb - archive_paths: - - ./sample_archive/warcs/ + # list of paths to search cdx files + # * local .cdx file + # * local dir, will include all .cdx files in dir + # + # or a string value indicating remote http cdx server + index_paths: + - ./sample_archive/cdx/ - head_insert_template: ./ui/head_insert.html + # indicate if cdx files are sorted by SURT keys -- eg: com,example)/ + # SURT keys are recommended for future indices, but non-SURT cdxs + # are also supported + # + # * Set to true if cdxs start with surts: com,example)/ + # * Set to false if cdx start with urls: example.com)/ + surt_ordered: True - html_query_template: ./ui/query.html + # list of paths prefixes for pywb look to 'resolve' WARC and ARC filenames + # in the cdx to their absolute path + # + # if path is: + # * local dir, use path as prefix + # * local file, lookup prefix in tab-delimited sorted index + # * http:// path, use path as remote prefix + # * redis:// path, use redis to lookup full path for w: as key + + archive_paths: + - ./sample_archive/warcs/ + + # ui: optional Jinja2 template to insert into of each replay + head_insert_html_template: ./ui/head_insert.html + + # ui: optional text to directly insert into + # only loaded if ui_head_insert_template_file is not specified + + #head_insert_text: -hostpaths: http://localhost:8080/ + # ui: optional Jinja2 template to use for 'calendar' query, + # eg, a listing of captures in response to a ../*/ + # + # may be a simple listing or a more complex 'calendar' UI + # if omitted, the capture listing lists raw index + calendar_html_template: ./ui/query.html + + +# list of host names that pywb will be running from to detect +# 'fallthrough' requests based on referrer +# +# eg: an incorrect request for http://localhost:8080/image.gif with a referrer +# of http://localhost:8080/pywb/index.html, pywb can correctly redirect +# to http://localhost:8080/pywb/image.gif +# + +hostpaths: ['http://localhost:8080/'] diff --git a/pywb/pywb_init.py b/pywb/pywb_init.py index 382f3d63..4d1e5c48 100644 --- a/pywb/pywb_init.py +++ b/pywb/pywb_init.py @@ -11,16 +11,18 @@ import yaml import utils import logging -## =========== -default_head_insert = """ +#================================================================= +## Reference non-YAML config +#================================================================= +def pywb_config_manual(): + default_head_insert = """ - - - - -""" + + + + + """ -def pywb_config2(): # Current test dir #test_dir = utils.test_data_dir() test_dir = './sample_archive/' @@ -64,10 +66,20 @@ def pywb_config2(): hostpaths = ['http://localhost:8080/']) -def pywb_config(filename = './pywb/config.yaml'): - config = yaml.load(open(filename)) - routes = map(yaml_parse_route, config['routes'].iteritems()) +#================================================================= +# YAML config loader +#================================================================= +DEFAULT_CONFIG_FILE = 'config.yaml' + + +def pywb_config(config_file = None): + if not config_file: + config_file = os.environ.get('PYWB_CONFIG', DEFAULT_CONFIG_FILE) + + config = yaml.load(open(config_file)) + + routes = map(yaml_parse_route, config['routes']) hostpaths = config.get('hostpaths', ['http://localhost:8080/']) @@ -75,9 +87,6 @@ def pywb_config(filename = './pywb/config.yaml'): -def yaml_parse_route((route_name, handler_def)): - return Route(route_name, yaml_parse_handler(handler_def)) - def yaml_parse_index_loader(index_config): # support mixed cdx streams and remote servers? @@ -101,60 +110,56 @@ def yaml_parse_index_loader(index_config): return indexreader.LocalCDXServer([uri]) -def yaml_parse_archive_resolvers(archive_paths): - - #TODO: more options (remote files, contains param, etc..) - def make_resolver(path): - if path.startswith('redis://'): - return replay_resolvers.RedisResolver(path) - elif os.path.isfile(path): - return replay_resolvers.PathIndexResolver(path) - else: - logging.info('Adding Archive Source: ' + path) - return replay_resolvers.PrefixResolver(path) - - return map(make_resolver, archive_paths) - -def yaml_parse_head_insert(handler_def): +def yaml_parse_head_insert(config): # First, try a template file - head_insert_file = handler_def.get('head_insert_template') + head_insert_file = config.get('head_insert_html_template') if head_insert_file: logging.info('Adding Head-Insert Template: ' + head_insert_file) return views.J2HeadInsertView(head_insert_file) # Then, static head_insert text - head_insert_text = handler_def.get('head_insert_text', '') + head_insert_text = config.get('head_insert_text', '') logging.info('Adding Head-Insert Text: ' + head_insert_text) return head_insert_text -def yaml_parse_handler(handler_def): - archive_loader = archiveloader.ArchiveLoader() - - index_loader = yaml_parse_index_loader(handler_def['index_paths']) - - archive_resolvers = yaml_parse_archive_resolvers(handler_def['archive_paths']) - - head_insert = yaml_parse_head_insert(handler_def) - - replayer = replay_views.RewritingReplayView(resolvers = archive_resolvers, - archiveloader = archive_loader, - head_insert = head_insert, - buffer_response = handler_def.get('buffer_response', False)) - - html_view_file = handler_def.get('html_query_template') +def yaml_parse_calendar_view(config): + html_view_file = config.get('calendar_html_template') if html_view_file: logging.info('Adding HTML Calendar Template: ' + html_view_file) else: logging.info('No HTML Calendar View Present') - html_view = views.J2QueryView(html_view_file) if html_view_file else None + return views.J2QueryView(html_view_file) if html_view_file else None + + + +def yaml_parse_route(config): + name = config['name'] + + archive_loader = archiveloader.ArchiveLoader() + + index_loader = yaml_parse_index_loader(config['index_paths']) + + archive_resolvers = map(replay_resolvers.make_best_resolver, config['archive_paths']) + + head_insert = yaml_parse_head_insert(config) + + replayer = replay_views.RewritingReplayView(resolvers = archive_resolvers, + archiveloader = archive_loader, + head_insert = head_insert, + buffer_response = config.get('buffer_response', False)) + + html_view = yaml_parse_calendar_view(config) wb_handler = handlers.WBHandler(index_loader, replayer, html_view) - return wb_handler + + return Route(name, wb_handler) + if __name__ == "__main__" or utils.enable_doctests(): - pass - #print pywb_config('config.yaml') + # Just test for execution for now + pywb_config(os.path.dirname(os.path.realpath(__file__)) + '/../config.yaml') + pywb_config_manual() diff --git a/pywb/replay_resolvers.py b/pywb/replay_resolvers.py index f437d0bd..98bcc89d 100644 --- a/pywb/replay_resolvers.py +++ b/pywb/replay_resolvers.py @@ -1,5 +1,10 @@ import redis import binsearch + +import urlparse +import os +import logging + #====================================== # PrefixResolver - convert cdx file entry to url with prefix if url contains specified string #====================================== @@ -41,3 +46,26 @@ class PathIndexResolver: return gen_list(result) + +#TODO: more options (remote files, contains param, etc..) +# find best resolver given the path +def make_best_resolver(path): + url_parts = urlparse.urlsplit(path) + + if url_parts.scheme == 'redis': + logging.info('Adding Redis Index: ' + path) + return RedisResolver(path) + + if url_parts.scheme == 'file': + path = url_parts.path + + if os.path.isfile(path): + logging.info('Adding Path Index: ' + path) + return PathIndexResolver(path) + + # non-file paths always treated as prefix for now + else: + logging.info('Adding Archive Path Source: ' + path) + return PrefixResolver(path) + + diff --git a/pywb/wbapp.py b/pywb/wbapp.py index 89d2bfef..14857003 100644 --- a/pywb/wbapp.py +++ b/pywb/wbapp.py @@ -84,18 +84,16 @@ def main(): # Attempt to load real settings from globalwb module logging.basicConfig(format = '%(asctime)s: [%(levelname)s]: %(message)s', level = logging.DEBUG) - config_name = os.environ.get('PYWB_CONFIG') + config_name = os.environ.get('PYWB_CONFIG_MODULE') if not config_name: config_name = 'pywb.pywb_init' - logging.info('PYWB_CONFIG not specified, loading default settings from module "{0}"'.format(config_name)) + logging.info('Loading from default config module "{0}"'.format(config_name)) logging.info('') module = importlib.import_module(config_name) - config_file = DEFAULT_CONFIG_FILE - - app = create_wb_app(module.pywb_config(config_file)) + app = create_wb_app(module.pywb_config()) logging.info('') logging.info('*** pywb inited with settings from {0}.pywb_config()!\n'.format(config_name)) return app @@ -107,8 +105,6 @@ def main(): #================================================================= if __name__ == "__main__" or utils.enable_doctests(): - import pywb_init - # Test sample settings - application = create_wb_app(pywb_init.pywb_config('../' + DEFAULT_CONFIG_FILE)) + pass else: application = main() diff --git a/run.sh b/run.sh index 0d2d803c..c4af9c4a 100755 --- a/run.sh +++ b/run.sh @@ -2,8 +2,13 @@ mypath=$(cd `dirname $0` && pwd) -# Setup init module -#export 'PYWB_CONFIG=globalwb' +# Set a different config file +#export 'PYWB_CONFIG=myconfig.yaml' + +# Set alternate init module +# The modules pywb_config() +# ex: my_pywb.pywb_config() +#export 'PYWB_CONFIG=my_pywb' app="pywb.wbapp"