From 43a46b373d91408f0e45311deb4734eb0202c70b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 28 Jan 2014 22:03:01 -0800 Subject: [PATCH] move sample/test data to ./sample_archive/warcs and ./sample_archive/cdx pywb_init now driven by config.yaml! (#14) Not yet supporting customized handlers, views, etc... --- config.yaml | 18 +++ pywb/archiveloader.py | 2 +- pywb/binsearch.py | 2 +- pywb/cdxserve.py | 2 +- pywb/indexreader.py | 2 +- pywb/pywb_init.py | 111 ++++++++++++++++-- pywb/utils.py | 4 + pywb/views.py | 8 +- pywb/wbapp.py | 8 +- {test => sample_archive/cdx}/dupes.cdx | 0 {test => sample_archive/cdx}/example.cdx | 0 {test => sample_archive/cdx}/iana.cdx | 0 {test => sample_archive/warcs}/dupes.warc.gz | Bin .../warcs}/example.warc.gz | Bin {test => sample_archive/warcs}/iana.warc.gz | Bin 15 files changed, 142 insertions(+), 15 deletions(-) create mode 100644 config.yaml rename {test => sample_archive/cdx}/dupes.cdx (100%) rename {test => sample_archive/cdx}/example.cdx (100%) rename {test => sample_archive/cdx}/iana.cdx (100%) rename {test => sample_archive/warcs}/dupes.warc.gz (100%) rename {test => sample_archive/warcs}/example.warc.gz (100%) rename {test => sample_archive/warcs}/iana.warc.gz (100%) diff --git a/config.yaml b/config.yaml new file mode 100644 index 00000000..8abf7f84 --- /dev/null +++ b/config.yaml @@ -0,0 +1,18 @@ +routes: + pywb: + index_paths: + - ./sample_archive/cdx/ + + archive_paths: + - ./sample_archive/warcs/ + + head_insert_template: ./ui/head_insert.html + + html_query_template: ./ui/query.html + + +hostpaths: http://localhost:8080/ + + + + diff --git a/pywb/archiveloader.py b/pywb/archiveloader.py index 586c359a..dbba1449 100644 --- a/pywb/archiveloader.py +++ b/pywb/archiveloader.py @@ -380,7 +380,7 @@ if __name__ == "__main__" or utils.enable_doctests(): testloader = ArchiveLoader() def load_test_archive(test_file, offset, length): - path = os.path.dirname(os.path.realpath(__file__)) + '/../test/' + test_file + path = utils.test_data_dir() + 'warcs/' + test_file archive = testloader.load(path, offset, length) pprint.pprint((archive.type, archive.rec_headers, archive.status_headers)) diff --git a/pywb/binsearch.py b/pywb/binsearch.py index 2027e204..563a1e32 100644 --- a/pywb/binsearch.py +++ b/pywb/binsearch.py @@ -127,7 +127,7 @@ import utils if __name__ == "__main__" or utils.enable_doctests(): def create_test_cdx(test_file): - path = os.path.dirname(os.path.realpath(__file__)) + '/../test/' + test_file + path = utils.test_data_dir() + 'cdx/' + test_file return FileReader(path) test_cdx = create_test_cdx('iana.cdx') diff --git a/pywb/cdxserve.py b/pywb/cdxserve.py index 8f53b7b0..e4922516 100644 --- a/pywb/cdxserve.py +++ b/pywb/cdxserve.py @@ -332,7 +332,7 @@ if __name__ == "__main__" or utils.enable_doctests(): import os import sys - test_dir = os.path.dirname(os.path.realpath(__file__)) + '/../test/' + test_dir = utils.test_data_dir() + 'cdx/' def test_cdx(key, match_func = binsearch.iter_exact, sources = [test_dir + 'iana.cdx'], **kwparams): for x in cdx_serve(key, kwparams, sources, match_func): diff --git a/pywb/indexreader.py b/pywb/indexreader.py index 00a1050c..ea692a05 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -251,7 +251,7 @@ import utils if __name__ == "__main__" or utils.enable_doctests(): from pprint import pprint - test_dir = os.path.dirname(os.path.realpath(__file__)) + '/../test/' + test_dir = utils.test_data_dir() + 'cdx/' import doctest doctest.testmod() diff --git a/pywb/pywb_init.py b/pywb/pywb_init.py index f6e58951..382f3d63 100644 --- a/pywb/pywb_init.py +++ b/pywb/pywb_init.py @@ -7,7 +7,9 @@ import replay_resolvers import cdxserve from archivalrouter import ArchivalRequestRouter, Route import os - +import yaml +import utils +import logging ## =========== default_head_insert = """ @@ -18,9 +20,10 @@ default_head_insert = """ """ -def pywb_config(): +def pywb_config2(): # Current test dir - test_dir = os.path.dirname(os.path.realpath(__file__)) + '/../test/' + #test_dir = utils.test_data_dir() + test_dir = './sample_archive/' # Standard loader which supports WARC/ARC files aloader = archiveloader.ArchiveLoader() @@ -28,19 +31,19 @@ def pywb_config(): # Source for cdx source #query_h = query.QueryHandler(indexreader.RemoteCDXServer('http://cdx.example.com/cdx')) #test_cdx = [test_dir + 'iana.cdx', test_dir + 'example.cdx', test_dir + 'dupes.cdx'] - indexs = indexreader.LocalCDXServer([test_dir]) + indexs = indexreader.LocalCDXServer([test_dir + 'cdx/']) # Loads warcs specified in cdx from these locations - prefixes = [replay_resolvers.PrefixResolver(test_dir)] + prefixes = [replay_resolvers.PrefixResolver(test_dir + 'warcs/')] # Jinja2 head insert - head_insert = views.J2HeadInsertView('./ui/', 'head_insert.html') + head_insert = views.J2HeadInsertView('./ui/head_insert.html') # Create rewriting replay handler to rewrite records replayer = replay_views.RewritingReplayView(resolvers = prefixes, archiveloader = aloader, head_insert = head_insert, buffer_response = True) # Create Jinja2 based html query view - html_view = views.J2QueryView('./ui/', 'query.html') + html_view = views.J2QueryView('./ui/query.html') # WB handler which uses the index reader, replayer, and html_view wb_handler = handlers.WBHandler(indexs, replayer, html_view) @@ -61,3 +64,97 @@ def pywb_config(): hostpaths = ['http://localhost:8080/']) +def pywb_config(filename = './pywb/config.yaml'): + config = yaml.load(open(filename)) + + routes = map(yaml_parse_route, config['routes'].iteritems()) + + hostpaths = config.get('hostpaths', ['http://localhost:8080/']) + + return ArchivalRequestRouter(routes, hostpaths) + + + +def yaml_parse_route((route_name, handler_def)): + return Route(route_name, yaml_parse_handler(handler_def)) + + +def yaml_parse_index_loader(index_config): + # support mixed cdx streams and remote servers? + # for now, list implies local sources + if isinstance(index_config, list): + return indexreader.LocalCDXServer(index_config) + + if isinstance(index_config, str): + uri = index_config + cookie = None + elif isinstance(index_config, dict): + uri = index_config['url'] + cookie = index_config['cookie'] + else: + raise Exception('Invalid Index Reader Config: ' + str(index_config)) + + # Check for remote cdx server + if (uri.startswith('http://') or uri.startswith('https://')) and not uri.endswith('.cdx'): + return indexreader.RemoteCDXServer(uri, cookie = cookie) + else: + return indexreader.LocalCDXServer([uri]) + + +def yaml_parse_archive_resolvers(archive_paths): + + #TODO: more options (remote files, contains param, etc..) + def make_resolver(path): + if path.startswith('redis://'): + return replay_resolvers.RedisResolver(path) + elif os.path.isfile(path): + return replay_resolvers.PathIndexResolver(path) + else: + logging.info('Adding Archive Source: ' + path) + return replay_resolvers.PrefixResolver(path) + + return map(make_resolver, archive_paths) + +def yaml_parse_head_insert(handler_def): + # First, try a template file + head_insert_file = handler_def.get('head_insert_template') + if head_insert_file: + logging.info('Adding Head-Insert Template: ' + head_insert_file) + return views.J2HeadInsertView(head_insert_file) + + # Then, static head_insert text + head_insert_text = handler_def.get('head_insert_text', '') + logging.info('Adding Head-Insert Text: ' + head_insert_text) + return head_insert_text + + +def yaml_parse_handler(handler_def): + archive_loader = archiveloader.ArchiveLoader() + + index_loader = yaml_parse_index_loader(handler_def['index_paths']) + + archive_resolvers = yaml_parse_archive_resolvers(handler_def['archive_paths']) + + head_insert = yaml_parse_head_insert(handler_def) + + replayer = replay_views.RewritingReplayView(resolvers = archive_resolvers, + archiveloader = archive_loader, + head_insert = head_insert, + buffer_response = handler_def.get('buffer_response', False)) + + html_view_file = handler_def.get('html_query_template') + if html_view_file: + logging.info('Adding HTML Calendar Template: ' + html_view_file) + else: + logging.info('No HTML Calendar View Present') + + html_view = views.J2QueryView(html_view_file) if html_view_file else None + + wb_handler = handlers.WBHandler(index_loader, replayer, html_view) + return wb_handler + +if __name__ == "__main__" or utils.enable_doctests(): + pass + #print pywb_config('config.yaml') + + diff --git a/pywb/utils.py b/pywb/utils.py index 02375376..496b5f6a 100644 --- a/pywb/utils.py +++ b/pywb/utils.py @@ -203,6 +203,10 @@ is_in_nose = sys.argv[0].endswith('nosetests') def enable_doctests(): return is_in_nose +def test_data_dir(): + import os + return os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/' + #============================================ if __name__ == "__main__" or enable_doctests(): diff --git a/pywb/views.py b/pywb/views.py index da7c7920..380e427f 100644 --- a/pywb/views.py +++ b/pywb/views.py @@ -4,6 +4,7 @@ import wbrequestresponse import wbexceptions import time +from os import path from itertools import imap from jinja2 import Environment, FileSystemLoader @@ -16,7 +17,9 @@ class TextQueryView: #================================================================= class J2QueryView: - def __init__(self, template_dir, template_file, buffer_index = True): + def __init__(self, filename, buffer_index = True): + template_dir, template_file = path.split(filename) + self.template_file = template_file self.buffer_index = buffer_index @@ -41,7 +44,8 @@ class J2QueryView: # Render the head insert (eg. banner) #================================================================= class J2HeadInsertView: - def __init__(self, template_dir, template_file, buffer_index = True): + def __init__(self, filename, buffer_index = True): + template_dir, template_file = path.split(filename) self.template_file = template_file self.jinja_env = make_jinja_env(template_dir) diff --git a/pywb/wbapp.py b/pywb/wbapp.py index 77b8d34f..89d2bfef 100644 --- a/pywb/wbapp.py +++ b/pywb/wbapp.py @@ -77,6 +77,8 @@ def handle_exception(env, exc): #================================================================= +DEFAULT_CONFIG_FILE = 'config.yaml' + def main(): try: # Attempt to load real settings from globalwb module @@ -91,7 +93,9 @@ def main(): module = importlib.import_module(config_name) - app = create_wb_app(module.pywb_config()) + config_file = DEFAULT_CONFIG_FILE + + app = create_wb_app(module.pywb_config(config_file)) logging.info('') logging.info('*** pywb inited with settings from {0}.pywb_config()!\n'.format(config_name)) return app @@ -105,6 +109,6 @@ def main(): if __name__ == "__main__" or utils.enable_doctests(): import pywb_init # Test sample settings - application = create_wb_app(pywb_init.pywb_config()) + application = create_wb_app(pywb_init.pywb_config('../' + DEFAULT_CONFIG_FILE)) else: application = main() diff --git a/test/dupes.cdx b/sample_archive/cdx/dupes.cdx similarity index 100% rename from test/dupes.cdx rename to sample_archive/cdx/dupes.cdx diff --git a/test/example.cdx b/sample_archive/cdx/example.cdx similarity index 100% rename from test/example.cdx rename to sample_archive/cdx/example.cdx diff --git a/test/iana.cdx b/sample_archive/cdx/iana.cdx similarity index 100% rename from test/iana.cdx rename to sample_archive/cdx/iana.cdx diff --git a/test/dupes.warc.gz b/sample_archive/warcs/dupes.warc.gz similarity index 100% rename from test/dupes.warc.gz rename to sample_archive/warcs/dupes.warc.gz diff --git a/test/example.warc.gz b/sample_archive/warcs/example.warc.gz similarity index 100% rename from test/example.warc.gz rename to sample_archive/warcs/example.warc.gz diff --git a/test/iana.warc.gz b/sample_archive/warcs/iana.warc.gz similarity index 100% rename from test/iana.warc.gz rename to sample_archive/warcs/iana.warc.gz