From e4f409b2a446eb6f653e9d9301fbbb7b270b173e Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 11 Feb 2014 14:10:40 -0800 Subject: [PATCH] simplify pywb_init config: - add defaults dictionary, chain dictionaries rather than copying - allow custom classes to be loaded explicitly via yaml - for LineReader, assume ungzipped if first decompress fails - properly ignore bad local paths - add optional reporter object --- pywb/archiveloader.py | 18 ++++++++---- pywb/config_utils.py | 56 +++++++++++++++++++++++-------------- pywb/indexreader.py | 2 +- pywb/pywb_init.py | 60 +++++++++++++++++++++++++--------------- pywb/replay_resolvers.py | 13 +++++---- pywb/replay_views.py | 19 ++++++++----- pywb/utils.py | 31 --------------------- run-tests.py | 6 ++++ test_config.yaml | 6 +++- 9 files changed, 117 insertions(+), 94 deletions(-) diff --git a/pywb/archiveloader.py b/pywb/archiveloader.py index c1193dce..86e35149 100644 --- a/pywb/archiveloader.py +++ b/pywb/archiveloader.py @@ -190,9 +190,8 @@ class ArchiveLoader: def load(self, url, offset, length): url_parts = urlparse.urlsplit(url) - try: - loader = self.loaders.get(url_parts.scheme) - except Exception: + loader = self.loaders.get(url_parts.scheme) + if not loader: raise wbexceptions.UnknownLoaderProtocolException(url) the_format = None @@ -319,11 +318,18 @@ class LineReader: self._process_read(data) def _process_read(self, data): - self.num_read += len(data) - if self.decomp and data: - data = self.decomp.decompress(data) + try: + data = self.decomp.decompress(data) + except Exception: + # if first read attempt, assume non-gzipped stream + if self.num_read == 0: + self.decomp = False + # otherwise (partly decompressed), something is wrong + else: + raise + self.num_read += len(data) self.buff = StringIO.StringIO(data) diff --git a/pywb/config_utils.py b/pywb/config_utils.py index c3ebcf84..999eba75 100644 --- a/pywb/config_utils.py +++ b/pywb/config_utils.py @@ -5,7 +5,8 @@ import indexreader import replay_views import replay_resolvers import logging - +import hmac +import time #================================================================= # Config Loading @@ -17,25 +18,55 @@ def load_template_file(file, desc = None, view_class = views.J2TemplateView): return file +#================================================================= +# Cookie Signing +#================================================================= + +class HMACCookieMaker: + def __init__(self, key, name): + self.key = key + self.name = name + + def __call__(self, duration, extra_id = ''): + expire = str(long(time.time() + duration)) + + if extra_id: + msg = extra_id + '-' + expire + else: + msg = expire + + hmacdigest = hmac.new(self.key, msg) + hexdigest = hmacdigest.hexdigest() + + if extra_id: + cookie = '{0}-{1}={2}-{3}'.format(self.name, extra_id, expire, hexdigest) + else: + cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest) + + return cookie + #================================================================= -def create_wb_handler(**config): +def create_wb_handler(cdx_source, config): + replayer = replay_views.RewritingReplayView( resolvers = replay_resolvers.make_best_resolvers(config.get('archive_paths')), - loader = archiveloader.ArchiveLoader(hmac = config.get('hmac', None)), + loader = archiveloader.ArchiveLoader(hmac = config.get('hmac')), - head_insert_view = load_template_file(config.get('head_html'), 'Head Insert'), + head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'), buffer_response = config.get('buffer_response', True), redir_to_exact = config.get('redir_to_exact', True), + + reporter = config.get('reporter') ) wb_handler = handlers.WBHandler( - config['cdx_source'], + cdx_source, replayer, @@ -46,18 +77,3 @@ def create_wb_handler(**config): return wb_handler - -#================================================================= -def load_class(name): - result = name.rsplit('.', 1) - - if len(result) == 1: - modname == '' - klass = result[0] - else: - modname = result[0] - klass = result[1] - - mod = __import__(modname, fromlist=[klass]) - return getattr(mod, klass) - diff --git a/pywb/indexreader.py b/pywb/indexreader.py index 2f404c9e..580e5705 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -45,7 +45,7 @@ class IndexReader: raise NotImplementedError('Override in subclasses') @staticmethod - def make_best_cdx_source(paths, **config): + def make_best_cdx_source(paths, config): # may be a string or list surt_ordered = config.get('surt_ordered', True) diff --git a/pywb/pywb_init.py b/pywb/pywb_init.py index 953a9a38..a781d601 100644 --- a/pywb/pywb_init.py +++ b/pywb/pywb_init.py @@ -8,24 +8,45 @@ import logging import proxy #================================================================= -DEFAULT_HEAD_INSERT = 'ui/head_insert.html' -DEFAULT_QUERY = 'ui/query.html' -DEFAULT_SEARCH = 'ui/search.html' -DEFAULT_INDEX = 'ui/index.html' -DEFAULT_ERROR = 'ui/error.html' +DEFAULTS = { + 'hostpaths': ['http://localhost:8080'], + 'collections': {'pywb': './sample_archive/cdx/'}, + 'archive_paths': './sample_archive/warcs/', + + 'head_insert_html': 'ui/head_insert.html', + 'query_html': 'ui/query.html', + 'search_html': 'ui/search.html', + 'home_html': 'ui/index.html', + 'error_html': 'ui/error.html', + + 'static_routes': {'static/default': 'static/'}, +} + +class DictChain: + def __init__(self, *dicts): + self.dicts = dicts + + def get(self, key, default_val=None): + for d in self.dicts: + val = d.get(key) + if val: + return val + return default_val #================================================================= ## Reference non-YAML config #================================================================= -def pywb_config_manual(config = {}): +def pywb_config_manual(passed_config = {}): + + config = DictChain(passed_config, DEFAULTS) routes = [] - hostpaths = config.get('hostpaths', ['http://localhost:8080']) + hostpaths = config.get('hostpaths') # collections based on cdx source - collections = config.get('collections', {'pywb': './sample_archive/cdx/'}) + collections = config.get('collections') for name, value in collections.iteritems(): route_config = config @@ -33,28 +54,21 @@ def pywb_config_manual(config = {}): if isinstance(value, dict): # if a dict, extend with base properies index_paths = value['index_paths'] - value.update(route_config) - route_config = value + route_config = DictChain(value, config) else: index_paths = str(value) - cdx_source = indexreader.IndexReader.make_best_cdx_source(index_paths, **config) + cdx_source = indexreader.IndexReader.make_best_cdx_source(index_paths, route_config) + wb_handler = config_utils.create_wb_handler( cdx_source = cdx_source, - archive_paths = route_config.get('archive_paths', './sample_archive/warcs/'), - head_html = route_config.get('head_insert_html', DEFAULT_HEAD_INSERT), - query_html = route_config.get('query_html', DEFAULT_QUERY), - search_html = route_config.get('search_html', DEFAULT_SEARCH), + config = route_config, ) logging.info('Adding Collection: ' + name) - route_class = route_config.get('route_class', None) - if route_class: - route_class = config_utils.load_class(route_class) - else: - route_class = archivalrouter.Route + route_class = route_config.get('route_class', archivalrouter.Route) routes.append(route_class(name, wb_handler, config = route_config)) @@ -70,7 +84,7 @@ def pywb_config_manual(config = {}): routes.append(archivalrouter.Route('echo_req', handlers.DebugEchoHandler())) - static_routes = config.get('static_routes', {'static/default': 'static/'}) + static_routes = config.get('static_routes') for static_name, static_path in static_routes.iteritems(): routes.append(archivalrouter.Route(static_name, handlers.StaticHandler(static_path))) @@ -91,8 +105,8 @@ def pywb_config_manual(config = {}): abs_path = config.get('absolute_paths', True), - home_view = config_utils.load_template_file(config.get('home_html', DEFAULT_INDEX), 'Home Page'), - error_view = config_utils.load_template_file(config.get('error_html', DEFAULT_ERROR), 'Error Page') + home_view = config_utils.load_template_file(config.get('home_html'), 'Home Page'), + error_view = config_utils.load_template_file(config.get('error_html'), 'Error Page') ) diff --git a/pywb/replay_resolvers.py b/pywb/replay_resolvers.py index cd1958dc..306675f8 100644 --- a/pywb/replay_resolvers.py +++ b/pywb/replay_resolvers.py @@ -72,7 +72,7 @@ def make_best_resolver(param): PrefixResolver('http://myhost.example.com/warcs/') # http path w/ contains param - >>> make_best_resolver(('http://myhost.example.com/warcs/', '/')) + >>> make_best_resolver(['http://myhost.example.com/warcs/', '/']) PrefixResolver('http://myhost.example.com/warcs/', contains = '/') # redis path @@ -89,7 +89,7 @@ def make_best_resolver(param): """ - if isinstance(param, tuple): + if isinstance(param, list): path = param[0] arg = param[1] else: @@ -116,12 +116,15 @@ def make_best_resolver(param): #================================================================= -def make_best_resolvers(*paths): +def make_best_resolvers(paths): """ - >>> make_best_resolvers('http://myhost.example.com/warcs/', 'redis://myhost.example.com:1234/1') + >>> make_best_resolvers(['http://myhost.example.com/warcs/', 'redis://myhost.example.com:1234/1']) [PrefixResolver('http://myhost.example.com/warcs/'), RedisResolver('redis://myhost.example.com:1234/1')] """ - return map(make_best_resolver, paths) + if hasattr(paths, '__iter__'): + return map(make_best_resolver, paths) + else: + return [make_best_resolver(paths)] import utils diff --git a/pywb/replay_views.py b/pywb/replay_views.py index 0f076e02..78c097b2 100644 --- a/pywb/replay_views.py +++ b/pywb/replay_views.py @@ -18,9 +18,10 @@ import wbexceptions #================================================================= class ReplayView: - def __init__(self, resolvers, loader = None): + def __init__(self, resolvers, loader = None, reporter = None): self.resolvers = resolvers self.loader = loader if loader else archiveloader.ArchiveLoader() + self._reporter = reporter def __call__(self, wbrequest, cdx_lines, cdx_reader): @@ -41,7 +42,13 @@ class ReplayView: (cdx, status_headers, stream) = self.resolve_headers_and_payload(cdx, wbrequest, cdx_reader, failed_files) - return self.make_response(wbrequest, cdx, status_headers, stream) + response = self.make_response(wbrequest, cdx, status_headers, stream) + + # notify reporter callback, if any + if self._reporter: + self._reporter(wbrequest, cdx, response) + + return response except wbexceptions.CaptureException as ce: @@ -83,7 +90,7 @@ class ReplayView: try: return self.loader.load(path, offset, length) - except URLError as ue: + except Exception as ue: last_exc = ue print last_exc pass @@ -231,8 +238,8 @@ class ReplayView: #================================================================= class RewritingReplayView(ReplayView): - def __init__(self, resolvers, loader = None, head_insert_view = None, header_rewriter = None, redir_to_exact = True, buffer_response = False): - ReplayView.__init__(self, resolvers, loader) + def __init__(self, resolvers, loader = None, head_insert_view = None, header_rewriter = None, redir_to_exact = True, buffer_response = False, reporter = None): + ReplayView.__init__(self, resolvers, loader, reporter) self.head_insert_view = head_insert_view self.header_rewriter = header_rewriter if header_rewriter else HeaderRewriter() self.redir_to_exact = redir_to_exact @@ -241,7 +248,6 @@ class RewritingReplayView(ReplayView): self.buffer_response = buffer_response - def _text_content_type(self, content_type): for ctype, mimelist in self.REWRITE_TYPES.iteritems(): if any ((mime in content_type) for mime in mimelist): @@ -411,4 +417,3 @@ class RewritingReplayView(ReplayView): if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)): raise wbexceptions.CaptureException('Self Redirect: ' + str(cdx)) - diff --git a/pywb/utils.py b/pywb/utils.py index 9041db09..934dd818 100644 --- a/pywb/utils.py +++ b/pywb/utils.py @@ -1,5 +1,4 @@ import itertools -import hmac import time import zlib import time @@ -26,36 +25,6 @@ def split_prefix(key, prefixs): def create_decompressor(): return zlib.decompressobj(16 + zlib.MAX_WBITS) -#================================================================= -# Cookie Signing -#================================================================= - -class HMACCookieMaker: - def __init__(self, key, name): - self.key = key - self.name = name - - - def __call__(self, duration, extra_id = ''): - expire = str(long(time.time() + duration)) - - if extra_id: - msg = extra_id + '-' + expire - else: - msg = expire - - hmacdigest = hmac.new(self.key, msg) - hexdigest = hmacdigest.hexdigest() - - if extra_id: - cookie = '{0}-{1}={2}-{3}'.format(self.name, extra_id, expire, hexdigest) - else: - cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest) - - return cookie - - #return cookie + hexdigest - #================================================================= # Adapted from example at diff --git a/run-tests.py b/run-tests.py index 014eb753..28282013 100644 --- a/run-tests.py +++ b/run-tests.py @@ -161,3 +161,9 @@ class TestWb: resp = self.testapp.get('/pywb/?abc', status = 400) assert resp.status_int == 400 assert 'Bad Request Url: http://?abc' in resp.body + +# Reporter callback for replay view +def print_reporter(wbrequest, cdx, response): + print wbrequest + print cdx + pass diff --git a/test_config.yaml b/test_config.yaml index 7460c825..38a15f37 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -33,7 +33,7 @@ surt_ordered: true # * http:// path, use path as remote prefix # * redis:// path, use redis to lookup full path for w: as key -archive_paths: ./sample_archive/warcs/ +archive_paths: ['./invalid/path/to/ignore/', './sample_archive/warcs/'] # ==== Optional UI: HTML/Jinja2 Templates ==== @@ -89,3 +89,7 @@ enable_http_proxy: true # enable cdx server api for querying cdx directly (experimental) enable_cdx_api: true + +# optional reporter callback func +# if set, called with request and cdx object +reporter_func: pywb.run-tests.print_reporter