1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

simplify pywb_init config:

- add defaults dictionary, chain dictionaries rather than copying
 - allow custom classes to be loaded explicitly via yaml
 - for LineReader, assume ungzipped if first decompress fails
 - properly ignore bad local paths
 - add optional reporter object
This commit is contained in:
Ilya Kreymer 2014-02-11 14:10:40 -08:00
parent 8b2bfa570c
commit e4f409b2a4
9 changed files with 117 additions and 94 deletions

View File

@ -190,9 +190,8 @@ class ArchiveLoader:
def load(self, url, offset, length):
url_parts = urlparse.urlsplit(url)
try:
loader = self.loaders.get(url_parts.scheme)
except Exception:
loader = self.loaders.get(url_parts.scheme)
if not loader:
raise wbexceptions.UnknownLoaderProtocolException(url)
the_format = None
@ -319,11 +318,18 @@ class LineReader:
self._process_read(data)
def _process_read(self, data):
self.num_read += len(data)
if self.decomp and data:
data = self.decomp.decompress(data)
try:
data = self.decomp.decompress(data)
except Exception:
# if first read attempt, assume non-gzipped stream
if self.num_read == 0:
self.decomp = False
# otherwise (partly decompressed), something is wrong
else:
raise
self.num_read += len(data)
self.buff = StringIO.StringIO(data)

View File

@ -5,7 +5,8 @@ import indexreader
import replay_views
import replay_resolvers
import logging
import hmac
import time
#=================================================================
# Config Loading
@ -17,25 +18,55 @@ def load_template_file(file, desc = None, view_class = views.J2TemplateView):
return file
#=================================================================
# Cookie Signing
#=================================================================
class HMACCookieMaker:
def __init__(self, key, name):
self.key = key
self.name = name
def __call__(self, duration, extra_id = ''):
expire = str(long(time.time() + duration))
if extra_id:
msg = extra_id + '-' + expire
else:
msg = expire
hmacdigest = hmac.new(self.key, msg)
hexdigest = hmacdigest.hexdigest()
if extra_id:
cookie = '{0}-{1}={2}-{3}'.format(self.name, extra_id, expire, hexdigest)
else:
cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest)
return cookie
#=================================================================
def create_wb_handler(**config):
def create_wb_handler(cdx_source, config):
replayer = replay_views.RewritingReplayView(
resolvers = replay_resolvers.make_best_resolvers(config.get('archive_paths')),
loader = archiveloader.ArchiveLoader(hmac = config.get('hmac', None)),
loader = archiveloader.ArchiveLoader(hmac = config.get('hmac')),
head_insert_view = load_template_file(config.get('head_html'), 'Head Insert'),
head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'),
buffer_response = config.get('buffer_response', True),
redir_to_exact = config.get('redir_to_exact', True),
reporter = config.get('reporter')
)
wb_handler = handlers.WBHandler(
config['cdx_source'],
cdx_source,
replayer,
@ -46,18 +77,3 @@ def create_wb_handler(**config):
return wb_handler
#=================================================================
def load_class(name):
result = name.rsplit('.', 1)
if len(result) == 1:
modname == ''
klass = result[0]
else:
modname = result[0]
klass = result[1]
mod = __import__(modname, fromlist=[klass])
return getattr(mod, klass)

View File

@ -45,7 +45,7 @@ class IndexReader:
raise NotImplementedError('Override in subclasses')
@staticmethod
def make_best_cdx_source(paths, **config):
def make_best_cdx_source(paths, config):
# may be a string or list
surt_ordered = config.get('surt_ordered', True)

View File

@ -8,24 +8,45 @@ import logging
import proxy
#=================================================================
DEFAULT_HEAD_INSERT = 'ui/head_insert.html'
DEFAULT_QUERY = 'ui/query.html'
DEFAULT_SEARCH = 'ui/search.html'
DEFAULT_INDEX = 'ui/index.html'
DEFAULT_ERROR = 'ui/error.html'
DEFAULTS = {
'hostpaths': ['http://localhost:8080'],
'collections': {'pywb': './sample_archive/cdx/'},
'archive_paths': './sample_archive/warcs/',
'head_insert_html': 'ui/head_insert.html',
'query_html': 'ui/query.html',
'search_html': 'ui/search.html',
'home_html': 'ui/index.html',
'error_html': 'ui/error.html',
'static_routes': {'static/default': 'static/'},
}
class DictChain:
def __init__(self, *dicts):
self.dicts = dicts
def get(self, key, default_val=None):
for d in self.dicts:
val = d.get(key)
if val:
return val
return default_val
#=================================================================
## Reference non-YAML config
#=================================================================
def pywb_config_manual(config = {}):
def pywb_config_manual(passed_config = {}):
config = DictChain(passed_config, DEFAULTS)
routes = []
hostpaths = config.get('hostpaths', ['http://localhost:8080'])
hostpaths = config.get('hostpaths')
# collections based on cdx source
collections = config.get('collections', {'pywb': './sample_archive/cdx/'})
collections = config.get('collections')
for name, value in collections.iteritems():
route_config = config
@ -33,28 +54,21 @@ def pywb_config_manual(config = {}):
if isinstance(value, dict):
# if a dict, extend with base properies
index_paths = value['index_paths']
value.update(route_config)
route_config = value
route_config = DictChain(value, config)
else:
index_paths = str(value)
cdx_source = indexreader.IndexReader.make_best_cdx_source(index_paths, **config)
cdx_source = indexreader.IndexReader.make_best_cdx_source(index_paths, route_config)
wb_handler = config_utils.create_wb_handler(
cdx_source = cdx_source,
archive_paths = route_config.get('archive_paths', './sample_archive/warcs/'),
head_html = route_config.get('head_insert_html', DEFAULT_HEAD_INSERT),
query_html = route_config.get('query_html', DEFAULT_QUERY),
search_html = route_config.get('search_html', DEFAULT_SEARCH),
config = route_config,
)
logging.info('Adding Collection: ' + name)
route_class = route_config.get('route_class', None)
if route_class:
route_class = config_utils.load_class(route_class)
else:
route_class = archivalrouter.Route
route_class = route_config.get('route_class', archivalrouter.Route)
routes.append(route_class(name, wb_handler, config = route_config))
@ -70,7 +84,7 @@ def pywb_config_manual(config = {}):
routes.append(archivalrouter.Route('echo_req', handlers.DebugEchoHandler()))
static_routes = config.get('static_routes', {'static/default': 'static/'})
static_routes = config.get('static_routes')
for static_name, static_path in static_routes.iteritems():
routes.append(archivalrouter.Route(static_name, handlers.StaticHandler(static_path)))
@ -91,8 +105,8 @@ def pywb_config_manual(config = {}):
abs_path = config.get('absolute_paths', True),
home_view = config_utils.load_template_file(config.get('home_html', DEFAULT_INDEX), 'Home Page'),
error_view = config_utils.load_template_file(config.get('error_html', DEFAULT_ERROR), 'Error Page')
home_view = config_utils.load_template_file(config.get('home_html'), 'Home Page'),
error_view = config_utils.load_template_file(config.get('error_html'), 'Error Page')
)

View File

@ -72,7 +72,7 @@ def make_best_resolver(param):
PrefixResolver('http://myhost.example.com/warcs/')
# http path w/ contains param
>>> make_best_resolver(('http://myhost.example.com/warcs/', '/'))
>>> make_best_resolver(['http://myhost.example.com/warcs/', '/'])
PrefixResolver('http://myhost.example.com/warcs/', contains = '/')
# redis path
@ -89,7 +89,7 @@ def make_best_resolver(param):
"""
if isinstance(param, tuple):
if isinstance(param, list):
path = param[0]
arg = param[1]
else:
@ -116,12 +116,15 @@ def make_best_resolver(param):
#=================================================================
def make_best_resolvers(*paths):
def make_best_resolvers(paths):
"""
>>> make_best_resolvers('http://myhost.example.com/warcs/', 'redis://myhost.example.com:1234/1')
>>> make_best_resolvers(['http://myhost.example.com/warcs/', 'redis://myhost.example.com:1234/1'])
[PrefixResolver('http://myhost.example.com/warcs/'), RedisResolver('redis://myhost.example.com:1234/1')]
"""
return map(make_best_resolver, paths)
if hasattr(paths, '__iter__'):
return map(make_best_resolver, paths)
else:
return [make_best_resolver(paths)]
import utils

View File

@ -18,9 +18,10 @@ import wbexceptions
#=================================================================
class ReplayView:
def __init__(self, resolvers, loader = None):
def __init__(self, resolvers, loader = None, reporter = None):
self.resolvers = resolvers
self.loader = loader if loader else archiveloader.ArchiveLoader()
self._reporter = reporter
def __call__(self, wbrequest, cdx_lines, cdx_reader):
@ -41,7 +42,13 @@ class ReplayView:
(cdx, status_headers, stream) = self.resolve_headers_and_payload(cdx, wbrequest, cdx_reader, failed_files)
return self.make_response(wbrequest, cdx, status_headers, stream)
response = self.make_response(wbrequest, cdx, status_headers, stream)
# notify reporter callback, if any
if self._reporter:
self._reporter(wbrequest, cdx, response)
return response
except wbexceptions.CaptureException as ce:
@ -83,7 +90,7 @@ class ReplayView:
try:
return self.loader.load(path, offset, length)
except URLError as ue:
except Exception as ue:
last_exc = ue
print last_exc
pass
@ -231,8 +238,8 @@ class ReplayView:
#=================================================================
class RewritingReplayView(ReplayView):
def __init__(self, resolvers, loader = None, head_insert_view = None, header_rewriter = None, redir_to_exact = True, buffer_response = False):
ReplayView.__init__(self, resolvers, loader)
def __init__(self, resolvers, loader = None, head_insert_view = None, header_rewriter = None, redir_to_exact = True, buffer_response = False, reporter = None):
ReplayView.__init__(self, resolvers, loader, reporter)
self.head_insert_view = head_insert_view
self.header_rewriter = header_rewriter if header_rewriter else HeaderRewriter()
self.redir_to_exact = redir_to_exact
@ -241,7 +248,6 @@ class RewritingReplayView(ReplayView):
self.buffer_response = buffer_response
def _text_content_type(self, content_type):
for ctype, mimelist in self.REWRITE_TYPES.iteritems():
if any ((mime in content_type) for mime in mimelist):
@ -411,4 +417,3 @@ class RewritingReplayView(ReplayView):
if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)):
raise wbexceptions.CaptureException('Self Redirect: ' + str(cdx))

View File

@ -1,5 +1,4 @@
import itertools
import hmac
import time
import zlib
import time
@ -26,36 +25,6 @@ def split_prefix(key, prefixs):
def create_decompressor():
return zlib.decompressobj(16 + zlib.MAX_WBITS)
#=================================================================
# Cookie Signing
#=================================================================
class HMACCookieMaker:
def __init__(self, key, name):
self.key = key
self.name = name
def __call__(self, duration, extra_id = ''):
expire = str(long(time.time() + duration))
if extra_id:
msg = extra_id + '-' + expire
else:
msg = expire
hmacdigest = hmac.new(self.key, msg)
hexdigest = hmacdigest.hexdigest()
if extra_id:
cookie = '{0}-{1}={2}-{3}'.format(self.name, extra_id, expire, hexdigest)
else:
cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest)
return cookie
#return cookie + hexdigest
#=================================================================
# Adapted from example at

View File

@ -161,3 +161,9 @@ class TestWb:
resp = self.testapp.get('/pywb/?abc', status = 400)
assert resp.status_int == 400
assert 'Bad Request Url: http://?abc' in resp.body
# Reporter callback for replay view
def print_reporter(wbrequest, cdx, response):
print wbrequest
print cdx
pass

View File

@ -33,7 +33,7 @@ surt_ordered: true
# * http:// path, use path as remote prefix
# * redis:// path, use redis to lookup full path for w:<warc> as key
archive_paths: ./sample_archive/warcs/
archive_paths: ['./invalid/path/to/ignore/', './sample_archive/warcs/']
# ==== Optional UI: HTML/Jinja2 Templates ====
@ -89,3 +89,7 @@ enable_http_proxy: true
# enable cdx server api for querying cdx directly (experimental)
enable_cdx_api: true
# optional reporter callback func
# if set, called with request and cdx object
reporter_func: pywb.run-tests.print_reporter