mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
simplify pywb_init config:
- add defaults dictionary, chain dictionaries rather than copying - allow custom classes to be loaded explicitly via yaml - for LineReader, assume ungzipped if first decompress fails - properly ignore bad local paths - add optional reporter object
This commit is contained in:
parent
8b2bfa570c
commit
e4f409b2a4
@ -190,9 +190,8 @@ class ArchiveLoader:
|
||||
def load(self, url, offset, length):
|
||||
url_parts = urlparse.urlsplit(url)
|
||||
|
||||
try:
|
||||
loader = self.loaders.get(url_parts.scheme)
|
||||
except Exception:
|
||||
if not loader:
|
||||
raise wbexceptions.UnknownLoaderProtocolException(url)
|
||||
|
||||
the_format = None
|
||||
@ -319,11 +318,18 @@ class LineReader:
|
||||
self._process_read(data)
|
||||
|
||||
def _process_read(self, data):
|
||||
self.num_read += len(data)
|
||||
|
||||
if self.decomp and data:
|
||||
try:
|
||||
data = self.decomp.decompress(data)
|
||||
except Exception:
|
||||
# if first read attempt, assume non-gzipped stream
|
||||
if self.num_read == 0:
|
||||
self.decomp = False
|
||||
# otherwise (partly decompressed), something is wrong
|
||||
else:
|
||||
raise
|
||||
|
||||
self.num_read += len(data)
|
||||
self.buff = StringIO.StringIO(data)
|
||||
|
||||
|
||||
|
@ -5,7 +5,8 @@ import indexreader
|
||||
import replay_views
|
||||
import replay_resolvers
|
||||
import logging
|
||||
|
||||
import hmac
|
||||
import time
|
||||
|
||||
#=================================================================
|
||||
# Config Loading
|
||||
@ -17,25 +18,55 @@ def load_template_file(file, desc = None, view_class = views.J2TemplateView):
|
||||
|
||||
return file
|
||||
|
||||
#=================================================================
|
||||
# Cookie Signing
|
||||
#=================================================================
|
||||
|
||||
class HMACCookieMaker:
|
||||
def __init__(self, key, name):
|
||||
self.key = key
|
||||
self.name = name
|
||||
|
||||
def __call__(self, duration, extra_id = ''):
|
||||
expire = str(long(time.time() + duration))
|
||||
|
||||
if extra_id:
|
||||
msg = extra_id + '-' + expire
|
||||
else:
|
||||
msg = expire
|
||||
|
||||
hmacdigest = hmac.new(self.key, msg)
|
||||
hexdigest = hmacdigest.hexdigest()
|
||||
|
||||
if extra_id:
|
||||
cookie = '{0}-{1}={2}-{3}'.format(self.name, extra_id, expire, hexdigest)
|
||||
else:
|
||||
cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest)
|
||||
|
||||
return cookie
|
||||
|
||||
|
||||
#=================================================================
|
||||
def create_wb_handler(**config):
|
||||
def create_wb_handler(cdx_source, config):
|
||||
|
||||
replayer = replay_views.RewritingReplayView(
|
||||
|
||||
resolvers = replay_resolvers.make_best_resolvers(config.get('archive_paths')),
|
||||
|
||||
loader = archiveloader.ArchiveLoader(hmac = config.get('hmac', None)),
|
||||
loader = archiveloader.ArchiveLoader(hmac = config.get('hmac')),
|
||||
|
||||
head_insert_view = load_template_file(config.get('head_html'), 'Head Insert'),
|
||||
head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'),
|
||||
|
||||
buffer_response = config.get('buffer_response', True),
|
||||
|
||||
redir_to_exact = config.get('redir_to_exact', True),
|
||||
|
||||
reporter = config.get('reporter')
|
||||
)
|
||||
|
||||
|
||||
wb_handler = handlers.WBHandler(
|
||||
config['cdx_source'],
|
||||
cdx_source,
|
||||
|
||||
replayer,
|
||||
|
||||
@ -46,18 +77,3 @@ def create_wb_handler(**config):
|
||||
|
||||
return wb_handler
|
||||
|
||||
|
||||
#=================================================================
|
||||
def load_class(name):
|
||||
result = name.rsplit('.', 1)
|
||||
|
||||
if len(result) == 1:
|
||||
modname == ''
|
||||
klass = result[0]
|
||||
else:
|
||||
modname = result[0]
|
||||
klass = result[1]
|
||||
|
||||
mod = __import__(modname, fromlist=[klass])
|
||||
return getattr(mod, klass)
|
||||
|
||||
|
@ -45,7 +45,7 @@ class IndexReader:
|
||||
raise NotImplementedError('Override in subclasses')
|
||||
|
||||
@staticmethod
|
||||
def make_best_cdx_source(paths, **config):
|
||||
def make_best_cdx_source(paths, config):
|
||||
# may be a string or list
|
||||
surt_ordered = config.get('surt_ordered', True)
|
||||
|
||||
|
@ -8,24 +8,45 @@ import logging
|
||||
import proxy
|
||||
|
||||
#=================================================================
|
||||
DEFAULT_HEAD_INSERT = 'ui/head_insert.html'
|
||||
DEFAULT_QUERY = 'ui/query.html'
|
||||
DEFAULT_SEARCH = 'ui/search.html'
|
||||
DEFAULT_INDEX = 'ui/index.html'
|
||||
DEFAULT_ERROR = 'ui/error.html'
|
||||
DEFAULTS = {
|
||||
'hostpaths': ['http://localhost:8080'],
|
||||
'collections': {'pywb': './sample_archive/cdx/'},
|
||||
'archive_paths': './sample_archive/warcs/',
|
||||
|
||||
'head_insert_html': 'ui/head_insert.html',
|
||||
'query_html': 'ui/query.html',
|
||||
'search_html': 'ui/search.html',
|
||||
'home_html': 'ui/index.html',
|
||||
'error_html': 'ui/error.html',
|
||||
|
||||
'static_routes': {'static/default': 'static/'},
|
||||
}
|
||||
|
||||
class DictChain:
|
||||
def __init__(self, *dicts):
|
||||
self.dicts = dicts
|
||||
|
||||
def get(self, key, default_val=None):
|
||||
for d in self.dicts:
|
||||
val = d.get(key)
|
||||
if val:
|
||||
return val
|
||||
return default_val
|
||||
|
||||
|
||||
#=================================================================
|
||||
## Reference non-YAML config
|
||||
#=================================================================
|
||||
def pywb_config_manual(config = {}):
|
||||
def pywb_config_manual(passed_config = {}):
|
||||
|
||||
config = DictChain(passed_config, DEFAULTS)
|
||||
|
||||
routes = []
|
||||
|
||||
hostpaths = config.get('hostpaths', ['http://localhost:8080'])
|
||||
hostpaths = config.get('hostpaths')
|
||||
|
||||
# collections based on cdx source
|
||||
collections = config.get('collections', {'pywb': './sample_archive/cdx/'})
|
||||
collections = config.get('collections')
|
||||
|
||||
for name, value in collections.iteritems():
|
||||
route_config = config
|
||||
@ -33,28 +54,21 @@ def pywb_config_manual(config = {}):
|
||||
if isinstance(value, dict):
|
||||
# if a dict, extend with base properies
|
||||
index_paths = value['index_paths']
|
||||
value.update(route_config)
|
||||
route_config = value
|
||||
route_config = DictChain(value, config)
|
||||
else:
|
||||
index_paths = str(value)
|
||||
|
||||
cdx_source = indexreader.IndexReader.make_best_cdx_source(index_paths, **config)
|
||||
cdx_source = indexreader.IndexReader.make_best_cdx_source(index_paths, route_config)
|
||||
|
||||
|
||||
wb_handler = config_utils.create_wb_handler(
|
||||
cdx_source = cdx_source,
|
||||
archive_paths = route_config.get('archive_paths', './sample_archive/warcs/'),
|
||||
head_html = route_config.get('head_insert_html', DEFAULT_HEAD_INSERT),
|
||||
query_html = route_config.get('query_html', DEFAULT_QUERY),
|
||||
search_html = route_config.get('search_html', DEFAULT_SEARCH),
|
||||
config = route_config,
|
||||
)
|
||||
|
||||
logging.info('Adding Collection: ' + name)
|
||||
|
||||
route_class = route_config.get('route_class', None)
|
||||
if route_class:
|
||||
route_class = config_utils.load_class(route_class)
|
||||
else:
|
||||
route_class = archivalrouter.Route
|
||||
route_class = route_config.get('route_class', archivalrouter.Route)
|
||||
|
||||
routes.append(route_class(name, wb_handler, config = route_config))
|
||||
|
||||
@ -70,7 +84,7 @@ def pywb_config_manual(config = {}):
|
||||
routes.append(archivalrouter.Route('echo_req', handlers.DebugEchoHandler()))
|
||||
|
||||
|
||||
static_routes = config.get('static_routes', {'static/default': 'static/'})
|
||||
static_routes = config.get('static_routes')
|
||||
|
||||
for static_name, static_path in static_routes.iteritems():
|
||||
routes.append(archivalrouter.Route(static_name, handlers.StaticHandler(static_path)))
|
||||
@ -91,8 +105,8 @@ def pywb_config_manual(config = {}):
|
||||
|
||||
abs_path = config.get('absolute_paths', True),
|
||||
|
||||
home_view = config_utils.load_template_file(config.get('home_html', DEFAULT_INDEX), 'Home Page'),
|
||||
error_view = config_utils.load_template_file(config.get('error_html', DEFAULT_ERROR), 'Error Page')
|
||||
home_view = config_utils.load_template_file(config.get('home_html'), 'Home Page'),
|
||||
error_view = config_utils.load_template_file(config.get('error_html'), 'Error Page')
|
||||
)
|
||||
|
||||
|
||||
|
@ -72,7 +72,7 @@ def make_best_resolver(param):
|
||||
PrefixResolver('http://myhost.example.com/warcs/')
|
||||
|
||||
# http path w/ contains param
|
||||
>>> make_best_resolver(('http://myhost.example.com/warcs/', '/'))
|
||||
>>> make_best_resolver(['http://myhost.example.com/warcs/', '/'])
|
||||
PrefixResolver('http://myhost.example.com/warcs/', contains = '/')
|
||||
|
||||
# redis path
|
||||
@ -89,7 +89,7 @@ def make_best_resolver(param):
|
||||
|
||||
"""
|
||||
|
||||
if isinstance(param, tuple):
|
||||
if isinstance(param, list):
|
||||
path = param[0]
|
||||
arg = param[1]
|
||||
else:
|
||||
@ -116,12 +116,15 @@ def make_best_resolver(param):
|
||||
|
||||
|
||||
#=================================================================
|
||||
def make_best_resolvers(*paths):
|
||||
def make_best_resolvers(paths):
|
||||
"""
|
||||
>>> make_best_resolvers('http://myhost.example.com/warcs/', 'redis://myhost.example.com:1234/1')
|
||||
>>> make_best_resolvers(['http://myhost.example.com/warcs/', 'redis://myhost.example.com:1234/1'])
|
||||
[PrefixResolver('http://myhost.example.com/warcs/'), RedisResolver('redis://myhost.example.com:1234/1')]
|
||||
"""
|
||||
if hasattr(paths, '__iter__'):
|
||||
return map(make_best_resolver, paths)
|
||||
else:
|
||||
return [make_best_resolver(paths)]
|
||||
|
||||
|
||||
import utils
|
||||
|
@ -18,9 +18,10 @@ import wbexceptions
|
||||
|
||||
#=================================================================
|
||||
class ReplayView:
|
||||
def __init__(self, resolvers, loader = None):
|
||||
def __init__(self, resolvers, loader = None, reporter = None):
|
||||
self.resolvers = resolvers
|
||||
self.loader = loader if loader else archiveloader.ArchiveLoader()
|
||||
self._reporter = reporter
|
||||
|
||||
|
||||
def __call__(self, wbrequest, cdx_lines, cdx_reader):
|
||||
@ -41,7 +42,13 @@ class ReplayView:
|
||||
|
||||
(cdx, status_headers, stream) = self.resolve_headers_and_payload(cdx, wbrequest, cdx_reader, failed_files)
|
||||
|
||||
return self.make_response(wbrequest, cdx, status_headers, stream)
|
||||
response = self.make_response(wbrequest, cdx, status_headers, stream)
|
||||
|
||||
# notify reporter callback, if any
|
||||
if self._reporter:
|
||||
self._reporter(wbrequest, cdx, response)
|
||||
|
||||
return response
|
||||
|
||||
|
||||
except wbexceptions.CaptureException as ce:
|
||||
@ -83,7 +90,7 @@ class ReplayView:
|
||||
try:
|
||||
return self.loader.load(path, offset, length)
|
||||
|
||||
except URLError as ue:
|
||||
except Exception as ue:
|
||||
last_exc = ue
|
||||
print last_exc
|
||||
pass
|
||||
@ -231,8 +238,8 @@ class ReplayView:
|
||||
#=================================================================
|
||||
class RewritingReplayView(ReplayView):
|
||||
|
||||
def __init__(self, resolvers, loader = None, head_insert_view = None, header_rewriter = None, redir_to_exact = True, buffer_response = False):
|
||||
ReplayView.__init__(self, resolvers, loader)
|
||||
def __init__(self, resolvers, loader = None, head_insert_view = None, header_rewriter = None, redir_to_exact = True, buffer_response = False, reporter = None):
|
||||
ReplayView.__init__(self, resolvers, loader, reporter)
|
||||
self.head_insert_view = head_insert_view
|
||||
self.header_rewriter = header_rewriter if header_rewriter else HeaderRewriter()
|
||||
self.redir_to_exact = redir_to_exact
|
||||
@ -241,7 +248,6 @@ class RewritingReplayView(ReplayView):
|
||||
self.buffer_response = buffer_response
|
||||
|
||||
|
||||
|
||||
def _text_content_type(self, content_type):
|
||||
for ctype, mimelist in self.REWRITE_TYPES.iteritems():
|
||||
if any ((mime in content_type) for mime in mimelist):
|
||||
@ -411,4 +417,3 @@ class RewritingReplayView(ReplayView):
|
||||
if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)):
|
||||
raise wbexceptions.CaptureException('Self Redirect: ' + str(cdx))
|
||||
|
||||
|
||||
|
@ -1,5 +1,4 @@
|
||||
import itertools
|
||||
import hmac
|
||||
import time
|
||||
import zlib
|
||||
import time
|
||||
@ -26,36 +25,6 @@ def split_prefix(key, prefixs):
|
||||
def create_decompressor():
|
||||
return zlib.decompressobj(16 + zlib.MAX_WBITS)
|
||||
|
||||
#=================================================================
|
||||
# Cookie Signing
|
||||
#=================================================================
|
||||
|
||||
class HMACCookieMaker:
|
||||
def __init__(self, key, name):
|
||||
self.key = key
|
||||
self.name = name
|
||||
|
||||
|
||||
def __call__(self, duration, extra_id = ''):
|
||||
expire = str(long(time.time() + duration))
|
||||
|
||||
if extra_id:
|
||||
msg = extra_id + '-' + expire
|
||||
else:
|
||||
msg = expire
|
||||
|
||||
hmacdigest = hmac.new(self.key, msg)
|
||||
hexdigest = hmacdigest.hexdigest()
|
||||
|
||||
if extra_id:
|
||||
cookie = '{0}-{1}={2}-{3}'.format(self.name, extra_id, expire, hexdigest)
|
||||
else:
|
||||
cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest)
|
||||
|
||||
return cookie
|
||||
|
||||
#return cookie + hexdigest
|
||||
|
||||
|
||||
#=================================================================
|
||||
# Adapted from example at
|
||||
|
@ -161,3 +161,9 @@ class TestWb:
|
||||
resp = self.testapp.get('/pywb/?abc', status = 400)
|
||||
assert resp.status_int == 400
|
||||
assert 'Bad Request Url: http://?abc' in resp.body
|
||||
|
||||
# Reporter callback for replay view
|
||||
def print_reporter(wbrequest, cdx, response):
|
||||
print wbrequest
|
||||
print cdx
|
||||
pass
|
||||
|
@ -33,7 +33,7 @@ surt_ordered: true
|
||||
# * http:// path, use path as remote prefix
|
||||
# * redis:// path, use redis to lookup full path for w:<warc> as key
|
||||
|
||||
archive_paths: ./sample_archive/warcs/
|
||||
archive_paths: ['./invalid/path/to/ignore/', './sample_archive/warcs/']
|
||||
|
||||
# ==== Optional UI: HTML/Jinja2 Templates ====
|
||||
|
||||
@ -89,3 +89,7 @@ enable_http_proxy: true
|
||||
|
||||
# enable cdx server api for querying cdx directly (experimental)
|
||||
enable_cdx_api: true
|
||||
|
||||
# optional reporter callback func
|
||||
# if set, called with request and cdx object
|
||||
reporter_func: pywb.run-tests.print_reporter
|
||||
|
Loading…
x
Reference in New Issue
Block a user