mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
simplify pywb_init config:
- add defaults dictionary, chain dictionaries rather than copying - allow custom classes to be loaded explicitly via yaml - for LineReader, assume ungzipped if first decompress fails - properly ignore bad local paths - add optional reporter object
This commit is contained in:
parent
8b2bfa570c
commit
e4f409b2a4
@ -190,9 +190,8 @@ class ArchiveLoader:
|
|||||||
def load(self, url, offset, length):
|
def load(self, url, offset, length):
|
||||||
url_parts = urlparse.urlsplit(url)
|
url_parts = urlparse.urlsplit(url)
|
||||||
|
|
||||||
try:
|
loader = self.loaders.get(url_parts.scheme)
|
||||||
loader = self.loaders.get(url_parts.scheme)
|
if not loader:
|
||||||
except Exception:
|
|
||||||
raise wbexceptions.UnknownLoaderProtocolException(url)
|
raise wbexceptions.UnknownLoaderProtocolException(url)
|
||||||
|
|
||||||
the_format = None
|
the_format = None
|
||||||
@ -319,11 +318,18 @@ class LineReader:
|
|||||||
self._process_read(data)
|
self._process_read(data)
|
||||||
|
|
||||||
def _process_read(self, data):
|
def _process_read(self, data):
|
||||||
self.num_read += len(data)
|
|
||||||
|
|
||||||
if self.decomp and data:
|
if self.decomp and data:
|
||||||
data = self.decomp.decompress(data)
|
try:
|
||||||
|
data = self.decomp.decompress(data)
|
||||||
|
except Exception:
|
||||||
|
# if first read attempt, assume non-gzipped stream
|
||||||
|
if self.num_read == 0:
|
||||||
|
self.decomp = False
|
||||||
|
# otherwise (partly decompressed), something is wrong
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
|
||||||
|
self.num_read += len(data)
|
||||||
self.buff = StringIO.StringIO(data)
|
self.buff = StringIO.StringIO(data)
|
||||||
|
|
||||||
|
|
||||||
|
@ -5,7 +5,8 @@ import indexreader
|
|||||||
import replay_views
|
import replay_views
|
||||||
import replay_resolvers
|
import replay_resolvers
|
||||||
import logging
|
import logging
|
||||||
|
import hmac
|
||||||
|
import time
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# Config Loading
|
# Config Loading
|
||||||
@ -17,25 +18,55 @@ def load_template_file(file, desc = None, view_class = views.J2TemplateView):
|
|||||||
|
|
||||||
return file
|
return file
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# Cookie Signing
|
||||||
|
#=================================================================
|
||||||
|
|
||||||
|
class HMACCookieMaker:
|
||||||
|
def __init__(self, key, name):
|
||||||
|
self.key = key
|
||||||
|
self.name = name
|
||||||
|
|
||||||
|
def __call__(self, duration, extra_id = ''):
|
||||||
|
expire = str(long(time.time() + duration))
|
||||||
|
|
||||||
|
if extra_id:
|
||||||
|
msg = extra_id + '-' + expire
|
||||||
|
else:
|
||||||
|
msg = expire
|
||||||
|
|
||||||
|
hmacdigest = hmac.new(self.key, msg)
|
||||||
|
hexdigest = hmacdigest.hexdigest()
|
||||||
|
|
||||||
|
if extra_id:
|
||||||
|
cookie = '{0}-{1}={2}-{3}'.format(self.name, extra_id, expire, hexdigest)
|
||||||
|
else:
|
||||||
|
cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest)
|
||||||
|
|
||||||
|
return cookie
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def create_wb_handler(**config):
|
def create_wb_handler(cdx_source, config):
|
||||||
|
|
||||||
replayer = replay_views.RewritingReplayView(
|
replayer = replay_views.RewritingReplayView(
|
||||||
|
|
||||||
resolvers = replay_resolvers.make_best_resolvers(config.get('archive_paths')),
|
resolvers = replay_resolvers.make_best_resolvers(config.get('archive_paths')),
|
||||||
|
|
||||||
loader = archiveloader.ArchiveLoader(hmac = config.get('hmac', None)),
|
loader = archiveloader.ArchiveLoader(hmac = config.get('hmac')),
|
||||||
|
|
||||||
head_insert_view = load_template_file(config.get('head_html'), 'Head Insert'),
|
head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'),
|
||||||
|
|
||||||
buffer_response = config.get('buffer_response', True),
|
buffer_response = config.get('buffer_response', True),
|
||||||
|
|
||||||
redir_to_exact = config.get('redir_to_exact', True),
|
redir_to_exact = config.get('redir_to_exact', True),
|
||||||
|
|
||||||
|
reporter = config.get('reporter')
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
wb_handler = handlers.WBHandler(
|
wb_handler = handlers.WBHandler(
|
||||||
config['cdx_source'],
|
cdx_source,
|
||||||
|
|
||||||
replayer,
|
replayer,
|
||||||
|
|
||||||
@ -46,18 +77,3 @@ def create_wb_handler(**config):
|
|||||||
|
|
||||||
return wb_handler
|
return wb_handler
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def load_class(name):
|
|
||||||
result = name.rsplit('.', 1)
|
|
||||||
|
|
||||||
if len(result) == 1:
|
|
||||||
modname == ''
|
|
||||||
klass = result[0]
|
|
||||||
else:
|
|
||||||
modname = result[0]
|
|
||||||
klass = result[1]
|
|
||||||
|
|
||||||
mod = __import__(modname, fromlist=[klass])
|
|
||||||
return getattr(mod, klass)
|
|
||||||
|
|
||||||
|
@ -45,7 +45,7 @@ class IndexReader:
|
|||||||
raise NotImplementedError('Override in subclasses')
|
raise NotImplementedError('Override in subclasses')
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_best_cdx_source(paths, **config):
|
def make_best_cdx_source(paths, config):
|
||||||
# may be a string or list
|
# may be a string or list
|
||||||
surt_ordered = config.get('surt_ordered', True)
|
surt_ordered = config.get('surt_ordered', True)
|
||||||
|
|
||||||
|
@ -8,24 +8,45 @@ import logging
|
|||||||
import proxy
|
import proxy
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
DEFAULT_HEAD_INSERT = 'ui/head_insert.html'
|
DEFAULTS = {
|
||||||
DEFAULT_QUERY = 'ui/query.html'
|
'hostpaths': ['http://localhost:8080'],
|
||||||
DEFAULT_SEARCH = 'ui/search.html'
|
'collections': {'pywb': './sample_archive/cdx/'},
|
||||||
DEFAULT_INDEX = 'ui/index.html'
|
'archive_paths': './sample_archive/warcs/',
|
||||||
DEFAULT_ERROR = 'ui/error.html'
|
|
||||||
|
'head_insert_html': 'ui/head_insert.html',
|
||||||
|
'query_html': 'ui/query.html',
|
||||||
|
'search_html': 'ui/search.html',
|
||||||
|
'home_html': 'ui/index.html',
|
||||||
|
'error_html': 'ui/error.html',
|
||||||
|
|
||||||
|
'static_routes': {'static/default': 'static/'},
|
||||||
|
}
|
||||||
|
|
||||||
|
class DictChain:
|
||||||
|
def __init__(self, *dicts):
|
||||||
|
self.dicts = dicts
|
||||||
|
|
||||||
|
def get(self, key, default_val=None):
|
||||||
|
for d in self.dicts:
|
||||||
|
val = d.get(key)
|
||||||
|
if val:
|
||||||
|
return val
|
||||||
|
return default_val
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
## Reference non-YAML config
|
## Reference non-YAML config
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def pywb_config_manual(config = {}):
|
def pywb_config_manual(passed_config = {}):
|
||||||
|
|
||||||
|
config = DictChain(passed_config, DEFAULTS)
|
||||||
|
|
||||||
routes = []
|
routes = []
|
||||||
|
|
||||||
hostpaths = config.get('hostpaths', ['http://localhost:8080'])
|
hostpaths = config.get('hostpaths')
|
||||||
|
|
||||||
# collections based on cdx source
|
# collections based on cdx source
|
||||||
collections = config.get('collections', {'pywb': './sample_archive/cdx/'})
|
collections = config.get('collections')
|
||||||
|
|
||||||
for name, value in collections.iteritems():
|
for name, value in collections.iteritems():
|
||||||
route_config = config
|
route_config = config
|
||||||
@ -33,28 +54,21 @@ def pywb_config_manual(config = {}):
|
|||||||
if isinstance(value, dict):
|
if isinstance(value, dict):
|
||||||
# if a dict, extend with base properies
|
# if a dict, extend with base properies
|
||||||
index_paths = value['index_paths']
|
index_paths = value['index_paths']
|
||||||
value.update(route_config)
|
route_config = DictChain(value, config)
|
||||||
route_config = value
|
|
||||||
else:
|
else:
|
||||||
index_paths = str(value)
|
index_paths = str(value)
|
||||||
|
|
||||||
cdx_source = indexreader.IndexReader.make_best_cdx_source(index_paths, **config)
|
cdx_source = indexreader.IndexReader.make_best_cdx_source(index_paths, route_config)
|
||||||
|
|
||||||
|
|
||||||
wb_handler = config_utils.create_wb_handler(
|
wb_handler = config_utils.create_wb_handler(
|
||||||
cdx_source = cdx_source,
|
cdx_source = cdx_source,
|
||||||
archive_paths = route_config.get('archive_paths', './sample_archive/warcs/'),
|
config = route_config,
|
||||||
head_html = route_config.get('head_insert_html', DEFAULT_HEAD_INSERT),
|
|
||||||
query_html = route_config.get('query_html', DEFAULT_QUERY),
|
|
||||||
search_html = route_config.get('search_html', DEFAULT_SEARCH),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
logging.info('Adding Collection: ' + name)
|
logging.info('Adding Collection: ' + name)
|
||||||
|
|
||||||
route_class = route_config.get('route_class', None)
|
route_class = route_config.get('route_class', archivalrouter.Route)
|
||||||
if route_class:
|
|
||||||
route_class = config_utils.load_class(route_class)
|
|
||||||
else:
|
|
||||||
route_class = archivalrouter.Route
|
|
||||||
|
|
||||||
routes.append(route_class(name, wb_handler, config = route_config))
|
routes.append(route_class(name, wb_handler, config = route_config))
|
||||||
|
|
||||||
@ -70,7 +84,7 @@ def pywb_config_manual(config = {}):
|
|||||||
routes.append(archivalrouter.Route('echo_req', handlers.DebugEchoHandler()))
|
routes.append(archivalrouter.Route('echo_req', handlers.DebugEchoHandler()))
|
||||||
|
|
||||||
|
|
||||||
static_routes = config.get('static_routes', {'static/default': 'static/'})
|
static_routes = config.get('static_routes')
|
||||||
|
|
||||||
for static_name, static_path in static_routes.iteritems():
|
for static_name, static_path in static_routes.iteritems():
|
||||||
routes.append(archivalrouter.Route(static_name, handlers.StaticHandler(static_path)))
|
routes.append(archivalrouter.Route(static_name, handlers.StaticHandler(static_path)))
|
||||||
@ -91,8 +105,8 @@ def pywb_config_manual(config = {}):
|
|||||||
|
|
||||||
abs_path = config.get('absolute_paths', True),
|
abs_path = config.get('absolute_paths', True),
|
||||||
|
|
||||||
home_view = config_utils.load_template_file(config.get('home_html', DEFAULT_INDEX), 'Home Page'),
|
home_view = config_utils.load_template_file(config.get('home_html'), 'Home Page'),
|
||||||
error_view = config_utils.load_template_file(config.get('error_html', DEFAULT_ERROR), 'Error Page')
|
error_view = config_utils.load_template_file(config.get('error_html'), 'Error Page')
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -72,7 +72,7 @@ def make_best_resolver(param):
|
|||||||
PrefixResolver('http://myhost.example.com/warcs/')
|
PrefixResolver('http://myhost.example.com/warcs/')
|
||||||
|
|
||||||
# http path w/ contains param
|
# http path w/ contains param
|
||||||
>>> make_best_resolver(('http://myhost.example.com/warcs/', '/'))
|
>>> make_best_resolver(['http://myhost.example.com/warcs/', '/'])
|
||||||
PrefixResolver('http://myhost.example.com/warcs/', contains = '/')
|
PrefixResolver('http://myhost.example.com/warcs/', contains = '/')
|
||||||
|
|
||||||
# redis path
|
# redis path
|
||||||
@ -89,7 +89,7 @@ def make_best_resolver(param):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if isinstance(param, tuple):
|
if isinstance(param, list):
|
||||||
path = param[0]
|
path = param[0]
|
||||||
arg = param[1]
|
arg = param[1]
|
||||||
else:
|
else:
|
||||||
@ -116,12 +116,15 @@ def make_best_resolver(param):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def make_best_resolvers(*paths):
|
def make_best_resolvers(paths):
|
||||||
"""
|
"""
|
||||||
>>> make_best_resolvers('http://myhost.example.com/warcs/', 'redis://myhost.example.com:1234/1')
|
>>> make_best_resolvers(['http://myhost.example.com/warcs/', 'redis://myhost.example.com:1234/1'])
|
||||||
[PrefixResolver('http://myhost.example.com/warcs/'), RedisResolver('redis://myhost.example.com:1234/1')]
|
[PrefixResolver('http://myhost.example.com/warcs/'), RedisResolver('redis://myhost.example.com:1234/1')]
|
||||||
"""
|
"""
|
||||||
return map(make_best_resolver, paths)
|
if hasattr(paths, '__iter__'):
|
||||||
|
return map(make_best_resolver, paths)
|
||||||
|
else:
|
||||||
|
return [make_best_resolver(paths)]
|
||||||
|
|
||||||
|
|
||||||
import utils
|
import utils
|
||||||
|
@ -18,9 +18,10 @@ import wbexceptions
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class ReplayView:
|
class ReplayView:
|
||||||
def __init__(self, resolvers, loader = None):
|
def __init__(self, resolvers, loader = None, reporter = None):
|
||||||
self.resolvers = resolvers
|
self.resolvers = resolvers
|
||||||
self.loader = loader if loader else archiveloader.ArchiveLoader()
|
self.loader = loader if loader else archiveloader.ArchiveLoader()
|
||||||
|
self._reporter = reporter
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, wbrequest, cdx_lines, cdx_reader):
|
def __call__(self, wbrequest, cdx_lines, cdx_reader):
|
||||||
@ -41,7 +42,13 @@ class ReplayView:
|
|||||||
|
|
||||||
(cdx, status_headers, stream) = self.resolve_headers_and_payload(cdx, wbrequest, cdx_reader, failed_files)
|
(cdx, status_headers, stream) = self.resolve_headers_and_payload(cdx, wbrequest, cdx_reader, failed_files)
|
||||||
|
|
||||||
return self.make_response(wbrequest, cdx, status_headers, stream)
|
response = self.make_response(wbrequest, cdx, status_headers, stream)
|
||||||
|
|
||||||
|
# notify reporter callback, if any
|
||||||
|
if self._reporter:
|
||||||
|
self._reporter(wbrequest, cdx, response)
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
except wbexceptions.CaptureException as ce:
|
except wbexceptions.CaptureException as ce:
|
||||||
@ -83,7 +90,7 @@ class ReplayView:
|
|||||||
try:
|
try:
|
||||||
return self.loader.load(path, offset, length)
|
return self.loader.load(path, offset, length)
|
||||||
|
|
||||||
except URLError as ue:
|
except Exception as ue:
|
||||||
last_exc = ue
|
last_exc = ue
|
||||||
print last_exc
|
print last_exc
|
||||||
pass
|
pass
|
||||||
@ -231,8 +238,8 @@ class ReplayView:
|
|||||||
#=================================================================
|
#=================================================================
|
||||||
class RewritingReplayView(ReplayView):
|
class RewritingReplayView(ReplayView):
|
||||||
|
|
||||||
def __init__(self, resolvers, loader = None, head_insert_view = None, header_rewriter = None, redir_to_exact = True, buffer_response = False):
|
def __init__(self, resolvers, loader = None, head_insert_view = None, header_rewriter = None, redir_to_exact = True, buffer_response = False, reporter = None):
|
||||||
ReplayView.__init__(self, resolvers, loader)
|
ReplayView.__init__(self, resolvers, loader, reporter)
|
||||||
self.head_insert_view = head_insert_view
|
self.head_insert_view = head_insert_view
|
||||||
self.header_rewriter = header_rewriter if header_rewriter else HeaderRewriter()
|
self.header_rewriter = header_rewriter if header_rewriter else HeaderRewriter()
|
||||||
self.redir_to_exact = redir_to_exact
|
self.redir_to_exact = redir_to_exact
|
||||||
@ -241,7 +248,6 @@ class RewritingReplayView(ReplayView):
|
|||||||
self.buffer_response = buffer_response
|
self.buffer_response = buffer_response
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _text_content_type(self, content_type):
|
def _text_content_type(self, content_type):
|
||||||
for ctype, mimelist in self.REWRITE_TYPES.iteritems():
|
for ctype, mimelist in self.REWRITE_TYPES.iteritems():
|
||||||
if any ((mime in content_type) for mime in mimelist):
|
if any ((mime in content_type) for mime in mimelist):
|
||||||
@ -411,4 +417,3 @@ class RewritingReplayView(ReplayView):
|
|||||||
if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)):
|
if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)):
|
||||||
raise wbexceptions.CaptureException('Self Redirect: ' + str(cdx))
|
raise wbexceptions.CaptureException('Self Redirect: ' + str(cdx))
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
import itertools
|
import itertools
|
||||||
import hmac
|
|
||||||
import time
|
import time
|
||||||
import zlib
|
import zlib
|
||||||
import time
|
import time
|
||||||
@ -26,36 +25,6 @@ def split_prefix(key, prefixs):
|
|||||||
def create_decompressor():
|
def create_decompressor():
|
||||||
return zlib.decompressobj(16 + zlib.MAX_WBITS)
|
return zlib.decompressobj(16 + zlib.MAX_WBITS)
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
# Cookie Signing
|
|
||||||
#=================================================================
|
|
||||||
|
|
||||||
class HMACCookieMaker:
|
|
||||||
def __init__(self, key, name):
|
|
||||||
self.key = key
|
|
||||||
self.name = name
|
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, duration, extra_id = ''):
|
|
||||||
expire = str(long(time.time() + duration))
|
|
||||||
|
|
||||||
if extra_id:
|
|
||||||
msg = extra_id + '-' + expire
|
|
||||||
else:
|
|
||||||
msg = expire
|
|
||||||
|
|
||||||
hmacdigest = hmac.new(self.key, msg)
|
|
||||||
hexdigest = hmacdigest.hexdigest()
|
|
||||||
|
|
||||||
if extra_id:
|
|
||||||
cookie = '{0}-{1}={2}-{3}'.format(self.name, extra_id, expire, hexdigest)
|
|
||||||
else:
|
|
||||||
cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest)
|
|
||||||
|
|
||||||
return cookie
|
|
||||||
|
|
||||||
#return cookie + hexdigest
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# Adapted from example at
|
# Adapted from example at
|
||||||
|
@ -161,3 +161,9 @@ class TestWb:
|
|||||||
resp = self.testapp.get('/pywb/?abc', status = 400)
|
resp = self.testapp.get('/pywb/?abc', status = 400)
|
||||||
assert resp.status_int == 400
|
assert resp.status_int == 400
|
||||||
assert 'Bad Request Url: http://?abc' in resp.body
|
assert 'Bad Request Url: http://?abc' in resp.body
|
||||||
|
|
||||||
|
# Reporter callback for replay view
|
||||||
|
def print_reporter(wbrequest, cdx, response):
|
||||||
|
print wbrequest
|
||||||
|
print cdx
|
||||||
|
pass
|
||||||
|
@ -33,7 +33,7 @@ surt_ordered: true
|
|||||||
# * http:// path, use path as remote prefix
|
# * http:// path, use path as remote prefix
|
||||||
# * redis:// path, use redis to lookup full path for w:<warc> as key
|
# * redis:// path, use redis to lookup full path for w:<warc> as key
|
||||||
|
|
||||||
archive_paths: ./sample_archive/warcs/
|
archive_paths: ['./invalid/path/to/ignore/', './sample_archive/warcs/']
|
||||||
|
|
||||||
# ==== Optional UI: HTML/Jinja2 Templates ====
|
# ==== Optional UI: HTML/Jinja2 Templates ====
|
||||||
|
|
||||||
@ -89,3 +89,7 @@ enable_http_proxy: true
|
|||||||
|
|
||||||
# enable cdx server api for querying cdx directly (experimental)
|
# enable cdx server api for querying cdx directly (experimental)
|
||||||
enable_cdx_api: true
|
enable_cdx_api: true
|
||||||
|
|
||||||
|
# optional reporter callback func
|
||||||
|
# if set, called with request and cdx object
|
||||||
|
reporter_func: pywb.run-tests.print_reporter
|
||||||
|
Loading…
x
Reference in New Issue
Block a user