1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

config cleanup:

- auto/dyn collections: use overridable 'index_paths' and 'archive_paths', support list for archive_paths
- all-auto collection: supported at warcserver layer via special '$all' index
- cleanup default_config.yaml and config.yaml, remove obsolete properties
- remove obsolete docker-compose.yaml
- default_config: simplify list of managed properties
- test_cli: add tests for cli options
This commit is contained in:
Ilya Kreymer 2017-10-03 15:31:08 -07:00
parent 16ede7abbb
commit b631a24a0e
13 changed files with 144 additions and 293 deletions

View File

@ -1,123 +1,14 @@
# pywb config file # pywb config file
# ======================================== # ========================================
# #
# Settings for each collection
use_js_obj_proxy: true
collections: collections:
# <name>: <cdx_path> pywb:
# collection will be accessed via /<name> index: ./sample_archive/cdx/
# <cdx_path> is a string or list of: resource: ./sample_archive/warcs/
# - string or list of one or more local .cdx file
# - string or list of one or more local dirs with .cdx files
# - a string value indicating remote http cdx server
pywb: ./sample_archive/cdx/
# ex with filtering: filter CDX lines by filename starting with 'dupe' # Settings for each collection
#pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']} use_js_obj_proxy: true
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
# SURT keys are recommended for future indices, but non-SURT cdxs
# are also supported
#
# * Set to true if cdxs start with surts: com,example)/
# * Set to false if cdx start with urls: example.com)/
#
# default:
# surt_ordered: true
# list of paths prefixes for pywb look to 'resolve' WARC and ARC filenames
# in the cdx to their absolute path
#
# if path is:
# * local dir, use path as prefix
# * local file, lookup prefix in tab-delimited sorted index
# * http:// path, use path as remote prefix
# * redis:// path, use redis to lookup full path for w:<warc> as key
archive_paths: ./sample_archive/warcs/
# ==== Proxy Mode ====
# Enable simple http proxy mode
enable_http_proxy: true
# Additional proxy options (defaults)
# proxy_options:
# enable HTTPS proxy support (requires openssl library)
# enable_https_proxy: false
#
# use cookies to switch collections and capture times
# if not enabled, requires use of proxy auth
# cookie_resolver: true
#
# default collection to start out in proxy mode
# if not set, will ask the first time
# use_default_coll: pywb
# use wildcard certificates when creating certs in proxy mode
# helps lower number of certs created, by may not be compatible
# with older libraries
# use_wildcard_certs: true
# if true, will not add any banner to proxy mode replay
# unaltered_replay: false
# Default settings for CA used by proxy mode:
# root_ca_file: ./ca/pywb-ca.pem
# root_ca_name: pywb https proxy replay CA
# certs_dir: ./ca/certs
# ==== UI: HTML/Jinja2 Templates ====
# The following are default settings -- uncomment to change
# Set to '' to disable the ui
# template for <head> insert into replayed html content
#head_insert_html: ui/head_insert.html
#
#
# template for just the banner modifications
# set to False to disable completely
#banner_html: banner.html
# template to for 'calendar' query,
# eg, a listing of captures in response to a ../*/<url>
#
# may be a simple listing or a more complex 'calendar' UI
# if omitted, will list raw cdx in plain text
#query_html: ui/query.html
# template for search page, which is displayed when no search url is entered
# in a collection
#search_html: ui/search.html
# template for home page.
# if no other route is set, this will be rendered at /, /index.htm and /index.html
#home_html: ui/index.html
# error page temlpate for may formatting error message and details
# if omitted, a text response is returned
#error_html: ui/error.html
# ==== Other Paths ====
# Rewrite urls with absolute paths instead of relative
#absoulte_paths: true
# List of route names:
# <route>: <package or file path>
# default route static/__pywb for pywb bundled static files
#static_routes:
# static/__pywb: pywb/static/
# enable cdx server api for querying cdx directly (experimental)
enable_cdx_api: true
# custom rules for domain specific matching
# set to false to disable
#domain_specific_rules: rules.yaml
# Memento support, enable # Memento support, enable
enable_memento: true enable_memento: true
@ -125,5 +16,3 @@ enable_memento: true
# Replay content in an iframe # Replay content in an iframe
framed_replay: true framed_replay: true
# debugging utility -- echo request data
# debug_echo_env: false

View File

@ -1,19 +0,0 @@
version: '2'
services:
proxy:
build: ./proxy/
links:
- webagg:webagg
environment:
- "WEBAGG=http://webrecplatform_webagg_1:8080"
ports:
- 9080:9080
volumes:
- ${HOME}/.mitmproxy/:/root/.mitmproxy/
webagg:
build: ./webagg/

View File

@ -98,7 +98,9 @@ class ReplayCli(BaseCli):
super(ReplayCli, self).load() super(ReplayCli, self).load()
if self.r.all_coll: if self.r.all_coll:
self.extra_config['all_coll'] = self.r.all_coll if 'collections' not in self.extra_config:
self.extra_config['collections'] = {}
self.extra_config['collections'][self.r.all_coll] = '$all'
import os import os
if self.r.directory: #pragma: no cover if self.r.directory: #pragma: no cover

View File

@ -44,7 +44,6 @@ class FrontEndApp(object):
PROXY_CA_PATH = os.path.join('proxy-certs', 'pywb-ca.pem') PROXY_CA_PATH = os.path.join('proxy-certs', 'pywb-ca.pem')
def __init__(self, config_file='./config.yaml', custom_config=None): def __init__(self, config_file='./config.yaml', custom_config=None):
print('CUSTOM', custom_config)
self.handler = self.handle_request self.handler = self.handle_request
self.warcserver = WarcServer(config_file=config_file, self.warcserver = WarcServer(config_file=config_file,
custom_config=custom_config) custom_config=custom_config)
@ -59,7 +58,7 @@ class FrontEndApp(object):
self.init_recorder(config.get('recorder')) self.init_recorder(config.get('recorder'))
static_path = config.get('static_path', 'pywb/static/').replace('/', os.path.sep) static_path = config.get('static_url_path', 'pywb/static/').replace('/', os.path.sep)
self.static_handler = StaticHandler(static_path) self.static_handler = StaticHandler(static_path)
self.all_coll = config.get('all_coll', None) self.all_coll = config.get('all_coll', None)
@ -125,7 +124,7 @@ class FrontEndApp(object):
# TODO: support dedup # TODO: support dedup
dedup_index = None dedup_index = None
warc_writer = MultiFileWARCWriter(self.warcserver.archive_templ, warc_writer = MultiFileWARCWriter(self.warcserver.archive_paths,
max_size=int(recorder_config.get('max_size', 1000000000)), max_size=int(recorder_config.get('max_size', 1000000000)),
max_idle_secs=int(recorder_config.get('max_idle_secs', 600)), max_idle_secs=int(recorder_config.get('max_idle_secs', 600)),
filename_template=recorder_config.get('filename_template'), filename_template=recorder_config.get('filename_template'),
@ -166,8 +165,8 @@ class FrontEndApp(object):
self.raise_not_found(environ, 'Static File Not Found: {0}'.format(filepath)) self.raise_not_found(environ, 'Static File Not Found: {0}'.format(filepath))
def get_metadata(self, coll): def get_metadata(self, coll):
if coll == self.all_coll: #if coll == self.all_coll:
coll = '*' # coll = '*'
metadata = {'coll': coll, metadata = {'coll': coll,
'type': 'replay'} 'type': 'replay'}
@ -205,8 +204,8 @@ class FrontEndApp(object):
def serve_cdx(self, environ, coll='$root'): def serve_cdx(self, environ, coll='$root'):
base_url = self.rewriterapp.paths['cdx-server'] base_url = self.rewriterapp.paths['cdx-server']
if coll == self.all_coll: #if coll == self.all_coll:
coll = '*' # coll = '*'
cdx_url = base_url.format(coll=coll) cdx_url = base_url.format(coll=coll)
@ -280,8 +279,8 @@ class FrontEndApp(object):
return WbResponse.json_response(result) return WbResponse.json_response(result)
def is_valid_coll(self, coll): def is_valid_coll(self, coll):
if coll == self.all_coll: #if coll == self.all_coll:
return True # return True
return (coll in self.warcserver.list_fixed_routes() or return (coll in self.warcserver.list_fixed_routes() or
coll in self.warcserver.list_dynamic_routes()) coll in self.warcserver.list_dynamic_routes())

View File

@ -1,68 +1,48 @@
collections_root: collections collections_root: collections
paths: # Per-Collection Paths
archive_paths: archive archive_paths: archive
index_paths: indexes index_paths: indexes
static_path: static static_path: static
templates_dir: templates templates_dir: templates
template_files:
banner_html: banner.html
head_insert_html: head_insert.html
frame_insert_html: frame_insert.html
query_html: query.html
search_html: search.html
not_found_html: not_found.html
shared_template_files:
home_html: index.html
error_html: error.html
proxy_cert_download_html: proxy_cert_download.html
proxy_select_html: proxy_select.html
info_json: collinfo.json
templates_dirs:
- templates
- '.'
- '/'
template_packages:
- pywb
# Template HTML
banner_html: banner.html
head_insert_html: head_insert.html head_insert_html: head_insert.html
frame_insert_html: frame_insert.html frame_insert_html: frame_insert.html
banner_html: banner.html
#archive_paths: ./
home_html: index.html
query_html: query.html query_html: query.html
search_html: search.html search_html: search.html
error_html: error.html
not_found_html: not_found.html not_found_html: not_found.html
home_html: index.html
error_html: error.html
proxy_cert_download_html: proxy_cert_download.html proxy_cert_download_html: proxy_cert_download.html
proxy_select_html: proxy_select.html proxy_select_html: proxy_select.html
# Info JSON
info_json: collinfo.json info_json: collinfo.json
static_default_prefix: &static_default_prefix static/__pywb # HTML Templates List
static_shared_prefix: static/__shared html_templates:
- banner_html
- head_insert_html
- frame_insert_html
template_globals: - query_html
static_path: *static_default_prefix - search_html
- not_found_html
static_routes: - home_html
*static_default_prefix: pywb/static/ - error_html
- proxy_cert_download_html
- proxy_select_html
# Other Settings
enable_memento: true enable_memento: true
domain_specific_rules: pywb/rules.yaml rules_config: pkg://pywb/rules.yaml
framed_replay: inverse

View File

@ -74,11 +74,11 @@ directory structure expected by pywb
def _get_root_dir(self, name): def _get_root_dir(self, name):
return os.path.join(os.getcwd(), return os.path.join(os.getcwd(),
self.default_config['paths'][name]) self.default_config[name])
def _get_dir(self, name): def _get_dir(self, name):
return os.path.join(self.curr_coll_dir, return os.path.join(self.curr_coll_dir,
self.default_config['paths'][name]) self.default_config[name])
def _create_dir(self, dirname): def _create_dir(self, dirname):
if not os.path.isdir(dirname): if not os.path.isdir(dirname):
@ -208,33 +208,22 @@ directory structure expected by pywb
def _load_templates_map(self): def _load_templates_map(self):
defaults = load_yaml_config(DEFAULT_CONFIG) defaults = load_yaml_config(DEFAULT_CONFIG)
temp_dir = defaults['paths']['templates_dir'] temp_dir = defaults['templates_dir']
# Coll Templates # Coll Templates
templates = defaults['paths']['template_files'] templates = defaults['html_templates']
for name, _ in six.iteritems(templates): for name in templates:
templates[name] = os.path.join(temp_dir, defaults[name]) defaults[name] = os.path.join(temp_dir, defaults[name])
# Shared Templates return defaults, templates
shared_templates = defaults['paths']['shared_template_files']
for name, _ in six.iteritems(shared_templates):
shared_templates[name] = os.path.join(temp_dir, defaults[name])
return templates, shared_templates
def list_templates(self): def list_templates(self):
templates, shared_templates = self._load_templates_map() defaults, templates = self._load_templates_map()
print('Shared Templates') print('HTML Shared and Per-Collection Templates')
for n, v in six.iteritems(shared_templates): for n in templates:
print('- {0}: (pywb/{1})'.format(n, v)) v = defaults[n]
print('')
print('Collection Templates')
for n, v in six.iteritems(templates):
print('- {0}: (pywb/{1})'.format(n, v)) print('- {0}: (pywb/{1})'.format(n, v))
def _confirm_overwrite(self, full_path, msg): def _confirm_overwrite(self, full_path, msg):
@ -251,10 +240,10 @@ directory structure expected by pywb
raise IOError('Skipping, {0} already exists'.format(full_path)) raise IOError('Skipping, {0} already exists'.format(full_path))
def _get_template_path(self, template_name, verb): def _get_template_path(self, template_name, verb):
templates, shared_templates = self._load_templates_map() defaults, templates = self._load_templates_map()
try: try:
filename = templates[template_name] filename = defaults[template_name]
if not self.coll_name: if not self.coll_name:
full_path = os.path.join(os.getcwd(), filename) full_path = os.path.join(os.getcwd(), filename)
else: else:
@ -262,14 +251,9 @@ directory structure expected by pywb
os.path.basename(filename)) os.path.basename(filename))
except KeyError: except KeyError:
try: msg = 'template name must be one of {0}'
filename = shared_templates[template_name] msg = msg.format(templates)
full_path = os.path.join(os.getcwd(), filename) raise KeyError(msg)
except KeyError:
msg = 'template name must be one of {0} or {1}'
msg = msg.format(templates.keys(), shared_templates.keys())
raise KeyError(msg)
return full_path, filename return full_path, filename

View File

@ -26,8 +26,8 @@ class BaseWarcServer(object):
self.url_map.add(Rule('/', endpoint=list_routes)) self.url_map.add(Rule('/', endpoint=list_routes))
def add_route(self, path, handler, path_param_name=''): def add_route(self, path, handler, path_param_name='', default_value=''):
def direct_input_request(environ, mode='', path_param_value=''): def direct_input_request(environ, mode='', path_param_value=default_value):
params = self.get_query_dict(environ) params = self.get_query_dict(environ)
params['mode'] = mode params['mode'] = mode
if path_param_value: if path_param_value:
@ -35,7 +35,7 @@ class BaseWarcServer(object):
params['_input_req'] = DirectWSGIInputRequest(environ) params['_input_req'] = DirectWSGIInputRequest(environ)
return handler(params) return handler(params)
def post_fullrequest(environ, mode='', path_param_value=''): def post_fullrequest(environ, mode='', path_param_value=default_value):
params = self.get_query_dict(environ) params = self.get_query_dict(environ)
params['mode'] = mode params['mode'] = mode
if path_param_value: if path_param_value:

View File

@ -14,4 +14,4 @@ endif =
gevent = 100 gevent = 100
gevent-monkey-patch = gevent-monkey-patch =
wsgi = webagg.test.live wsgi = warcserver.test.live

View File

@ -32,8 +32,7 @@ SOURCE_LIST = [LiveIndexSource,
# ============================================================================ # ============================================================================
class WarcServer(BaseWarcServer): class WarcServer(BaseWarcServer):
AUTO_DIR_INDEX_PATH = '{coll}/indexes/' AUTO_COLL_TEMPL = '{coll}'
AUTO_DIR_ARCHIVE_PATH = '{coll}/archive/'
def __init__(self, config_file='./config.yaml', custom_config=None): def __init__(self, config_file='./config.yaml', custom_config=None):
config = load_yaml_config(DEFAULT_CONFIG) config = load_yaml_config(DEFAULT_CONFIG)
@ -55,47 +54,49 @@ class WarcServer(BaseWarcServer):
super(WarcServer, self).__init__(debug=config.get('debug', False)) super(WarcServer, self).__init__(debug=config.get('debug', False))
self.config = config self.config = config
self.fixed_routes = self.load_colls() self.root_dir = self.config.get('collections_root', '')
self.index_paths = self.init_paths('index_paths')
self.archive_paths = self.init_paths('archive_paths', self.root_dir)
self.archive_templ = None self.auto_handler = None
self.indexes_templ = None
for name, route in iteritems(self.fixed_routes):
self.add_route('/' + name, route)
if self.config.get('enable_auto_colls', True): if self.config.get('enable_auto_colls', True):
auto_handler = self.load_auto_colls() self.auto_handler = self.load_auto_colls()
self.add_route('/<path:path_param_value>', auto_handler, path_param_name='param.coll')
def _lookup(self, environ, path): self.fixed_routes = self.load_colls()
urls = self.url_map.bind(environ['HTTP_HOST'], path_info=path)
try: for name, route in iteritems(self.fixed_routes):
endpoint, args = urls.match() if route == self.auto_handler:
result = endpoint(environ, **args) self.add_route('/' + name, route, path_param_name='param.coll', default_value='*')
return result else:
except Exception as e: self.add_route('/' + name, route)
print(e)
return None if self.auto_handler:
self.add_route('/<path:path_param_value>', self.auto_handler, path_param_name='param.coll')
def init_paths(self, name, abs_path=None):
templ = self.config.get(name)
def get_full_path(path):
path = os.path.join(self.AUTO_COLL_TEMPL, path, '')
if abs_path and '://' not in path:
path = os.path.join(abs_path, path)
return path
if isinstance(templ, str):
return get_full_path(templ)
else:
return [get_full_path(t) for t in templ]
def load_auto_colls(self): def load_auto_colls(self):
self.root_dir = self.config.get('collections_root', '')
if not self.root_dir: if not self.root_dir:
print('No Root Dir, Skip Auto Colls!') print('No Root Dir, Skip Auto Colls!')
return return
self.indexes_templ = self.config.get('dyn_index_path', self.AUTO_DIR_INDEX_PATH).replace('/', os.path.sep)
dir_source = CacheDirectoryIndexSource(base_prefix=self.root_dir, dir_source = CacheDirectoryIndexSource(base_prefix=self.root_dir,
base_dir=self.indexes_templ) base_dir=self.index_paths)
self.archive_templ = self.config.get('dyn_archive_path', self.AUTO_DIR_ARCHIVE_PATH).replace('/', os.path.sep) return DefaultResourceHandler(dir_source, self.archive_paths)
if '://' not in self.archive_templ:
self.archive_templ = os.path.join(self.root_dir, self.archive_templ)
handler = DefaultResourceHandler(dir_source, self.archive_templ)
return handler
def list_fixed_routes(self): def list_fixed_routes(self):
return list(self.fixed_routes.keys()) return list(self.fixed_routes.keys())
@ -126,8 +127,6 @@ class WarcServer(BaseWarcServer):
if not colls: if not colls:
return routes return routes
self.default_archive_paths = self.config.get('archive_paths')
for name, coll_config in iteritems(colls): for name, coll_config in iteritems(colls):
try: try:
handler = self.load_coll(name, coll_config) handler = self.load_coll(name, coll_config)
@ -143,6 +142,9 @@ class WarcServer(BaseWarcServer):
return routes return routes
def load_coll(self, name, coll_config): def load_coll(self, name, coll_config):
if coll_config == '$all' and self.auto_handler:
return self.auto_handler
if isinstance(coll_config, str): if isinstance(coll_config, str):
index = coll_config index = coll_config
resource = None resource = None
@ -176,7 +178,7 @@ class WarcServer(BaseWarcServer):
agg = init_index_agg(index_group, True, timeout) agg = init_index_agg(index_group, True, timeout)
if not resource: if not resource:
resource = self.default_archive_paths resource = self.config.get('archive_paths')
return DefaultResourceHandler(agg, resource) return DefaultResourceHandler(agg, resource)

View File

@ -5,7 +5,9 @@ debug: true
collections_root: _test_colls collections_root: _test_colls
collections: collections:
pywb: ./sample_archive/cdx/ pywb:
index: ./sample_archive/cdx/
resource: ./sample_archive/warcs/
with-js-proxy: with-js-proxy:
index: ./sample_archive/cdx/ index: ./sample_archive/cdx/
@ -31,10 +33,10 @@ collections:
pywb-cdxj: pywb-cdxj:
index_paths: ./sample_archive/cdxj/ index_paths: ./sample_archive/cdxj/
archive_paths: archive_paths:
- ./invalid/path/to/ignore/ - ./invalid/path/to/ignore/
- ./sample_archive/warcs/ - ./sample_archive/warcs/
- archive
enable_memento: true enable_memento: true

View File

@ -2,11 +2,10 @@ debug: true
collections_root: _test_colls collections_root: _test_colls
all_coll: all
recorder: live recorder: live
collections: collections:
'live': '$live' 'live': '$live'
'all': '$all'

42
tests/test_cli.py Normal file
View File

@ -0,0 +1,42 @@
import os
from mock import patch
import pytest
from pywb.apps.cli import wayback
from .base_config_test import CollsDirMixin, BaseTestClass
# ============================================================================
def _run_patch(self):
return self
@patch('pywb.apps.cli.ReplayCli.run', _run_patch)
class TestProxyCLIConfig(CollsDirMixin, BaseTestClass):
def test_proxy_cli(self):
res = wayback(['--proxy', 'test'])
exp = {'ca_file_cache': os.path.join('proxy-certs', 'pywb-ca.pem'),
'ca_name': 'pywb HTTPS Proxy CA',
'coll': 'test',
'recording': False}
assert res.extra_config['proxy'] == exp
def test_proxy_cli_rec(self):
res = wayback(['--proxy', 'test', '--proxy-record'])
assert res.extra_config['proxy']['recording'] == True
assert res.extra_config['collections']['live'] == {'index': '$live', 'use_js_obj_proxy': True}
def test_proxy_cli_err_coll(self):
with pytest.raises(Exception):
res = wayback(['--proxy', 'test/foo'])
def test_all_cli(self):
res = wayback(['--all-coll', 'all'])
assert res.extra_config['collections']['all'] == '$all'
def test_live_all_cli(self):
res = wayback(['--all-coll', 'all', '--live'])
assert res.extra_config['collections'] == {'live': {'index': '$live', 'use_js_obj_proxy': True},
'all': '$all'}

View File

@ -3,11 +3,8 @@ from pywb.warcserver.test.testutils import BaseTestClass, TempDirTests
from .base_config_test import CollsDirMixin from .base_config_test import CollsDirMixin
from pywb.utils.geventserver import GeventServer from pywb.utils.geventserver import GeventServer
from pywb.apps.frontendapp import FrontEndApp from pywb.apps.frontendapp import FrontEndApp
from pywb.apps.cli import wayback
from pywb.manager.manager import main as manager from pywb.manager.manager import main as manager
from mock import patch
import os import os
import requests import requests
import pytest import pytest
@ -105,29 +102,3 @@ class TestRecordingProxy(CollsDirMixin, BaseTestProxy):
assert 'is_live = false' in res.text assert 'is_live = false' in res.text
assert 'httpbin(1)' in res.text assert 'httpbin(1)' in res.text
# ============================================================================
def _run_patch(self):
return self
@patch('pywb.apps.cli.ReplayCli.run', _run_patch)
class TestProxyCLIConfig(object):
def test_proxy_cli(self):
res = wayback(['--proxy', 'test'])
exp = {'ca_file_cache': os.path.join('proxy-certs', 'pywb-ca.pem'),
'ca_name': 'pywb HTTPS Proxy CA',
'coll': 'test',
'recording': False}
assert res.extra_config['proxy'] == exp
def test_proxy_cli_rec(self):
res = wayback(['--proxy', 'test', '--proxy-record'])
assert res.extra_config['proxy']['recording'] == True
assert res.extra_config['collections']['live'] == {'index': '$live', 'use_js_obj_proxy': True}
def test_proxy_cli_err_coll(self):
with pytest.raises(Exception):
res = wayback(['--proxy', 'test/foo'])