1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

config cleanup:

- auto/dyn collections: use overridable 'index_paths' and 'archive_paths', support list for archive_paths
- all-auto collection: supported at warcserver layer via special '$all' index
- cleanup default_config.yaml and config.yaml, remove obsolete properties
- remove obsolete docker-compose.yaml
- default_config: simplify list of managed properties
- test_cli: add tests for cli options
This commit is contained in:
Ilya Kreymer 2017-10-03 15:31:08 -07:00
parent 16ede7abbb
commit b631a24a0e
13 changed files with 144 additions and 293 deletions

View File

@ -1,123 +1,14 @@
# pywb config file
# ========================================
#
# Settings for each collection
use_js_obj_proxy: true
collections:
# <name>: <cdx_path>
# collection will be accessed via /<name>
# <cdx_path> is a string or list of:
# - string or list of one or more local .cdx file
# - string or list of one or more local dirs with .cdx files
# - a string value indicating remote http cdx server
pywb: ./sample_archive/cdx/
pywb:
index: ./sample_archive/cdx/
resource: ./sample_archive/warcs/
# ex with filtering: filter CDX lines by filename starting with 'dupe'
#pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']}
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
# SURT keys are recommended for future indices, but non-SURT cdxs
# are also supported
#
# * Set to true if cdxs start with surts: com,example)/
# * Set to false if cdx start with urls: example.com)/
#
# default:
# surt_ordered: true
# list of paths prefixes for pywb look to 'resolve' WARC and ARC filenames
# in the cdx to their absolute path
#
# if path is:
# * local dir, use path as prefix
# * local file, lookup prefix in tab-delimited sorted index
# * http:// path, use path as remote prefix
# * redis:// path, use redis to lookup full path for w:<warc> as key
archive_paths: ./sample_archive/warcs/
# ==== Proxy Mode ====
# Enable simple http proxy mode
enable_http_proxy: true
# Additional proxy options (defaults)
# proxy_options:
# enable HTTPS proxy support (requires openssl library)
# enable_https_proxy: false
#
# use cookies to switch collections and capture times
# if not enabled, requires use of proxy auth
# cookie_resolver: true
#
# default collection to start out in proxy mode
# if not set, will ask the first time
# use_default_coll: pywb
# use wildcard certificates when creating certs in proxy mode
# helps lower number of certs created, by may not be compatible
# with older libraries
# use_wildcard_certs: true
# if true, will not add any banner to proxy mode replay
# unaltered_replay: false
# Default settings for CA used by proxy mode:
# root_ca_file: ./ca/pywb-ca.pem
# root_ca_name: pywb https proxy replay CA
# certs_dir: ./ca/certs
# ==== UI: HTML/Jinja2 Templates ====
# The following are default settings -- uncomment to change
# Set to '' to disable the ui
# template for <head> insert into replayed html content
#head_insert_html: ui/head_insert.html
#
#
# template for just the banner modifications
# set to False to disable completely
#banner_html: banner.html
# template to for 'calendar' query,
# eg, a listing of captures in response to a ../*/<url>
#
# may be a simple listing or a more complex 'calendar' UI
# if omitted, will list raw cdx in plain text
#query_html: ui/query.html
# template for search page, which is displayed when no search url is entered
# in a collection
#search_html: ui/search.html
# template for home page.
# if no other route is set, this will be rendered at /, /index.htm and /index.html
#home_html: ui/index.html
# error page temlpate for may formatting error message and details
# if omitted, a text response is returned
#error_html: ui/error.html
# ==== Other Paths ====
# Rewrite urls with absolute paths instead of relative
#absoulte_paths: true
# List of route names:
# <route>: <package or file path>
# default route static/__pywb for pywb bundled static files
#static_routes:
# static/__pywb: pywb/static/
# enable cdx server api for querying cdx directly (experimental)
enable_cdx_api: true
# custom rules for domain specific matching
# set to false to disable
#domain_specific_rules: rules.yaml
# Settings for each collection
use_js_obj_proxy: true
# Memento support, enable
enable_memento: true
@ -125,5 +16,3 @@ enable_memento: true
# Replay content in an iframe
framed_replay: true
# debugging utility -- echo request data
# debug_echo_env: false

View File

@ -1,19 +0,0 @@
version: '2'
services:
proxy:
build: ./proxy/
links:
- webagg:webagg
environment:
- "WEBAGG=http://webrecplatform_webagg_1:8080"
ports:
- 9080:9080
volumes:
- ${HOME}/.mitmproxy/:/root/.mitmproxy/
webagg:
build: ./webagg/

View File

@ -98,7 +98,9 @@ class ReplayCli(BaseCli):
super(ReplayCli, self).load()
if self.r.all_coll:
self.extra_config['all_coll'] = self.r.all_coll
if 'collections' not in self.extra_config:
self.extra_config['collections'] = {}
self.extra_config['collections'][self.r.all_coll] = '$all'
import os
if self.r.directory: #pragma: no cover

View File

@ -44,7 +44,6 @@ class FrontEndApp(object):
PROXY_CA_PATH = os.path.join('proxy-certs', 'pywb-ca.pem')
def __init__(self, config_file='./config.yaml', custom_config=None):
print('CUSTOM', custom_config)
self.handler = self.handle_request
self.warcserver = WarcServer(config_file=config_file,
custom_config=custom_config)
@ -59,7 +58,7 @@ class FrontEndApp(object):
self.init_recorder(config.get('recorder'))
static_path = config.get('static_path', 'pywb/static/').replace('/', os.path.sep)
static_path = config.get('static_url_path', 'pywb/static/').replace('/', os.path.sep)
self.static_handler = StaticHandler(static_path)
self.all_coll = config.get('all_coll', None)
@ -125,7 +124,7 @@ class FrontEndApp(object):
# TODO: support dedup
dedup_index = None
warc_writer = MultiFileWARCWriter(self.warcserver.archive_templ,
warc_writer = MultiFileWARCWriter(self.warcserver.archive_paths,
max_size=int(recorder_config.get('max_size', 1000000000)),
max_idle_secs=int(recorder_config.get('max_idle_secs', 600)),
filename_template=recorder_config.get('filename_template'),
@ -166,8 +165,8 @@ class FrontEndApp(object):
self.raise_not_found(environ, 'Static File Not Found: {0}'.format(filepath))
def get_metadata(self, coll):
if coll == self.all_coll:
coll = '*'
#if coll == self.all_coll:
# coll = '*'
metadata = {'coll': coll,
'type': 'replay'}
@ -205,8 +204,8 @@ class FrontEndApp(object):
def serve_cdx(self, environ, coll='$root'):
base_url = self.rewriterapp.paths['cdx-server']
if coll == self.all_coll:
coll = '*'
#if coll == self.all_coll:
# coll = '*'
cdx_url = base_url.format(coll=coll)
@ -280,8 +279,8 @@ class FrontEndApp(object):
return WbResponse.json_response(result)
def is_valid_coll(self, coll):
if coll == self.all_coll:
return True
#if coll == self.all_coll:
# return True
return (coll in self.warcserver.list_fixed_routes() or
coll in self.warcserver.list_dynamic_routes())

View File

@ -1,68 +1,48 @@
collections_root: collections
paths:
archive_paths: archive
index_paths: indexes
static_path: static
# Per-Collection Paths
archive_paths: archive
index_paths: indexes
static_path: static
templates_dir: templates
template_files:
banner_html: banner.html
head_insert_html: head_insert.html
frame_insert_html: frame_insert.html
query_html: query.html
search_html: search.html
not_found_html: not_found.html
shared_template_files:
home_html: index.html
error_html: error.html
proxy_cert_download_html: proxy_cert_download.html
proxy_select_html: proxy_select.html
info_json: collinfo.json
templates_dirs:
- templates
- '.'
- '/'
template_packages:
- pywb
templates_dir: templates
# Template HTML
banner_html: banner.html
head_insert_html: head_insert.html
frame_insert_html: frame_insert.html
banner_html: banner.html
#archive_paths: ./
home_html: index.html
query_html: query.html
search_html: search.html
error_html: error.html
not_found_html: not_found.html
home_html: index.html
error_html: error.html
proxy_cert_download_html: proxy_cert_download.html
proxy_select_html: proxy_select.html
# Info JSON
info_json: collinfo.json
static_default_prefix: &static_default_prefix static/__pywb
static_shared_prefix: static/__shared
# HTML Templates List
html_templates:
- banner_html
- head_insert_html
- frame_insert_html
template_globals:
static_path: *static_default_prefix
- query_html
- search_html
- not_found_html
static_routes:
*static_default_prefix: pywb/static/
- home_html
- error_html
- proxy_cert_download_html
- proxy_select_html
# Other Settings
enable_memento: true
domain_specific_rules: pywb/rules.yaml
rules_config: pkg://pywb/rules.yaml
framed_replay: inverse

View File

@ -74,11 +74,11 @@ directory structure expected by pywb
def _get_root_dir(self, name):
return os.path.join(os.getcwd(),
self.default_config['paths'][name])
self.default_config[name])
def _get_dir(self, name):
return os.path.join(self.curr_coll_dir,
self.default_config['paths'][name])
self.default_config[name])
def _create_dir(self, dirname):
if not os.path.isdir(dirname):
@ -208,33 +208,22 @@ directory structure expected by pywb
def _load_templates_map(self):
defaults = load_yaml_config(DEFAULT_CONFIG)
temp_dir = defaults['paths']['templates_dir']
temp_dir = defaults['templates_dir']
# Coll Templates
templates = defaults['paths']['template_files']
templates = defaults['html_templates']
for name, _ in six.iteritems(templates):
templates[name] = os.path.join(temp_dir, defaults[name])
for name in templates:
defaults[name] = os.path.join(temp_dir, defaults[name])
# Shared Templates
shared_templates = defaults['paths']['shared_template_files']
for name, _ in six.iteritems(shared_templates):
shared_templates[name] = os.path.join(temp_dir, defaults[name])
return templates, shared_templates
return defaults, templates
def list_templates(self):
templates, shared_templates = self._load_templates_map()
defaults, templates = self._load_templates_map()
print('Shared Templates')
for n, v in six.iteritems(shared_templates):
print('- {0}: (pywb/{1})'.format(n, v))
print('')
print('Collection Templates')
for n, v in six.iteritems(templates):
print('HTML Shared and Per-Collection Templates')
for n in templates:
v = defaults[n]
print('- {0}: (pywb/{1})'.format(n, v))
def _confirm_overwrite(self, full_path, msg):
@ -251,10 +240,10 @@ directory structure expected by pywb
raise IOError('Skipping, {0} already exists'.format(full_path))
def _get_template_path(self, template_name, verb):
templates, shared_templates = self._load_templates_map()
defaults, templates = self._load_templates_map()
try:
filename = templates[template_name]
filename = defaults[template_name]
if not self.coll_name:
full_path = os.path.join(os.getcwd(), filename)
else:
@ -262,14 +251,9 @@ directory structure expected by pywb
os.path.basename(filename))
except KeyError:
try:
filename = shared_templates[template_name]
full_path = os.path.join(os.getcwd(), filename)
except KeyError:
msg = 'template name must be one of {0} or {1}'
msg = msg.format(templates.keys(), shared_templates.keys())
raise KeyError(msg)
msg = 'template name must be one of {0}'
msg = msg.format(templates)
raise KeyError(msg)
return full_path, filename

View File

@ -26,8 +26,8 @@ class BaseWarcServer(object):
self.url_map.add(Rule('/', endpoint=list_routes))
def add_route(self, path, handler, path_param_name=''):
def direct_input_request(environ, mode='', path_param_value=''):
def add_route(self, path, handler, path_param_name='', default_value=''):
def direct_input_request(environ, mode='', path_param_value=default_value):
params = self.get_query_dict(environ)
params['mode'] = mode
if path_param_value:
@ -35,7 +35,7 @@ class BaseWarcServer(object):
params['_input_req'] = DirectWSGIInputRequest(environ)
return handler(params)
def post_fullrequest(environ, mode='', path_param_value=''):
def post_fullrequest(environ, mode='', path_param_value=default_value):
params = self.get_query_dict(environ)
params['mode'] = mode
if path_param_value:

View File

@ -14,4 +14,4 @@ endif =
gevent = 100
gevent-monkey-patch =
wsgi = webagg.test.live
wsgi = warcserver.test.live

View File

@ -32,8 +32,7 @@ SOURCE_LIST = [LiveIndexSource,
# ============================================================================
class WarcServer(BaseWarcServer):
AUTO_DIR_INDEX_PATH = '{coll}/indexes/'
AUTO_DIR_ARCHIVE_PATH = '{coll}/archive/'
AUTO_COLL_TEMPL = '{coll}'
def __init__(self, config_file='./config.yaml', custom_config=None):
config = load_yaml_config(DEFAULT_CONFIG)
@ -55,47 +54,49 @@ class WarcServer(BaseWarcServer):
super(WarcServer, self).__init__(debug=config.get('debug', False))
self.config = config
self.fixed_routes = self.load_colls()
self.root_dir = self.config.get('collections_root', '')
self.index_paths = self.init_paths('index_paths')
self.archive_paths = self.init_paths('archive_paths', self.root_dir)
self.archive_templ = None
self.indexes_templ = None
for name, route in iteritems(self.fixed_routes):
self.add_route('/' + name, route)
self.auto_handler = None
if self.config.get('enable_auto_colls', True):
auto_handler = self.load_auto_colls()
self.add_route('/<path:path_param_value>', auto_handler, path_param_name='param.coll')
self.auto_handler = self.load_auto_colls()
def _lookup(self, environ, path):
urls = self.url_map.bind(environ['HTTP_HOST'], path_info=path)
self.fixed_routes = self.load_colls()
try:
endpoint, args = urls.match()
result = endpoint(environ, **args)
return result
except Exception as e:
print(e)
return None
for name, route in iteritems(self.fixed_routes):
if route == self.auto_handler:
self.add_route('/' + name, route, path_param_name='param.coll', default_value='*')
else:
self.add_route('/' + name, route)
if self.auto_handler:
self.add_route('/<path:path_param_value>', self.auto_handler, path_param_name='param.coll')
def init_paths(self, name, abs_path=None):
templ = self.config.get(name)
def get_full_path(path):
path = os.path.join(self.AUTO_COLL_TEMPL, path, '')
if abs_path and '://' not in path:
path = os.path.join(abs_path, path)
return path
if isinstance(templ, str):
return get_full_path(templ)
else:
return [get_full_path(t) for t in templ]
def load_auto_colls(self):
self.root_dir = self.config.get('collections_root', '')
if not self.root_dir:
print('No Root Dir, Skip Auto Colls!')
return
self.indexes_templ = self.config.get('dyn_index_path', self.AUTO_DIR_INDEX_PATH).replace('/', os.path.sep)
dir_source = CacheDirectoryIndexSource(base_prefix=self.root_dir,
base_dir=self.indexes_templ)
base_dir=self.index_paths)
self.archive_templ = self.config.get('dyn_archive_path', self.AUTO_DIR_ARCHIVE_PATH).replace('/', os.path.sep)
if '://' not in self.archive_templ:
self.archive_templ = os.path.join(self.root_dir, self.archive_templ)
handler = DefaultResourceHandler(dir_source, self.archive_templ)
return handler
return DefaultResourceHandler(dir_source, self.archive_paths)
def list_fixed_routes(self):
return list(self.fixed_routes.keys())
@ -126,8 +127,6 @@ class WarcServer(BaseWarcServer):
if not colls:
return routes
self.default_archive_paths = self.config.get('archive_paths')
for name, coll_config in iteritems(colls):
try:
handler = self.load_coll(name, coll_config)
@ -143,6 +142,9 @@ class WarcServer(BaseWarcServer):
return routes
def load_coll(self, name, coll_config):
if coll_config == '$all' and self.auto_handler:
return self.auto_handler
if isinstance(coll_config, str):
index = coll_config
resource = None
@ -176,7 +178,7 @@ class WarcServer(BaseWarcServer):
agg = init_index_agg(index_group, True, timeout)
if not resource:
resource = self.default_archive_paths
resource = self.config.get('archive_paths')
return DefaultResourceHandler(agg, resource)

View File

@ -5,7 +5,9 @@ debug: true
collections_root: _test_colls
collections:
pywb: ./sample_archive/cdx/
pywb:
index: ./sample_archive/cdx/
resource: ./sample_archive/warcs/
with-js-proxy:
index: ./sample_archive/cdx/
@ -31,10 +33,10 @@ collections:
pywb-cdxj:
index_paths: ./sample_archive/cdxj/
archive_paths:
- ./invalid/path/to/ignore/
- ./sample_archive/warcs/
- archive
enable_memento: true

View File

@ -2,11 +2,10 @@ debug: true
collections_root: _test_colls
all_coll: all
recorder: live
collections:
'live': '$live'
'all': '$all'

42
tests/test_cli.py Normal file
View File

@ -0,0 +1,42 @@
import os
from mock import patch
import pytest
from pywb.apps.cli import wayback
from .base_config_test import CollsDirMixin, BaseTestClass
# ============================================================================
def _run_patch(self):
return self
@patch('pywb.apps.cli.ReplayCli.run', _run_patch)
class TestProxyCLIConfig(CollsDirMixin, BaseTestClass):
def test_proxy_cli(self):
res = wayback(['--proxy', 'test'])
exp = {'ca_file_cache': os.path.join('proxy-certs', 'pywb-ca.pem'),
'ca_name': 'pywb HTTPS Proxy CA',
'coll': 'test',
'recording': False}
assert res.extra_config['proxy'] == exp
def test_proxy_cli_rec(self):
res = wayback(['--proxy', 'test', '--proxy-record'])
assert res.extra_config['proxy']['recording'] == True
assert res.extra_config['collections']['live'] == {'index': '$live', 'use_js_obj_proxy': True}
def test_proxy_cli_err_coll(self):
with pytest.raises(Exception):
res = wayback(['--proxy', 'test/foo'])
def test_all_cli(self):
res = wayback(['--all-coll', 'all'])
assert res.extra_config['collections']['all'] == '$all'
def test_live_all_cli(self):
res = wayback(['--all-coll', 'all', '--live'])
assert res.extra_config['collections'] == {'live': {'index': '$live', 'use_js_obj_proxy': True},
'all': '$all'}

View File

@ -3,11 +3,8 @@ from pywb.warcserver.test.testutils import BaseTestClass, TempDirTests
from .base_config_test import CollsDirMixin
from pywb.utils.geventserver import GeventServer
from pywb.apps.frontendapp import FrontEndApp
from pywb.apps.cli import wayback
from pywb.manager.manager import main as manager
from mock import patch
import os
import requests
import pytest
@ -105,29 +102,3 @@ class TestRecordingProxy(CollsDirMixin, BaseTestProxy):
assert 'is_live = false' in res.text
assert 'httpbin(1)' in res.text
# ============================================================================
def _run_patch(self):
return self
@patch('pywb.apps.cli.ReplayCli.run', _run_patch)
class TestProxyCLIConfig(object):
def test_proxy_cli(self):
res = wayback(['--proxy', 'test'])
exp = {'ca_file_cache': os.path.join('proxy-certs', 'pywb-ca.pem'),
'ca_name': 'pywb HTTPS Proxy CA',
'coll': 'test',
'recording': False}
assert res.extra_config['proxy'] == exp
def test_proxy_cli_rec(self):
res = wayback(['--proxy', 'test', '--proxy-record'])
assert res.extra_config['proxy']['recording'] == True
assert res.extra_config['collections']['live'] == {'index': '$live', 'use_js_obj_proxy': True}
def test_proxy_cli_err_coll(self):
with pytest.raises(Exception):
res = wayback(['--proxy', 'test/foo'])