From b631a24a0eac661972d2735a95d4e4312f1f6642 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 3 Oct 2017 15:31:08 -0700 Subject: [PATCH] config cleanup: - auto/dyn collections: use overridable 'index_paths' and 'archive_paths', support list for archive_paths - all-auto collection: supported at warcserver layer via special '$all' index - cleanup default_config.yaml and config.yaml, remove obsolete properties - remove obsolete docker-compose.yaml - default_config: simplify list of managed properties - test_cli: add tests for cli options --- config.yaml | 121 ++---------------------------- docker-compose.yml | 19 ----- pywb/apps/cli.py | 4 +- pywb/apps/frontendapp.py | 17 ++--- pywb/default_config.yaml | 72 +++++++----------- pywb/manager/manager.py | 48 ++++-------- pywb/warcserver/basewarcserver.py | 6 +- pywb/warcserver/test/live.ini | 2 +- pywb/warcserver/warcserver.py | 68 +++++++++-------- tests/config_test.yaml | 6 +- tests/config_test_record.yaml | 3 +- tests/test_cli.py | 42 +++++++++++ tests/test_proxy.py | 29 ------- 13 files changed, 144 insertions(+), 293 deletions(-) delete mode 100644 docker-compose.yml create mode 100644 tests/test_cli.py diff --git a/config.yaml b/config.yaml index 752c8a12..bb0b27b5 100644 --- a/config.yaml +++ b/config.yaml @@ -1,123 +1,14 @@ # pywb config file # ======================================== # -# Settings for each collection -use_js_obj_proxy: true - collections: - # : - # collection will be accessed via / - # is a string or list of: - # - string or list of one or more local .cdx file - # - string or list of one or more local dirs with .cdx files - # - a string value indicating remote http cdx server - pywb: ./sample_archive/cdx/ + pywb: + index: ./sample_archive/cdx/ + resource: ./sample_archive/warcs/ - # ex with filtering: filter CDX lines by filename starting with 'dupe' - #pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']} - -# indicate if cdx files are sorted by SURT keys -- eg: com,example)/ -# SURT keys are recommended for future indices, but non-SURT cdxs -# are also supported -# -# * Set to true if cdxs start with surts: com,example)/ -# * Set to false if cdx start with urls: example.com)/ -# -# default: -# surt_ordered: true - -# list of paths prefixes for pywb look to 'resolve' WARC and ARC filenames -# in the cdx to their absolute path -# -# if path is: -# * local dir, use path as prefix -# * local file, lookup prefix in tab-delimited sorted index -# * http:// path, use path as remote prefix -# * redis:// path, use redis to lookup full path for w: as key - -archive_paths: ./sample_archive/warcs/ - -# ==== Proxy Mode ==== -# Enable simple http proxy mode -enable_http_proxy: true - -# Additional proxy options (defaults) -# proxy_options: -# enable HTTPS proxy support (requires openssl library) -# enable_https_proxy: false -# -# use cookies to switch collections and capture times -# if not enabled, requires use of proxy auth -# cookie_resolver: true -# -# default collection to start out in proxy mode -# if not set, will ask the first time -# use_default_coll: pywb - -# use wildcard certificates when creating certs in proxy mode -# helps lower number of certs created, by may not be compatible -# with older libraries -# use_wildcard_certs: true - -# if true, will not add any banner to proxy mode replay -# unaltered_replay: false - -# Default settings for CA used by proxy mode: -# root_ca_file: ./ca/pywb-ca.pem -# root_ca_name: pywb https proxy replay CA -# certs_dir: ./ca/certs - -# ==== UI: HTML/Jinja2 Templates ==== - -# The following are default settings -- uncomment to change -# Set to '' to disable the ui - -# template for insert into replayed html content -#head_insert_html: ui/head_insert.html -# -# -# template for just the banner modifications -# set to False to disable completely -#banner_html: banner.html - -# template to for 'calendar' query, -# eg, a listing of captures in response to a ../*/ -# -# may be a simple listing or a more complex 'calendar' UI -# if omitted, will list raw cdx in plain text -#query_html: ui/query.html - -# template for search page, which is displayed when no search url is entered -# in a collection -#search_html: ui/search.html - -# template for home page. -# if no other route is set, this will be rendered at /, /index.htm and /index.html -#home_html: ui/index.html - - -# error page temlpate for may formatting error message and details -# if omitted, a text response is returned -#error_html: ui/error.html - -# ==== Other Paths ==== - -# Rewrite urls with absolute paths instead of relative -#absoulte_paths: true - -# List of route names: -# : -# default route static/__pywb for pywb bundled static files -#static_routes: -# static/__pywb: pywb/static/ - -# enable cdx server api for querying cdx directly (experimental) -enable_cdx_api: true - -# custom rules for domain specific matching -# set to false to disable -#domain_specific_rules: rules.yaml +# Settings for each collection +use_js_obj_proxy: true # Memento support, enable enable_memento: true @@ -125,5 +16,3 @@ enable_memento: true # Replay content in an iframe framed_replay: true -# debugging utility -- echo request data -# debug_echo_env: false diff --git a/docker-compose.yml b/docker-compose.yml deleted file mode 100644 index 463ec243..00000000 --- a/docker-compose.yml +++ /dev/null @@ -1,19 +0,0 @@ -version: '2' - -services: - proxy: - build: ./proxy/ - links: - - webagg:webagg - - environment: - - "WEBAGG=http://webrecplatform_webagg_1:8080" - - ports: - - 9080:9080 - - volumes: - - ${HOME}/.mitmproxy/:/root/.mitmproxy/ - - webagg: - build: ./webagg/ diff --git a/pywb/apps/cli.py b/pywb/apps/cli.py index 9c4b80c5..4abce689 100644 --- a/pywb/apps/cli.py +++ b/pywb/apps/cli.py @@ -98,7 +98,9 @@ class ReplayCli(BaseCli): super(ReplayCli, self).load() if self.r.all_coll: - self.extra_config['all_coll'] = self.r.all_coll + if 'collections' not in self.extra_config: + self.extra_config['collections'] = {} + self.extra_config['collections'][self.r.all_coll] = '$all' import os if self.r.directory: #pragma: no cover diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index 0ee86397..b7255939 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -44,7 +44,6 @@ class FrontEndApp(object): PROXY_CA_PATH = os.path.join('proxy-certs', 'pywb-ca.pem') def __init__(self, config_file='./config.yaml', custom_config=None): - print('CUSTOM', custom_config) self.handler = self.handle_request self.warcserver = WarcServer(config_file=config_file, custom_config=custom_config) @@ -59,7 +58,7 @@ class FrontEndApp(object): self.init_recorder(config.get('recorder')) - static_path = config.get('static_path', 'pywb/static/').replace('/', os.path.sep) + static_path = config.get('static_url_path', 'pywb/static/').replace('/', os.path.sep) self.static_handler = StaticHandler(static_path) self.all_coll = config.get('all_coll', None) @@ -125,7 +124,7 @@ class FrontEndApp(object): # TODO: support dedup dedup_index = None - warc_writer = MultiFileWARCWriter(self.warcserver.archive_templ, + warc_writer = MultiFileWARCWriter(self.warcserver.archive_paths, max_size=int(recorder_config.get('max_size', 1000000000)), max_idle_secs=int(recorder_config.get('max_idle_secs', 600)), filename_template=recorder_config.get('filename_template'), @@ -166,8 +165,8 @@ class FrontEndApp(object): self.raise_not_found(environ, 'Static File Not Found: {0}'.format(filepath)) def get_metadata(self, coll): - if coll == self.all_coll: - coll = '*' + #if coll == self.all_coll: + # coll = '*' metadata = {'coll': coll, 'type': 'replay'} @@ -205,8 +204,8 @@ class FrontEndApp(object): def serve_cdx(self, environ, coll='$root'): base_url = self.rewriterapp.paths['cdx-server'] - if coll == self.all_coll: - coll = '*' + #if coll == self.all_coll: + # coll = '*' cdx_url = base_url.format(coll=coll) @@ -280,8 +279,8 @@ class FrontEndApp(object): return WbResponse.json_response(result) def is_valid_coll(self, coll): - if coll == self.all_coll: - return True + #if coll == self.all_coll: + # return True return (coll in self.warcserver.list_fixed_routes() or coll in self.warcserver.list_dynamic_routes()) diff --git a/pywb/default_config.yaml b/pywb/default_config.yaml index d8fa5f45..812ac9b3 100644 --- a/pywb/default_config.yaml +++ b/pywb/default_config.yaml @@ -1,68 +1,48 @@ collections_root: collections -paths: - archive_paths: archive - index_paths: indexes - static_path: static +# Per-Collection Paths +archive_paths: archive +index_paths: indexes +static_path: static - templates_dir: templates - - template_files: - banner_html: banner.html - head_insert_html: head_insert.html - frame_insert_html: frame_insert.html - - query_html: query.html - search_html: search.html - not_found_html: not_found.html - - shared_template_files: - home_html: index.html - error_html: error.html - - proxy_cert_download_html: proxy_cert_download.html - proxy_select_html: proxy_select.html - - info_json: collinfo.json - -templates_dirs: - - templates - - '.' - - '/' - -template_packages: - - pywb +templates_dir: templates +# Template HTML +banner_html: banner.html head_insert_html: head_insert.html frame_insert_html: frame_insert.html -banner_html: banner.html - -#archive_paths: ./ - -home_html: index.html query_html: query.html search_html: search.html - -error_html: error.html not_found_html: not_found.html +home_html: index.html +error_html: error.html + proxy_cert_download_html: proxy_cert_download.html proxy_select_html: proxy_select.html +# Info JSON info_json: collinfo.json -static_default_prefix: &static_default_prefix static/__pywb -static_shared_prefix: static/__shared +# HTML Templates List +html_templates: + - banner_html + - head_insert_html + - frame_insert_html -template_globals: - static_path: *static_default_prefix + - query_html + - search_html + - not_found_html -static_routes: - *static_default_prefix: pywb/static/ + - home_html + - error_html + - proxy_cert_download_html + - proxy_select_html +# Other Settings enable_memento: true -domain_specific_rules: pywb/rules.yaml +rules_config: pkg://pywb/rules.yaml + -framed_replay: inverse diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index d2e0e62a..2a0776ab 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -74,11 +74,11 @@ directory structure expected by pywb def _get_root_dir(self, name): return os.path.join(os.getcwd(), - self.default_config['paths'][name]) + self.default_config[name]) def _get_dir(self, name): return os.path.join(self.curr_coll_dir, - self.default_config['paths'][name]) + self.default_config[name]) def _create_dir(self, dirname): if not os.path.isdir(dirname): @@ -208,33 +208,22 @@ directory structure expected by pywb def _load_templates_map(self): defaults = load_yaml_config(DEFAULT_CONFIG) - temp_dir = defaults['paths']['templates_dir'] + temp_dir = defaults['templates_dir'] # Coll Templates - templates = defaults['paths']['template_files'] + templates = defaults['html_templates'] - for name, _ in six.iteritems(templates): - templates[name] = os.path.join(temp_dir, defaults[name]) + for name in templates: + defaults[name] = os.path.join(temp_dir, defaults[name]) - # Shared Templates - shared_templates = defaults['paths']['shared_template_files'] - - for name, _ in six.iteritems(shared_templates): - shared_templates[name] = os.path.join(temp_dir, defaults[name]) - - return templates, shared_templates + return defaults, templates def list_templates(self): - templates, shared_templates = self._load_templates_map() + defaults, templates = self._load_templates_map() - print('Shared Templates') - for n, v in six.iteritems(shared_templates): - print('- {0}: (pywb/{1})'.format(n, v)) - - print('') - - print('Collection Templates') - for n, v in six.iteritems(templates): + print('HTML Shared and Per-Collection Templates') + for n in templates: + v = defaults[n] print('- {0}: (pywb/{1})'.format(n, v)) def _confirm_overwrite(self, full_path, msg): @@ -251,10 +240,10 @@ directory structure expected by pywb raise IOError('Skipping, {0} already exists'.format(full_path)) def _get_template_path(self, template_name, verb): - templates, shared_templates = self._load_templates_map() + defaults, templates = self._load_templates_map() try: - filename = templates[template_name] + filename = defaults[template_name] if not self.coll_name: full_path = os.path.join(os.getcwd(), filename) else: @@ -262,14 +251,9 @@ directory structure expected by pywb os.path.basename(filename)) except KeyError: - try: - filename = shared_templates[template_name] - full_path = os.path.join(os.getcwd(), filename) - - except KeyError: - msg = 'template name must be one of {0} or {1}' - msg = msg.format(templates.keys(), shared_templates.keys()) - raise KeyError(msg) + msg = 'template name must be one of {0}' + msg = msg.format(templates) + raise KeyError(msg) return full_path, filename diff --git a/pywb/warcserver/basewarcserver.py b/pywb/warcserver/basewarcserver.py index 5b159aaf..c9aa8819 100644 --- a/pywb/warcserver/basewarcserver.py +++ b/pywb/warcserver/basewarcserver.py @@ -26,8 +26,8 @@ class BaseWarcServer(object): self.url_map.add(Rule('/', endpoint=list_routes)) - def add_route(self, path, handler, path_param_name=''): - def direct_input_request(environ, mode='', path_param_value=''): + def add_route(self, path, handler, path_param_name='', default_value=''): + def direct_input_request(environ, mode='', path_param_value=default_value): params = self.get_query_dict(environ) params['mode'] = mode if path_param_value: @@ -35,7 +35,7 @@ class BaseWarcServer(object): params['_input_req'] = DirectWSGIInputRequest(environ) return handler(params) - def post_fullrequest(environ, mode='', path_param_value=''): + def post_fullrequest(environ, mode='', path_param_value=default_value): params = self.get_query_dict(environ) params['mode'] = mode if path_param_value: diff --git a/pywb/warcserver/test/live.ini b/pywb/warcserver/test/live.ini index f63d5896..558ae0d8 100644 --- a/pywb/warcserver/test/live.ini +++ b/pywb/warcserver/test/live.ini @@ -14,4 +14,4 @@ endif = gevent = 100 gevent-monkey-patch = -wsgi = webagg.test.live +wsgi = warcserver.test.live diff --git a/pywb/warcserver/warcserver.py b/pywb/warcserver/warcserver.py index f416690a..f9df3cb2 100644 --- a/pywb/warcserver/warcserver.py +++ b/pywb/warcserver/warcserver.py @@ -32,8 +32,7 @@ SOURCE_LIST = [LiveIndexSource, # ============================================================================ class WarcServer(BaseWarcServer): - AUTO_DIR_INDEX_PATH = '{coll}/indexes/' - AUTO_DIR_ARCHIVE_PATH = '{coll}/archive/' + AUTO_COLL_TEMPL = '{coll}' def __init__(self, config_file='./config.yaml', custom_config=None): config = load_yaml_config(DEFAULT_CONFIG) @@ -55,47 +54,49 @@ class WarcServer(BaseWarcServer): super(WarcServer, self).__init__(debug=config.get('debug', False)) self.config = config - self.fixed_routes = self.load_colls() + self.root_dir = self.config.get('collections_root', '') + self.index_paths = self.init_paths('index_paths') + self.archive_paths = self.init_paths('archive_paths', self.root_dir) - self.archive_templ = None - self.indexes_templ = None - - for name, route in iteritems(self.fixed_routes): - self.add_route('/' + name, route) + self.auto_handler = None if self.config.get('enable_auto_colls', True): - auto_handler = self.load_auto_colls() - self.add_route('/', auto_handler, path_param_name='param.coll') + self.auto_handler = self.load_auto_colls() - def _lookup(self, environ, path): - urls = self.url_map.bind(environ['HTTP_HOST'], path_info=path) + self.fixed_routes = self.load_colls() - try: - endpoint, args = urls.match() - result = endpoint(environ, **args) - return result - except Exception as e: - print(e) - return None + for name, route in iteritems(self.fixed_routes): + if route == self.auto_handler: + self.add_route('/' + name, route, path_param_name='param.coll', default_value='*') + else: + self.add_route('/' + name, route) + + if self.auto_handler: + self.add_route('/', self.auto_handler, path_param_name='param.coll') + + def init_paths(self, name, abs_path=None): + templ = self.config.get(name) + + def get_full_path(path): + path = os.path.join(self.AUTO_COLL_TEMPL, path, '') + if abs_path and '://' not in path: + path = os.path.join(abs_path, path) + return path + + if isinstance(templ, str): + return get_full_path(templ) + else: + return [get_full_path(t) for t in templ] def load_auto_colls(self): - self.root_dir = self.config.get('collections_root', '') if not self.root_dir: print('No Root Dir, Skip Auto Colls!') return - self.indexes_templ = self.config.get('dyn_index_path', self.AUTO_DIR_INDEX_PATH).replace('/', os.path.sep) - dir_source = CacheDirectoryIndexSource(base_prefix=self.root_dir, - base_dir=self.indexes_templ) + base_dir=self.index_paths) - self.archive_templ = self.config.get('dyn_archive_path', self.AUTO_DIR_ARCHIVE_PATH).replace('/', os.path.sep) - if '://' not in self.archive_templ: - self.archive_templ = os.path.join(self.root_dir, self.archive_templ) - - handler = DefaultResourceHandler(dir_source, self.archive_templ) - - return handler + return DefaultResourceHandler(dir_source, self.archive_paths) def list_fixed_routes(self): return list(self.fixed_routes.keys()) @@ -126,8 +127,6 @@ class WarcServer(BaseWarcServer): if not colls: return routes - self.default_archive_paths = self.config.get('archive_paths') - for name, coll_config in iteritems(colls): try: handler = self.load_coll(name, coll_config) @@ -143,6 +142,9 @@ class WarcServer(BaseWarcServer): return routes def load_coll(self, name, coll_config): + if coll_config == '$all' and self.auto_handler: + return self.auto_handler + if isinstance(coll_config, str): index = coll_config resource = None @@ -176,7 +178,7 @@ class WarcServer(BaseWarcServer): agg = init_index_agg(index_group, True, timeout) if not resource: - resource = self.default_archive_paths + resource = self.config.get('archive_paths') return DefaultResourceHandler(agg, resource) diff --git a/tests/config_test.yaml b/tests/config_test.yaml index 2fb8f04b..ca67e2e2 100644 --- a/tests/config_test.yaml +++ b/tests/config_test.yaml @@ -5,7 +5,9 @@ debug: true collections_root: _test_colls collections: - pywb: ./sample_archive/cdx/ + pywb: + index: ./sample_archive/cdx/ + resource: ./sample_archive/warcs/ with-js-proxy: index: ./sample_archive/cdx/ @@ -31,10 +33,10 @@ collections: pywb-cdxj: index_paths: ./sample_archive/cdxj/ - archive_paths: - ./invalid/path/to/ignore/ - ./sample_archive/warcs/ + - archive enable_memento: true diff --git a/tests/config_test_record.yaml b/tests/config_test_record.yaml index a98034db..9f4050a4 100644 --- a/tests/config_test_record.yaml +++ b/tests/config_test_record.yaml @@ -2,11 +2,10 @@ debug: true collections_root: _test_colls -all_coll: all - recorder: live collections: 'live': '$live' + 'all': '$all' diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 00000000..4437b6fd --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,42 @@ +import os +from mock import patch + +import pytest + +from pywb.apps.cli import wayback +from .base_config_test import CollsDirMixin, BaseTestClass + + +# ============================================================================ +def _run_patch(self): + return self + + +@patch('pywb.apps.cli.ReplayCli.run', _run_patch) +class TestProxyCLIConfig(CollsDirMixin, BaseTestClass): + def test_proxy_cli(self): + res = wayback(['--proxy', 'test']) + exp = {'ca_file_cache': os.path.join('proxy-certs', 'pywb-ca.pem'), + 'ca_name': 'pywb HTTPS Proxy CA', + 'coll': 'test', + 'recording': False} + assert res.extra_config['proxy'] == exp + + def test_proxy_cli_rec(self): + res = wayback(['--proxy', 'test', '--proxy-record']) + assert res.extra_config['proxy']['recording'] == True + assert res.extra_config['collections']['live'] == {'index': '$live', 'use_js_obj_proxy': True} + + def test_proxy_cli_err_coll(self): + with pytest.raises(Exception): + res = wayback(['--proxy', 'test/foo']) + + def test_all_cli(self): + res = wayback(['--all-coll', 'all']) + assert res.extra_config['collections']['all'] == '$all' + + def test_live_all_cli(self): + res = wayback(['--all-coll', 'all', '--live']) + assert res.extra_config['collections'] == {'live': {'index': '$live', 'use_js_obj_proxy': True}, + 'all': '$all'} + diff --git a/tests/test_proxy.py b/tests/test_proxy.py index 500572a5..9cc68e67 100644 --- a/tests/test_proxy.py +++ b/tests/test_proxy.py @@ -3,11 +3,8 @@ from pywb.warcserver.test.testutils import BaseTestClass, TempDirTests from .base_config_test import CollsDirMixin from pywb.utils.geventserver import GeventServer from pywb.apps.frontendapp import FrontEndApp -from pywb.apps.cli import wayback from pywb.manager.manager import main as manager -from mock import patch - import os import requests import pytest @@ -105,29 +102,3 @@ class TestRecordingProxy(CollsDirMixin, BaseTestProxy): assert 'is_live = false' in res.text assert 'httpbin(1)' in res.text - -# ============================================================================ -def _run_patch(self): - return self - - -@patch('pywb.apps.cli.ReplayCli.run', _run_patch) -class TestProxyCLIConfig(object): - def test_proxy_cli(self): - res = wayback(['--proxy', 'test']) - exp = {'ca_file_cache': os.path.join('proxy-certs', 'pywb-ca.pem'), - 'ca_name': 'pywb HTTPS Proxy CA', - 'coll': 'test', - 'recording': False} - assert res.extra_config['proxy'] == exp - - def test_proxy_cli_rec(self): - res = wayback(['--proxy', 'test', '--proxy-record']) - assert res.extra_config['proxy']['recording'] == True - assert res.extra_config['collections']['live'] == {'index': '$live', 'use_js_obj_proxy': True} - - def test_proxy_cli_err_coll(self): - with pytest.raises(Exception): - res = wayback(['--proxy', 'test/foo']) - -