From 1bfba09c94ae1b3834c1da54d1396719a521e56d Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 1 Oct 2017 09:46:54 -0700 Subject: [PATCH] config: proxy and recorder improvements - proxy and recorder config loaded from 'proxy' and 'recorder' string or dicts in config - proxy settings loaded from config, wsgiproxmiddleware applied within main init path - cli --proxy-record add to indicate recording, optional dict to set options - optional recorder dict to configure other recorder options, file max_size, filename_template, etc.. - proxy tests: add proxy cli tests - recorder tests: add recorder custom config test --- pywb/apps/cli.py | 30 ++++---- pywb/apps/frontendapp.py | 134 +++++++++++++++++++++++++----------- tests/base_config_test.py | 12 ++-- tests/test_proxy.py | 48 ++++++++++--- tests/test_record_replay.py | 22 ++++++ 5 files changed, 181 insertions(+), 65 deletions(-) diff --git a/pywb/apps/cli.py b/pywb/apps/cli.py index 12ee7cbd..9c4b80c5 100644 --- a/pywb/apps/cli.py +++ b/pywb/apps/cli.py @@ -6,23 +6,23 @@ import logging #============================================================================= def warcserver(args=None): - WarcServerCli(args=args, - default_port=8070, - desc='pywb WarcServer').run() + return WarcServerCli(args=args, + default_port=8070, + desc='pywb WarcServer').run() #============================================================================= def wayback(args=None): - WaybackCli(args=args, - default_port=8080, - desc='pywb Wayback Machine Server').run() + return WaybackCli(args=args, + default_port=8080, + desc='pywb Wayback Machine Server').run() #============================================================================= def live_rewrite_server(args=None): - LiveCli(args=args, - default_port=8090, - desc='pywb Live Rewrite Proxy Server').run() + return LiveCli(args=args, + default_port=8090, + desc='pywb Live Rewrite Proxy Server').run() #============================================================================= @@ -37,6 +37,7 @@ class BaseCli(object): parser.add_argument('--live', action='store_true', help='Add live-web handler at /live') parser.add_argument('--proxy', help='Enable HTTP/S Proxy on specified collection') + parser.add_argument('--proxy-record', action='store_true', help='Enable Proxy Recording into specified collection') self.desc = desc self.extra_config = {} @@ -48,10 +49,12 @@ class BaseCli(object): logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s', level=logging.DEBUG if self.r.debug else logging.INFO) - self.application = self.load() - if self.r.proxy: - self.application = self.application.init_proxy(self.r.proxy) + self.extra_config['proxy'] = {'coll': self.r.proxy, + 'recording': self.r.proxy_record} + self.r.live = True + + self.application = self.load() if self.r.profile: from werkzeug.contrib.profiler import ProfilerMiddleware @@ -71,6 +74,7 @@ class BaseCli(object): def run(self): self.run_gevent() + return self def run_gevent(self): from gevent.pywsgi import WSGIServer @@ -116,7 +120,7 @@ class ReplayCli(BaseCli): logging.info(msg.format(indexer.root_path, self.r.auto_interval)) indexer.start() - super(ReplayCli, self).run() + return super(ReplayCli, self).run() #============================================================================= diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index 12f1aa56..0ee86397 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -27,27 +27,59 @@ from pywb.apps.wbrequestresponse import WbResponse import os import traceback import requests +import logging # ============================================================================ class FrontEndApp(object): + REPLAY_API = 'http://localhost:%s/{coll}/resource/postreq' + CDX_API = 'http://localhost:%s/{coll}/index' + RECORD_SERVER = 'http://localhost:%s' + RECORD_API = 'http://localhost:%s/%s/resource/postreq?param.recorder.coll={coll}' + + RECORD_ROUTE = '/record' + + PROXY_CA_NAME = 'pywb HTTPS Proxy CA' + + PROXY_CA_PATH = os.path.join('proxy-certs', 'pywb-ca.pem') + def __init__(self, config_file='./config.yaml', custom_config=None): - self.debug = True + print('CUSTOM', custom_config) + self.handler = self.handle_request self.warcserver = WarcServer(config_file=config_file, custom_config=custom_config) config = self.warcserver.config - framed_replay = config.get('framed_replay', True) + self.debug = config.get('debug', False) self.warcserver_server = GeventServer(self.warcserver, port=0) - self.init_recorder(config) + self.init_proxy(config) - self.static_handler = StaticHandler('pywb/static/') + self.init_recorder(config.get('recorder')) + + static_path = config.get('static_path', 'pywb/static/').replace('/', os.path.sep) + self.static_handler = StaticHandler(static_path) self.all_coll = config.get('all_coll', None) + self._init_routes() + + upstream_paths = self.get_upstream_paths(self.warcserver_server.port) + + framed_replay = config.get('framed_replay', True) + self.rewriterapp = RewriterApp(framed_replay, + config=config, + paths=upstream_paths) + + self.templates_dir = config.get('templates_dir', 'templates') + self.static_dir = config.get('static_dir', 'static') + + metadata_templ = os.path.join(self.warcserver.root_dir, '{coll}', 'metadata.yaml') + self.metadata_cache = MetadataCache(metadata_templ) + + def _init_routes(self): self.url_map = Map() self.url_map.add(Rule('/static/_//', endpoint=self.serve_static)) self.url_map.add(Rule('/static/', endpoint=self.serve_static)) @@ -63,50 +95,47 @@ class FrontEndApp(object): self.url_map.add(Rule(coll_prefix + '/timemap//', endpoint=self.serve_content)) self.url_map.add(Rule(coll_prefix + '/cdx', endpoint=self.serve_cdx)) - if self.recorder: - self.url_map.add(Rule(coll_prefix + '/record/', endpoint=self.serve_record)) + if self.recorder_path: + self.url_map.add(Rule(coll_prefix + self.RECORD_ROUTE + '/', endpoint=self.serve_record)) self.url_map.add(Rule(coll_prefix + '/', endpoint=self.serve_content)) - upstream_paths = self.get_upstream_paths(self.warcserver_server.port) - - self.rewriterapp = RewriterApp(framed_replay, - config=config, - paths=upstream_paths) - - self.templates_dir = config.get('templates_dir', 'templates') - self.static_dir = config.get('static_dir', 'static') - - metadata_templ = os.path.join(self.warcserver.root_dir, '{coll}', 'metadata.yaml') - self.metadata_cache = MetadataCache(metadata_templ) - def get_upstream_paths(self, port): base_paths = { - 'replay': 'http://localhost:%s/{coll}/resource/postreq' % port, - 'cdx-server': 'http://localhost:%s/{coll}/index' % port, + 'replay': self.REPLAY_API % port, + 'cdx-server': self.CDX_API % port, } - if self.recorder: - base_paths['record'] = 'http://localhost:%s/%s/resource/postreq?param.recorder.coll={coll}' % (self.recorder_port, self.recorder_source) + if self.recorder_path: + base_paths['record'] = self.recorder_path return base_paths - def init_recorder(self, config): - self.recorder_source = config.get('recorder') - - if not self.recorder_source: + def init_recorder(self, recorder_config): + if not recorder_config: self.recorder = None - self.recorder_server = None - self.recorder_port = 0 + self.recorder_path = None return + if isinstance(recorder_config, str): + recorder_coll = recorder_config + recorder_config = {} + else: + recorder_coll = recorder_config['source_coll'] + + # TODO: support dedup dedup_index = None - warc_writer = MultiFileWARCWriter(self.warcserver.archive_templ, max_size=1000000000, max_idle_secs=600, + warc_writer = MultiFileWARCWriter(self.warcserver.archive_templ, + max_size=int(recorder_config.get('max_size', 1000000000)), + max_idle_secs=int(recorder_config.get('max_idle_secs', 600)), + filename_template=recorder_config.get('filename_template'), dedup_index=dedup_index) - self.recorder = RecorderApp('http://localhost:' + str(self.warcserver_server.port), warc_writer) - self.recorder_server = GeventServer(self.recorder, port=0) - self.recorder_port = self.recorder_server.port + self.recorder = RecorderApp(self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer) + + recorder_server = GeventServer(self.recorder, port=0) + + self.recorder_path = self.RECORD_API % (recorder_server.port, recorder_coll) def serve_home(self, environ): home_view = BaseInsertView(self.rewriterapp.jinja_env, 'index.html') @@ -289,6 +318,9 @@ class FrontEndApp(object): return WbResponse.redir_response(full_url, '307 Redirect') def __call__(self, environ, start_response): + return self.handler(environ, start_response) + + def handle_request(self, environ, start_response): urls = self.url_map.bind_to_environ(environ) try: endpoint, args = urls.match() @@ -316,16 +348,40 @@ class FrontEndApp(object): app_server = GeventServer(app, port=port, hostname='0.0.0.0') return app_server - def init_proxy(self, proxy_coll, opts=None): - if not opts: - opts = {'ca_name': 'pywb HTTPS Proxy CA', - 'ca_file_cache': os.path.join('proxy-certs', 'pywb-ca.pem')} + def init_proxy(self, config): + proxy_config = config.get('proxy') + if not proxy_config: + return + + if isinstance(proxy_config, str): + proxy_coll = proxy_config + proxy_config = {} + else: + proxy_coll = proxy_config['coll'] + + if '/' in proxy_coll: + raise Exception('Proxy collection can not contain "/"') + + proxy_config['ca_name'] = proxy_config.get('ca_name', self.PROXY_CA_NAME) + proxy_config['ca_file_cache'] = proxy_config.get('ca_file_cache', self.PROXY_CA_PATH) + + if proxy_config.get('recording'): + logging.info('Proxy recording into collection "{0}"'.format(proxy_coll)) + if proxy_coll in self.warcserver.list_fixed_routes(): + raise Exception('Can not record into fixed collection') + + proxy_coll += self.RECORD_ROUTE + if not config.get('recorder'): + config['recorder'] = 'live' + + else: + logging.info('Proxy enabled for collection "{0}"'.format(proxy_coll)) prefix = '/{0}/bn_/'.format(proxy_coll) - return WSGIProxMiddleware(self, prefix, - proxy_host='pywb.proxy', - proxy_options=opts) + self.handler = WSGIProxMiddleware(self.handle_request, prefix, + proxy_host=proxy_config.get('host', 'pywb.proxy'), + proxy_options=proxy_config) # ============================================================================ diff --git a/tests/base_config_test.py b/tests/base_config_test.py index a8c1a7df..bbc83a21 100644 --- a/tests/base_config_test.py +++ b/tests/base_config_test.py @@ -23,19 +23,21 @@ def fmod_sl(request): # ============================================================================ class BaseConfigTest(BaseTestClass): @classmethod - def get_test_app(cls, config_file, override=None): + def get_test_app(cls, config_file, custom_config=None): config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), config_file) - app = FrontEndApp(config_file=config_file, custom_config=override) + app = FrontEndApp(config_file=config_file, custom_config=custom_config) return app, webtest.TestApp(app) @classmethod - def setup_class(cls, config_file, include_non_frame=True): + def setup_class(cls, config_file, include_non_frame=True, custom_config=None): super(BaseConfigTest, cls).setup_class() - cls.app, cls.testapp = cls.get_test_app(config_file) + cls.app, cls.testapp = cls.get_test_app(config_file, custom_config) if include_non_frame: + custom_config = custom_config or {} + custom_config['framed_replay'] = False cls.app_non_frame, cls.testapp_non_frame = cls.get_test_app(config_file, - override={'framed_replay': False}) + custom_config) @classmethod def teardown_class(cls): diff --git a/tests/test_proxy.py b/tests/test_proxy.py index 6b22f683..500572a5 100644 --- a/tests/test_proxy.py +++ b/tests/test_proxy.py @@ -3,8 +3,11 @@ from pywb.warcserver.test.testutils import BaseTestClass, TempDirTests from .base_config_test import CollsDirMixin from pywb.utils.geventserver import GeventServer from pywb.apps.frontendapp import FrontEndApp +from pywb.apps.cli import wayback from pywb.manager.manager import main as manager +from mock import patch + import os import requests import pytest @@ -19,19 +22,22 @@ def scheme(request): # ============================================================================ class BaseTestProxy(TempDirTests, BaseTestClass): @classmethod - def setup_class(cls, coll='pywb', config_file='config_test.yaml'): + def setup_class(cls, coll='pywb', config_file='config_test.yaml', recording=False): super(BaseTestProxy, cls).setup_class() config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), config_file) cls.root_ca_file = os.path.join(cls.root_dir, 'pywb-ca-test.pem') - cls.app = FrontEndApp(config_file=config_file) - opts = {'ca_name': 'pywb HTTPS Proxy CA', - 'ca_file_cache': cls.root_ca_file} + opts = {'ca_name': 'pywb test HTTPS Proxy CA', + 'ca_file_cache': cls.root_ca_file, + 'coll': coll, + 'recording': recording, + } - cls.proxy_app = cls.app.init_proxy(coll, opts) + cls.app = FrontEndApp(config_file=config_file, + custom_config={'proxy': opts}) - cls.server = GeventServer(cls.proxy_app) + cls.server = GeventServer(cls.app) cls.proxies = cls.proxy_dict(cls.server.port) @classmethod @@ -65,7 +71,7 @@ class TestProxy(BaseTestProxy): class TestRecordingProxy(CollsDirMixin, BaseTestProxy): @classmethod def setup_class(cls, coll='pywb', config_file='config_test.yaml'): - super(TestRecordingProxy, cls).setup_class('test/record', 'config_test_record.yaml') + super(TestRecordingProxy, cls).setup_class('test', 'config_test_record.yaml', recording=True) manager(['init', 'test']) @classmethod @@ -90,7 +96,7 @@ class TestRecordingProxy(CollsDirMixin, BaseTestProxy): def test_proxy_replay_recorded(self, scheme): manager(['reindex', 'test']) - self.proxy_app.prefix_resolver.fixed_prefix = '/test/bn_/' + self.app.handler.prefix_resolver.fixed_prefix = '/test/bn_/' res = requests.get('{0}://httpbin.org/'.format(scheme), proxies=self.proxies, @@ -99,3 +105,29 @@ class TestRecordingProxy(CollsDirMixin, BaseTestProxy): assert 'is_live = false' in res.text assert 'httpbin(1)' in res.text + +# ============================================================================ +def _run_patch(self): + return self + + +@patch('pywb.apps.cli.ReplayCli.run', _run_patch) +class TestProxyCLIConfig(object): + def test_proxy_cli(self): + res = wayback(['--proxy', 'test']) + exp = {'ca_file_cache': os.path.join('proxy-certs', 'pywb-ca.pem'), + 'ca_name': 'pywb HTTPS Proxy CA', + 'coll': 'test', + 'recording': False} + assert res.extra_config['proxy'] == exp + + def test_proxy_cli_rec(self): + res = wayback(['--proxy', 'test', '--proxy-record']) + assert res.extra_config['proxy']['recording'] == True + assert res.extra_config['collections']['live'] == {'index': '$live', 'use_js_obj_proxy': True} + + def test_proxy_cli_err_coll(self): + with pytest.raises(Exception): + res = wayback(['--proxy', 'test/foo']) + + diff --git a/tests/test_record_replay.py b/tests/test_record_replay.py index 6ccd2a1e..c2bd6815 100644 --- a/tests/test_record_replay.py +++ b/tests/test_record_replay.py @@ -108,3 +108,25 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest): assert to_path('test/indexes/autoindex.cdxj') in link_lines[4] +# ============================================================================ +class TestRecordCustomConfig(CollsDirMixin, BaseConfigTest): + @classmethod + def setup_class(cls): + rec_custom = {'recorder': {'source_coll': 'live', + 'filename_template': 'pywb-rec-test-{timestamp}.warcgz'}} + super(TestRecordCustomConfig, cls).setup_class('config_test_record.yaml', custom_config=rec_custom) + + def test_init_and_rec(self): + manager(['init', 'test-new']) + dir_name = os.path.join(self.root_dir, '_test_colls', 'test-new', 'archive') + assert os.path.isdir(dir_name) + + res = self.testapp.get('/test-new/record/mp_/http://httpbin.org/get?A=B') + assert '"A": "B"' in res.text + + names = os.listdir(dir_name) + assert len(names) == 1 + assert names[0].startswith('pywb-rec-test-') + assert names[0].endswith('.warcgz') + +