1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

config: proxy and recorder improvements

- proxy and recorder config loaded from 'proxy' and 'recorder' string or dicts in config
- proxy settings loaded from config, wsgiproxmiddleware applied within main init path
- cli --proxy-record add to indicate recording, optional dict to set options
- optional recorder dict to configure other recorder options, file max_size, filename_template, etc..
- proxy tests: add proxy cli tests
- recorder tests: add recorder custom config test
This commit is contained in:
Ilya Kreymer 2017-10-01 09:46:54 -07:00
parent 903fa6c6a2
commit 1bfba09c94
5 changed files with 181 additions and 65 deletions

View File

@ -6,23 +6,23 @@ import logging
#============================================================================= #=============================================================================
def warcserver(args=None): def warcserver(args=None):
WarcServerCli(args=args, return WarcServerCli(args=args,
default_port=8070, default_port=8070,
desc='pywb WarcServer').run() desc='pywb WarcServer').run()
#============================================================================= #=============================================================================
def wayback(args=None): def wayback(args=None):
WaybackCli(args=args, return WaybackCli(args=args,
default_port=8080, default_port=8080,
desc='pywb Wayback Machine Server').run() desc='pywb Wayback Machine Server').run()
#============================================================================= #=============================================================================
def live_rewrite_server(args=None): def live_rewrite_server(args=None):
LiveCli(args=args, return LiveCli(args=args,
default_port=8090, default_port=8090,
desc='pywb Live Rewrite Proxy Server').run() desc='pywb Live Rewrite Proxy Server').run()
#============================================================================= #=============================================================================
@ -37,6 +37,7 @@ class BaseCli(object):
parser.add_argument('--live', action='store_true', help='Add live-web handler at /live') parser.add_argument('--live', action='store_true', help='Add live-web handler at /live')
parser.add_argument('--proxy', help='Enable HTTP/S Proxy on specified collection') parser.add_argument('--proxy', help='Enable HTTP/S Proxy on specified collection')
parser.add_argument('--proxy-record', action='store_true', help='Enable Proxy Recording into specified collection')
self.desc = desc self.desc = desc
self.extra_config = {} self.extra_config = {}
@ -48,10 +49,12 @@ class BaseCli(object):
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s', logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
level=logging.DEBUG if self.r.debug else logging.INFO) level=logging.DEBUG if self.r.debug else logging.INFO)
self.application = self.load()
if self.r.proxy: if self.r.proxy:
self.application = self.application.init_proxy(self.r.proxy) self.extra_config['proxy'] = {'coll': self.r.proxy,
'recording': self.r.proxy_record}
self.r.live = True
self.application = self.load()
if self.r.profile: if self.r.profile:
from werkzeug.contrib.profiler import ProfilerMiddleware from werkzeug.contrib.profiler import ProfilerMiddleware
@ -71,6 +74,7 @@ class BaseCli(object):
def run(self): def run(self):
self.run_gevent() self.run_gevent()
return self
def run_gevent(self): def run_gevent(self):
from gevent.pywsgi import WSGIServer from gevent.pywsgi import WSGIServer
@ -116,7 +120,7 @@ class ReplayCli(BaseCli):
logging.info(msg.format(indexer.root_path, self.r.auto_interval)) logging.info(msg.format(indexer.root_path, self.r.auto_interval))
indexer.start() indexer.start()
super(ReplayCli, self).run() return super(ReplayCli, self).run()
#============================================================================= #=============================================================================

View File

@ -27,27 +27,59 @@ from pywb.apps.wbrequestresponse import WbResponse
import os import os
import traceback import traceback
import requests import requests
import logging
# ============================================================================ # ============================================================================
class FrontEndApp(object): class FrontEndApp(object):
REPLAY_API = 'http://localhost:%s/{coll}/resource/postreq'
CDX_API = 'http://localhost:%s/{coll}/index'
RECORD_SERVER = 'http://localhost:%s'
RECORD_API = 'http://localhost:%s/%s/resource/postreq?param.recorder.coll={coll}'
RECORD_ROUTE = '/record'
PROXY_CA_NAME = 'pywb HTTPS Proxy CA'
PROXY_CA_PATH = os.path.join('proxy-certs', 'pywb-ca.pem')
def __init__(self, config_file='./config.yaml', custom_config=None): def __init__(self, config_file='./config.yaml', custom_config=None):
self.debug = True print('CUSTOM', custom_config)
self.handler = self.handle_request
self.warcserver = WarcServer(config_file=config_file, self.warcserver = WarcServer(config_file=config_file,
custom_config=custom_config) custom_config=custom_config)
config = self.warcserver.config config = self.warcserver.config
framed_replay = config.get('framed_replay', True) self.debug = config.get('debug', False)
self.warcserver_server = GeventServer(self.warcserver, port=0) self.warcserver_server = GeventServer(self.warcserver, port=0)
self.init_recorder(config) self.init_proxy(config)
self.static_handler = StaticHandler('pywb/static/') self.init_recorder(config.get('recorder'))
static_path = config.get('static_path', 'pywb/static/').replace('/', os.path.sep)
self.static_handler = StaticHandler(static_path)
self.all_coll = config.get('all_coll', None) self.all_coll = config.get('all_coll', None)
self._init_routes()
upstream_paths = self.get_upstream_paths(self.warcserver_server.port)
framed_replay = config.get('framed_replay', True)
self.rewriterapp = RewriterApp(framed_replay,
config=config,
paths=upstream_paths)
self.templates_dir = config.get('templates_dir', 'templates')
self.static_dir = config.get('static_dir', 'static')
metadata_templ = os.path.join(self.warcserver.root_dir, '{coll}', 'metadata.yaml')
self.metadata_cache = MetadataCache(metadata_templ)
def _init_routes(self):
self.url_map = Map() self.url_map = Map()
self.url_map.add(Rule('/static/_/<coll>/<path:filepath>', endpoint=self.serve_static)) self.url_map.add(Rule('/static/_/<coll>/<path:filepath>', endpoint=self.serve_static))
self.url_map.add(Rule('/static/<path:filepath>', endpoint=self.serve_static)) self.url_map.add(Rule('/static/<path:filepath>', endpoint=self.serve_static))
@ -63,50 +95,47 @@ class FrontEndApp(object):
self.url_map.add(Rule(coll_prefix + '/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content)) self.url_map.add(Rule(coll_prefix + '/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content))
self.url_map.add(Rule(coll_prefix + '/cdx', endpoint=self.serve_cdx)) self.url_map.add(Rule(coll_prefix + '/cdx', endpoint=self.serve_cdx))
if self.recorder: if self.recorder_path:
self.url_map.add(Rule(coll_prefix + '/record/<path:url>', endpoint=self.serve_record)) self.url_map.add(Rule(coll_prefix + self.RECORD_ROUTE + '/<path:url>', endpoint=self.serve_record))
self.url_map.add(Rule(coll_prefix + '/<path:url>', endpoint=self.serve_content)) self.url_map.add(Rule(coll_prefix + '/<path:url>', endpoint=self.serve_content))
upstream_paths = self.get_upstream_paths(self.warcserver_server.port)
self.rewriterapp = RewriterApp(framed_replay,
config=config,
paths=upstream_paths)
self.templates_dir = config.get('templates_dir', 'templates')
self.static_dir = config.get('static_dir', 'static')
metadata_templ = os.path.join(self.warcserver.root_dir, '{coll}', 'metadata.yaml')
self.metadata_cache = MetadataCache(metadata_templ)
def get_upstream_paths(self, port): def get_upstream_paths(self, port):
base_paths = { base_paths = {
'replay': 'http://localhost:%s/{coll}/resource/postreq' % port, 'replay': self.REPLAY_API % port,
'cdx-server': 'http://localhost:%s/{coll}/index' % port, 'cdx-server': self.CDX_API % port,
} }
if self.recorder: if self.recorder_path:
base_paths['record'] = 'http://localhost:%s/%s/resource/postreq?param.recorder.coll={coll}' % (self.recorder_port, self.recorder_source) base_paths['record'] = self.recorder_path
return base_paths return base_paths
def init_recorder(self, config): def init_recorder(self, recorder_config):
self.recorder_source = config.get('recorder') if not recorder_config:
if not self.recorder_source:
self.recorder = None self.recorder = None
self.recorder_server = None self.recorder_path = None
self.recorder_port = 0
return return
if isinstance(recorder_config, str):
recorder_coll = recorder_config
recorder_config = {}
else:
recorder_coll = recorder_config['source_coll']
# TODO: support dedup
dedup_index = None dedup_index = None
warc_writer = MultiFileWARCWriter(self.warcserver.archive_templ, max_size=1000000000, max_idle_secs=600, warc_writer = MultiFileWARCWriter(self.warcserver.archive_templ,
max_size=int(recorder_config.get('max_size', 1000000000)),
max_idle_secs=int(recorder_config.get('max_idle_secs', 600)),
filename_template=recorder_config.get('filename_template'),
dedup_index=dedup_index) dedup_index=dedup_index)
self.recorder = RecorderApp('http://localhost:' + str(self.warcserver_server.port), warc_writer) self.recorder = RecorderApp(self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer)
self.recorder_server = GeventServer(self.recorder, port=0)
self.recorder_port = self.recorder_server.port recorder_server = GeventServer(self.recorder, port=0)
self.recorder_path = self.RECORD_API % (recorder_server.port, recorder_coll)
def serve_home(self, environ): def serve_home(self, environ):
home_view = BaseInsertView(self.rewriterapp.jinja_env, 'index.html') home_view = BaseInsertView(self.rewriterapp.jinja_env, 'index.html')
@ -289,6 +318,9 @@ class FrontEndApp(object):
return WbResponse.redir_response(full_url, '307 Redirect') return WbResponse.redir_response(full_url, '307 Redirect')
def __call__(self, environ, start_response): def __call__(self, environ, start_response):
return self.handler(environ, start_response)
def handle_request(self, environ, start_response):
urls = self.url_map.bind_to_environ(environ) urls = self.url_map.bind_to_environ(environ)
try: try:
endpoint, args = urls.match() endpoint, args = urls.match()
@ -316,16 +348,40 @@ class FrontEndApp(object):
app_server = GeventServer(app, port=port, hostname='0.0.0.0') app_server = GeventServer(app, port=port, hostname='0.0.0.0')
return app_server return app_server
def init_proxy(self, proxy_coll, opts=None): def init_proxy(self, config):
if not opts: proxy_config = config.get('proxy')
opts = {'ca_name': 'pywb HTTPS Proxy CA', if not proxy_config:
'ca_file_cache': os.path.join('proxy-certs', 'pywb-ca.pem')} return
if isinstance(proxy_config, str):
proxy_coll = proxy_config
proxy_config = {}
else:
proxy_coll = proxy_config['coll']
if '/' in proxy_coll:
raise Exception('Proxy collection can not contain "/"')
proxy_config['ca_name'] = proxy_config.get('ca_name', self.PROXY_CA_NAME)
proxy_config['ca_file_cache'] = proxy_config.get('ca_file_cache', self.PROXY_CA_PATH)
if proxy_config.get('recording'):
logging.info('Proxy recording into collection "{0}"'.format(proxy_coll))
if proxy_coll in self.warcserver.list_fixed_routes():
raise Exception('Can not record into fixed collection')
proxy_coll += self.RECORD_ROUTE
if not config.get('recorder'):
config['recorder'] = 'live'
else:
logging.info('Proxy enabled for collection "{0}"'.format(proxy_coll))
prefix = '/{0}/bn_/'.format(proxy_coll) prefix = '/{0}/bn_/'.format(proxy_coll)
return WSGIProxMiddleware(self, prefix, self.handler = WSGIProxMiddleware(self.handle_request, prefix,
proxy_host='pywb.proxy', proxy_host=proxy_config.get('host', 'pywb.proxy'),
proxy_options=opts) proxy_options=proxy_config)
# ============================================================================ # ============================================================================

View File

@ -23,19 +23,21 @@ def fmod_sl(request):
# ============================================================================ # ============================================================================
class BaseConfigTest(BaseTestClass): class BaseConfigTest(BaseTestClass):
@classmethod @classmethod
def get_test_app(cls, config_file, override=None): def get_test_app(cls, config_file, custom_config=None):
config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), config_file) config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), config_file)
app = FrontEndApp(config_file=config_file, custom_config=override) app = FrontEndApp(config_file=config_file, custom_config=custom_config)
return app, webtest.TestApp(app) return app, webtest.TestApp(app)
@classmethod @classmethod
def setup_class(cls, config_file, include_non_frame=True): def setup_class(cls, config_file, include_non_frame=True, custom_config=None):
super(BaseConfigTest, cls).setup_class() super(BaseConfigTest, cls).setup_class()
cls.app, cls.testapp = cls.get_test_app(config_file) cls.app, cls.testapp = cls.get_test_app(config_file, custom_config)
if include_non_frame: if include_non_frame:
custom_config = custom_config or {}
custom_config['framed_replay'] = False
cls.app_non_frame, cls.testapp_non_frame = cls.get_test_app(config_file, cls.app_non_frame, cls.testapp_non_frame = cls.get_test_app(config_file,
override={'framed_replay': False}) custom_config)
@classmethod @classmethod
def teardown_class(cls): def teardown_class(cls):

View File

@ -3,8 +3,11 @@ from pywb.warcserver.test.testutils import BaseTestClass, TempDirTests
from .base_config_test import CollsDirMixin from .base_config_test import CollsDirMixin
from pywb.utils.geventserver import GeventServer from pywb.utils.geventserver import GeventServer
from pywb.apps.frontendapp import FrontEndApp from pywb.apps.frontendapp import FrontEndApp
from pywb.apps.cli import wayback
from pywb.manager.manager import main as manager from pywb.manager.manager import main as manager
from mock import patch
import os import os
import requests import requests
import pytest import pytest
@ -19,19 +22,22 @@ def scheme(request):
# ============================================================================ # ============================================================================
class BaseTestProxy(TempDirTests, BaseTestClass): class BaseTestProxy(TempDirTests, BaseTestClass):
@classmethod @classmethod
def setup_class(cls, coll='pywb', config_file='config_test.yaml'): def setup_class(cls, coll='pywb', config_file='config_test.yaml', recording=False):
super(BaseTestProxy, cls).setup_class() super(BaseTestProxy, cls).setup_class()
config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), config_file) config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), config_file)
cls.root_ca_file = os.path.join(cls.root_dir, 'pywb-ca-test.pem') cls.root_ca_file = os.path.join(cls.root_dir, 'pywb-ca-test.pem')
cls.app = FrontEndApp(config_file=config_file) opts = {'ca_name': 'pywb test HTTPS Proxy CA',
opts = {'ca_name': 'pywb HTTPS Proxy CA', 'ca_file_cache': cls.root_ca_file,
'ca_file_cache': cls.root_ca_file} 'coll': coll,
'recording': recording,
}
cls.proxy_app = cls.app.init_proxy(coll, opts) cls.app = FrontEndApp(config_file=config_file,
custom_config={'proxy': opts})
cls.server = GeventServer(cls.proxy_app) cls.server = GeventServer(cls.app)
cls.proxies = cls.proxy_dict(cls.server.port) cls.proxies = cls.proxy_dict(cls.server.port)
@classmethod @classmethod
@ -65,7 +71,7 @@ class TestProxy(BaseTestProxy):
class TestRecordingProxy(CollsDirMixin, BaseTestProxy): class TestRecordingProxy(CollsDirMixin, BaseTestProxy):
@classmethod @classmethod
def setup_class(cls, coll='pywb', config_file='config_test.yaml'): def setup_class(cls, coll='pywb', config_file='config_test.yaml'):
super(TestRecordingProxy, cls).setup_class('test/record', 'config_test_record.yaml') super(TestRecordingProxy, cls).setup_class('test', 'config_test_record.yaml', recording=True)
manager(['init', 'test']) manager(['init', 'test'])
@classmethod @classmethod
@ -90,7 +96,7 @@ class TestRecordingProxy(CollsDirMixin, BaseTestProxy):
def test_proxy_replay_recorded(self, scheme): def test_proxy_replay_recorded(self, scheme):
manager(['reindex', 'test']) manager(['reindex', 'test'])
self.proxy_app.prefix_resolver.fixed_prefix = '/test/bn_/' self.app.handler.prefix_resolver.fixed_prefix = '/test/bn_/'
res = requests.get('{0}://httpbin.org/'.format(scheme), res = requests.get('{0}://httpbin.org/'.format(scheme),
proxies=self.proxies, proxies=self.proxies,
@ -99,3 +105,29 @@ class TestRecordingProxy(CollsDirMixin, BaseTestProxy):
assert 'is_live = false' in res.text assert 'is_live = false' in res.text
assert 'httpbin(1)' in res.text assert 'httpbin(1)' in res.text
# ============================================================================
def _run_patch(self):
return self
@patch('pywb.apps.cli.ReplayCli.run', _run_patch)
class TestProxyCLIConfig(object):
def test_proxy_cli(self):
res = wayback(['--proxy', 'test'])
exp = {'ca_file_cache': os.path.join('proxy-certs', 'pywb-ca.pem'),
'ca_name': 'pywb HTTPS Proxy CA',
'coll': 'test',
'recording': False}
assert res.extra_config['proxy'] == exp
def test_proxy_cli_rec(self):
res = wayback(['--proxy', 'test', '--proxy-record'])
assert res.extra_config['proxy']['recording'] == True
assert res.extra_config['collections']['live'] == {'index': '$live', 'use_js_obj_proxy': True}
def test_proxy_cli_err_coll(self):
with pytest.raises(Exception):
res = wayback(['--proxy', 'test/foo'])

View File

@ -108,3 +108,25 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest):
assert to_path('test/indexes/autoindex.cdxj') in link_lines[4] assert to_path('test/indexes/autoindex.cdxj') in link_lines[4]
# ============================================================================
class TestRecordCustomConfig(CollsDirMixin, BaseConfigTest):
@classmethod
def setup_class(cls):
rec_custom = {'recorder': {'source_coll': 'live',
'filename_template': 'pywb-rec-test-{timestamp}.warcgz'}}
super(TestRecordCustomConfig, cls).setup_class('config_test_record.yaml', custom_config=rec_custom)
def test_init_and_rec(self):
manager(['init', 'test-new'])
dir_name = os.path.join(self.root_dir, '_test_colls', 'test-new', 'archive')
assert os.path.isdir(dir_name)
res = self.testapp.get('/test-new/record/mp_/http://httpbin.org/get?A=B')
assert '"A": "B"' in res.text
names = os.listdir(dir_name)
assert len(names) == 1
assert names[0].startswith('pywb-rec-test-')
assert names[0].endswith('.warcgz')