mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
config: proxy and recorder improvements
- proxy and recorder config loaded from 'proxy' and 'recorder' string or dicts in config - proxy settings loaded from config, wsgiproxmiddleware applied within main init path - cli --proxy-record add to indicate recording, optional dict to set options - optional recorder dict to configure other recorder options, file max_size, filename_template, etc.. - proxy tests: add proxy cli tests - recorder tests: add recorder custom config test
This commit is contained in:
parent
903fa6c6a2
commit
1bfba09c94
@ -6,23 +6,23 @@ import logging
|
||||
|
||||
#=============================================================================
|
||||
def warcserver(args=None):
|
||||
WarcServerCli(args=args,
|
||||
default_port=8070,
|
||||
desc='pywb WarcServer').run()
|
||||
return WarcServerCli(args=args,
|
||||
default_port=8070,
|
||||
desc='pywb WarcServer').run()
|
||||
|
||||
|
||||
#=============================================================================
|
||||
def wayback(args=None):
|
||||
WaybackCli(args=args,
|
||||
default_port=8080,
|
||||
desc='pywb Wayback Machine Server').run()
|
||||
return WaybackCli(args=args,
|
||||
default_port=8080,
|
||||
desc='pywb Wayback Machine Server').run()
|
||||
|
||||
|
||||
#=============================================================================
|
||||
def live_rewrite_server(args=None):
|
||||
LiveCli(args=args,
|
||||
default_port=8090,
|
||||
desc='pywb Live Rewrite Proxy Server').run()
|
||||
return LiveCli(args=args,
|
||||
default_port=8090,
|
||||
desc='pywb Live Rewrite Proxy Server').run()
|
||||
|
||||
|
||||
#=============================================================================
|
||||
@ -37,6 +37,7 @@ class BaseCli(object):
|
||||
parser.add_argument('--live', action='store_true', help='Add live-web handler at /live')
|
||||
|
||||
parser.add_argument('--proxy', help='Enable HTTP/S Proxy on specified collection')
|
||||
parser.add_argument('--proxy-record', action='store_true', help='Enable Proxy Recording into specified collection')
|
||||
|
||||
self.desc = desc
|
||||
self.extra_config = {}
|
||||
@ -48,10 +49,12 @@ class BaseCli(object):
|
||||
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
|
||||
level=logging.DEBUG if self.r.debug else logging.INFO)
|
||||
|
||||
self.application = self.load()
|
||||
|
||||
if self.r.proxy:
|
||||
self.application = self.application.init_proxy(self.r.proxy)
|
||||
self.extra_config['proxy'] = {'coll': self.r.proxy,
|
||||
'recording': self.r.proxy_record}
|
||||
self.r.live = True
|
||||
|
||||
self.application = self.load()
|
||||
|
||||
if self.r.profile:
|
||||
from werkzeug.contrib.profiler import ProfilerMiddleware
|
||||
@ -71,6 +74,7 @@ class BaseCli(object):
|
||||
|
||||
def run(self):
|
||||
self.run_gevent()
|
||||
return self
|
||||
|
||||
def run_gevent(self):
|
||||
from gevent.pywsgi import WSGIServer
|
||||
@ -116,7 +120,7 @@ class ReplayCli(BaseCli):
|
||||
logging.info(msg.format(indexer.root_path, self.r.auto_interval))
|
||||
indexer.start()
|
||||
|
||||
super(ReplayCli, self).run()
|
||||
return super(ReplayCli, self).run()
|
||||
|
||||
|
||||
#=============================================================================
|
||||
|
@ -27,27 +27,59 @@ from pywb.apps.wbrequestresponse import WbResponse
|
||||
import os
|
||||
import traceback
|
||||
import requests
|
||||
import logging
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class FrontEndApp(object):
|
||||
REPLAY_API = 'http://localhost:%s/{coll}/resource/postreq'
|
||||
CDX_API = 'http://localhost:%s/{coll}/index'
|
||||
RECORD_SERVER = 'http://localhost:%s'
|
||||
RECORD_API = 'http://localhost:%s/%s/resource/postreq?param.recorder.coll={coll}'
|
||||
|
||||
RECORD_ROUTE = '/record'
|
||||
|
||||
PROXY_CA_NAME = 'pywb HTTPS Proxy CA'
|
||||
|
||||
PROXY_CA_PATH = os.path.join('proxy-certs', 'pywb-ca.pem')
|
||||
|
||||
def __init__(self, config_file='./config.yaml', custom_config=None):
|
||||
self.debug = True
|
||||
print('CUSTOM', custom_config)
|
||||
self.handler = self.handle_request
|
||||
self.warcserver = WarcServer(config_file=config_file,
|
||||
custom_config=custom_config)
|
||||
|
||||
config = self.warcserver.config
|
||||
|
||||
framed_replay = config.get('framed_replay', True)
|
||||
self.debug = config.get('debug', False)
|
||||
|
||||
self.warcserver_server = GeventServer(self.warcserver, port=0)
|
||||
|
||||
self.init_recorder(config)
|
||||
self.init_proxy(config)
|
||||
|
||||
self.static_handler = StaticHandler('pywb/static/')
|
||||
self.init_recorder(config.get('recorder'))
|
||||
|
||||
static_path = config.get('static_path', 'pywb/static/').replace('/', os.path.sep)
|
||||
self.static_handler = StaticHandler(static_path)
|
||||
|
||||
self.all_coll = config.get('all_coll', None)
|
||||
|
||||
self._init_routes()
|
||||
|
||||
upstream_paths = self.get_upstream_paths(self.warcserver_server.port)
|
||||
|
||||
framed_replay = config.get('framed_replay', True)
|
||||
self.rewriterapp = RewriterApp(framed_replay,
|
||||
config=config,
|
||||
paths=upstream_paths)
|
||||
|
||||
self.templates_dir = config.get('templates_dir', 'templates')
|
||||
self.static_dir = config.get('static_dir', 'static')
|
||||
|
||||
metadata_templ = os.path.join(self.warcserver.root_dir, '{coll}', 'metadata.yaml')
|
||||
self.metadata_cache = MetadataCache(metadata_templ)
|
||||
|
||||
def _init_routes(self):
|
||||
self.url_map = Map()
|
||||
self.url_map.add(Rule('/static/_/<coll>/<path:filepath>', endpoint=self.serve_static))
|
||||
self.url_map.add(Rule('/static/<path:filepath>', endpoint=self.serve_static))
|
||||
@ -63,50 +95,47 @@ class FrontEndApp(object):
|
||||
self.url_map.add(Rule(coll_prefix + '/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content))
|
||||
self.url_map.add(Rule(coll_prefix + '/cdx', endpoint=self.serve_cdx))
|
||||
|
||||
if self.recorder:
|
||||
self.url_map.add(Rule(coll_prefix + '/record/<path:url>', endpoint=self.serve_record))
|
||||
if self.recorder_path:
|
||||
self.url_map.add(Rule(coll_prefix + self.RECORD_ROUTE + '/<path:url>', endpoint=self.serve_record))
|
||||
|
||||
self.url_map.add(Rule(coll_prefix + '/<path:url>', endpoint=self.serve_content))
|
||||
|
||||
upstream_paths = self.get_upstream_paths(self.warcserver_server.port)
|
||||
|
||||
self.rewriterapp = RewriterApp(framed_replay,
|
||||
config=config,
|
||||
paths=upstream_paths)
|
||||
|
||||
self.templates_dir = config.get('templates_dir', 'templates')
|
||||
self.static_dir = config.get('static_dir', 'static')
|
||||
|
||||
metadata_templ = os.path.join(self.warcserver.root_dir, '{coll}', 'metadata.yaml')
|
||||
self.metadata_cache = MetadataCache(metadata_templ)
|
||||
|
||||
def get_upstream_paths(self, port):
|
||||
base_paths = {
|
||||
'replay': 'http://localhost:%s/{coll}/resource/postreq' % port,
|
||||
'cdx-server': 'http://localhost:%s/{coll}/index' % port,
|
||||
'replay': self.REPLAY_API % port,
|
||||
'cdx-server': self.CDX_API % port,
|
||||
}
|
||||
|
||||
if self.recorder:
|
||||
base_paths['record'] = 'http://localhost:%s/%s/resource/postreq?param.recorder.coll={coll}' % (self.recorder_port, self.recorder_source)
|
||||
if self.recorder_path:
|
||||
base_paths['record'] = self.recorder_path
|
||||
|
||||
return base_paths
|
||||
|
||||
def init_recorder(self, config):
|
||||
self.recorder_source = config.get('recorder')
|
||||
|
||||
if not self.recorder_source:
|
||||
def init_recorder(self, recorder_config):
|
||||
if not recorder_config:
|
||||
self.recorder = None
|
||||
self.recorder_server = None
|
||||
self.recorder_port = 0
|
||||
self.recorder_path = None
|
||||
return
|
||||
|
||||
if isinstance(recorder_config, str):
|
||||
recorder_coll = recorder_config
|
||||
recorder_config = {}
|
||||
else:
|
||||
recorder_coll = recorder_config['source_coll']
|
||||
|
||||
# TODO: support dedup
|
||||
dedup_index = None
|
||||
warc_writer = MultiFileWARCWriter(self.warcserver.archive_templ, max_size=1000000000, max_idle_secs=600,
|
||||
warc_writer = MultiFileWARCWriter(self.warcserver.archive_templ,
|
||||
max_size=int(recorder_config.get('max_size', 1000000000)),
|
||||
max_idle_secs=int(recorder_config.get('max_idle_secs', 600)),
|
||||
filename_template=recorder_config.get('filename_template'),
|
||||
dedup_index=dedup_index)
|
||||
|
||||
self.recorder = RecorderApp('http://localhost:' + str(self.warcserver_server.port), warc_writer)
|
||||
self.recorder_server = GeventServer(self.recorder, port=0)
|
||||
self.recorder_port = self.recorder_server.port
|
||||
self.recorder = RecorderApp(self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer)
|
||||
|
||||
recorder_server = GeventServer(self.recorder, port=0)
|
||||
|
||||
self.recorder_path = self.RECORD_API % (recorder_server.port, recorder_coll)
|
||||
|
||||
def serve_home(self, environ):
|
||||
home_view = BaseInsertView(self.rewriterapp.jinja_env, 'index.html')
|
||||
@ -289,6 +318,9 @@ class FrontEndApp(object):
|
||||
return WbResponse.redir_response(full_url, '307 Redirect')
|
||||
|
||||
def __call__(self, environ, start_response):
|
||||
return self.handler(environ, start_response)
|
||||
|
||||
def handle_request(self, environ, start_response):
|
||||
urls = self.url_map.bind_to_environ(environ)
|
||||
try:
|
||||
endpoint, args = urls.match()
|
||||
@ -316,16 +348,40 @@ class FrontEndApp(object):
|
||||
app_server = GeventServer(app, port=port, hostname='0.0.0.0')
|
||||
return app_server
|
||||
|
||||
def init_proxy(self, proxy_coll, opts=None):
|
||||
if not opts:
|
||||
opts = {'ca_name': 'pywb HTTPS Proxy CA',
|
||||
'ca_file_cache': os.path.join('proxy-certs', 'pywb-ca.pem')}
|
||||
def init_proxy(self, config):
|
||||
proxy_config = config.get('proxy')
|
||||
if not proxy_config:
|
||||
return
|
||||
|
||||
if isinstance(proxy_config, str):
|
||||
proxy_coll = proxy_config
|
||||
proxy_config = {}
|
||||
else:
|
||||
proxy_coll = proxy_config['coll']
|
||||
|
||||
if '/' in proxy_coll:
|
||||
raise Exception('Proxy collection can not contain "/"')
|
||||
|
||||
proxy_config['ca_name'] = proxy_config.get('ca_name', self.PROXY_CA_NAME)
|
||||
proxy_config['ca_file_cache'] = proxy_config.get('ca_file_cache', self.PROXY_CA_PATH)
|
||||
|
||||
if proxy_config.get('recording'):
|
||||
logging.info('Proxy recording into collection "{0}"'.format(proxy_coll))
|
||||
if proxy_coll in self.warcserver.list_fixed_routes():
|
||||
raise Exception('Can not record into fixed collection')
|
||||
|
||||
proxy_coll += self.RECORD_ROUTE
|
||||
if not config.get('recorder'):
|
||||
config['recorder'] = 'live'
|
||||
|
||||
else:
|
||||
logging.info('Proxy enabled for collection "{0}"'.format(proxy_coll))
|
||||
|
||||
prefix = '/{0}/bn_/'.format(proxy_coll)
|
||||
|
||||
return WSGIProxMiddleware(self, prefix,
|
||||
proxy_host='pywb.proxy',
|
||||
proxy_options=opts)
|
||||
self.handler = WSGIProxMiddleware(self.handle_request, prefix,
|
||||
proxy_host=proxy_config.get('host', 'pywb.proxy'),
|
||||
proxy_options=proxy_config)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
@ -23,19 +23,21 @@ def fmod_sl(request):
|
||||
# ============================================================================
|
||||
class BaseConfigTest(BaseTestClass):
|
||||
@classmethod
|
||||
def get_test_app(cls, config_file, override=None):
|
||||
def get_test_app(cls, config_file, custom_config=None):
|
||||
config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), config_file)
|
||||
app = FrontEndApp(config_file=config_file, custom_config=override)
|
||||
app = FrontEndApp(config_file=config_file, custom_config=custom_config)
|
||||
return app, webtest.TestApp(app)
|
||||
|
||||
@classmethod
|
||||
def setup_class(cls, config_file, include_non_frame=True):
|
||||
def setup_class(cls, config_file, include_non_frame=True, custom_config=None):
|
||||
super(BaseConfigTest, cls).setup_class()
|
||||
cls.app, cls.testapp = cls.get_test_app(config_file)
|
||||
cls.app, cls.testapp = cls.get_test_app(config_file, custom_config)
|
||||
|
||||
if include_non_frame:
|
||||
custom_config = custom_config or {}
|
||||
custom_config['framed_replay'] = False
|
||||
cls.app_non_frame, cls.testapp_non_frame = cls.get_test_app(config_file,
|
||||
override={'framed_replay': False})
|
||||
custom_config)
|
||||
|
||||
@classmethod
|
||||
def teardown_class(cls):
|
||||
|
@ -3,8 +3,11 @@ from pywb.warcserver.test.testutils import BaseTestClass, TempDirTests
|
||||
from .base_config_test import CollsDirMixin
|
||||
from pywb.utils.geventserver import GeventServer
|
||||
from pywb.apps.frontendapp import FrontEndApp
|
||||
from pywb.apps.cli import wayback
|
||||
from pywb.manager.manager import main as manager
|
||||
|
||||
from mock import patch
|
||||
|
||||
import os
|
||||
import requests
|
||||
import pytest
|
||||
@ -19,19 +22,22 @@ def scheme(request):
|
||||
# ============================================================================
|
||||
class BaseTestProxy(TempDirTests, BaseTestClass):
|
||||
@classmethod
|
||||
def setup_class(cls, coll='pywb', config_file='config_test.yaml'):
|
||||
def setup_class(cls, coll='pywb', config_file='config_test.yaml', recording=False):
|
||||
super(BaseTestProxy, cls).setup_class()
|
||||
config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), config_file)
|
||||
|
||||
cls.root_ca_file = os.path.join(cls.root_dir, 'pywb-ca-test.pem')
|
||||
|
||||
cls.app = FrontEndApp(config_file=config_file)
|
||||
opts = {'ca_name': 'pywb HTTPS Proxy CA',
|
||||
'ca_file_cache': cls.root_ca_file}
|
||||
opts = {'ca_name': 'pywb test HTTPS Proxy CA',
|
||||
'ca_file_cache': cls.root_ca_file,
|
||||
'coll': coll,
|
||||
'recording': recording,
|
||||
}
|
||||
|
||||
cls.proxy_app = cls.app.init_proxy(coll, opts)
|
||||
cls.app = FrontEndApp(config_file=config_file,
|
||||
custom_config={'proxy': opts})
|
||||
|
||||
cls.server = GeventServer(cls.proxy_app)
|
||||
cls.server = GeventServer(cls.app)
|
||||
cls.proxies = cls.proxy_dict(cls.server.port)
|
||||
|
||||
@classmethod
|
||||
@ -65,7 +71,7 @@ class TestProxy(BaseTestProxy):
|
||||
class TestRecordingProxy(CollsDirMixin, BaseTestProxy):
|
||||
@classmethod
|
||||
def setup_class(cls, coll='pywb', config_file='config_test.yaml'):
|
||||
super(TestRecordingProxy, cls).setup_class('test/record', 'config_test_record.yaml')
|
||||
super(TestRecordingProxy, cls).setup_class('test', 'config_test_record.yaml', recording=True)
|
||||
manager(['init', 'test'])
|
||||
|
||||
@classmethod
|
||||
@ -90,7 +96,7 @@ class TestRecordingProxy(CollsDirMixin, BaseTestProxy):
|
||||
def test_proxy_replay_recorded(self, scheme):
|
||||
manager(['reindex', 'test'])
|
||||
|
||||
self.proxy_app.prefix_resolver.fixed_prefix = '/test/bn_/'
|
||||
self.app.handler.prefix_resolver.fixed_prefix = '/test/bn_/'
|
||||
|
||||
res = requests.get('{0}://httpbin.org/'.format(scheme),
|
||||
proxies=self.proxies,
|
||||
@ -99,3 +105,29 @@ class TestRecordingProxy(CollsDirMixin, BaseTestProxy):
|
||||
assert 'is_live = false' in res.text
|
||||
assert 'httpbin(1)' in res.text
|
||||
|
||||
|
||||
# ============================================================================
|
||||
def _run_patch(self):
|
||||
return self
|
||||
|
||||
|
||||
@patch('pywb.apps.cli.ReplayCli.run', _run_patch)
|
||||
class TestProxyCLIConfig(object):
|
||||
def test_proxy_cli(self):
|
||||
res = wayback(['--proxy', 'test'])
|
||||
exp = {'ca_file_cache': os.path.join('proxy-certs', 'pywb-ca.pem'),
|
||||
'ca_name': 'pywb HTTPS Proxy CA',
|
||||
'coll': 'test',
|
||||
'recording': False}
|
||||
assert res.extra_config['proxy'] == exp
|
||||
|
||||
def test_proxy_cli_rec(self):
|
||||
res = wayback(['--proxy', 'test', '--proxy-record'])
|
||||
assert res.extra_config['proxy']['recording'] == True
|
||||
assert res.extra_config['collections']['live'] == {'index': '$live', 'use_js_obj_proxy': True}
|
||||
|
||||
def test_proxy_cli_err_coll(self):
|
||||
with pytest.raises(Exception):
|
||||
res = wayback(['--proxy', 'test/foo'])
|
||||
|
||||
|
||||
|
@ -108,3 +108,25 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest):
|
||||
assert to_path('test/indexes/autoindex.cdxj') in link_lines[4]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class TestRecordCustomConfig(CollsDirMixin, BaseConfigTest):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
rec_custom = {'recorder': {'source_coll': 'live',
|
||||
'filename_template': 'pywb-rec-test-{timestamp}.warcgz'}}
|
||||
super(TestRecordCustomConfig, cls).setup_class('config_test_record.yaml', custom_config=rec_custom)
|
||||
|
||||
def test_init_and_rec(self):
|
||||
manager(['init', 'test-new'])
|
||||
dir_name = os.path.join(self.root_dir, '_test_colls', 'test-new', 'archive')
|
||||
assert os.path.isdir(dir_name)
|
||||
|
||||
res = self.testapp.get('/test-new/record/mp_/http://httpbin.org/get?A=B')
|
||||
assert '"A": "B"' in res.text
|
||||
|
||||
names = os.listdir(dir_name)
|
||||
assert len(names) == 1
|
||||
assert names[0].startswith('pywb-rec-test-')
|
||||
assert names[0].endswith('.warcgz')
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user