1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

config: proxy and recorder improvements

- proxy and recorder config loaded from 'proxy' and 'recorder' string or dicts in config
- proxy settings loaded from config, wsgiproxmiddleware applied within main init path
- cli --proxy-record add to indicate recording, optional dict to set options
- optional recorder dict to configure other recorder options, file max_size, filename_template, etc..
- proxy tests: add proxy cli tests
- recorder tests: add recorder custom config test
This commit is contained in:
Ilya Kreymer 2017-10-01 09:46:54 -07:00
parent 903fa6c6a2
commit 1bfba09c94
5 changed files with 181 additions and 65 deletions

View File

@ -6,23 +6,23 @@ import logging
#=============================================================================
def warcserver(args=None):
WarcServerCli(args=args,
default_port=8070,
desc='pywb WarcServer').run()
return WarcServerCli(args=args,
default_port=8070,
desc='pywb WarcServer').run()
#=============================================================================
def wayback(args=None):
WaybackCli(args=args,
default_port=8080,
desc='pywb Wayback Machine Server').run()
return WaybackCli(args=args,
default_port=8080,
desc='pywb Wayback Machine Server').run()
#=============================================================================
def live_rewrite_server(args=None):
LiveCli(args=args,
default_port=8090,
desc='pywb Live Rewrite Proxy Server').run()
return LiveCli(args=args,
default_port=8090,
desc='pywb Live Rewrite Proxy Server').run()
#=============================================================================
@ -37,6 +37,7 @@ class BaseCli(object):
parser.add_argument('--live', action='store_true', help='Add live-web handler at /live')
parser.add_argument('--proxy', help='Enable HTTP/S Proxy on specified collection')
parser.add_argument('--proxy-record', action='store_true', help='Enable Proxy Recording into specified collection')
self.desc = desc
self.extra_config = {}
@ -48,10 +49,12 @@ class BaseCli(object):
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
level=logging.DEBUG if self.r.debug else logging.INFO)
self.application = self.load()
if self.r.proxy:
self.application = self.application.init_proxy(self.r.proxy)
self.extra_config['proxy'] = {'coll': self.r.proxy,
'recording': self.r.proxy_record}
self.r.live = True
self.application = self.load()
if self.r.profile:
from werkzeug.contrib.profiler import ProfilerMiddleware
@ -71,6 +74,7 @@ class BaseCli(object):
def run(self):
self.run_gevent()
return self
def run_gevent(self):
from gevent.pywsgi import WSGIServer
@ -116,7 +120,7 @@ class ReplayCli(BaseCli):
logging.info(msg.format(indexer.root_path, self.r.auto_interval))
indexer.start()
super(ReplayCli, self).run()
return super(ReplayCli, self).run()
#=============================================================================

View File

@ -27,27 +27,59 @@ from pywb.apps.wbrequestresponse import WbResponse
import os
import traceback
import requests
import logging
# ============================================================================
class FrontEndApp(object):
REPLAY_API = 'http://localhost:%s/{coll}/resource/postreq'
CDX_API = 'http://localhost:%s/{coll}/index'
RECORD_SERVER = 'http://localhost:%s'
RECORD_API = 'http://localhost:%s/%s/resource/postreq?param.recorder.coll={coll}'
RECORD_ROUTE = '/record'
PROXY_CA_NAME = 'pywb HTTPS Proxy CA'
PROXY_CA_PATH = os.path.join('proxy-certs', 'pywb-ca.pem')
def __init__(self, config_file='./config.yaml', custom_config=None):
self.debug = True
print('CUSTOM', custom_config)
self.handler = self.handle_request
self.warcserver = WarcServer(config_file=config_file,
custom_config=custom_config)
config = self.warcserver.config
framed_replay = config.get('framed_replay', True)
self.debug = config.get('debug', False)
self.warcserver_server = GeventServer(self.warcserver, port=0)
self.init_recorder(config)
self.init_proxy(config)
self.static_handler = StaticHandler('pywb/static/')
self.init_recorder(config.get('recorder'))
static_path = config.get('static_path', 'pywb/static/').replace('/', os.path.sep)
self.static_handler = StaticHandler(static_path)
self.all_coll = config.get('all_coll', None)
self._init_routes()
upstream_paths = self.get_upstream_paths(self.warcserver_server.port)
framed_replay = config.get('framed_replay', True)
self.rewriterapp = RewriterApp(framed_replay,
config=config,
paths=upstream_paths)
self.templates_dir = config.get('templates_dir', 'templates')
self.static_dir = config.get('static_dir', 'static')
metadata_templ = os.path.join(self.warcserver.root_dir, '{coll}', 'metadata.yaml')
self.metadata_cache = MetadataCache(metadata_templ)
def _init_routes(self):
self.url_map = Map()
self.url_map.add(Rule('/static/_/<coll>/<path:filepath>', endpoint=self.serve_static))
self.url_map.add(Rule('/static/<path:filepath>', endpoint=self.serve_static))
@ -63,50 +95,47 @@ class FrontEndApp(object):
self.url_map.add(Rule(coll_prefix + '/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content))
self.url_map.add(Rule(coll_prefix + '/cdx', endpoint=self.serve_cdx))
if self.recorder:
self.url_map.add(Rule(coll_prefix + '/record/<path:url>', endpoint=self.serve_record))
if self.recorder_path:
self.url_map.add(Rule(coll_prefix + self.RECORD_ROUTE + '/<path:url>', endpoint=self.serve_record))
self.url_map.add(Rule(coll_prefix + '/<path:url>', endpoint=self.serve_content))
upstream_paths = self.get_upstream_paths(self.warcserver_server.port)
self.rewriterapp = RewriterApp(framed_replay,
config=config,
paths=upstream_paths)
self.templates_dir = config.get('templates_dir', 'templates')
self.static_dir = config.get('static_dir', 'static')
metadata_templ = os.path.join(self.warcserver.root_dir, '{coll}', 'metadata.yaml')
self.metadata_cache = MetadataCache(metadata_templ)
def get_upstream_paths(self, port):
base_paths = {
'replay': 'http://localhost:%s/{coll}/resource/postreq' % port,
'cdx-server': 'http://localhost:%s/{coll}/index' % port,
'replay': self.REPLAY_API % port,
'cdx-server': self.CDX_API % port,
}
if self.recorder:
base_paths['record'] = 'http://localhost:%s/%s/resource/postreq?param.recorder.coll={coll}' % (self.recorder_port, self.recorder_source)
if self.recorder_path:
base_paths['record'] = self.recorder_path
return base_paths
def init_recorder(self, config):
self.recorder_source = config.get('recorder')
if not self.recorder_source:
def init_recorder(self, recorder_config):
if not recorder_config:
self.recorder = None
self.recorder_server = None
self.recorder_port = 0
self.recorder_path = None
return
if isinstance(recorder_config, str):
recorder_coll = recorder_config
recorder_config = {}
else:
recorder_coll = recorder_config['source_coll']
# TODO: support dedup
dedup_index = None
warc_writer = MultiFileWARCWriter(self.warcserver.archive_templ, max_size=1000000000, max_idle_secs=600,
warc_writer = MultiFileWARCWriter(self.warcserver.archive_templ,
max_size=int(recorder_config.get('max_size', 1000000000)),
max_idle_secs=int(recorder_config.get('max_idle_secs', 600)),
filename_template=recorder_config.get('filename_template'),
dedup_index=dedup_index)
self.recorder = RecorderApp('http://localhost:' + str(self.warcserver_server.port), warc_writer)
self.recorder_server = GeventServer(self.recorder, port=0)
self.recorder_port = self.recorder_server.port
self.recorder = RecorderApp(self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer)
recorder_server = GeventServer(self.recorder, port=0)
self.recorder_path = self.RECORD_API % (recorder_server.port, recorder_coll)
def serve_home(self, environ):
home_view = BaseInsertView(self.rewriterapp.jinja_env, 'index.html')
@ -289,6 +318,9 @@ class FrontEndApp(object):
return WbResponse.redir_response(full_url, '307 Redirect')
def __call__(self, environ, start_response):
return self.handler(environ, start_response)
def handle_request(self, environ, start_response):
urls = self.url_map.bind_to_environ(environ)
try:
endpoint, args = urls.match()
@ -316,16 +348,40 @@ class FrontEndApp(object):
app_server = GeventServer(app, port=port, hostname='0.0.0.0')
return app_server
def init_proxy(self, proxy_coll, opts=None):
if not opts:
opts = {'ca_name': 'pywb HTTPS Proxy CA',
'ca_file_cache': os.path.join('proxy-certs', 'pywb-ca.pem')}
def init_proxy(self, config):
proxy_config = config.get('proxy')
if not proxy_config:
return
if isinstance(proxy_config, str):
proxy_coll = proxy_config
proxy_config = {}
else:
proxy_coll = proxy_config['coll']
if '/' in proxy_coll:
raise Exception('Proxy collection can not contain "/"')
proxy_config['ca_name'] = proxy_config.get('ca_name', self.PROXY_CA_NAME)
proxy_config['ca_file_cache'] = proxy_config.get('ca_file_cache', self.PROXY_CA_PATH)
if proxy_config.get('recording'):
logging.info('Proxy recording into collection "{0}"'.format(proxy_coll))
if proxy_coll in self.warcserver.list_fixed_routes():
raise Exception('Can not record into fixed collection')
proxy_coll += self.RECORD_ROUTE
if not config.get('recorder'):
config['recorder'] = 'live'
else:
logging.info('Proxy enabled for collection "{0}"'.format(proxy_coll))
prefix = '/{0}/bn_/'.format(proxy_coll)
return WSGIProxMiddleware(self, prefix,
proxy_host='pywb.proxy',
proxy_options=opts)
self.handler = WSGIProxMiddleware(self.handle_request, prefix,
proxy_host=proxy_config.get('host', 'pywb.proxy'),
proxy_options=proxy_config)
# ============================================================================

View File

@ -23,19 +23,21 @@ def fmod_sl(request):
# ============================================================================
class BaseConfigTest(BaseTestClass):
@classmethod
def get_test_app(cls, config_file, override=None):
def get_test_app(cls, config_file, custom_config=None):
config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), config_file)
app = FrontEndApp(config_file=config_file, custom_config=override)
app = FrontEndApp(config_file=config_file, custom_config=custom_config)
return app, webtest.TestApp(app)
@classmethod
def setup_class(cls, config_file, include_non_frame=True):
def setup_class(cls, config_file, include_non_frame=True, custom_config=None):
super(BaseConfigTest, cls).setup_class()
cls.app, cls.testapp = cls.get_test_app(config_file)
cls.app, cls.testapp = cls.get_test_app(config_file, custom_config)
if include_non_frame:
custom_config = custom_config or {}
custom_config['framed_replay'] = False
cls.app_non_frame, cls.testapp_non_frame = cls.get_test_app(config_file,
override={'framed_replay': False})
custom_config)
@classmethod
def teardown_class(cls):

View File

@ -3,8 +3,11 @@ from pywb.warcserver.test.testutils import BaseTestClass, TempDirTests
from .base_config_test import CollsDirMixin
from pywb.utils.geventserver import GeventServer
from pywb.apps.frontendapp import FrontEndApp
from pywb.apps.cli import wayback
from pywb.manager.manager import main as manager
from mock import patch
import os
import requests
import pytest
@ -19,19 +22,22 @@ def scheme(request):
# ============================================================================
class BaseTestProxy(TempDirTests, BaseTestClass):
@classmethod
def setup_class(cls, coll='pywb', config_file='config_test.yaml'):
def setup_class(cls, coll='pywb', config_file='config_test.yaml', recording=False):
super(BaseTestProxy, cls).setup_class()
config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), config_file)
cls.root_ca_file = os.path.join(cls.root_dir, 'pywb-ca-test.pem')
cls.app = FrontEndApp(config_file=config_file)
opts = {'ca_name': 'pywb HTTPS Proxy CA',
'ca_file_cache': cls.root_ca_file}
opts = {'ca_name': 'pywb test HTTPS Proxy CA',
'ca_file_cache': cls.root_ca_file,
'coll': coll,
'recording': recording,
}
cls.proxy_app = cls.app.init_proxy(coll, opts)
cls.app = FrontEndApp(config_file=config_file,
custom_config={'proxy': opts})
cls.server = GeventServer(cls.proxy_app)
cls.server = GeventServer(cls.app)
cls.proxies = cls.proxy_dict(cls.server.port)
@classmethod
@ -65,7 +71,7 @@ class TestProxy(BaseTestProxy):
class TestRecordingProxy(CollsDirMixin, BaseTestProxy):
@classmethod
def setup_class(cls, coll='pywb', config_file='config_test.yaml'):
super(TestRecordingProxy, cls).setup_class('test/record', 'config_test_record.yaml')
super(TestRecordingProxy, cls).setup_class('test', 'config_test_record.yaml', recording=True)
manager(['init', 'test'])
@classmethod
@ -90,7 +96,7 @@ class TestRecordingProxy(CollsDirMixin, BaseTestProxy):
def test_proxy_replay_recorded(self, scheme):
manager(['reindex', 'test'])
self.proxy_app.prefix_resolver.fixed_prefix = '/test/bn_/'
self.app.handler.prefix_resolver.fixed_prefix = '/test/bn_/'
res = requests.get('{0}://httpbin.org/'.format(scheme),
proxies=self.proxies,
@ -99,3 +105,29 @@ class TestRecordingProxy(CollsDirMixin, BaseTestProxy):
assert 'is_live = false' in res.text
assert 'httpbin(1)' in res.text
# ============================================================================
def _run_patch(self):
return self
@patch('pywb.apps.cli.ReplayCli.run', _run_patch)
class TestProxyCLIConfig(object):
def test_proxy_cli(self):
res = wayback(['--proxy', 'test'])
exp = {'ca_file_cache': os.path.join('proxy-certs', 'pywb-ca.pem'),
'ca_name': 'pywb HTTPS Proxy CA',
'coll': 'test',
'recording': False}
assert res.extra_config['proxy'] == exp
def test_proxy_cli_rec(self):
res = wayback(['--proxy', 'test', '--proxy-record'])
assert res.extra_config['proxy']['recording'] == True
assert res.extra_config['collections']['live'] == {'index': '$live', 'use_js_obj_proxy': True}
def test_proxy_cli_err_coll(self):
with pytest.raises(Exception):
res = wayback(['--proxy', 'test/foo'])

View File

@ -108,3 +108,25 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest):
assert to_path('test/indexes/autoindex.cdxj') in link_lines[4]
# ============================================================================
class TestRecordCustomConfig(CollsDirMixin, BaseConfigTest):
@classmethod
def setup_class(cls):
rec_custom = {'recorder': {'source_coll': 'live',
'filename_template': 'pywb-rec-test-{timestamp}.warcgz'}}
super(TestRecordCustomConfig, cls).setup_class('config_test_record.yaml', custom_config=rec_custom)
def test_init_and_rec(self):
manager(['init', 'test-new'])
dir_name = os.path.join(self.root_dir, '_test_colls', 'test-new', 'archive')
assert os.path.isdir(dir_name)
res = self.testapp.get('/test-new/record/mp_/http://httpbin.org/get?A=B')
assert '"A": "B"' in res.text
names = os.listdir(dir_name)
assert len(names) == 1
assert names[0].startswith('pywb-rec-test-')
assert names[0].endswith('.warcgz')