1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

new-wayback cli script, using new FrontEndApp (rewriting) + AutoConfigApp (config-driven aggregator)

support for dynamic collections: check all .cdxj files in /<coll>/indexes/*.cdxj when accessing /<coll>
support for fixed routes: specified in config.yaml as per https://github.com/ikreymer/pywb/wiki/Distributed-Archive-Config
werkzeug routing in FrontEndApp: default query, replay, search pages working
route listing: /_coll_info.json for listing fixed + dynamic routes
autoindexing enabled, indexing WARCs added to archives directory to .cdxj index
Addresses #196
This commit is contained in:
Ilya Kreymer 2017-02-17 18:04:07 -08:00
parent 531422fc1b
commit 31bf7a47f1
13 changed files with 270 additions and 123 deletions

View File

@ -1,4 +1,6 @@
from argparse import ArgumentParser from argparse import ArgumentParser
import logging
#================================================================= #=================================================================
def cdx_server(args=None): #pragma: no cover def cdx_server(args=None): #pragma: no cover
@ -26,6 +28,11 @@ def webagg():
WebaggCli().run() WebaggCli().run()
#=============================================================================
def new_wayback():
NewWaybackCli().run()
#============================================================================= #=============================================================================
class BaseCli(object): class BaseCli(object):
def __init__(self, args=None, default_port=8080, desc=''): def __init__(self, args=None, default_port=8080, desc=''):
@ -33,6 +40,7 @@ class BaseCli(object):
parser.add_argument('-p', '--port', type=int, default=default_port) parser.add_argument('-p', '--port', type=int, default=default_port)
parser.add_argument('-t', '--threads', type=int, default=4) parser.add_argument('-t', '--threads', type=int, default=4)
parser.add_argument('-s', '--server', default='gevent') parser.add_argument('-s', '--server', default='gevent')
parser.add_argument('--debug', action='store_true')
self.desc = desc self.desc = desc
@ -40,12 +48,15 @@ class BaseCli(object):
self.r = parser.parse_args(args) self.r = parser.parse_args(args)
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
level=logging.DEBUG if self.r.debug else logging.INFO)
if self.r.server == 'gevent': if self.r.server == 'gevent':
try: try:
from gevent.monkey import patch_all; patch_all() from gevent.monkey import patch_all; patch_all()
print('Using Gevent') logging.debug('Using Gevent')
except: except:
print('No Gevent') logging.debug('No Gevent')
self.r.server = 'wsgiref' self.r.server = 'wsgiref'
from pywb.framework.wsgi_wrappers import init_app from pywb.framework.wsgi_wrappers import init_app
@ -69,7 +80,7 @@ class BaseCli(object):
def run_waitress(self): #pragma: no cover def run_waitress(self): #pragma: no cover
from waitress import serve from waitress import serve
print(self.desc) logging.debug(str(self.desc))
serve(self.application, port=self.r.port, threads=self.r.threads) serve(self.application, port=self.r.port, threads=self.r.threads)
def run_wsgiref(self): #pragma: no cover def run_wsgiref(self): #pragma: no cover
@ -78,7 +89,7 @@ class BaseCli(object):
def run_gevent(self): def run_gevent(self):
from gevent.pywsgi import WSGIServer from gevent.pywsgi import WSGIServer
print('Starting Gevent Server on ' + str(self.r.port)) logging.info('Starting Gevent Server on ' + str(self.r.port))
WSGIServer(('', self.r.port), self.application).serve_forever() WSGIServer(('', self.r.port), self.application).serve_forever()
@ -105,6 +116,7 @@ class LiveCli(BaseCli):
class ReplayCli(BaseCli): class ReplayCli(BaseCli):
def _extend_parser(self, parser): def _extend_parser(self, parser):
parser.add_argument('-a', '--autoindex', action='store_true') parser.add_argument('-a', '--autoindex', action='store_true')
parser.add_argument('--auto-interval', type=int, default=30)
help_dir='Specify root archive dir (default is current working directory)' help_dir='Specify root archive dir (default is current working directory)'
parser.add_argument('-d', '--directory', help=help_dir) parser.add_argument('-d', '--directory', help=help_dir)
@ -118,7 +130,6 @@ class ReplayCli(BaseCli):
if self.r.autoindex: if self.r.autoindex:
from pywb.manager.manager import CollectionsManager from pywb.manager.manager import CollectionsManager
import os import os
import logging
m = CollectionsManager('', must_exist=False) m = CollectionsManager('', must_exist=False)
if not os.path.isdir(m.colls_dir): if not os.path.isdir(m.colls_dir):
@ -127,12 +138,13 @@ class ReplayCli(BaseCli):
import sys import sys
sys.exit(2) sys.exit(2)
else: else:
msg = 'Auto-Indexing Enabled on "{0}"' msg = 'Auto-Indexing Enabled on "{0}", checking every {1} secs'
logging.info(msg.format(m.colls_dir)) logging.info(msg.format(m.colls_dir, self.r.auto_interval))
m.autoindex(do_loop=False) m.autoindex(interval=self.r.auto_interval, do_loop=False)
super(ReplayCli, self).run() super(ReplayCli, self).run()
#============================================================================= #=============================================================================
class CdxCli(ReplayCli): #pragma: no cover class CdxCli(ReplayCli): #pragma: no cover
def load(self): def load(self):
@ -161,6 +173,18 @@ class WebaggCli(BaseCli):
self.run_gevent() self.run_gevent()
#=============================================================================
class NewWaybackCli(ReplayCli):
def load(self):
from pywb.apps.newwayback import application
return application
def run(self):
self.r.server = 'gevent'
super(NewWaybackCli, self).run()
#self.run_gevent()
#============================================================================= #=============================================================================
if __name__ == "__main__": if __name__ == "__main__":
wayback() wayback()

6
pywb/apps/newwayback.py Normal file
View File

@ -0,0 +1,6 @@
from gevent.monkey import patch_all; patch_all()
from pywb.urlrewrite.frontendapp import FrontEndApp
application = FrontEndApp()

View File

@ -1,6 +1,6 @@
from gevent.monkey import patch_all; patch_all() from gevent.monkey import patch_all; patch_all()
from pywb.webagg.autoapp import AutoConfigApp from pywb.webagg.autoapp import AutoConfigApp
application = AutoConfigApp().init() application = AutoConfigApp()

View File

@ -4,6 +4,7 @@ from pywb.utils.loaders import extract_post_query, append_post_query
from io import BytesIO from io import BytesIO
import pprint import pprint
import re import re
import json
#================================================================= #=================================================================
@ -246,6 +247,10 @@ class WbResponse(object):
return WbResponse(status_headers, value=[encoded_text]) return WbResponse(status_headers, value=[encoded_text])
@staticmethod
def json_response(obj, status='200 OK', content_type='application/json; charset=utf-8'):
return WbResponse.text_response(json.dumps(obj), status, content_type)
@staticmethod @staticmethod
def redir_response(location, status='302 Redirect', headers=None): def redir_response(location, status='302 Redirect', headers=None):
redir_headers = [('Location', location), ('Content-Length', '0')] redir_headers = [('Location', location), ('Content-Length', '0')]

View File

@ -133,10 +133,6 @@ DEFAULT_CONFIG_FILE = 'config.yaml'
#================================================================= #=================================================================
def init_app(init_func, load_yaml=True, config_file=None, config=None): def init_app(init_func, load_yaml=True, config_file=None, config=None):
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
level=logging.DEBUG)
logging.debug('')
try: try:
config = config or {} config = config or {}
if load_yaml: if load_yaml:

View File

@ -0,0 +1,123 @@
from gevent.monkey import patch_all; patch_all()
#from bottle import run, Bottle, request, response, debug
from werkzeug.routing import Map, Rule
from werkzeug.exceptions import HTTPException
from werkzeug.wsgi import pop_path_info
from pywb.webagg.autoapp import AutoConfigApp
from pywb.webapp.handlers import StaticHandler
from pywb.framework.wbrequestresponse import WbResponse
from pywb.urlrewrite.geventserver import GeventServer
from pywb.urlrewrite.templateview import BaseInsertView
from pywb.urlrewrite.rewriterapp import RewriterApp, UpstreamException
import traceback
# ============================================================================
class NewWbRequest(object):
def __init__(self, env, wb_url_str, full_prefix):
self.env = env
self.wb_url_str = wb_url_str
self.full_prefix = full_prefix
self.user_metadata = {}
# ============================================================================
class FrontEndApp(RewriterApp):
def __init__(self, config_file='./config.yaml', custom_config=None):
super(FrontEndApp, self).__init__(True)
self.debug = True
self.webagg = AutoConfigApp(config_file=config_file,
custom_config=custom_config)
self.webagg_server = GeventServer(self.webagg, port=0)
self.static_handler = StaticHandler('pywb/static/')
self.url_map = Map()
self.url_map.add(Rule('/static/__pywb/<path:filepath>', endpoint=self.serve_static))
self.url_map.add(Rule('/<coll>/', endpoint=self.serve_coll_page))
self.url_map.add(Rule('/<coll>/<path:url>', endpoint=self.serve_content))
self.url_map.add(Rule('/_coll_info.json', endpoint=self.serve_listing))
self.paths = self.get_upstream_paths(self.webagg_server.port)
def get_upstream_paths(self, port):
return {'replay-dyn': 'http://localhost:%s/_/resource/postreq?param.coll={coll}' % port,
'replay-fixed': 'http://localhost:%s/{coll}/resource/postreq' % port
}
def serve_static(self, environ, filepath=''):
return self.static_handler(NewWbRequest(environ, filepath, ''))
def serve_coll_page(self, environ, coll):
view = BaseInsertView(self.jinja_env, 'search.html')
wbrequest = NewWbRequest(environ, '', '/')
return WbResponse.text_response(view.render_to_string(environ, wbrequest=wbrequest),
content_type='text/html; charset="utf-8"')
def serve_listing(self, environ):
result = {'fixed': self.webagg.list_fixed_routes(),
'dynamic': self.webagg.list_dynamic_routes()
}
return WbResponse.json_response(result)
def serve_content(self, environ, coll='', url=''):
pop_path_info(environ)
wb_url = self.get_wburl(environ)
kwargs = {'coll': coll}
if coll in self.webagg.list_fixed_routes():
kwargs['type'] = 'replay-fixed'
else:
kwargs['type'] = 'replay-dyn'
try:
response = self.render_content(wb_url, kwargs, environ)
except UpstreamException as ue:
response = self.handle_error(environ, ue)
return response
def __call__(self, environ, start_response):
urls = self.url_map.bind_to_environ(environ)
try:
endpoint, args = urls.match()
except HTTPException as e:
return e(environ, start_response)
try:
response = endpoint(environ, **args)
return response(environ, start_response)
except Exception as e:
if self.debug:
traceback.print_exc()
#message = 'Internal Error: ' + str(e)
#status = 500
#return self.send_error({}, start_response,
# message=message,
# status=status)
@classmethod
def create_app(cls, port):
app = FrontEndApp()
app_server = GeventServer(app, port=port, hostname='0.0.0.0')
return app_server
# ============================================================================
if __name__ == "__main__":
app_server = FrontEndApp.create_app(port=8080)
app_server.join()

View File

@ -0,0 +1,36 @@
from gevent.wsgi import WSGIServer
from gevent import spawn
import logging
# ============================================================================
class GeventServer(object):
def __init__(self, app, port=0, hostname='localhost', handler_class=None):
self.port = port
self.make_server(app, port, hostname, handler_class)
def stop(self):
if self.server:
logging.debug('stopping server on ' + str(self.port))
self.server.stop()
def _run(self, server, port):
logging.debug('starting server on ' + str(port))
try:
server.serve_forever()
except Exception as e:
logging.debug('server failed to start on ' + str(port))
traceback.print_exc()
def make_server(self, app, port, hostname, handler_class):
server = WSGIServer((hostname, port), app, handler_class=handler_class)
server.init_socket()
self.port = server.address[1]
self.server = server
self.ge = spawn(self._run, server, self.port)
def join(self):
self.ge.join()

View File

@ -41,7 +41,7 @@ class RewriterApp(object):
self.loader = ArcWarcRecordLoader() self.loader = ArcWarcRecordLoader()
config = config or {} config = config or {}
self.paths = config['url_templates'] self.paths = {}
self.framed_replay = framed_replay self.framed_replay = framed_replay
self.frame_mod = '' self.frame_mod = ''
@ -395,13 +395,14 @@ class RewriterApp(object):
def get_base_url(self, wb_url, kwargs): def get_base_url(self, wb_url, kwargs):
type = kwargs.get('type') type = kwargs.get('type')
return self.paths[type] return self.paths[type].format(**kwargs)
def get_upstream_url(self, wb_url, kwargs, params): def get_upstream_url(self, wb_url, kwargs, params):
base_url = self.get_base_url(wb_url, kwargs) base_url = self.get_base_url(wb_url, kwargs)
param_str = urlencode(params, True) param_str = urlencode(params, True)
if param_str: if param_str:
base_url += '&' + param_str q_char = '&' if '?' in base_url else '?'
base_url += q_char + param_str
return base_url return base_url
def get_cookie_key(self, kwargs): def get_cookie_key(self, kwargs):

View File

@ -1,74 +0,0 @@
from gevent.monkey import patch_all; patch_all()
from bottle import run, Bottle, request, response, debug
from six.moves.urllib.parse import quote
from pywb.utils.loaders import LocalFileLoader
import mimetypes
import redis
from pywb.urlrewrite.rewriterapp import RewriterApp
from pywb.urlrewrite.cookies import CookieTracker
# ============================================================================
class RWApp(RewriterApp):
def __init__(self, upstream_urls, cookie_key_templ, redis):
config = {}
config['url_templates'] = upstream_urls
self.cookie_key_templ = cookie_key_templ
self.app = Bottle()
self.block_loader = LocalFileLoader()
self.init_routes()
super(RWApp, self).__init__(True, config=config)
self.cookie_tracker = CookieTracker(redis)
self.orig_error_handler = self.app.default_error_handler
self.app.default_error_handler = self.err_handler
def err_handler(self, exc):
print(exc)
import traceback
traceback.print_exc()
return self.orig_error_handler(exc)
def get_cookie_key(self, kwargs):
return self.cookie_key_templ.format(**kwargs)
def init_routes(self):
@self.app.get('/static/__pywb/<filepath:path>')
def server_static(filepath):
data = self.block_loader.load('pywb/static/' + filepath)
guessed = mimetypes.guess_type(filepath)
if guessed[0]:
response.headers['Content-Type'] = guessed[0]
return data
self.app.mount('/live/', self.call_with_params(type='live'))
self.app.mount('/record/', self.call_with_params(type='record'))
self.app.mount('/replay/', self.call_with_params(type='replay'))
@staticmethod
def create_app(replay_port=8080, record_port=8010):
upstream_urls = {'live': 'http://localhost:%s/live/resource/postreq?' % replay_port,
'record': 'http://localhost:%s/live/resource/postreq?' % record_port,
'replay': 'http://localhost:%s/replay/resource/postreq?' % replay_port,
}
r = redis.StrictRedis.from_url('redis://localhost/2')
rwapp = RWApp(upstream_urls, 'cookies:', r)
return rwapp
# ============================================================================
if __name__ == "__main__":
application = RWApp.create_app()
application.app.run(port=8090, server='gevent')

View File

@ -1,24 +1,26 @@
from gevent import monkey; monkey.patch_all(thread=False)
from pywb.webagg.test.testutils import LiveServerTests, BaseTestClass from pywb.webagg.test.testutils import LiveServerTests, BaseTestClass
from pywb.webagg.test.testutils import FakeRedisTests from pywb.webagg.test.testutils import FakeRedisTests
from .simpleapp import RWApp, debug from pywb.urlrewrite.frontendapp import FrontEndApp
import os import os
import webtest import webtest
class TestRewriter(LiveServerTests, FakeRedisTests, BaseTestClass): LIVE_CONFIG = {'collections': {'live': '$live'}}
class TestRewriter(FakeRedisTests, BaseTestClass):
@classmethod @classmethod
def setup_class(cls): def setup_class(cls):
super(TestRewriter, cls).setup_class() super(TestRewriter, cls).setup_class()
#cls.upstream_url = 'http://localhost:{0}'.format(cls.server.port)
#cls.upstream_url += '/{type}/resource/postreq?url={url}&closest={closest}'
#cls.app = RWApp(cls.upstream_url)
cls.app = RWApp.create_app(replay_port=cls.server.port) #cls.app = RWApp.create_app(replay_port=cls.server.port)
cls.testapp = webtest.TestApp(cls.app.app) #cls.testapp = webtest.TestApp(cls.app.app)
debug(True) cls.testapp = webtest.TestApp(FrontEndApp(custom_config=LIVE_CONFIG,
config_file=None))
def test_replay(self): def test_replay(self):
resp = self.testapp.get('/live/mp_/http://example.com/') resp = self.testapp.get('/live/mp_/http://example.com/')
@ -36,8 +38,8 @@ class TestRewriter(LiveServerTests, FakeRedisTests, BaseTestClass):
assert 'wbinfo.capture_url = "http://example.com/"' in resp.text assert 'wbinfo.capture_url = "http://example.com/"' in resp.text
def test_cookie_track_1(self): #def test_cookie_track_1(self):
resp = self.testapp.get('/live/mp_/https://twitter.com/') # resp = self.testapp.get('/live/mp_/https://twitter.com/')
assert resp.headers['set-cookie'] != None # assert resp.headers['set-cookie'] != None

View File

@ -31,34 +31,37 @@ SOURCE_LIST = [LiveIndexSource,
# ============================================================================ # ============================================================================
class AutoConfigApp(ResAggApp): class AutoConfigApp(ResAggApp):
def __init__(self, config_file='./config.yaml'): AUTO_DIR_INDEX_PATH = '{coll}/indexes/'
AUTO_DIR_ARCHIVE_PATH = '{coll}/archive/'
def __init__(self, config_file='./config.yaml', custom_config=None):
config = load_yaml_config(DEFAULT_CONFIG) config = load_yaml_config(DEFAULT_CONFIG)
if config_file:
try: try:
new_config = load_config('PYWB_CONFIG_FILE', config_file) custom_config = load_config('PYWB_CONFIG_FILE', config_file)
except Exception as e: except Exception as e:
new_config = {} if not custom_config:
custom_config = {'debug': True}
print(e) print(e)
if new_config: if custom_config:
config.update(new_config) config.update(custom_config)
super(AutoConfigApp, self).__init__(debug=config.get('debug', False)) super(AutoConfigApp, self).__init__(debug=config.get('debug', False))
self.config = config self.config = config
def init(self):
if self.config.get('enable_auto_colls', True): if self.config.get('enable_auto_colls', True):
auto_handler = self.load_auto_colls() auto_handler = self.load_auto_colls()
self.add_route('/_', auto_handler) self.add_route('/_', auto_handler)
routes = self.load_colls() self.fixed_routes = self.load_colls()
for name, route in iteritems(routes):
for name, route in iteritems(self.fixed_routes):
self.add_route('/' + name, route) self.add_route('/' + name, route)
self._add_simple_route('/<coll>-cdx', self.cdx_compat) self._add_simple_route('/<coll>-cdx', self.cdx_compat)
return self
def _lookup(self, environ, path): def _lookup(self, environ, path):
urls = self.url_map.bind(environ['HTTP_HOST'], path_info=path) urls = self.url_map.bind(environ['HTTP_HOST'], path_info=path)
@ -82,21 +85,37 @@ class AutoConfigApp(ResAggApp):
return result return result
def load_auto_colls(self): def load_auto_colls(self):
root_dir = self.config.get('collections_root', '') self.root_dir = self.config.get('collections_root', '')
if not root_dir: if not self.root_dir:
print('No Root Dir, Skip Auto Colls!') print('No Root Dir, Skip Auto Colls!')
return return
indexes_templ = os.path.join('{coll}', 'indexes') + os.path.sep #indexes_templ = os.path.join('{coll}', 'indexes') + os.path.sep
dir_source = CacheDirectoryIndexSource(root_dir, indexes_templ) indexes_templ = self.AUTO_DIR_INDEX_PATH.replace('/', os.path.sep)
dir_source = CacheDirectoryIndexSource(self.root_dir, indexes_templ)
archive_templ = self.config.get('archive_paths') archive_templ = self.config.get('archive_paths')
if not archive_templ: if not archive_templ:
archive_templ = os.path.join('.', root_dir, '{coll}', 'archive') + os.path.sep archive_templ = self.AUTO_DIR_ARCHIVE_PATH.replace('/', os.path.sep)
archive_templ = os.path.join(self.root_dir, archive_templ)
#archive_templ = os.path.join('.', root_dir, '{coll}', 'archive') + os.path.sep
handler = DefaultResourceHandler(dir_source, archive_templ) handler = DefaultResourceHandler(dir_source, archive_templ)
return handler return handler
def list_fixed_routes(self):
return list(self.fixed_routes.keys())
def list_dynamic_routes(self):
if not self.root_dir:
return []
try:
return os.listdir(self.root_dir)
except IOError:
return []
def load_colls(self): def load_colls(self):
routes = {} routes = {}

View File

@ -17,6 +17,10 @@ class TestAutoConfigApp(TempDirTests, BaseTestClass):
os.mkdir('./local') os.mkdir('./local')
os.mkdir('./local/indexes') os.mkdir('./local/indexes')
os.mkdir('collections')
os.mkdir('collections/auto1')
os.mkdir('collections/auto2')
with open(os.path.join('local', 'indexes', 'file.cdxj'), 'a') as fh: with open(os.path.join('local', 'indexes', 'file.cdxj'), 'a') as fh:
fh.write('foo') fh.write('foo')
@ -28,8 +32,6 @@ class TestAutoConfigApp(TempDirTests, BaseTestClass):
cls.loader = AutoConfigApp(os.path.join(cls.get_curr_dir(), 'test_autoapp.yaml')) cls.loader = AutoConfigApp(os.path.join(cls.get_curr_dir(), 'test_autoapp.yaml'))
cls.colls = cls.loader.load_colls()
@classmethod @classmethod
def teardown_class(cls): def teardown_class(cls):
os.chdir(cls.orig_cwd) os.chdir(cls.orig_cwd)
@ -41,11 +43,17 @@ class TestAutoConfigApp(TempDirTests, BaseTestClass):
def _get_sources(self, coll_name='', handler=None): def _get_sources(self, coll_name='', handler=None):
if not handler: if not handler:
handler = self.colls.get(coll_name) handler = self.loader.fixed_routes.get(coll_name)
assert isinstance(handler, ResourceHandler) assert isinstance(handler, ResourceHandler)
assert isinstance(handler.index_source, BaseSourceListAggregator) assert isinstance(handler.index_source, BaseSourceListAggregator)
return handler.index_source.sources return handler.index_source.sources
def test_list_static(self):
assert len(self.loader.list_fixed_routes()) == 12
def test_list_dynamic(self):
assert self.loader.list_dynamic_routes() == ['auto1', 'auto2']
def test_remote_cdx(self): def test_remote_cdx(self):
sources = self._get_sources('ait') sources = self._get_sources('ait')
assert isinstance(sources['ait'], RemoteIndexSource) assert isinstance(sources['ait'], RemoteIndexSource)
@ -90,7 +98,7 @@ class TestAutoConfigApp(TempDirTests, BaseTestClass):
assert isinstance(sources['local_file'], FileIndexSource) assert isinstance(sources['local_file'], FileIndexSource)
def test_sequence(self): def test_sequence(self):
seq = self.colls.get('many_seq') seq = self.loader.fixed_routes.get('many_seq')
assert isinstance(seq, HandlerSeq) assert isinstance(seq, HandlerSeq)
assert len(seq.handlers) == 3 assert len(seq.handlers) == 3

View File

@ -107,6 +107,7 @@ setup(
cdx-indexer = pywb.warc.cdxindexer:main cdx-indexer = pywb.warc.cdxindexer:main
wb-manager = pywb.manager.manager:main_wrap_exc wb-manager = pywb.manager.manager:main_wrap_exc
webagg-server = pywb.apps.cli:webagg webagg-server = pywb.apps.cli:webagg
new-wayback = pywb.apps.cli:new_wayback
""", """,
classifiers=[ classifiers=[
'Development Status :: 4 - Beta', 'Development Status :: 4 - Beta',