1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

new-wayback cli script, using new FrontEndApp (rewriting) + AutoConfigApp (config-driven aggregator)

support for dynamic collections: check all .cdxj files in /<coll>/indexes/*.cdxj when accessing /<coll>
support for fixed routes: specified in config.yaml as per https://github.com/ikreymer/pywb/wiki/Distributed-Archive-Config
werkzeug routing in FrontEndApp: default query, replay, search pages working
route listing: /_coll_info.json for listing fixed + dynamic routes
autoindexing enabled, indexing WARCs added to archives directory to .cdxj index
Addresses #196
This commit is contained in:
Ilya Kreymer 2017-02-17 18:04:07 -08:00
parent 531422fc1b
commit 31bf7a47f1
13 changed files with 270 additions and 123 deletions

View File

@ -1,4 +1,6 @@
from argparse import ArgumentParser
import logging
#=================================================================
def cdx_server(args=None): #pragma: no cover
@ -26,6 +28,11 @@ def webagg():
WebaggCli().run()
#=============================================================================
def new_wayback():
NewWaybackCli().run()
#=============================================================================
class BaseCli(object):
def __init__(self, args=None, default_port=8080, desc=''):
@ -33,6 +40,7 @@ class BaseCli(object):
parser.add_argument('-p', '--port', type=int, default=default_port)
parser.add_argument('-t', '--threads', type=int, default=4)
parser.add_argument('-s', '--server', default='gevent')
parser.add_argument('--debug', action='store_true')
self.desc = desc
@ -40,12 +48,15 @@ class BaseCli(object):
self.r = parser.parse_args(args)
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
level=logging.DEBUG if self.r.debug else logging.INFO)
if self.r.server == 'gevent':
try:
from gevent.monkey import patch_all; patch_all()
print('Using Gevent')
logging.debug('Using Gevent')
except:
print('No Gevent')
logging.debug('No Gevent')
self.r.server = 'wsgiref'
from pywb.framework.wsgi_wrappers import init_app
@ -69,7 +80,7 @@ class BaseCli(object):
def run_waitress(self): #pragma: no cover
from waitress import serve
print(self.desc)
logging.debug(str(self.desc))
serve(self.application, port=self.r.port, threads=self.r.threads)
def run_wsgiref(self): #pragma: no cover
@ -78,7 +89,7 @@ class BaseCli(object):
def run_gevent(self):
from gevent.pywsgi import WSGIServer
print('Starting Gevent Server on ' + str(self.r.port))
logging.info('Starting Gevent Server on ' + str(self.r.port))
WSGIServer(('', self.r.port), self.application).serve_forever()
@ -105,6 +116,7 @@ class LiveCli(BaseCli):
class ReplayCli(BaseCli):
def _extend_parser(self, parser):
parser.add_argument('-a', '--autoindex', action='store_true')
parser.add_argument('--auto-interval', type=int, default=30)
help_dir='Specify root archive dir (default is current working directory)'
parser.add_argument('-d', '--directory', help=help_dir)
@ -118,7 +130,6 @@ class ReplayCli(BaseCli):
if self.r.autoindex:
from pywb.manager.manager import CollectionsManager
import os
import logging
m = CollectionsManager('', must_exist=False)
if not os.path.isdir(m.colls_dir):
@ -127,12 +138,13 @@ class ReplayCli(BaseCli):
import sys
sys.exit(2)
else:
msg = 'Auto-Indexing Enabled on "{0}"'
logging.info(msg.format(m.colls_dir))
m.autoindex(do_loop=False)
msg = 'Auto-Indexing Enabled on "{0}", checking every {1} secs'
logging.info(msg.format(m.colls_dir, self.r.auto_interval))
m.autoindex(interval=self.r.auto_interval, do_loop=False)
super(ReplayCli, self).run()
#=============================================================================
class CdxCli(ReplayCli): #pragma: no cover
def load(self):
@ -161,6 +173,18 @@ class WebaggCli(BaseCli):
self.run_gevent()
#=============================================================================
class NewWaybackCli(ReplayCli):
def load(self):
from pywb.apps.newwayback import application
return application
def run(self):
self.r.server = 'gevent'
super(NewWaybackCli, self).run()
#self.run_gevent()
#=============================================================================
if __name__ == "__main__":
wayback()

6
pywb/apps/newwayback.py Normal file
View File

@ -0,0 +1,6 @@
from gevent.monkey import patch_all; patch_all()
from pywb.urlrewrite.frontendapp import FrontEndApp
application = FrontEndApp()

View File

@ -1,6 +1,6 @@
from gevent.monkey import patch_all; patch_all()
from pywb.webagg.autoapp import AutoConfigApp
application = AutoConfigApp().init()
application = AutoConfigApp()

View File

@ -4,6 +4,7 @@ from pywb.utils.loaders import extract_post_query, append_post_query
from io import BytesIO
import pprint
import re
import json
#=================================================================
@ -246,6 +247,10 @@ class WbResponse(object):
return WbResponse(status_headers, value=[encoded_text])
@staticmethod
def json_response(obj, status='200 OK', content_type='application/json; charset=utf-8'):
return WbResponse.text_response(json.dumps(obj), status, content_type)
@staticmethod
def redir_response(location, status='302 Redirect', headers=None):
redir_headers = [('Location', location), ('Content-Length', '0')]

View File

@ -133,10 +133,6 @@ DEFAULT_CONFIG_FILE = 'config.yaml'
#=================================================================
def init_app(init_func, load_yaml=True, config_file=None, config=None):
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
level=logging.DEBUG)
logging.debug('')
try:
config = config or {}
if load_yaml:

View File

@ -0,0 +1,123 @@
from gevent.monkey import patch_all; patch_all()
#from bottle import run, Bottle, request, response, debug
from werkzeug.routing import Map, Rule
from werkzeug.exceptions import HTTPException
from werkzeug.wsgi import pop_path_info
from pywb.webagg.autoapp import AutoConfigApp
from pywb.webapp.handlers import StaticHandler
from pywb.framework.wbrequestresponse import WbResponse
from pywb.urlrewrite.geventserver import GeventServer
from pywb.urlrewrite.templateview import BaseInsertView
from pywb.urlrewrite.rewriterapp import RewriterApp, UpstreamException
import traceback
# ============================================================================
class NewWbRequest(object):
def __init__(self, env, wb_url_str, full_prefix):
self.env = env
self.wb_url_str = wb_url_str
self.full_prefix = full_prefix
self.user_metadata = {}
# ============================================================================
class FrontEndApp(RewriterApp):
def __init__(self, config_file='./config.yaml', custom_config=None):
super(FrontEndApp, self).__init__(True)
self.debug = True
self.webagg = AutoConfigApp(config_file=config_file,
custom_config=custom_config)
self.webagg_server = GeventServer(self.webagg, port=0)
self.static_handler = StaticHandler('pywb/static/')
self.url_map = Map()
self.url_map.add(Rule('/static/__pywb/<path:filepath>', endpoint=self.serve_static))
self.url_map.add(Rule('/<coll>/', endpoint=self.serve_coll_page))
self.url_map.add(Rule('/<coll>/<path:url>', endpoint=self.serve_content))
self.url_map.add(Rule('/_coll_info.json', endpoint=self.serve_listing))
self.paths = self.get_upstream_paths(self.webagg_server.port)
def get_upstream_paths(self, port):
return {'replay-dyn': 'http://localhost:%s/_/resource/postreq?param.coll={coll}' % port,
'replay-fixed': 'http://localhost:%s/{coll}/resource/postreq' % port
}
def serve_static(self, environ, filepath=''):
return self.static_handler(NewWbRequest(environ, filepath, ''))
def serve_coll_page(self, environ, coll):
view = BaseInsertView(self.jinja_env, 'search.html')
wbrequest = NewWbRequest(environ, '', '/')
return WbResponse.text_response(view.render_to_string(environ, wbrequest=wbrequest),
content_type='text/html; charset="utf-8"')
def serve_listing(self, environ):
result = {'fixed': self.webagg.list_fixed_routes(),
'dynamic': self.webagg.list_dynamic_routes()
}
return WbResponse.json_response(result)
def serve_content(self, environ, coll='', url=''):
pop_path_info(environ)
wb_url = self.get_wburl(environ)
kwargs = {'coll': coll}
if coll in self.webagg.list_fixed_routes():
kwargs['type'] = 'replay-fixed'
else:
kwargs['type'] = 'replay-dyn'
try:
response = self.render_content(wb_url, kwargs, environ)
except UpstreamException as ue:
response = self.handle_error(environ, ue)
return response
def __call__(self, environ, start_response):
urls = self.url_map.bind_to_environ(environ)
try:
endpoint, args = urls.match()
except HTTPException as e:
return e(environ, start_response)
try:
response = endpoint(environ, **args)
return response(environ, start_response)
except Exception as e:
if self.debug:
traceback.print_exc()
#message = 'Internal Error: ' + str(e)
#status = 500
#return self.send_error({}, start_response,
# message=message,
# status=status)
@classmethod
def create_app(cls, port):
app = FrontEndApp()
app_server = GeventServer(app, port=port, hostname='0.0.0.0')
return app_server
# ============================================================================
if __name__ == "__main__":
app_server = FrontEndApp.create_app(port=8080)
app_server.join()

View File

@ -0,0 +1,36 @@
from gevent.wsgi import WSGIServer
from gevent import spawn
import logging
# ============================================================================
class GeventServer(object):
def __init__(self, app, port=0, hostname='localhost', handler_class=None):
self.port = port
self.make_server(app, port, hostname, handler_class)
def stop(self):
if self.server:
logging.debug('stopping server on ' + str(self.port))
self.server.stop()
def _run(self, server, port):
logging.debug('starting server on ' + str(port))
try:
server.serve_forever()
except Exception as e:
logging.debug('server failed to start on ' + str(port))
traceback.print_exc()
def make_server(self, app, port, hostname, handler_class):
server = WSGIServer((hostname, port), app, handler_class=handler_class)
server.init_socket()
self.port = server.address[1]
self.server = server
self.ge = spawn(self._run, server, self.port)
def join(self):
self.ge.join()

View File

@ -41,7 +41,7 @@ class RewriterApp(object):
self.loader = ArcWarcRecordLoader()
config = config or {}
self.paths = config['url_templates']
self.paths = {}
self.framed_replay = framed_replay
self.frame_mod = ''
@ -395,13 +395,14 @@ class RewriterApp(object):
def get_base_url(self, wb_url, kwargs):
type = kwargs.get('type')
return self.paths[type]
return self.paths[type].format(**kwargs)
def get_upstream_url(self, wb_url, kwargs, params):
base_url = self.get_base_url(wb_url, kwargs)
param_str = urlencode(params, True)
if param_str:
base_url += '&' + param_str
q_char = '&' if '?' in base_url else '?'
base_url += q_char + param_str
return base_url
def get_cookie_key(self, kwargs):

View File

@ -1,74 +0,0 @@
from gevent.monkey import patch_all; patch_all()
from bottle import run, Bottle, request, response, debug
from six.moves.urllib.parse import quote
from pywb.utils.loaders import LocalFileLoader
import mimetypes
import redis
from pywb.urlrewrite.rewriterapp import RewriterApp
from pywb.urlrewrite.cookies import CookieTracker
# ============================================================================
class RWApp(RewriterApp):
def __init__(self, upstream_urls, cookie_key_templ, redis):
config = {}
config['url_templates'] = upstream_urls
self.cookie_key_templ = cookie_key_templ
self.app = Bottle()
self.block_loader = LocalFileLoader()
self.init_routes()
super(RWApp, self).__init__(True, config=config)
self.cookie_tracker = CookieTracker(redis)
self.orig_error_handler = self.app.default_error_handler
self.app.default_error_handler = self.err_handler
def err_handler(self, exc):
print(exc)
import traceback
traceback.print_exc()
return self.orig_error_handler(exc)
def get_cookie_key(self, kwargs):
return self.cookie_key_templ.format(**kwargs)
def init_routes(self):
@self.app.get('/static/__pywb/<filepath:path>')
def server_static(filepath):
data = self.block_loader.load('pywb/static/' + filepath)
guessed = mimetypes.guess_type(filepath)
if guessed[0]:
response.headers['Content-Type'] = guessed[0]
return data
self.app.mount('/live/', self.call_with_params(type='live'))
self.app.mount('/record/', self.call_with_params(type='record'))
self.app.mount('/replay/', self.call_with_params(type='replay'))
@staticmethod
def create_app(replay_port=8080, record_port=8010):
upstream_urls = {'live': 'http://localhost:%s/live/resource/postreq?' % replay_port,
'record': 'http://localhost:%s/live/resource/postreq?' % record_port,
'replay': 'http://localhost:%s/replay/resource/postreq?' % replay_port,
}
r = redis.StrictRedis.from_url('redis://localhost/2')
rwapp = RWApp(upstream_urls, 'cookies:', r)
return rwapp
# ============================================================================
if __name__ == "__main__":
application = RWApp.create_app()
application.app.run(port=8090, server='gevent')

View File

@ -1,24 +1,26 @@
from gevent import monkey; monkey.patch_all(thread=False)
from pywb.webagg.test.testutils import LiveServerTests, BaseTestClass
from pywb.webagg.test.testutils import FakeRedisTests
from .simpleapp import RWApp, debug
from pywb.urlrewrite.frontendapp import FrontEndApp
import os
import webtest
class TestRewriter(LiveServerTests, FakeRedisTests, BaseTestClass):
LIVE_CONFIG = {'collections': {'live': '$live'}}
class TestRewriter(FakeRedisTests, BaseTestClass):
@classmethod
def setup_class(cls):
super(TestRewriter, cls).setup_class()
#cls.upstream_url = 'http://localhost:{0}'.format(cls.server.port)
#cls.upstream_url += '/{type}/resource/postreq?url={url}&closest={closest}'
#cls.app = RWApp(cls.upstream_url)
cls.app = RWApp.create_app(replay_port=cls.server.port)
cls.testapp = webtest.TestApp(cls.app.app)
debug(True)
#cls.app = RWApp.create_app(replay_port=cls.server.port)
#cls.testapp = webtest.TestApp(cls.app.app)
cls.testapp = webtest.TestApp(FrontEndApp(custom_config=LIVE_CONFIG,
config_file=None))
def test_replay(self):
resp = self.testapp.get('/live/mp_/http://example.com/')
@ -36,8 +38,8 @@ class TestRewriter(LiveServerTests, FakeRedisTests, BaseTestClass):
assert 'wbinfo.capture_url = "http://example.com/"' in resp.text
def test_cookie_track_1(self):
resp = self.testapp.get('/live/mp_/https://twitter.com/')
#def test_cookie_track_1(self):
# resp = self.testapp.get('/live/mp_/https://twitter.com/')
assert resp.headers['set-cookie'] != None
# assert resp.headers['set-cookie'] != None

View File

@ -31,34 +31,37 @@ SOURCE_LIST = [LiveIndexSource,
# ============================================================================
class AutoConfigApp(ResAggApp):
def __init__(self, config_file='./config.yaml'):
AUTO_DIR_INDEX_PATH = '{coll}/indexes/'
AUTO_DIR_ARCHIVE_PATH = '{coll}/archive/'
def __init__(self, config_file='./config.yaml', custom_config=None):
config = load_yaml_config(DEFAULT_CONFIG)
try:
new_config = load_config('PYWB_CONFIG_FILE', config_file)
except Exception as e:
new_config = {}
print(e)
if config_file:
try:
custom_config = load_config('PYWB_CONFIG_FILE', config_file)
except Exception as e:
if not custom_config:
custom_config = {'debug': True}
print(e)
if new_config:
config.update(new_config)
if custom_config:
config.update(custom_config)
super(AutoConfigApp, self).__init__(debug=config.get('debug', False))
self.config = config
def init(self):
if self.config.get('enable_auto_colls', True):
auto_handler = self.load_auto_colls()
self.add_route('/_', auto_handler)
routes = self.load_colls()
for name, route in iteritems(routes):
self.fixed_routes = self.load_colls()
for name, route in iteritems(self.fixed_routes):
self.add_route('/' + name, route)
self._add_simple_route('/<coll>-cdx', self.cdx_compat)
return self
def _lookup(self, environ, path):
urls = self.url_map.bind(environ['HTTP_HOST'], path_info=path)
@ -82,21 +85,37 @@ class AutoConfigApp(ResAggApp):
return result
def load_auto_colls(self):
root_dir = self.config.get('collections_root', '')
if not root_dir:
self.root_dir = self.config.get('collections_root', '')
if not self.root_dir:
print('No Root Dir, Skip Auto Colls!')
return
indexes_templ = os.path.join('{coll}', 'indexes') + os.path.sep
dir_source = CacheDirectoryIndexSource(root_dir, indexes_templ)
#indexes_templ = os.path.join('{coll}', 'indexes') + os.path.sep
indexes_templ = self.AUTO_DIR_INDEX_PATH.replace('/', os.path.sep)
dir_source = CacheDirectoryIndexSource(self.root_dir, indexes_templ)
archive_templ = self.config.get('archive_paths')
if not archive_templ:
archive_templ = os.path.join('.', root_dir, '{coll}', 'archive') + os.path.sep
archive_templ = self.AUTO_DIR_ARCHIVE_PATH.replace('/', os.path.sep)
archive_templ = os.path.join(self.root_dir, archive_templ)
#archive_templ = os.path.join('.', root_dir, '{coll}', 'archive') + os.path.sep
handler = DefaultResourceHandler(dir_source, archive_templ)
return handler
def list_fixed_routes(self):
return list(self.fixed_routes.keys())
def list_dynamic_routes(self):
if not self.root_dir:
return []
try:
return os.listdir(self.root_dir)
except IOError:
return []
def load_colls(self):
routes = {}

View File

@ -17,6 +17,10 @@ class TestAutoConfigApp(TempDirTests, BaseTestClass):
os.mkdir('./local')
os.mkdir('./local/indexes')
os.mkdir('collections')
os.mkdir('collections/auto1')
os.mkdir('collections/auto2')
with open(os.path.join('local', 'indexes', 'file.cdxj'), 'a') as fh:
fh.write('foo')
@ -28,8 +32,6 @@ class TestAutoConfigApp(TempDirTests, BaseTestClass):
cls.loader = AutoConfigApp(os.path.join(cls.get_curr_dir(), 'test_autoapp.yaml'))
cls.colls = cls.loader.load_colls()
@classmethod
def teardown_class(cls):
os.chdir(cls.orig_cwd)
@ -41,11 +43,17 @@ class TestAutoConfigApp(TempDirTests, BaseTestClass):
def _get_sources(self, coll_name='', handler=None):
if not handler:
handler = self.colls.get(coll_name)
handler = self.loader.fixed_routes.get(coll_name)
assert isinstance(handler, ResourceHandler)
assert isinstance(handler.index_source, BaseSourceListAggregator)
return handler.index_source.sources
def test_list_static(self):
assert len(self.loader.list_fixed_routes()) == 12
def test_list_dynamic(self):
assert self.loader.list_dynamic_routes() == ['auto1', 'auto2']
def test_remote_cdx(self):
sources = self._get_sources('ait')
assert isinstance(sources['ait'], RemoteIndexSource)
@ -90,7 +98,7 @@ class TestAutoConfigApp(TempDirTests, BaseTestClass):
assert isinstance(sources['local_file'], FileIndexSource)
def test_sequence(self):
seq = self.colls.get('many_seq')
seq = self.loader.fixed_routes.get('many_seq')
assert isinstance(seq, HandlerSeq)
assert len(seq.handlers) == 3

View File

@ -107,6 +107,7 @@ setup(
cdx-indexer = pywb.warc.cdxindexer:main
wb-manager = pywb.manager.manager:main_wrap_exc
webagg-server = pywb.apps.cli:webagg
new-wayback = pywb.apps.cli:new_wayback
""",
classifiers=[
'Development Status :: 4 - Beta',