1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-26 15:59:23 +01:00
Ilya Kreymer 31bf7a47f1 new-wayback cli script, using new FrontEndApp (rewriting) + AutoConfigApp (config-driven aggregator)
support for dynamic collections: check all .cdxj files in /<coll>/indexes/*.cdxj when accessing /<coll>
support for fixed routes: specified in config.yaml as per https://github.com/ikreymer/pywb/wiki/Distributed-Archive-Config
werkzeug routing in FrontEndApp: default query, replay, search pages working
route listing: /_coll_info.json for listing fixed + dynamic routes
autoindexing enabled, indexing WARCs added to archives directory to .cdxj index
Addresses 
2017-02-17 18:04:07 -08:00

191 lines
6.3 KiB
Python

from argparse import ArgumentParser
import logging
#=================================================================
def cdx_server(args=None): #pragma: no cover
CdxCli(args=args,
default_port=8080,
desc='pywb CDX Index Server').run()
#=================================================================
def live_rewrite_server(args=None): #pragma: no cover
LiveCli(args=args,
default_port=8090,
desc='pywb Live Rewrite Proxy Server').run()
#=================================================================
def wayback(args=None):
WaybackCli(args=args,
default_port=8080,
desc='pywb Wayback Web Archive Replay').run()
#=============================================================================
def webagg():
WebaggCli().run()
#=============================================================================
def new_wayback():
NewWaybackCli().run()
#=============================================================================
class BaseCli(object):
def __init__(self, args=None, default_port=8080, desc=''):
parser = ArgumentParser(description=desc)
parser.add_argument('-p', '--port', type=int, default=default_port)
parser.add_argument('-t', '--threads', type=int, default=4)
parser.add_argument('-s', '--server', default='gevent')
parser.add_argument('--debug', action='store_true')
self.desc = desc
self._extend_parser(parser)
self.r = parser.parse_args(args)
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
level=logging.DEBUG if self.r.debug else logging.INFO)
if self.r.server == 'gevent':
try:
from gevent.monkey import patch_all; patch_all()
logging.debug('Using Gevent')
except:
logging.debug('No Gevent')
self.r.server = 'wsgiref'
from pywb.framework.wsgi_wrappers import init_app
self.init_app = init_app
self.application = self.load()
def _extend_parser(self, parser): #pragma: no cover
pass
def load(self): #pragma: no cover
pass
def run(self):
if self.r.server == 'gevent': #pragma: no cover
self.run_gevent()
elif self.r.server == 'waitress': #pragma: no cover
self.run_waitress()
else:
self.run_wsgiref()
def run_waitress(self): #pragma: no cover
from waitress import serve
logging.debug(str(self.desc))
serve(self.application, port=self.r.port, threads=self.r.threads)
def run_wsgiref(self): #pragma: no cover
from pywb.framework.wsgi_wrappers import start_wsgi_ref_server
start_wsgi_ref_server(self.application, self.desc, port=self.r.port)
def run_gevent(self):
from gevent.pywsgi import WSGIServer
logging.info('Starting Gevent Server on ' + str(self.r.port))
WSGIServer(('', self.r.port), self.application).serve_forever()
#=============================================================================
class LiveCli(BaseCli):
def _extend_parser(self, parser):
parser.add_argument('-x', '--proxy',
help='Specify host:port to use as HTTP/S proxy')
parser.add_argument('-f', '--framed', action='store_true',
help='Replay using framed wrapping mode')
def load(self):
config = dict(proxyhostport=self.r.proxy,
framed_replay='inverse' if self.r.framed else False,
enable_auto_colls=False,
collections={'live': '$liveweb'})
from pywb.webapp.pywb_init import create_wb_router
return self.init_app(create_wb_router, load_yaml=False, config=config)
#=============================================================================
class ReplayCli(BaseCli):
def _extend_parser(self, parser):
parser.add_argument('-a', '--autoindex', action='store_true')
parser.add_argument('--auto-interval', type=int, default=30)
help_dir='Specify root archive dir (default is current working directory)'
parser.add_argument('-d', '--directory', help=help_dir)
def load(self):
if self.r.directory: #pragma: no cover
os.chdir(self.r.directory)
def run(self):
if self.r.autoindex:
from pywb.manager.manager import CollectionsManager
import os
m = CollectionsManager('', must_exist=False)
if not os.path.isdir(m.colls_dir):
msg = 'No managed directory "{0}" for auto-indexing'
logging.error(msg.format(m.colls_dir))
import sys
sys.exit(2)
else:
msg = 'Auto-Indexing Enabled on "{0}", checking every {1} secs'
logging.info(msg.format(m.colls_dir, self.r.auto_interval))
m.autoindex(interval=self.r.auto_interval, do_loop=False)
super(ReplayCli, self).run()
#=============================================================================
class CdxCli(ReplayCli): #pragma: no cover
def load(self):
from pywb.webapp.pywb_init import create_cdx_server_app
super(CdxCli, self).load()
return self.init_app(create_cdx_server_app,
load_yaml=True)
#=============================================================================
class WaybackCli(ReplayCli):
def load(self):
from pywb.webapp.pywb_init import create_wb_router
super(WaybackCli, self).load()
return self.init_app(create_wb_router,
load_yaml=True)
#=============================================================================
class WebaggCli(BaseCli):
def load(self):
from pywb.apps.webagg import application
return application
def run(self):
self.run_gevent()
#=============================================================================
class NewWaybackCli(ReplayCli):
def load(self):
from pywb.apps.newwayback import application
return application
def run(self):
self.r.server = 'gevent'
super(NewWaybackCli, self).run()
#self.run_gevent()
#=============================================================================
if __name__ == "__main__":
wayback()