mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Recorder App Support (#241)
recording support: now available for dynamic collections via config - config.yaml 'recorder: live' entry enables /record/ subpath which records to any dynamic collections (can record from any collection, though usually live) - autoindex refactor: simplified, standalone AutoIndexer() -- indexes any changed warc files to autoindex.cdxj - windows autoindex support: also check for changed file size, as last modified time may not be changing - manager: remove autoindex, now part of main cli - tests: updated test_auto_colls with autoindex changes - tests: add record/replay tests for recording and replay
This commit is contained in:
parent
a05916617d
commit
93921aadb7
@ -88,19 +88,19 @@ class ReplayCli(BaseCli):
|
|||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
if self.r.autoindex:
|
if self.r.autoindex:
|
||||||
from pywb.manager.manager import CollectionsManager
|
from pywb.manager.autoindex import AutoIndexer
|
||||||
import os
|
import os
|
||||||
|
|
||||||
m = CollectionsManager('', must_exist=False)
|
indexer = AutoIndexer(interval=self.r.auto_interval)
|
||||||
if not os.path.isdir(m.colls_dir):
|
if not os.path.isdir(indexer.root_path):
|
||||||
msg = 'No managed directory "{0}" for auto-indexing'
|
msg = 'No managed directory "{0}" for auto-indexing'
|
||||||
logging.error(msg.format(m.colls_dir))
|
logging.error(msg.format(indexer.root_path))
|
||||||
import sys
|
import sys
|
||||||
sys.exit(2)
|
sys.exit(2)
|
||||||
else:
|
|
||||||
msg = 'Auto-Indexing Enabled on "{0}", checking every {1} secs'
|
msg = 'Auto-Indexing Enabled on "{0}", checking every {1} secs'
|
||||||
logging.info(msg.format(m.colls_dir, self.r.auto_interval))
|
logging.info(msg.format(indexer.root_path, self.r.auto_interval))
|
||||||
m.autoindex(interval=self.r.auto_interval, do_loop=False)
|
indexer.start()
|
||||||
|
|
||||||
super(ReplayCli, self).run()
|
super(ReplayCli, self).run()
|
||||||
|
|
||||||
|
@ -9,6 +9,9 @@ from six import iteritems
|
|||||||
|
|
||||||
from warcio.utils import to_native_str
|
from warcio.utils import to_native_str
|
||||||
|
|
||||||
|
from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter
|
||||||
|
from pywb.recorder.recorderapp import RecorderApp
|
||||||
|
|
||||||
from pywb.utils.loaders import load_yaml_config
|
from pywb.utils.loaders import load_yaml_config
|
||||||
from pywb.utils.geventserver import GeventServer
|
from pywb.utils.geventserver import GeventServer
|
||||||
|
|
||||||
@ -32,10 +35,14 @@ class FrontEndApp(object):
|
|||||||
self.warcserver = WarcServer(config_file=config_file,
|
self.warcserver = WarcServer(config_file=config_file,
|
||||||
custom_config=custom_config)
|
custom_config=custom_config)
|
||||||
|
|
||||||
framed_replay = self.warcserver.config.get('framed_replay', True)
|
config = self.warcserver.config
|
||||||
|
|
||||||
|
framed_replay = config.get('framed_replay', True)
|
||||||
|
|
||||||
self.warcserver_server = GeventServer(self.warcserver, port=0)
|
self.warcserver_server = GeventServer(self.warcserver, port=0)
|
||||||
|
|
||||||
|
self.init_recorder(config)
|
||||||
|
|
||||||
self.static_handler = StaticHandler('pywb/static/')
|
self.static_handler = StaticHandler('pywb/static/')
|
||||||
|
|
||||||
self.url_map = Map()
|
self.url_map = Map()
|
||||||
@ -44,36 +51,59 @@ class FrontEndApp(object):
|
|||||||
self.url_map.add(Rule('/collinfo.json', endpoint=self.serve_listing))
|
self.url_map.add(Rule('/collinfo.json', endpoint=self.serve_listing))
|
||||||
|
|
||||||
if self.is_valid_coll('$root'):
|
if self.is_valid_coll('$root'):
|
||||||
self.url_map.add(Rule('/', endpoint=self.serve_coll_page))
|
coll_prefix = ''
|
||||||
self.url_map.add(Rule('/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content))
|
|
||||||
self.url_map.add(Rule('/cdx', endpoint=self.serve_cdx))
|
|
||||||
self.url_map.add(Rule('/<path:url>', endpoint=self.serve_content))
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.url_map.add(Rule('/<coll>/', endpoint=self.serve_coll_page))
|
coll_prefix = '/<coll>'
|
||||||
self.url_map.add(Rule('/<coll>/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content))
|
|
||||||
self.url_map.add(Rule('/<coll>/cdx', endpoint=self.serve_cdx))
|
|
||||||
self.url_map.add(Rule('/<coll>/<path:url>', endpoint=self.serve_content))
|
|
||||||
|
|
||||||
self.url_map.add(Rule('/', endpoint=self.serve_home))
|
self.url_map.add(Rule('/', endpoint=self.serve_home))
|
||||||
|
|
||||||
|
self.url_map.add(Rule(coll_prefix + '/', endpoint=self.serve_coll_page))
|
||||||
|
self.url_map.add(Rule(coll_prefix + '/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content))
|
||||||
|
self.url_map.add(Rule(coll_prefix + '/cdx', endpoint=self.serve_cdx))
|
||||||
|
if self.recorder:
|
||||||
|
self.url_map.add(Rule(coll_prefix + '/record/<path:url>', endpoint=self.serve_record))
|
||||||
|
|
||||||
|
self.url_map.add(Rule(coll_prefix + '/<path:url>', endpoint=self.serve_content))
|
||||||
|
|
||||||
upstream_paths = self.get_upstream_paths(self.warcserver_server.port)
|
upstream_paths = self.get_upstream_paths(self.warcserver_server.port)
|
||||||
|
|
||||||
self.rewriterapp = RewriterApp(framed_replay,
|
self.rewriterapp = RewriterApp(framed_replay,
|
||||||
config=self.warcserver.config,
|
config=config,
|
||||||
paths=upstream_paths)
|
paths=upstream_paths)
|
||||||
|
|
||||||
self.templates_dir = self.warcserver.config.get('templates_dir', 'templates')
|
self.templates_dir = config.get('templates_dir', 'templates')
|
||||||
self.static_dir = self.warcserver.config.get('static_dir', 'static')
|
self.static_dir = config.get('static_dir', 'static')
|
||||||
|
|
||||||
metadata_templ = os.path.join(self.warcserver.root_dir, '{coll}', 'metadata.yaml')
|
metadata_templ = os.path.join(self.warcserver.root_dir, '{coll}', 'metadata.yaml')
|
||||||
self.metadata_cache = MetadataCache(metadata_templ)
|
self.metadata_cache = MetadataCache(metadata_templ)
|
||||||
|
|
||||||
def get_upstream_paths(self, port):
|
def get_upstream_paths(self, port):
|
||||||
return {
|
base_paths = {
|
||||||
'replay': 'http://localhost:%s/{coll}/resource/postreq' % port,
|
'replay': 'http://localhost:%s/{coll}/resource/postreq' % port,
|
||||||
'cdx-server': 'http://localhost:%s/{coll}/index' % port,
|
'cdx-server': 'http://localhost:%s/{coll}/index' % port,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if self.recorder:
|
||||||
|
base_paths['record'] = 'http://localhost:%s/%s/resource/postreq?param.recorder.coll={coll}' % (self.recorder_port, self.recorder_source)
|
||||||
|
|
||||||
|
return base_paths
|
||||||
|
|
||||||
|
def init_recorder(self, config):
|
||||||
|
self.recorder_source = config.get('recorder')
|
||||||
|
|
||||||
|
if not self.recorder_source:
|
||||||
|
self.recorder = None
|
||||||
|
self.recorder_server = None
|
||||||
|
self.recorder_port = 0
|
||||||
|
return
|
||||||
|
|
||||||
|
dedup_index = None
|
||||||
|
warc_writer = MultiFileWARCWriter(self.warcserver.archive_templ, max_size=1000000000, max_idle_secs=600,
|
||||||
|
dedup_index=dedup_index)
|
||||||
|
|
||||||
|
self.recorder = RecorderApp('http://localhost:' + str(self.warcserver_server.port), warc_writer)
|
||||||
|
self.recorder_server = GeventServer(self.recorder, port=0)
|
||||||
|
self.recorder_port = self.recorder_server.port
|
||||||
|
|
||||||
def serve_home(self, environ):
|
def serve_home(self, environ):
|
||||||
home_view = BaseInsertView(self.rewriterapp.jinja_env, 'index.html')
|
home_view = BaseInsertView(self.rewriterapp.jinja_env, 'index.html')
|
||||||
fixed_routes = self.warcserver.list_fixed_routes()
|
fixed_routes = self.warcserver.list_fixed_routes()
|
||||||
@ -150,13 +180,19 @@ class FrontEndApp(object):
|
|||||||
return WbResponse.bin_stream(res.raw, content_type=res.headers.get('Content-Type'))
|
return WbResponse.bin_stream(res.raw, content_type=res.headers.get('Content-Type'))
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return WbResponse.text_content('Error: ' + str(e), status='400 Bad Request')
|
return WbResponse.text_response('Error: ' + str(e), status='400 Bad Request')
|
||||||
|
|
||||||
def serve_content(self, environ, coll='$root', url='', timemap_output=''):
|
def serve_record(self, environ, coll='$root', url=''):
|
||||||
|
if coll in self.warcserver.list_fixed_routes():
|
||||||
|
return WbResponse.text_response('Error: Can Not Record Into Custom Collection "{0}"'.format(coll))
|
||||||
|
|
||||||
|
return self.serve_content(environ, coll, url, record=True)
|
||||||
|
|
||||||
|
def serve_content(self, environ, coll='$root', url='', timemap_output='', record=False):
|
||||||
if not self.is_valid_coll(coll):
|
if not self.is_valid_coll(coll):
|
||||||
self.raise_not_found(environ, 'No handler for "/{0}"'.format(coll))
|
self.raise_not_found(environ, 'No handler for "/{0}"'.format(coll))
|
||||||
|
|
||||||
self.setup_paths(environ, coll)
|
self.setup_paths(environ, coll, record)
|
||||||
|
|
||||||
wb_url_str = to_native_str(url)
|
wb_url_str = to_native_str(url)
|
||||||
|
|
||||||
@ -164,6 +200,10 @@ class FrontEndApp(object):
|
|||||||
wb_url_str += '?' + environ.get('QUERY_STRING')
|
wb_url_str += '?' + environ.get('QUERY_STRING')
|
||||||
|
|
||||||
metadata = self.get_metadata(coll)
|
metadata = self.get_metadata(coll)
|
||||||
|
if record:
|
||||||
|
metadata['type'] = 'record'
|
||||||
|
print('RECORD')
|
||||||
|
|
||||||
if timemap_output:
|
if timemap_output:
|
||||||
metadata['output'] = timemap_output
|
metadata['output'] = timemap_output
|
||||||
|
|
||||||
@ -175,12 +215,14 @@ class FrontEndApp(object):
|
|||||||
|
|
||||||
return response
|
return response
|
||||||
|
|
||||||
def setup_paths(self, environ, coll):
|
def setup_paths(self, environ, coll, record=False):
|
||||||
if not coll or not self.warcserver.root_dir:
|
if not coll or not self.warcserver.root_dir:
|
||||||
return
|
return
|
||||||
|
|
||||||
if coll != '$root':
|
if coll != '$root':
|
||||||
pop_path_info(environ)
|
pop_path_info(environ)
|
||||||
|
if record:
|
||||||
|
pop_path_info(environ)
|
||||||
|
|
||||||
paths = [self.warcserver.root_dir]
|
paths = [self.warcserver.root_dir]
|
||||||
|
|
||||||
|
@ -2,59 +2,106 @@ import gevent
|
|||||||
import time
|
import time
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from pywb.manager.manager import CollectionsManager
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
EXT_RX = re.compile('.*\.w?arc(\.gz)?$')
|
class AutoIndexer(object):
|
||||||
|
EXT_RX = re.compile('.*\.w?arc(\.gz)?$')
|
||||||
|
AUTO_INDEX_FILE = 'autoindex.cdxj'
|
||||||
|
|
||||||
keep_running = True
|
def __init__(self, interval=30, keep_running=True):
|
||||||
|
self.manager = CollectionsManager('', must_exist=False)
|
||||||
|
|
||||||
|
self.root_path = self.manager.colls_dir
|
||||||
|
|
||||||
#=============================================================================
|
self.keep_running = keep_running
|
||||||
class CDXAutoIndexer(object):
|
|
||||||
def __init__(self, updater, path):
|
|
||||||
self.updater = updater
|
|
||||||
self.root_path = path
|
|
||||||
|
|
||||||
self.mtimes = {}
|
self.interval = interval
|
||||||
|
|
||||||
def has_changed(self, *paths):
|
self.last_size = {}
|
||||||
full_path = os.path.join(*paths)
|
|
||||||
|
def is_newer_than(self, path1, path2, track=False):
|
||||||
try:
|
try:
|
||||||
mtime = os.path.getmtime(full_path)
|
mtime1 = os.path.getmtime(path1)
|
||||||
|
mtime2 = os.path.getmtime(path2)
|
||||||
|
newer = mtime1 > mtime2
|
||||||
except:
|
except:
|
||||||
return False
|
newer = True
|
||||||
|
|
||||||
if mtime == self.mtimes.get(full_path):
|
if track:
|
||||||
return False
|
size = os.path.getsize(path1)
|
||||||
|
try:
|
||||||
|
if size != self.last_size[path1]:
|
||||||
|
newer = True
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
self.mtimes[full_path] = mtime
|
self.last_size[path1] = size
|
||||||
return full_path
|
|
||||||
|
return newer
|
||||||
|
|
||||||
|
def do_index(self, files):
|
||||||
|
logging.info('Auto-Indexing... ' + str(files))
|
||||||
|
self.manager.index_merge(files, self.AUTO_INDEX_FILE)
|
||||||
|
logging.info('...Done')
|
||||||
|
|
||||||
def check_path(self):
|
def check_path(self):
|
||||||
for dirName, subdirList, fileList in os.walk(self.root_path):
|
for coll in os.listdir(self.root_path):
|
||||||
if not subdirList and not self.has_changed(dirName):
|
coll_dir = os.path.join(self.root_path, coll)
|
||||||
return False
|
if not os.path.isdir(coll_dir):
|
||||||
|
continue
|
||||||
|
|
||||||
for filename in fileList:
|
self.manager.change_collection(coll)
|
||||||
if not EXT_RX.match(filename):
|
|
||||||
|
archive_dir = self.manager.archive_dir
|
||||||
|
|
||||||
|
if not os.path.isdir(archive_dir):
|
||||||
|
continue
|
||||||
|
|
||||||
|
index_file = os.path.join(self.manager.indexes_dir, self.AUTO_INDEX_FILE)
|
||||||
|
|
||||||
|
if os.path.isfile(index_file):
|
||||||
|
if os.name != 'nt' and self.is_newer_than(archive_dir, index_file):
|
||||||
continue
|
continue
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
os.makedirs(self.manager.indexes_dir)
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
|
||||||
path = self.has_changed(self.root_path, dirName, filename)
|
logging.info('Collection Possibly Changed: ' + coll)
|
||||||
if not path:
|
to_index = []
|
||||||
continue
|
for dirpath, dirnames, filenames in os.walk(archive_dir):
|
||||||
|
for filename in filenames:
|
||||||
|
if not self.EXT_RX.match(filename):
|
||||||
|
continue
|
||||||
|
|
||||||
self.updater(os.path.join(dirName, filename))
|
full_filename = os.path.join(dirpath, filename)
|
||||||
|
|
||||||
def do_loop(self, interval):
|
if self.is_newer_than(full_filename, index_file, True):
|
||||||
|
to_index.append(full_filename)
|
||||||
|
|
||||||
|
if to_index:
|
||||||
|
self.do_index(to_index)
|
||||||
|
|
||||||
|
def run(self):
|
||||||
try:
|
try:
|
||||||
while keep_running:
|
while self.keep_running:
|
||||||
self.check_path()
|
self.check_path()
|
||||||
time.sleep(interval)
|
if not self.interval:
|
||||||
|
break
|
||||||
|
|
||||||
|
time.sleep(self.interval)
|
||||||
except KeyboardInterrupt: # pragma: no cover
|
except KeyboardInterrupt: # pragma: no cover
|
||||||
return
|
return
|
||||||
|
|
||||||
def start(self, interval):
|
def start(self):
|
||||||
self.ge = gevent.spawn(self.do_loop, interval)
|
self.ge = gevent.spawn(self.run)
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
self.interval = 0
|
||||||
|
self.keep_running = False
|
||||||
|
|
||||||
|
@ -33,7 +33,6 @@ It may be used via cmdline to setup and maintain the
|
|||||||
directory structure expected by pywb
|
directory structure expected by pywb
|
||||||
"""
|
"""
|
||||||
DEF_INDEX_FILE = 'index.cdxj'
|
DEF_INDEX_FILE = 'index.cdxj'
|
||||||
AUTO_INDEX_FILE = 'autoindex.cdxj'
|
|
||||||
|
|
||||||
COLL_RX = re.compile('^[\w][-\w]*$')
|
COLL_RX = re.compile('^[\w][-\w]*$')
|
||||||
|
|
||||||
@ -48,12 +47,12 @@ directory structure expected by pywb
|
|||||||
|
|
||||||
self.colls_dir = os.path.join(os.getcwd(), colls_dir)
|
self.colls_dir = os.path.join(os.getcwd(), colls_dir)
|
||||||
|
|
||||||
self._set_coll_dirs(coll_name)
|
self.change_collection(coll_name)
|
||||||
|
|
||||||
if must_exist:
|
if must_exist:
|
||||||
self._assert_coll_exists()
|
self._assert_coll_exists()
|
||||||
|
|
||||||
def _set_coll_dirs(self, coll_name):
|
def change_collection(self, coll_name):
|
||||||
self.coll_name = coll_name
|
self.coll_name = coll_name
|
||||||
self.curr_coll_dir = os.path.join(self.colls_dir, coll_name)
|
self.curr_coll_dir = os.path.join(self.colls_dir, coll_name)
|
||||||
|
|
||||||
@ -330,35 +329,6 @@ directory structure expected by pywb
|
|||||||
|
|
||||||
migrate.convert_to_cdxj()
|
migrate.convert_to_cdxj()
|
||||||
|
|
||||||
def autoindex(self, interval=30.0, do_loop=True):
|
|
||||||
from pywb.manager.autoindex import CDXAutoIndexer
|
|
||||||
|
|
||||||
if self.coll_name:
|
|
||||||
any_coll = False
|
|
||||||
path = self.archive_dir
|
|
||||||
else:
|
|
||||||
path = self.colls_dir
|
|
||||||
any_coll = True
|
|
||||||
|
|
||||||
def do_index(warc):
|
|
||||||
if any_coll:
|
|
||||||
coll_name = warc.split(self.colls_dir + os.path.sep)
|
|
||||||
coll_name = coll_name[-1].split(os.path.sep)[0]
|
|
||||||
|
|
||||||
if coll_name != self.coll_name:
|
|
||||||
self._set_coll_dirs(coll_name)
|
|
||||||
|
|
||||||
print('Auto-Indexing: ' + warc)
|
|
||||||
self.index_merge([warc], self.AUTO_INDEX_FILE)
|
|
||||||
print('Done.. Waiting for file updates')
|
|
||||||
|
|
||||||
|
|
||||||
indexer = CDXAutoIndexer(do_index, path)
|
|
||||||
indexer.start(interval)
|
|
||||||
#indexer.start_watch()
|
|
||||||
if do_loop:
|
|
||||||
indexer.do_loop(interval)
|
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
def main(args=None):
|
def main(args=None):
|
||||||
@ -469,17 +439,7 @@ Create manage file based web archive collections
|
|||||||
migrate.add_argument('-f', '--force', action='store_true')
|
migrate.add_argument('-f', '--force', action='store_true')
|
||||||
migrate.set_defaults(func=do_migrate)
|
migrate.set_defaults(func=do_migrate)
|
||||||
|
|
||||||
# Auto Index
|
# Parse
|
||||||
def do_autoindex(r):
|
|
||||||
m = CollectionsManager(r.coll_name, must_exist=False)
|
|
||||||
m.autoindex(r.interval, True)
|
|
||||||
|
|
||||||
autoindex_help = 'Automatically index any change archive files'
|
|
||||||
autoindex = subparsers.add_parser('autoindex', help=autoindex_help)
|
|
||||||
autoindex.add_argument('coll_name', nargs='?', default='')
|
|
||||||
autoindex.add_argument('--interval', type=float, default=30.0)
|
|
||||||
autoindex.set_defaults(func=do_autoindex)
|
|
||||||
|
|
||||||
r = parser.parse_args(args=args)
|
r = parser.parse_args(args=args)
|
||||||
r.func(r)
|
r.func(r)
|
||||||
|
|
||||||
|
@ -58,7 +58,7 @@ class BaseLoader(object):
|
|||||||
cdx['recorder_skip'] = '1'
|
cdx['recorder_skip'] = '1'
|
||||||
|
|
||||||
out_headers['WebAgg-Cdx'] = to_native_str(cdx.to_cdxj().rstrip())
|
out_headers['WebAgg-Cdx'] = to_native_str(cdx.to_cdxj().rstrip())
|
||||||
out_headers['WebAgg-Source-Coll'] = source
|
out_headers['WebAgg-Source-Coll'] = to_native_str(source)
|
||||||
|
|
||||||
if not warc_headers:
|
if not warc_headers:
|
||||||
if other_headers:
|
if other_headers:
|
||||||
|
@ -85,8 +85,8 @@ class FakeRedisTests(object):
|
|||||||
# ============================================================================
|
# ============================================================================
|
||||||
class TempDirTests(object):
|
class TempDirTests(object):
|
||||||
@classmethod
|
@classmethod
|
||||||
def setup_class(cls):
|
def setup_class(cls, *args, **kwargs):
|
||||||
super(TempDirTests, cls).setup_class()
|
super(TempDirTests, cls).setup_class(*args, **kwargs)
|
||||||
cls.root_dir = tempfile.mkdtemp()
|
cls.root_dir = tempfile.mkdtemp()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -57,6 +57,9 @@ class WarcServer(BaseWarcServer):
|
|||||||
|
|
||||||
self.fixed_routes = self.load_colls()
|
self.fixed_routes = self.load_colls()
|
||||||
|
|
||||||
|
self.archive_templ = None
|
||||||
|
self.indexes_templ = None
|
||||||
|
|
||||||
for name, route in iteritems(self.fixed_routes):
|
for name, route in iteritems(self.fixed_routes):
|
||||||
self.add_route('/' + name, route)
|
self.add_route('/' + name, route)
|
||||||
|
|
||||||
@ -82,13 +85,13 @@ class WarcServer(BaseWarcServer):
|
|||||||
return
|
return
|
||||||
|
|
||||||
#indexes_templ = os.path.join('{coll}', 'indexes') + os.path.sep
|
#indexes_templ = os.path.join('{coll}', 'indexes') + os.path.sep
|
||||||
indexes_templ = self.AUTO_DIR_INDEX_PATH.replace('/', os.path.sep)
|
self.indexes_templ = self.AUTO_DIR_INDEX_PATH.replace('/', os.path.sep)
|
||||||
dir_source = CacheDirectoryIndexSource(self.root_dir, indexes_templ)
|
dir_source = CacheDirectoryIndexSource(self.root_dir, self.indexes_templ)
|
||||||
|
|
||||||
archive_templ = self.AUTO_DIR_ARCHIVE_PATH.replace('/', os.path.sep)
|
self.archive_templ = self.AUTO_DIR_ARCHIVE_PATH.replace('/', os.path.sep)
|
||||||
archive_templ = os.path.join(self.root_dir, archive_templ)
|
self.archive_templ = os.path.join(self.root_dir, self.archive_templ)
|
||||||
|
|
||||||
handler = DefaultResourceHandler(dir_source, archive_templ)
|
handler = DefaultResourceHandler(dir_source, self.archive_templ)
|
||||||
|
|
||||||
return handler
|
return handler
|
||||||
|
|
||||||
|
@ -3,7 +3,8 @@ from gevent import monkey; monkey.patch_all(thread=False)
|
|||||||
import pytest
|
import pytest
|
||||||
import webtest
|
import webtest
|
||||||
|
|
||||||
from pywb.warcserver.test.testutils import BaseTestClass
|
from pywb.warcserver.test.testutils import BaseTestClass, TempDirTests
|
||||||
|
from pywb.manager.manager import main, CollectionsManager
|
||||||
|
|
||||||
from pywb.apps.frontendapp import FrontEndApp
|
from pywb.apps.frontendapp import FrontEndApp
|
||||||
import os
|
import os
|
||||||
@ -24,16 +25,28 @@ class BaseConfigTest(BaseTestClass):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def get_test_app(cls, config_file, override=None):
|
def get_test_app(cls, config_file, override=None):
|
||||||
config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), config_file)
|
config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), config_file)
|
||||||
return webtest.TestApp(FrontEndApp(config_file=config_file, custom_config=override))
|
app = FrontEndApp(config_file=config_file, custom_config=override)
|
||||||
|
return app, webtest.TestApp(app)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def setup_class(cls, config_file, include_non_frame=True):
|
def setup_class(cls, config_file, include_non_frame=True):
|
||||||
super(BaseConfigTest, cls).setup_class()
|
super(BaseConfigTest, cls).setup_class()
|
||||||
cls.testapp = cls.get_test_app(config_file)
|
cls.app, cls.testapp = cls.get_test_app(config_file)
|
||||||
|
|
||||||
if include_non_frame:
|
if include_non_frame:
|
||||||
cls.testapp_non_frame = cls.get_test_app(config_file,
|
cls.app_non_frame, cls.testapp_non_frame = cls.get_test_app(config_file,
|
||||||
override={'framed_replay': False})
|
override={'framed_replay': False})
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def teardown_class(cls):
|
||||||
|
if cls.app.recorder:
|
||||||
|
cls.app.recorder.writer.close()
|
||||||
|
|
||||||
|
if cls.app_non_frame.recorder:
|
||||||
|
cls.app_non_frame.recorder.writer.close()
|
||||||
|
|
||||||
|
super(BaseConfigTest, cls).teardown_class()
|
||||||
|
|
||||||
def _assert_basic_html(self, resp):
|
def _assert_basic_html(self, resp):
|
||||||
assert resp.status_int == 200
|
assert resp.status_int == 200
|
||||||
assert resp.content_type == 'text/html'
|
assert resp.content_type == 'text/html'
|
||||||
@ -61,3 +74,21 @@ class BaseConfigTest(BaseTestClass):
|
|||||||
return app.head(url.format(fmod), *args, **kwargs)
|
return app.head(url.format(fmod), *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
class CollsDirMixin(TempDirTests):
|
||||||
|
COLLS_DIR = '_test_colls'
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setup_class(cls, *args, **kwargs):
|
||||||
|
super(CollsDirMixin, cls).setup_class(*args, **kwargs)
|
||||||
|
cls.orig_cwd = os.getcwd()
|
||||||
|
cls.root_dir = os.path.realpath(cls.root_dir)
|
||||||
|
os.chdir(cls.root_dir)
|
||||||
|
cls.orig_collections = CollectionsManager.COLLS_DIR
|
||||||
|
CollectionsManager.COLLS_DIR = cls.COLLS_DIR
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def teardown_class(cls):
|
||||||
|
os.chdir(cls.orig_cwd)
|
||||||
|
CollectionsManager.COLLS_DIR = cls.orig_collections
|
||||||
|
super(CollsDirMixin, cls).teardown_class()
|
||||||
|
10
tests/config_test_record.yaml
Normal file
10
tests/config_test_record.yaml
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
debug: true
|
||||||
|
|
||||||
|
collections_root: _test_colls
|
||||||
|
|
||||||
|
recorder: live
|
||||||
|
|
||||||
|
collections:
|
||||||
|
'live': '$live'
|
||||||
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
|||||||
from gevent.monkey import patch_all; patch_all()
|
from .base_config_test import CollsDirMixin
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
@ -17,11 +17,10 @@ from pytest import raises
|
|||||||
from mock import patch
|
from mock import patch
|
||||||
|
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
from pywb.warcserver.test.testutils import TempDirTests, BaseTestClass
|
from pywb.warcserver.test.testutils import BaseTestClass
|
||||||
|
|
||||||
from pywb.manager.manager import main, CollectionsManager
|
from pywb.manager.autoindex import AutoIndexer
|
||||||
|
from pywb.manager.manager import main
|
||||||
import pywb.manager.autoindex
|
|
||||||
|
|
||||||
from pywb.indexer.cdxindexer import main as cdxindexer_main
|
from pywb.indexer.cdxindexer import main as cdxindexer_main
|
||||||
from pywb.warcserver.index.cdxobject import CDXObject
|
from pywb.warcserver.index.cdxobject import CDXObject
|
||||||
@ -34,26 +33,12 @@ ARCHIVE_DIR = 'archive'
|
|||||||
INDEX_DIR = 'indexes'
|
INDEX_DIR = 'indexes'
|
||||||
COLLECTIONS = '_test_colls'
|
COLLECTIONS = '_test_colls'
|
||||||
|
|
||||||
CollectionsManager.COLLS_DIR = COLLECTIONS
|
|
||||||
|
|
||||||
INDEX_FILE = 'index.cdxj'
|
INDEX_FILE = 'index.cdxj'
|
||||||
AUTOINDEX_FILE = 'autoindex.cdxj'
|
AUTOINDEX_FILE = 'autoindex.cdxj'
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class TestManagedColls(TempDirTests, BaseTestClass):
|
class TestManagedColls(CollsDirMixin, BaseTestClass):
|
||||||
@classmethod
|
|
||||||
def setup_class(cls):
|
|
||||||
super(TestManagedColls, cls).setup_class()
|
|
||||||
cls.orig_cwd = os.getcwd()
|
|
||||||
cls.root_dir = os.path.realpath(cls.root_dir)
|
|
||||||
os.chdir(cls.root_dir)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def teardown_class(cls):
|
|
||||||
os.chdir(cls.orig_cwd)
|
|
||||||
super(TestManagedColls, cls).teardown_class()
|
|
||||||
|
|
||||||
def _check_dirs(self, base, dirlist):
|
def _check_dirs(self, base, dirlist):
|
||||||
for dir_ in dirlist:
|
for dir_ in dirlist:
|
||||||
assert os.path.isdir(os.path.join(base, dir_))
|
assert os.path.isdir(os.path.join(base, dir_))
|
||||||
@ -82,8 +67,7 @@ class TestManagedColls(TempDirTests, BaseTestClass):
|
|||||||
colls = os.path.join(self.root_dir, COLLECTIONS)
|
colls = os.path.join(self.root_dir, COLLECTIONS)
|
||||||
os.mkdir(colls)
|
os.mkdir(colls)
|
||||||
|
|
||||||
pywb.manager.autoindex.keep_running = False
|
wayback(['-a', '-p', '0', '--auto-interval', '0'])
|
||||||
wayback(['-a', '-p', '0'])
|
|
||||||
|
|
||||||
def test_create_first_coll(self):
|
def test_create_first_coll(self):
|
||||||
""" Test first collection creation, with all required dirs
|
""" Test first collection creation, with all required dirs
|
||||||
@ -472,8 +456,6 @@ class TestManagedColls(TempDirTests, BaseTestClass):
|
|||||||
archive_sub_dir = os.path.join(archive_dir, 'sub')
|
archive_sub_dir = os.path.join(archive_dir, 'sub')
|
||||||
os.makedirs(archive_sub_dir)
|
os.makedirs(archive_sub_dir)
|
||||||
|
|
||||||
pywb.manager.autoindex.keep_running = True
|
|
||||||
|
|
||||||
def do_copy():
|
def do_copy():
|
||||||
try:
|
try:
|
||||||
time.sleep(1.0)
|
time.sleep(1.0)
|
||||||
@ -481,16 +463,12 @@ class TestManagedColls(TempDirTests, BaseTestClass):
|
|||||||
shutil.copy(self._get_sample_warc('example-extra.warc'), archive_sub_dir)
|
shutil.copy(self._get_sample_warc('example-extra.warc'), archive_sub_dir)
|
||||||
time.sleep(1.0)
|
time.sleep(1.0)
|
||||||
finally:
|
finally:
|
||||||
pywb.manager.autoindex.keep_running = False
|
indexer.interval = 0
|
||||||
|
|
||||||
|
indexer = AutoIndexer(interval=0.25)
|
||||||
|
indexer.start()
|
||||||
|
|
||||||
#thread = threading.Thread(target=do_copy)
|
|
||||||
#thread.daemon = True
|
|
||||||
#thread.start()
|
|
||||||
ge = gevent.spawn(do_copy)
|
ge = gevent.spawn(do_copy)
|
||||||
|
|
||||||
main(['autoindex', 'auto', '--interval', '0.25'])
|
|
||||||
|
|
||||||
#thread.join()
|
|
||||||
ge.join()
|
ge.join()
|
||||||
|
|
||||||
index_file = os.path.join(auto_dir, INDEX_DIR, AUTOINDEX_FILE)
|
index_file = os.path.join(auto_dir, INDEX_DIR, AUTOINDEX_FILE)
|
||||||
@ -505,7 +483,8 @@ class TestManagedColls(TempDirTests, BaseTestClass):
|
|||||||
mtime = os.path.getmtime(index_file)
|
mtime = os.path.getmtime(index_file)
|
||||||
|
|
||||||
# Update
|
# Update
|
||||||
pywb.manager.autoindex.keep_running = True
|
indexer.interval = 0.25
|
||||||
|
indexer.start()
|
||||||
|
|
||||||
os.remove(index_file)
|
os.remove(index_file)
|
||||||
|
|
||||||
@ -514,7 +493,7 @@ class TestManagedColls(TempDirTests, BaseTestClass):
|
|||||||
#thread.start()
|
#thread.start()
|
||||||
ge = gevent.spawn(do_copy)
|
ge = gevent.spawn(do_copy)
|
||||||
|
|
||||||
main(['autoindex', 'auto', '--interval', '0.25'])
|
#wayback(['-p', '0', '-a', '--auto-interval', '0.25'])
|
||||||
|
|
||||||
#thread.join()
|
#thread.join()
|
||||||
ge.join()
|
ge.join()
|
||||||
|
69
tests/test_record_replay.py
Normal file
69
tests/test_record_replay.py
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
from .base_config_test import BaseConfigTest, fmod, CollsDirMixin
|
||||||
|
from pywb.manager.manager import main as manager
|
||||||
|
from pywb.manager.autoindex import AutoIndexer
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
class TestRecordReplay(CollsDirMixin, BaseConfigTest):
|
||||||
|
@classmethod
|
||||||
|
def setup_class(cls):
|
||||||
|
super(TestRecordReplay, cls).setup_class('config_test_record.yaml')
|
||||||
|
cls.indexer = AutoIndexer(interval=0.25)
|
||||||
|
cls.indexer.start()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def teardown_class(cls):
|
||||||
|
cls.indexer.stop()
|
||||||
|
super(TestRecordReplay, cls).teardown_class()
|
||||||
|
|
||||||
|
def test_init_coll(self):
|
||||||
|
manager(['init', 'test'])
|
||||||
|
assert os.path.isdir(os.path.join(self.root_dir, '_test_colls', 'test', 'archive'))
|
||||||
|
|
||||||
|
manager(['init', 'test2'])
|
||||||
|
assert os.path.isdir(os.path.join(self.root_dir, '_test_colls', 'test2', 'archive'))
|
||||||
|
|
||||||
|
def test_record_1(self, fmod):
|
||||||
|
fmod_slash = fmod + '/' if fmod else ''
|
||||||
|
res = self.get('/test/record/mp_/http://httpbin.org/get?A=B', fmod_slash)
|
||||||
|
assert '"A": "B"' in res.text
|
||||||
|
|
||||||
|
def test_replay_1(self, fmod):
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
fmod_slash = fmod + '/' if fmod else ''
|
||||||
|
res = self.get('/test/mp_/http://httpbin.org/get?A=B', fmod_slash)
|
||||||
|
assert '"A": "B"' in res.text
|
||||||
|
|
||||||
|
def test_record_2(self, fmod):
|
||||||
|
fmod_slash = fmod + '/' if fmod else ''
|
||||||
|
res = self.get('/test2/record/{0}http://httpbin.org/get?C=D', fmod_slash)
|
||||||
|
assert '"C": "D"' in res.text
|
||||||
|
|
||||||
|
def test_replay_2(self, fmod):
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
fmod_slash = fmod + '/' if fmod else ''
|
||||||
|
res = self.get('/test2/{0}http://httpbin.org/get?C=D', fmod_slash)
|
||||||
|
assert '"C": "D"' in res.text
|
||||||
|
|
||||||
|
def test_record_again_1(self, fmod):
|
||||||
|
fmod_slash = fmod + '/' if fmod else ''
|
||||||
|
res = self.get('/test/record/{0}http://httpbin.org/get?C=D', fmod_slash)
|
||||||
|
assert '"C": "D"' in res.text
|
||||||
|
|
||||||
|
def test_replay_again_1(self, fmod):
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
fmod_slash = fmod + '/' if fmod else ''
|
||||||
|
res = self.get('/test/{0}http://httpbin.org/get?C=D', fmod_slash)
|
||||||
|
assert '"C": "D"' in res.text
|
||||||
|
|
||||||
|
# two warcs, for framed and non-framed capture
|
||||||
|
assert len(os.listdir(os.path.join(self.root_dir, '_test_colls', 'test', 'archive'))) == 2
|
||||||
|
|
||||||
|
assert len(os.listdir(os.path.join(self.root_dir, '_test_colls', 'test', 'indexes'))) == 1
|
||||||
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user