mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Recorder App Support (#241)
recording support: now available for dynamic collections via config - config.yaml 'recorder: live' entry enables /record/ subpath which records to any dynamic collections (can record from any collection, though usually live) - autoindex refactor: simplified, standalone AutoIndexer() -- indexes any changed warc files to autoindex.cdxj - windows autoindex support: also check for changed file size, as last modified time may not be changing - manager: remove autoindex, now part of main cli - tests: updated test_auto_colls with autoindex changes - tests: add record/replay tests for recording and replay
This commit is contained in:
parent
a05916617d
commit
93921aadb7
@ -88,19 +88,19 @@ class ReplayCli(BaseCli):
|
||||
|
||||
def run(self):
|
||||
if self.r.autoindex:
|
||||
from pywb.manager.manager import CollectionsManager
|
||||
from pywb.manager.autoindex import AutoIndexer
|
||||
import os
|
||||
|
||||
m = CollectionsManager('', must_exist=False)
|
||||
if not os.path.isdir(m.colls_dir):
|
||||
indexer = AutoIndexer(interval=self.r.auto_interval)
|
||||
if not os.path.isdir(indexer.root_path):
|
||||
msg = 'No managed directory "{0}" for auto-indexing'
|
||||
logging.error(msg.format(m.colls_dir))
|
||||
logging.error(msg.format(indexer.root_path))
|
||||
import sys
|
||||
sys.exit(2)
|
||||
else:
|
||||
msg = 'Auto-Indexing Enabled on "{0}", checking every {1} secs'
|
||||
logging.info(msg.format(m.colls_dir, self.r.auto_interval))
|
||||
m.autoindex(interval=self.r.auto_interval, do_loop=False)
|
||||
|
||||
msg = 'Auto-Indexing Enabled on "{0}", checking every {1} secs'
|
||||
logging.info(msg.format(indexer.root_path, self.r.auto_interval))
|
||||
indexer.start()
|
||||
|
||||
super(ReplayCli, self).run()
|
||||
|
||||
|
@ -9,6 +9,9 @@ from six import iteritems
|
||||
|
||||
from warcio.utils import to_native_str
|
||||
|
||||
from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter
|
||||
from pywb.recorder.recorderapp import RecorderApp
|
||||
|
||||
from pywb.utils.loaders import load_yaml_config
|
||||
from pywb.utils.geventserver import GeventServer
|
||||
|
||||
@ -32,10 +35,14 @@ class FrontEndApp(object):
|
||||
self.warcserver = WarcServer(config_file=config_file,
|
||||
custom_config=custom_config)
|
||||
|
||||
framed_replay = self.warcserver.config.get('framed_replay', True)
|
||||
config = self.warcserver.config
|
||||
|
||||
framed_replay = config.get('framed_replay', True)
|
||||
|
||||
self.warcserver_server = GeventServer(self.warcserver, port=0)
|
||||
|
||||
self.init_recorder(config)
|
||||
|
||||
self.static_handler = StaticHandler('pywb/static/')
|
||||
|
||||
self.url_map = Map()
|
||||
@ -44,36 +51,59 @@ class FrontEndApp(object):
|
||||
self.url_map.add(Rule('/collinfo.json', endpoint=self.serve_listing))
|
||||
|
||||
if self.is_valid_coll('$root'):
|
||||
self.url_map.add(Rule('/', endpoint=self.serve_coll_page))
|
||||
self.url_map.add(Rule('/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content))
|
||||
self.url_map.add(Rule('/cdx', endpoint=self.serve_cdx))
|
||||
self.url_map.add(Rule('/<path:url>', endpoint=self.serve_content))
|
||||
|
||||
coll_prefix = ''
|
||||
else:
|
||||
self.url_map.add(Rule('/<coll>/', endpoint=self.serve_coll_page))
|
||||
self.url_map.add(Rule('/<coll>/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content))
|
||||
self.url_map.add(Rule('/<coll>/cdx', endpoint=self.serve_cdx))
|
||||
self.url_map.add(Rule('/<coll>/<path:url>', endpoint=self.serve_content))
|
||||
|
||||
coll_prefix = '/<coll>'
|
||||
self.url_map.add(Rule('/', endpoint=self.serve_home))
|
||||
|
||||
self.url_map.add(Rule(coll_prefix + '/', endpoint=self.serve_coll_page))
|
||||
self.url_map.add(Rule(coll_prefix + '/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content))
|
||||
self.url_map.add(Rule(coll_prefix + '/cdx', endpoint=self.serve_cdx))
|
||||
if self.recorder:
|
||||
self.url_map.add(Rule(coll_prefix + '/record/<path:url>', endpoint=self.serve_record))
|
||||
|
||||
self.url_map.add(Rule(coll_prefix + '/<path:url>', endpoint=self.serve_content))
|
||||
|
||||
upstream_paths = self.get_upstream_paths(self.warcserver_server.port)
|
||||
|
||||
self.rewriterapp = RewriterApp(framed_replay,
|
||||
config=self.warcserver.config,
|
||||
config=config,
|
||||
paths=upstream_paths)
|
||||
|
||||
self.templates_dir = self.warcserver.config.get('templates_dir', 'templates')
|
||||
self.static_dir = self.warcserver.config.get('static_dir', 'static')
|
||||
self.templates_dir = config.get('templates_dir', 'templates')
|
||||
self.static_dir = config.get('static_dir', 'static')
|
||||
|
||||
metadata_templ = os.path.join(self.warcserver.root_dir, '{coll}', 'metadata.yaml')
|
||||
self.metadata_cache = MetadataCache(metadata_templ)
|
||||
|
||||
def get_upstream_paths(self, port):
|
||||
return {
|
||||
base_paths = {
|
||||
'replay': 'http://localhost:%s/{coll}/resource/postreq' % port,
|
||||
'cdx-server': 'http://localhost:%s/{coll}/index' % port,
|
||||
}
|
||||
|
||||
if self.recorder:
|
||||
base_paths['record'] = 'http://localhost:%s/%s/resource/postreq?param.recorder.coll={coll}' % (self.recorder_port, self.recorder_source)
|
||||
|
||||
return base_paths
|
||||
|
||||
def init_recorder(self, config):
|
||||
self.recorder_source = config.get('recorder')
|
||||
|
||||
if not self.recorder_source:
|
||||
self.recorder = None
|
||||
self.recorder_server = None
|
||||
self.recorder_port = 0
|
||||
return
|
||||
|
||||
dedup_index = None
|
||||
warc_writer = MultiFileWARCWriter(self.warcserver.archive_templ, max_size=1000000000, max_idle_secs=600,
|
||||
dedup_index=dedup_index)
|
||||
|
||||
self.recorder = RecorderApp('http://localhost:' + str(self.warcserver_server.port), warc_writer)
|
||||
self.recorder_server = GeventServer(self.recorder, port=0)
|
||||
self.recorder_port = self.recorder_server.port
|
||||
|
||||
def serve_home(self, environ):
|
||||
home_view = BaseInsertView(self.rewriterapp.jinja_env, 'index.html')
|
||||
fixed_routes = self.warcserver.list_fixed_routes()
|
||||
@ -150,13 +180,19 @@ class FrontEndApp(object):
|
||||
return WbResponse.bin_stream(res.raw, content_type=res.headers.get('Content-Type'))
|
||||
|
||||
except Exception as e:
|
||||
return WbResponse.text_content('Error: ' + str(e), status='400 Bad Request')
|
||||
return WbResponse.text_response('Error: ' + str(e), status='400 Bad Request')
|
||||
|
||||
def serve_content(self, environ, coll='$root', url='', timemap_output=''):
|
||||
def serve_record(self, environ, coll='$root', url=''):
|
||||
if coll in self.warcserver.list_fixed_routes():
|
||||
return WbResponse.text_response('Error: Can Not Record Into Custom Collection "{0}"'.format(coll))
|
||||
|
||||
return self.serve_content(environ, coll, url, record=True)
|
||||
|
||||
def serve_content(self, environ, coll='$root', url='', timemap_output='', record=False):
|
||||
if not self.is_valid_coll(coll):
|
||||
self.raise_not_found(environ, 'No handler for "/{0}"'.format(coll))
|
||||
|
||||
self.setup_paths(environ, coll)
|
||||
self.setup_paths(environ, coll, record)
|
||||
|
||||
wb_url_str = to_native_str(url)
|
||||
|
||||
@ -164,6 +200,10 @@ class FrontEndApp(object):
|
||||
wb_url_str += '?' + environ.get('QUERY_STRING')
|
||||
|
||||
metadata = self.get_metadata(coll)
|
||||
if record:
|
||||
metadata['type'] = 'record'
|
||||
print('RECORD')
|
||||
|
||||
if timemap_output:
|
||||
metadata['output'] = timemap_output
|
||||
|
||||
@ -175,12 +215,14 @@ class FrontEndApp(object):
|
||||
|
||||
return response
|
||||
|
||||
def setup_paths(self, environ, coll):
|
||||
def setup_paths(self, environ, coll, record=False):
|
||||
if not coll or not self.warcserver.root_dir:
|
||||
return
|
||||
|
||||
if coll != '$root':
|
||||
pop_path_info(environ)
|
||||
if record:
|
||||
pop_path_info(environ)
|
||||
|
||||
paths = [self.warcserver.root_dir]
|
||||
|
||||
|
@ -2,59 +2,106 @@ import gevent
|
||||
import time
|
||||
import re
|
||||
import os
|
||||
import logging
|
||||
|
||||
from pywb.manager.manager import CollectionsManager
|
||||
|
||||
|
||||
#=============================================================================
|
||||
EXT_RX = re.compile('.*\.w?arc(\.gz)?$')
|
||||
class AutoIndexer(object):
|
||||
EXT_RX = re.compile('.*\.w?arc(\.gz)?$')
|
||||
AUTO_INDEX_FILE = 'autoindex.cdxj'
|
||||
|
||||
keep_running = True
|
||||
def __init__(self, interval=30, keep_running=True):
|
||||
self.manager = CollectionsManager('', must_exist=False)
|
||||
|
||||
self.root_path = self.manager.colls_dir
|
||||
|
||||
#=============================================================================
|
||||
class CDXAutoIndexer(object):
|
||||
def __init__(self, updater, path):
|
||||
self.updater = updater
|
||||
self.root_path = path
|
||||
self.keep_running = keep_running
|
||||
|
||||
self.mtimes = {}
|
||||
self.interval = interval
|
||||
|
||||
def has_changed(self, *paths):
|
||||
full_path = os.path.join(*paths)
|
||||
self.last_size = {}
|
||||
|
||||
def is_newer_than(self, path1, path2, track=False):
|
||||
try:
|
||||
mtime = os.path.getmtime(full_path)
|
||||
mtime1 = os.path.getmtime(path1)
|
||||
mtime2 = os.path.getmtime(path2)
|
||||
newer = mtime1 > mtime2
|
||||
except:
|
||||
return False
|
||||
newer = True
|
||||
|
||||
if mtime == self.mtimes.get(full_path):
|
||||
return False
|
||||
if track:
|
||||
size = os.path.getsize(path1)
|
||||
try:
|
||||
if size != self.last_size[path1]:
|
||||
newer = True
|
||||
except:
|
||||
pass
|
||||
|
||||
self.mtimes[full_path] = mtime
|
||||
return full_path
|
||||
self.last_size[path1] = size
|
||||
|
||||
return newer
|
||||
|
||||
def do_index(self, files):
|
||||
logging.info('Auto-Indexing... ' + str(files))
|
||||
self.manager.index_merge(files, self.AUTO_INDEX_FILE)
|
||||
logging.info('...Done')
|
||||
|
||||
def check_path(self):
|
||||
for dirName, subdirList, fileList in os.walk(self.root_path):
|
||||
if not subdirList and not self.has_changed(dirName):
|
||||
return False
|
||||
for coll in os.listdir(self.root_path):
|
||||
coll_dir = os.path.join(self.root_path, coll)
|
||||
if not os.path.isdir(coll_dir):
|
||||
continue
|
||||
|
||||
for filename in fileList:
|
||||
if not EXT_RX.match(filename):
|
||||
self.manager.change_collection(coll)
|
||||
|
||||
archive_dir = self.manager.archive_dir
|
||||
|
||||
if not os.path.isdir(archive_dir):
|
||||
continue
|
||||
|
||||
index_file = os.path.join(self.manager.indexes_dir, self.AUTO_INDEX_FILE)
|
||||
|
||||
if os.path.isfile(index_file):
|
||||
if os.name != 'nt' and self.is_newer_than(archive_dir, index_file):
|
||||
continue
|
||||
else:
|
||||
try:
|
||||
os.makedirs(self.manager.indexes_dir)
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
path = self.has_changed(self.root_path, dirName, filename)
|
||||
if not path:
|
||||
continue
|
||||
logging.info('Collection Possibly Changed: ' + coll)
|
||||
to_index = []
|
||||
for dirpath, dirnames, filenames in os.walk(archive_dir):
|
||||
for filename in filenames:
|
||||
if not self.EXT_RX.match(filename):
|
||||
continue
|
||||
|
||||
self.updater(os.path.join(dirName, filename))
|
||||
full_filename = os.path.join(dirpath, filename)
|
||||
|
||||
def do_loop(self, interval):
|
||||
if self.is_newer_than(full_filename, index_file, True):
|
||||
to_index.append(full_filename)
|
||||
|
||||
if to_index:
|
||||
self.do_index(to_index)
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
while keep_running:
|
||||
while self.keep_running:
|
||||
self.check_path()
|
||||
time.sleep(interval)
|
||||
if not self.interval:
|
||||
break
|
||||
|
||||
time.sleep(self.interval)
|
||||
except KeyboardInterrupt: # pragma: no cover
|
||||
return
|
||||
|
||||
def start(self, interval):
|
||||
self.ge = gevent.spawn(self.do_loop, interval)
|
||||
def start(self):
|
||||
self.ge = gevent.spawn(self.run)
|
||||
|
||||
def stop(self):
|
||||
self.interval = 0
|
||||
self.keep_running = False
|
||||
|
||||
|
@ -33,7 +33,6 @@ It may be used via cmdline to setup and maintain the
|
||||
directory structure expected by pywb
|
||||
"""
|
||||
DEF_INDEX_FILE = 'index.cdxj'
|
||||
AUTO_INDEX_FILE = 'autoindex.cdxj'
|
||||
|
||||
COLL_RX = re.compile('^[\w][-\w]*$')
|
||||
|
||||
@ -48,12 +47,12 @@ directory structure expected by pywb
|
||||
|
||||
self.colls_dir = os.path.join(os.getcwd(), colls_dir)
|
||||
|
||||
self._set_coll_dirs(coll_name)
|
||||
self.change_collection(coll_name)
|
||||
|
||||
if must_exist:
|
||||
self._assert_coll_exists()
|
||||
|
||||
def _set_coll_dirs(self, coll_name):
|
||||
def change_collection(self, coll_name):
|
||||
self.coll_name = coll_name
|
||||
self.curr_coll_dir = os.path.join(self.colls_dir, coll_name)
|
||||
|
||||
@ -330,35 +329,6 @@ directory structure expected by pywb
|
||||
|
||||
migrate.convert_to_cdxj()
|
||||
|
||||
def autoindex(self, interval=30.0, do_loop=True):
|
||||
from pywb.manager.autoindex import CDXAutoIndexer
|
||||
|
||||
if self.coll_name:
|
||||
any_coll = False
|
||||
path = self.archive_dir
|
||||
else:
|
||||
path = self.colls_dir
|
||||
any_coll = True
|
||||
|
||||
def do_index(warc):
|
||||
if any_coll:
|
||||
coll_name = warc.split(self.colls_dir + os.path.sep)
|
||||
coll_name = coll_name[-1].split(os.path.sep)[0]
|
||||
|
||||
if coll_name != self.coll_name:
|
||||
self._set_coll_dirs(coll_name)
|
||||
|
||||
print('Auto-Indexing: ' + warc)
|
||||
self.index_merge([warc], self.AUTO_INDEX_FILE)
|
||||
print('Done.. Waiting for file updates')
|
||||
|
||||
|
||||
indexer = CDXAutoIndexer(do_index, path)
|
||||
indexer.start(interval)
|
||||
#indexer.start_watch()
|
||||
if do_loop:
|
||||
indexer.do_loop(interval)
|
||||
|
||||
|
||||
#=============================================================================
|
||||
def main(args=None):
|
||||
@ -469,17 +439,7 @@ Create manage file based web archive collections
|
||||
migrate.add_argument('-f', '--force', action='store_true')
|
||||
migrate.set_defaults(func=do_migrate)
|
||||
|
||||
# Auto Index
|
||||
def do_autoindex(r):
|
||||
m = CollectionsManager(r.coll_name, must_exist=False)
|
||||
m.autoindex(r.interval, True)
|
||||
|
||||
autoindex_help = 'Automatically index any change archive files'
|
||||
autoindex = subparsers.add_parser('autoindex', help=autoindex_help)
|
||||
autoindex.add_argument('coll_name', nargs='?', default='')
|
||||
autoindex.add_argument('--interval', type=float, default=30.0)
|
||||
autoindex.set_defaults(func=do_autoindex)
|
||||
|
||||
# Parse
|
||||
r = parser.parse_args(args=args)
|
||||
r.func(r)
|
||||
|
||||
|
@ -58,7 +58,7 @@ class BaseLoader(object):
|
||||
cdx['recorder_skip'] = '1'
|
||||
|
||||
out_headers['WebAgg-Cdx'] = to_native_str(cdx.to_cdxj().rstrip())
|
||||
out_headers['WebAgg-Source-Coll'] = source
|
||||
out_headers['WebAgg-Source-Coll'] = to_native_str(source)
|
||||
|
||||
if not warc_headers:
|
||||
if other_headers:
|
||||
|
@ -85,8 +85,8 @@ class FakeRedisTests(object):
|
||||
# ============================================================================
|
||||
class TempDirTests(object):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
super(TempDirTests, cls).setup_class()
|
||||
def setup_class(cls, *args, **kwargs):
|
||||
super(TempDirTests, cls).setup_class(*args, **kwargs)
|
||||
cls.root_dir = tempfile.mkdtemp()
|
||||
|
||||
@classmethod
|
||||
|
@ -57,6 +57,9 @@ class WarcServer(BaseWarcServer):
|
||||
|
||||
self.fixed_routes = self.load_colls()
|
||||
|
||||
self.archive_templ = None
|
||||
self.indexes_templ = None
|
||||
|
||||
for name, route in iteritems(self.fixed_routes):
|
||||
self.add_route('/' + name, route)
|
||||
|
||||
@ -82,13 +85,13 @@ class WarcServer(BaseWarcServer):
|
||||
return
|
||||
|
||||
#indexes_templ = os.path.join('{coll}', 'indexes') + os.path.sep
|
||||
indexes_templ = self.AUTO_DIR_INDEX_PATH.replace('/', os.path.sep)
|
||||
dir_source = CacheDirectoryIndexSource(self.root_dir, indexes_templ)
|
||||
self.indexes_templ = self.AUTO_DIR_INDEX_PATH.replace('/', os.path.sep)
|
||||
dir_source = CacheDirectoryIndexSource(self.root_dir, self.indexes_templ)
|
||||
|
||||
archive_templ = self.AUTO_DIR_ARCHIVE_PATH.replace('/', os.path.sep)
|
||||
archive_templ = os.path.join(self.root_dir, archive_templ)
|
||||
self.archive_templ = self.AUTO_DIR_ARCHIVE_PATH.replace('/', os.path.sep)
|
||||
self.archive_templ = os.path.join(self.root_dir, self.archive_templ)
|
||||
|
||||
handler = DefaultResourceHandler(dir_source, archive_templ)
|
||||
handler = DefaultResourceHandler(dir_source, self.archive_templ)
|
||||
|
||||
return handler
|
||||
|
||||
|
@ -3,7 +3,8 @@ from gevent import monkey; monkey.patch_all(thread=False)
|
||||
import pytest
|
||||
import webtest
|
||||
|
||||
from pywb.warcserver.test.testutils import BaseTestClass
|
||||
from pywb.warcserver.test.testutils import BaseTestClass, TempDirTests
|
||||
from pywb.manager.manager import main, CollectionsManager
|
||||
|
||||
from pywb.apps.frontendapp import FrontEndApp
|
||||
import os
|
||||
@ -24,16 +25,28 @@ class BaseConfigTest(BaseTestClass):
|
||||
@classmethod
|
||||
def get_test_app(cls, config_file, override=None):
|
||||
config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), config_file)
|
||||
return webtest.TestApp(FrontEndApp(config_file=config_file, custom_config=override))
|
||||
app = FrontEndApp(config_file=config_file, custom_config=override)
|
||||
return app, webtest.TestApp(app)
|
||||
|
||||
@classmethod
|
||||
def setup_class(cls, config_file, include_non_frame=True):
|
||||
super(BaseConfigTest, cls).setup_class()
|
||||
cls.testapp = cls.get_test_app(config_file)
|
||||
cls.app, cls.testapp = cls.get_test_app(config_file)
|
||||
|
||||
if include_non_frame:
|
||||
cls.testapp_non_frame = cls.get_test_app(config_file,
|
||||
override={'framed_replay': False})
|
||||
cls.app_non_frame, cls.testapp_non_frame = cls.get_test_app(config_file,
|
||||
override={'framed_replay': False})
|
||||
|
||||
@classmethod
|
||||
def teardown_class(cls):
|
||||
if cls.app.recorder:
|
||||
cls.app.recorder.writer.close()
|
||||
|
||||
if cls.app_non_frame.recorder:
|
||||
cls.app_non_frame.recorder.writer.close()
|
||||
|
||||
super(BaseConfigTest, cls).teardown_class()
|
||||
|
||||
def _assert_basic_html(self, resp):
|
||||
assert resp.status_int == 200
|
||||
assert resp.content_type == 'text/html'
|
||||
@ -61,3 +74,21 @@ class BaseConfigTest(BaseTestClass):
|
||||
return app.head(url.format(fmod), *args, **kwargs)
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class CollsDirMixin(TempDirTests):
|
||||
COLLS_DIR = '_test_colls'
|
||||
|
||||
@classmethod
|
||||
def setup_class(cls, *args, **kwargs):
|
||||
super(CollsDirMixin, cls).setup_class(*args, **kwargs)
|
||||
cls.orig_cwd = os.getcwd()
|
||||
cls.root_dir = os.path.realpath(cls.root_dir)
|
||||
os.chdir(cls.root_dir)
|
||||
cls.orig_collections = CollectionsManager.COLLS_DIR
|
||||
CollectionsManager.COLLS_DIR = cls.COLLS_DIR
|
||||
|
||||
@classmethod
|
||||
def teardown_class(cls):
|
||||
os.chdir(cls.orig_cwd)
|
||||
CollectionsManager.COLLS_DIR = cls.orig_collections
|
||||
super(CollsDirMixin, cls).teardown_class()
|
||||
|
10
tests/config_test_record.yaml
Normal file
10
tests/config_test_record.yaml
Normal file
@ -0,0 +1,10 @@
|
||||
debug: true
|
||||
|
||||
collections_root: _test_colls
|
||||
|
||||
recorder: live
|
||||
|
||||
collections:
|
||||
'live': '$live'
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
from gevent.monkey import patch_all; patch_all()
|
||||
from .base_config_test import CollsDirMixin
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
@ -17,11 +17,10 @@ from pytest import raises
|
||||
from mock import patch
|
||||
|
||||
from pywb import get_test_dir
|
||||
from pywb.warcserver.test.testutils import TempDirTests, BaseTestClass
|
||||
from pywb.warcserver.test.testutils import BaseTestClass
|
||||
|
||||
from pywb.manager.manager import main, CollectionsManager
|
||||
|
||||
import pywb.manager.autoindex
|
||||
from pywb.manager.autoindex import AutoIndexer
|
||||
from pywb.manager.manager import main
|
||||
|
||||
from pywb.indexer.cdxindexer import main as cdxindexer_main
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
@ -34,26 +33,12 @@ ARCHIVE_DIR = 'archive'
|
||||
INDEX_DIR = 'indexes'
|
||||
COLLECTIONS = '_test_colls'
|
||||
|
||||
CollectionsManager.COLLS_DIR = COLLECTIONS
|
||||
|
||||
INDEX_FILE = 'index.cdxj'
|
||||
AUTOINDEX_FILE = 'autoindex.cdxj'
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class TestManagedColls(TempDirTests, BaseTestClass):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
super(TestManagedColls, cls).setup_class()
|
||||
cls.orig_cwd = os.getcwd()
|
||||
cls.root_dir = os.path.realpath(cls.root_dir)
|
||||
os.chdir(cls.root_dir)
|
||||
|
||||
@classmethod
|
||||
def teardown_class(cls):
|
||||
os.chdir(cls.orig_cwd)
|
||||
super(TestManagedColls, cls).teardown_class()
|
||||
|
||||
class TestManagedColls(CollsDirMixin, BaseTestClass):
|
||||
def _check_dirs(self, base, dirlist):
|
||||
for dir_ in dirlist:
|
||||
assert os.path.isdir(os.path.join(base, dir_))
|
||||
@ -82,8 +67,7 @@ class TestManagedColls(TempDirTests, BaseTestClass):
|
||||
colls = os.path.join(self.root_dir, COLLECTIONS)
|
||||
os.mkdir(colls)
|
||||
|
||||
pywb.manager.autoindex.keep_running = False
|
||||
wayback(['-a', '-p', '0'])
|
||||
wayback(['-a', '-p', '0', '--auto-interval', '0'])
|
||||
|
||||
def test_create_first_coll(self):
|
||||
""" Test first collection creation, with all required dirs
|
||||
@ -472,8 +456,6 @@ class TestManagedColls(TempDirTests, BaseTestClass):
|
||||
archive_sub_dir = os.path.join(archive_dir, 'sub')
|
||||
os.makedirs(archive_sub_dir)
|
||||
|
||||
pywb.manager.autoindex.keep_running = True
|
||||
|
||||
def do_copy():
|
||||
try:
|
||||
time.sleep(1.0)
|
||||
@ -481,16 +463,12 @@ class TestManagedColls(TempDirTests, BaseTestClass):
|
||||
shutil.copy(self._get_sample_warc('example-extra.warc'), archive_sub_dir)
|
||||
time.sleep(1.0)
|
||||
finally:
|
||||
pywb.manager.autoindex.keep_running = False
|
||||
indexer.interval = 0
|
||||
|
||||
indexer = AutoIndexer(interval=0.25)
|
||||
indexer.start()
|
||||
|
||||
#thread = threading.Thread(target=do_copy)
|
||||
#thread.daemon = True
|
||||
#thread.start()
|
||||
ge = gevent.spawn(do_copy)
|
||||
|
||||
main(['autoindex', 'auto', '--interval', '0.25'])
|
||||
|
||||
#thread.join()
|
||||
ge.join()
|
||||
|
||||
index_file = os.path.join(auto_dir, INDEX_DIR, AUTOINDEX_FILE)
|
||||
@ -505,7 +483,8 @@ class TestManagedColls(TempDirTests, BaseTestClass):
|
||||
mtime = os.path.getmtime(index_file)
|
||||
|
||||
# Update
|
||||
pywb.manager.autoindex.keep_running = True
|
||||
indexer.interval = 0.25
|
||||
indexer.start()
|
||||
|
||||
os.remove(index_file)
|
||||
|
||||
@ -514,7 +493,7 @@ class TestManagedColls(TempDirTests, BaseTestClass):
|
||||
#thread.start()
|
||||
ge = gevent.spawn(do_copy)
|
||||
|
||||
main(['autoindex', 'auto', '--interval', '0.25'])
|
||||
#wayback(['-p', '0', '-a', '--auto-interval', '0.25'])
|
||||
|
||||
#thread.join()
|
||||
ge.join()
|
||||
|
69
tests/test_record_replay.py
Normal file
69
tests/test_record_replay.py
Normal file
@ -0,0 +1,69 @@
|
||||
from .base_config_test import BaseConfigTest, fmod, CollsDirMixin
|
||||
from pywb.manager.manager import main as manager
|
||||
from pywb.manager.autoindex import AutoIndexer
|
||||
import os
|
||||
import time
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class TestRecordReplay(CollsDirMixin, BaseConfigTest):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
super(TestRecordReplay, cls).setup_class('config_test_record.yaml')
|
||||
cls.indexer = AutoIndexer(interval=0.25)
|
||||
cls.indexer.start()
|
||||
|
||||
@classmethod
|
||||
def teardown_class(cls):
|
||||
cls.indexer.stop()
|
||||
super(TestRecordReplay, cls).teardown_class()
|
||||
|
||||
def test_init_coll(self):
|
||||
manager(['init', 'test'])
|
||||
assert os.path.isdir(os.path.join(self.root_dir, '_test_colls', 'test', 'archive'))
|
||||
|
||||
manager(['init', 'test2'])
|
||||
assert os.path.isdir(os.path.join(self.root_dir, '_test_colls', 'test2', 'archive'))
|
||||
|
||||
def test_record_1(self, fmod):
|
||||
fmod_slash = fmod + '/' if fmod else ''
|
||||
res = self.get('/test/record/mp_/http://httpbin.org/get?A=B', fmod_slash)
|
||||
assert '"A": "B"' in res.text
|
||||
|
||||
def test_replay_1(self, fmod):
|
||||
time.sleep(0.5)
|
||||
|
||||
fmod_slash = fmod + '/' if fmod else ''
|
||||
res = self.get('/test/mp_/http://httpbin.org/get?A=B', fmod_slash)
|
||||
assert '"A": "B"' in res.text
|
||||
|
||||
def test_record_2(self, fmod):
|
||||
fmod_slash = fmod + '/' if fmod else ''
|
||||
res = self.get('/test2/record/{0}http://httpbin.org/get?C=D', fmod_slash)
|
||||
assert '"C": "D"' in res.text
|
||||
|
||||
def test_replay_2(self, fmod):
|
||||
time.sleep(0.5)
|
||||
|
||||
fmod_slash = fmod + '/' if fmod else ''
|
||||
res = self.get('/test2/{0}http://httpbin.org/get?C=D', fmod_slash)
|
||||
assert '"C": "D"' in res.text
|
||||
|
||||
def test_record_again_1(self, fmod):
|
||||
fmod_slash = fmod + '/' if fmod else ''
|
||||
res = self.get('/test/record/{0}http://httpbin.org/get?C=D', fmod_slash)
|
||||
assert '"C": "D"' in res.text
|
||||
|
||||
def test_replay_again_1(self, fmod):
|
||||
time.sleep(0.5)
|
||||
|
||||
fmod_slash = fmod + '/' if fmod else ''
|
||||
res = self.get('/test/{0}http://httpbin.org/get?C=D', fmod_slash)
|
||||
assert '"C": "D"' in res.text
|
||||
|
||||
# two warcs, for framed and non-framed capture
|
||||
assert len(os.listdir(os.path.join(self.root_dir, '_test_colls', 'test', 'archive'))) == 2
|
||||
|
||||
assert len(os.listdir(os.path.join(self.root_dir, '_test_colls', 'test', 'indexes'))) == 1
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user