1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Recorder App Support (#241)

recording support: now available for dynamic collections via config
- config.yaml 'recorder: live' entry enables /record/ subpath which records to any dynamic collections (can record from any collection, though usually live)
- autoindex refactor: simplified, standalone AutoIndexer() -- indexes any changed warc files to autoindex.cdxj
- windows autoindex support: also check for changed file size, as last modified time may not be changing
- manager: remove autoindex, now part of main cli
- tests: updated test_auto_colls with autoindex changes
- tests: add record/replay tests for recording and replay
This commit is contained in:
Ilya Kreymer 2017-09-21 22:12:57 -07:00 committed by GitHub
parent a05916617d
commit 93921aadb7
11 changed files with 288 additions and 147 deletions

View File

@ -88,19 +88,19 @@ class ReplayCli(BaseCli):
def run(self):
if self.r.autoindex:
from pywb.manager.manager import CollectionsManager
from pywb.manager.autoindex import AutoIndexer
import os
m = CollectionsManager('', must_exist=False)
if not os.path.isdir(m.colls_dir):
indexer = AutoIndexer(interval=self.r.auto_interval)
if not os.path.isdir(indexer.root_path):
msg = 'No managed directory "{0}" for auto-indexing'
logging.error(msg.format(m.colls_dir))
logging.error(msg.format(indexer.root_path))
import sys
sys.exit(2)
else:
msg = 'Auto-Indexing Enabled on "{0}", checking every {1} secs'
logging.info(msg.format(m.colls_dir, self.r.auto_interval))
m.autoindex(interval=self.r.auto_interval, do_loop=False)
msg = 'Auto-Indexing Enabled on "{0}", checking every {1} secs'
logging.info(msg.format(indexer.root_path, self.r.auto_interval))
indexer.start()
super(ReplayCli, self).run()

View File

@ -9,6 +9,9 @@ from six import iteritems
from warcio.utils import to_native_str
from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter
from pywb.recorder.recorderapp import RecorderApp
from pywb.utils.loaders import load_yaml_config
from pywb.utils.geventserver import GeventServer
@ -32,10 +35,14 @@ class FrontEndApp(object):
self.warcserver = WarcServer(config_file=config_file,
custom_config=custom_config)
framed_replay = self.warcserver.config.get('framed_replay', True)
config = self.warcserver.config
framed_replay = config.get('framed_replay', True)
self.warcserver_server = GeventServer(self.warcserver, port=0)
self.init_recorder(config)
self.static_handler = StaticHandler('pywb/static/')
self.url_map = Map()
@ -44,36 +51,59 @@ class FrontEndApp(object):
self.url_map.add(Rule('/collinfo.json', endpoint=self.serve_listing))
if self.is_valid_coll('$root'):
self.url_map.add(Rule('/', endpoint=self.serve_coll_page))
self.url_map.add(Rule('/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content))
self.url_map.add(Rule('/cdx', endpoint=self.serve_cdx))
self.url_map.add(Rule('/<path:url>', endpoint=self.serve_content))
coll_prefix = ''
else:
self.url_map.add(Rule('/<coll>/', endpoint=self.serve_coll_page))
self.url_map.add(Rule('/<coll>/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content))
self.url_map.add(Rule('/<coll>/cdx', endpoint=self.serve_cdx))
self.url_map.add(Rule('/<coll>/<path:url>', endpoint=self.serve_content))
coll_prefix = '/<coll>'
self.url_map.add(Rule('/', endpoint=self.serve_home))
self.url_map.add(Rule(coll_prefix + '/', endpoint=self.serve_coll_page))
self.url_map.add(Rule(coll_prefix + '/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content))
self.url_map.add(Rule(coll_prefix + '/cdx', endpoint=self.serve_cdx))
if self.recorder:
self.url_map.add(Rule(coll_prefix + '/record/<path:url>', endpoint=self.serve_record))
self.url_map.add(Rule(coll_prefix + '/<path:url>', endpoint=self.serve_content))
upstream_paths = self.get_upstream_paths(self.warcserver_server.port)
self.rewriterapp = RewriterApp(framed_replay,
config=self.warcserver.config,
config=config,
paths=upstream_paths)
self.templates_dir = self.warcserver.config.get('templates_dir', 'templates')
self.static_dir = self.warcserver.config.get('static_dir', 'static')
self.templates_dir = config.get('templates_dir', 'templates')
self.static_dir = config.get('static_dir', 'static')
metadata_templ = os.path.join(self.warcserver.root_dir, '{coll}', 'metadata.yaml')
self.metadata_cache = MetadataCache(metadata_templ)
def get_upstream_paths(self, port):
return {
base_paths = {
'replay': 'http://localhost:%s/{coll}/resource/postreq' % port,
'cdx-server': 'http://localhost:%s/{coll}/index' % port,
}
if self.recorder:
base_paths['record'] = 'http://localhost:%s/%s/resource/postreq?param.recorder.coll={coll}' % (self.recorder_port, self.recorder_source)
return base_paths
def init_recorder(self, config):
self.recorder_source = config.get('recorder')
if not self.recorder_source:
self.recorder = None
self.recorder_server = None
self.recorder_port = 0
return
dedup_index = None
warc_writer = MultiFileWARCWriter(self.warcserver.archive_templ, max_size=1000000000, max_idle_secs=600,
dedup_index=dedup_index)
self.recorder = RecorderApp('http://localhost:' + str(self.warcserver_server.port), warc_writer)
self.recorder_server = GeventServer(self.recorder, port=0)
self.recorder_port = self.recorder_server.port
def serve_home(self, environ):
home_view = BaseInsertView(self.rewriterapp.jinja_env, 'index.html')
fixed_routes = self.warcserver.list_fixed_routes()
@ -150,13 +180,19 @@ class FrontEndApp(object):
return WbResponse.bin_stream(res.raw, content_type=res.headers.get('Content-Type'))
except Exception as e:
return WbResponse.text_content('Error: ' + str(e), status='400 Bad Request')
return WbResponse.text_response('Error: ' + str(e), status='400 Bad Request')
def serve_content(self, environ, coll='$root', url='', timemap_output=''):
def serve_record(self, environ, coll='$root', url=''):
if coll in self.warcserver.list_fixed_routes():
return WbResponse.text_response('Error: Can Not Record Into Custom Collection "{0}"'.format(coll))
return self.serve_content(environ, coll, url, record=True)
def serve_content(self, environ, coll='$root', url='', timemap_output='', record=False):
if not self.is_valid_coll(coll):
self.raise_not_found(environ, 'No handler for "/{0}"'.format(coll))
self.setup_paths(environ, coll)
self.setup_paths(environ, coll, record)
wb_url_str = to_native_str(url)
@ -164,6 +200,10 @@ class FrontEndApp(object):
wb_url_str += '?' + environ.get('QUERY_STRING')
metadata = self.get_metadata(coll)
if record:
metadata['type'] = 'record'
print('RECORD')
if timemap_output:
metadata['output'] = timemap_output
@ -175,12 +215,14 @@ class FrontEndApp(object):
return response
def setup_paths(self, environ, coll):
def setup_paths(self, environ, coll, record=False):
if not coll or not self.warcserver.root_dir:
return
if coll != '$root':
pop_path_info(environ)
if record:
pop_path_info(environ)
paths = [self.warcserver.root_dir]

View File

@ -2,59 +2,106 @@ import gevent
import time
import re
import os
import logging
from pywb.manager.manager import CollectionsManager
#=============================================================================
EXT_RX = re.compile('.*\.w?arc(\.gz)?$')
class AutoIndexer(object):
EXT_RX = re.compile('.*\.w?arc(\.gz)?$')
AUTO_INDEX_FILE = 'autoindex.cdxj'
keep_running = True
def __init__(self, interval=30, keep_running=True):
self.manager = CollectionsManager('', must_exist=False)
self.root_path = self.manager.colls_dir
#=============================================================================
class CDXAutoIndexer(object):
def __init__(self, updater, path):
self.updater = updater
self.root_path = path
self.keep_running = keep_running
self.mtimes = {}
self.interval = interval
def has_changed(self, *paths):
full_path = os.path.join(*paths)
self.last_size = {}
def is_newer_than(self, path1, path2, track=False):
try:
mtime = os.path.getmtime(full_path)
mtime1 = os.path.getmtime(path1)
mtime2 = os.path.getmtime(path2)
newer = mtime1 > mtime2
except:
return False
newer = True
if mtime == self.mtimes.get(full_path):
return False
if track:
size = os.path.getsize(path1)
try:
if size != self.last_size[path1]:
newer = True
except:
pass
self.mtimes[full_path] = mtime
return full_path
self.last_size[path1] = size
return newer
def do_index(self, files):
logging.info('Auto-Indexing... ' + str(files))
self.manager.index_merge(files, self.AUTO_INDEX_FILE)
logging.info('...Done')
def check_path(self):
for dirName, subdirList, fileList in os.walk(self.root_path):
if not subdirList and not self.has_changed(dirName):
return False
for coll in os.listdir(self.root_path):
coll_dir = os.path.join(self.root_path, coll)
if not os.path.isdir(coll_dir):
continue
for filename in fileList:
if not EXT_RX.match(filename):
self.manager.change_collection(coll)
archive_dir = self.manager.archive_dir
if not os.path.isdir(archive_dir):
continue
index_file = os.path.join(self.manager.indexes_dir, self.AUTO_INDEX_FILE)
if os.path.isfile(index_file):
if os.name != 'nt' and self.is_newer_than(archive_dir, index_file):
continue
else:
try:
os.makedirs(self.manager.indexes_dir)
except Exception as e:
pass
path = self.has_changed(self.root_path, dirName, filename)
if not path:
continue
logging.info('Collection Possibly Changed: ' + coll)
to_index = []
for dirpath, dirnames, filenames in os.walk(archive_dir):
for filename in filenames:
if not self.EXT_RX.match(filename):
continue
self.updater(os.path.join(dirName, filename))
full_filename = os.path.join(dirpath, filename)
def do_loop(self, interval):
if self.is_newer_than(full_filename, index_file, True):
to_index.append(full_filename)
if to_index:
self.do_index(to_index)
def run(self):
try:
while keep_running:
while self.keep_running:
self.check_path()
time.sleep(interval)
if not self.interval:
break
time.sleep(self.interval)
except KeyboardInterrupt: # pragma: no cover
return
def start(self, interval):
self.ge = gevent.spawn(self.do_loop, interval)
def start(self):
self.ge = gevent.spawn(self.run)
def stop(self):
self.interval = 0
self.keep_running = False

View File

@ -33,7 +33,6 @@ It may be used via cmdline to setup and maintain the
directory structure expected by pywb
"""
DEF_INDEX_FILE = 'index.cdxj'
AUTO_INDEX_FILE = 'autoindex.cdxj'
COLL_RX = re.compile('^[\w][-\w]*$')
@ -48,12 +47,12 @@ directory structure expected by pywb
self.colls_dir = os.path.join(os.getcwd(), colls_dir)
self._set_coll_dirs(coll_name)
self.change_collection(coll_name)
if must_exist:
self._assert_coll_exists()
def _set_coll_dirs(self, coll_name):
def change_collection(self, coll_name):
self.coll_name = coll_name
self.curr_coll_dir = os.path.join(self.colls_dir, coll_name)
@ -330,35 +329,6 @@ directory structure expected by pywb
migrate.convert_to_cdxj()
def autoindex(self, interval=30.0, do_loop=True):
from pywb.manager.autoindex import CDXAutoIndexer
if self.coll_name:
any_coll = False
path = self.archive_dir
else:
path = self.colls_dir
any_coll = True
def do_index(warc):
if any_coll:
coll_name = warc.split(self.colls_dir + os.path.sep)
coll_name = coll_name[-1].split(os.path.sep)[0]
if coll_name != self.coll_name:
self._set_coll_dirs(coll_name)
print('Auto-Indexing: ' + warc)
self.index_merge([warc], self.AUTO_INDEX_FILE)
print('Done.. Waiting for file updates')
indexer = CDXAutoIndexer(do_index, path)
indexer.start(interval)
#indexer.start_watch()
if do_loop:
indexer.do_loop(interval)
#=============================================================================
def main(args=None):
@ -469,17 +439,7 @@ Create manage file based web archive collections
migrate.add_argument('-f', '--force', action='store_true')
migrate.set_defaults(func=do_migrate)
# Auto Index
def do_autoindex(r):
m = CollectionsManager(r.coll_name, must_exist=False)
m.autoindex(r.interval, True)
autoindex_help = 'Automatically index any change archive files'
autoindex = subparsers.add_parser('autoindex', help=autoindex_help)
autoindex.add_argument('coll_name', nargs='?', default='')
autoindex.add_argument('--interval', type=float, default=30.0)
autoindex.set_defaults(func=do_autoindex)
# Parse
r = parser.parse_args(args=args)
r.func(r)

View File

@ -58,7 +58,7 @@ class BaseLoader(object):
cdx['recorder_skip'] = '1'
out_headers['WebAgg-Cdx'] = to_native_str(cdx.to_cdxj().rstrip())
out_headers['WebAgg-Source-Coll'] = source
out_headers['WebAgg-Source-Coll'] = to_native_str(source)
if not warc_headers:
if other_headers:

View File

@ -85,8 +85,8 @@ class FakeRedisTests(object):
# ============================================================================
class TempDirTests(object):
@classmethod
def setup_class(cls):
super(TempDirTests, cls).setup_class()
def setup_class(cls, *args, **kwargs):
super(TempDirTests, cls).setup_class(*args, **kwargs)
cls.root_dir = tempfile.mkdtemp()
@classmethod

View File

@ -57,6 +57,9 @@ class WarcServer(BaseWarcServer):
self.fixed_routes = self.load_colls()
self.archive_templ = None
self.indexes_templ = None
for name, route in iteritems(self.fixed_routes):
self.add_route('/' + name, route)
@ -82,13 +85,13 @@ class WarcServer(BaseWarcServer):
return
#indexes_templ = os.path.join('{coll}', 'indexes') + os.path.sep
indexes_templ = self.AUTO_DIR_INDEX_PATH.replace('/', os.path.sep)
dir_source = CacheDirectoryIndexSource(self.root_dir, indexes_templ)
self.indexes_templ = self.AUTO_DIR_INDEX_PATH.replace('/', os.path.sep)
dir_source = CacheDirectoryIndexSource(self.root_dir, self.indexes_templ)
archive_templ = self.AUTO_DIR_ARCHIVE_PATH.replace('/', os.path.sep)
archive_templ = os.path.join(self.root_dir, archive_templ)
self.archive_templ = self.AUTO_DIR_ARCHIVE_PATH.replace('/', os.path.sep)
self.archive_templ = os.path.join(self.root_dir, self.archive_templ)
handler = DefaultResourceHandler(dir_source, archive_templ)
handler = DefaultResourceHandler(dir_source, self.archive_templ)
return handler

View File

@ -3,7 +3,8 @@ from gevent import monkey; monkey.patch_all(thread=False)
import pytest
import webtest
from pywb.warcserver.test.testutils import BaseTestClass
from pywb.warcserver.test.testutils import BaseTestClass, TempDirTests
from pywb.manager.manager import main, CollectionsManager
from pywb.apps.frontendapp import FrontEndApp
import os
@ -24,16 +25,28 @@ class BaseConfigTest(BaseTestClass):
@classmethod
def get_test_app(cls, config_file, override=None):
config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), config_file)
return webtest.TestApp(FrontEndApp(config_file=config_file, custom_config=override))
app = FrontEndApp(config_file=config_file, custom_config=override)
return app, webtest.TestApp(app)
@classmethod
def setup_class(cls, config_file, include_non_frame=True):
super(BaseConfigTest, cls).setup_class()
cls.testapp = cls.get_test_app(config_file)
cls.app, cls.testapp = cls.get_test_app(config_file)
if include_non_frame:
cls.testapp_non_frame = cls.get_test_app(config_file,
override={'framed_replay': False})
cls.app_non_frame, cls.testapp_non_frame = cls.get_test_app(config_file,
override={'framed_replay': False})
@classmethod
def teardown_class(cls):
if cls.app.recorder:
cls.app.recorder.writer.close()
if cls.app_non_frame.recorder:
cls.app_non_frame.recorder.writer.close()
super(BaseConfigTest, cls).teardown_class()
def _assert_basic_html(self, resp):
assert resp.status_int == 200
assert resp.content_type == 'text/html'
@ -61,3 +74,21 @@ class BaseConfigTest(BaseTestClass):
return app.head(url.format(fmod), *args, **kwargs)
#=============================================================================
class CollsDirMixin(TempDirTests):
COLLS_DIR = '_test_colls'
@classmethod
def setup_class(cls, *args, **kwargs):
super(CollsDirMixin, cls).setup_class(*args, **kwargs)
cls.orig_cwd = os.getcwd()
cls.root_dir = os.path.realpath(cls.root_dir)
os.chdir(cls.root_dir)
cls.orig_collections = CollectionsManager.COLLS_DIR
CollectionsManager.COLLS_DIR = cls.COLLS_DIR
@classmethod
def teardown_class(cls):
os.chdir(cls.orig_cwd)
CollectionsManager.COLLS_DIR = cls.orig_collections
super(CollsDirMixin, cls).teardown_class()

View File

@ -0,0 +1,10 @@
debug: true
collections_root: _test_colls
recorder: live
collections:
'live': '$live'

View File

@ -1,4 +1,4 @@
from gevent.monkey import patch_all; patch_all()
from .base_config_test import CollsDirMixin
import os
import tempfile
@ -17,11 +17,10 @@ from pytest import raises
from mock import patch
from pywb import get_test_dir
from pywb.warcserver.test.testutils import TempDirTests, BaseTestClass
from pywb.warcserver.test.testutils import BaseTestClass
from pywb.manager.manager import main, CollectionsManager
import pywb.manager.autoindex
from pywb.manager.autoindex import AutoIndexer
from pywb.manager.manager import main
from pywb.indexer.cdxindexer import main as cdxindexer_main
from pywb.warcserver.index.cdxobject import CDXObject
@ -34,26 +33,12 @@ ARCHIVE_DIR = 'archive'
INDEX_DIR = 'indexes'
COLLECTIONS = '_test_colls'
CollectionsManager.COLLS_DIR = COLLECTIONS
INDEX_FILE = 'index.cdxj'
AUTOINDEX_FILE = 'autoindex.cdxj'
#=============================================================================
class TestManagedColls(TempDirTests, BaseTestClass):
@classmethod
def setup_class(cls):
super(TestManagedColls, cls).setup_class()
cls.orig_cwd = os.getcwd()
cls.root_dir = os.path.realpath(cls.root_dir)
os.chdir(cls.root_dir)
@classmethod
def teardown_class(cls):
os.chdir(cls.orig_cwd)
super(TestManagedColls, cls).teardown_class()
class TestManagedColls(CollsDirMixin, BaseTestClass):
def _check_dirs(self, base, dirlist):
for dir_ in dirlist:
assert os.path.isdir(os.path.join(base, dir_))
@ -82,8 +67,7 @@ class TestManagedColls(TempDirTests, BaseTestClass):
colls = os.path.join(self.root_dir, COLLECTIONS)
os.mkdir(colls)
pywb.manager.autoindex.keep_running = False
wayback(['-a', '-p', '0'])
wayback(['-a', '-p', '0', '--auto-interval', '0'])
def test_create_first_coll(self):
""" Test first collection creation, with all required dirs
@ -472,8 +456,6 @@ class TestManagedColls(TempDirTests, BaseTestClass):
archive_sub_dir = os.path.join(archive_dir, 'sub')
os.makedirs(archive_sub_dir)
pywb.manager.autoindex.keep_running = True
def do_copy():
try:
time.sleep(1.0)
@ -481,16 +463,12 @@ class TestManagedColls(TempDirTests, BaseTestClass):
shutil.copy(self._get_sample_warc('example-extra.warc'), archive_sub_dir)
time.sleep(1.0)
finally:
pywb.manager.autoindex.keep_running = False
indexer.interval = 0
indexer = AutoIndexer(interval=0.25)
indexer.start()
#thread = threading.Thread(target=do_copy)
#thread.daemon = True
#thread.start()
ge = gevent.spawn(do_copy)
main(['autoindex', 'auto', '--interval', '0.25'])
#thread.join()
ge.join()
index_file = os.path.join(auto_dir, INDEX_DIR, AUTOINDEX_FILE)
@ -505,7 +483,8 @@ class TestManagedColls(TempDirTests, BaseTestClass):
mtime = os.path.getmtime(index_file)
# Update
pywb.manager.autoindex.keep_running = True
indexer.interval = 0.25
indexer.start()
os.remove(index_file)
@ -514,7 +493,7 @@ class TestManagedColls(TempDirTests, BaseTestClass):
#thread.start()
ge = gevent.spawn(do_copy)
main(['autoindex', 'auto', '--interval', '0.25'])
#wayback(['-p', '0', '-a', '--auto-interval', '0.25'])
#thread.join()
ge.join()

View File

@ -0,0 +1,69 @@
from .base_config_test import BaseConfigTest, fmod, CollsDirMixin
from pywb.manager.manager import main as manager
from pywb.manager.autoindex import AutoIndexer
import os
import time
# ============================================================================
class TestRecordReplay(CollsDirMixin, BaseConfigTest):
@classmethod
def setup_class(cls):
super(TestRecordReplay, cls).setup_class('config_test_record.yaml')
cls.indexer = AutoIndexer(interval=0.25)
cls.indexer.start()
@classmethod
def teardown_class(cls):
cls.indexer.stop()
super(TestRecordReplay, cls).teardown_class()
def test_init_coll(self):
manager(['init', 'test'])
assert os.path.isdir(os.path.join(self.root_dir, '_test_colls', 'test', 'archive'))
manager(['init', 'test2'])
assert os.path.isdir(os.path.join(self.root_dir, '_test_colls', 'test2', 'archive'))
def test_record_1(self, fmod):
fmod_slash = fmod + '/' if fmod else ''
res = self.get('/test/record/mp_/http://httpbin.org/get?A=B', fmod_slash)
assert '"A": "B"' in res.text
def test_replay_1(self, fmod):
time.sleep(0.5)
fmod_slash = fmod + '/' if fmod else ''
res = self.get('/test/mp_/http://httpbin.org/get?A=B', fmod_slash)
assert '"A": "B"' in res.text
def test_record_2(self, fmod):
fmod_slash = fmod + '/' if fmod else ''
res = self.get('/test2/record/{0}http://httpbin.org/get?C=D', fmod_slash)
assert '"C": "D"' in res.text
def test_replay_2(self, fmod):
time.sleep(0.5)
fmod_slash = fmod + '/' if fmod else ''
res = self.get('/test2/{0}http://httpbin.org/get?C=D', fmod_slash)
assert '"C": "D"' in res.text
def test_record_again_1(self, fmod):
fmod_slash = fmod + '/' if fmod else ''
res = self.get('/test/record/{0}http://httpbin.org/get?C=D', fmod_slash)
assert '"C": "D"' in res.text
def test_replay_again_1(self, fmod):
time.sleep(0.5)
fmod_slash = fmod + '/' if fmod else ''
res = self.get('/test/{0}http://httpbin.org/get?C=D', fmod_slash)
assert '"C": "D"' in res.text
# two warcs, for framed and non-framed capture
assert len(os.listdir(os.path.join(self.root_dir, '_test_colls', 'test', 'archive'))) == 2
assert len(os.listdir(os.path.join(self.root_dir, '_test_colls', 'test', 'indexes'))) == 1