From b417b4783525575021ff61bdffb9369303352a6f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 14 Mar 2015 14:56:15 -0700 Subject: [PATCH] collections manager: support for merge when adding warc, explicit --index-warcs option to index and merge instead of reindexing whole dir, #74 additional testing for recursive indexing, index merge timeutils: add timestamp20_now() function --- pywb/manager/manager.py | 70 +++++++++++++++++- pywb/utils/timeutils.py | 16 ++++ pywb/webapp/pywb_init.py | 2 +- tests/test_auto_colls.py | 153 +++++++++++++++++++++++++++++++++++++-- 4 files changed, 233 insertions(+), 8 deletions(-) diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index e746d277..12bde6c3 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -4,8 +4,11 @@ import sys import logging from pywb.utils.loaders import load_yaml_config +from pywb.utils.timeutils import timestamp20_now from pywb.warc.cdxindexer import main as cdxindexer_main + from argparse import ArgumentParser, RawTextHelpFormatter +import heapq #============================================================================= @@ -49,23 +52,83 @@ directory structure expected by pywb def add_warcs(self, warcs): if not os.path.isdir(self.warc_dir): - raise Exception('Directory ' + warcdir + ' does not exist') + if not os.path.isdir(self.coll_dir): + raise IOError('Collection {0} does not exist'. + format(self.coll_name)) + else: + raise IOError('Directory {0} does not exist'. + format(self.warc_dir)) if not warcs: logging.info('No WARCs specified') return + full_paths = [] for filename in warcs: shutil.copy2(filename, self.warc_dir) + full_paths.append(os.path.join(self.warc_dir, filename)) logging.info('Copied ' + filename + ' to ' + self.warc_dir) - self.reindex() + self._index_merge_warcs(full_paths) def reindex(self): cdx_file = os.path.join(self.cdx_dir, 'index.cdx') logging.info('Indexing ' + self.warc_dir + ' to ' + cdx_file) cdxindexer_main(['-p', '-s', '-r', cdx_file, self.warc_dir]) + def index_merge(self, filelist): + wrongdir = 'Skipping {0}, must be in {1} archive directory' + notfound = 'Skipping {0}, file not found' + + filtered_warcs = [] + + # Check that warcs are actually in warcs dir + abs_warc_dir = os.path.abspath(self.warc_dir) + + for f in filelist: + abs_filepath = os.path.abspath(f) + prefix = os.path.commonprefix([abs_warc_dir, abs_filepath]) + + if prefix != abs_warc_dir: + raise IOError(wrongdir.format(abs_filepath, abs_warc_dir)) + elif not os.path.isfile(abs_filepath): + raise IOError(notfound.format(f)) + else: + filtered_warcs.append(abs_filepath.split(prefix)[1]) + + self._index_merge_warcs(filtered_warcs) + + def _index_merge_warcs(self, new_warcs): + if not new_warcs: + return + + cdx_file = os.path.join(self.cdx_dir, 'index.cdx') + + # no existing file, just reindex all + if not os.path.isfile(cdx_file): + return self.reindex() + + temp_file = cdx_file + '.tmp.' + timestamp20_now() + args = ['-p', '-s', '-r', temp_file] + args.extend(new_warcs) + cdxindexer_main(args) + + merged_file = temp_file + '.merged' + + last_line = None + + with open(cdx_file) as orig_index: + with open(temp_file) as new_index: + with open(merged_file, 'w+b') as merged: + for line in heapq.merge(orig_index, new_index): + if last_line != line: + merged.write(line) + last_line = line + + os.rename(merged_file, cdx_file) + os.remove(temp_file) + + def main(args=None): description = """ Create manage file based web archive collections @@ -93,6 +156,7 @@ Some examples: group.add_argument('--init', action='store_true') group.add_argument('--addwarc', action='store_true') group.add_argument('--reindex', action='store_true') + group.add_argument('--index-warcs', action='store_true') parser.add_argument('name') parser.add_argument('files', nargs='*') @@ -104,6 +168,8 @@ Some examples: m.add_collection() elif r.addwarc: m.add_warcs(r.files) + elif r.index_warcs: + m.index_merge(r.files) elif r.reindex: m.reindex() diff --git a/pywb/utils/timeutils.py b/pywb/utils/timeutils.py index dcca13d6..30d1007f 100644 --- a/pywb/utils/timeutils.py +++ b/pywb/utils/timeutils.py @@ -80,6 +80,22 @@ def timestamp_now(): return datetime_to_timestamp(datetime.datetime.utcnow()) +def timestamp20_now(): + """ + Create 20-digit timestamp, useful to timestamping temp files + + >>> n = timestamp20_now() + >>> timestamp20_now() >= n + True + + >>> len(n) + 20 + + """ + now = datetime.datetime.utcnow() + return now.strftime('%Y%m%d%H%M%S%f') + + def iso_date_to_timestamp(string): """ >>> iso_date_to_timestamp('2013-12-26T10:11:12Z') diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py index d14600a7..eef703d0 100644 --- a/pywb/webapp/pywb_init.py +++ b/pywb/webapp/pywb_init.py @@ -171,7 +171,7 @@ class DirectoryCollsLoader(object): def load_dir(self, root_dir, name): config_file = os.path.join(root_dir, 'config.yaml') if os.path.isfile(config_file): - coll = load_yaml_file(config_file) + coll = load_yaml_config(config_file) else: coll = {} diff --git a/tests/test_auto_colls.py b/tests/test_auto_colls.py index 0b892d1b..01fb78ec 100644 --- a/tests/test_auto_colls.py +++ b/tests/test_auto_colls.py @@ -25,6 +25,9 @@ def setup_module(): orig_cwd = os.getcwd() os.chdir(root_dir) + # use actually set dir + root_dir = os.getcwd() + def teardown_module(): global root_dir shutil.rmtree(root_dir) @@ -48,6 +51,8 @@ class TestManagedColls(object): assert os.path.isdir(os.path.join(base, dir_)) def test_create_first_coll(self): + """ Test first collection creation, with all required dirs + """ main(['--init', 'test']) colls = os.path.join(self.root_dir, 'collections') @@ -59,6 +64,8 @@ class TestManagedColls(object): self._check_dirs(test, ['cdx', 'warcs', 'static', 'templates']) def test_add_warcs(self): + """ Test adding warc to new coll, check replay + """ warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz') main(['--addwarc', 'test', warc1]) @@ -67,7 +74,22 @@ class TestManagedColls(object): resp = self.testapp.get('/test/20140103030321/http://example.com?example=1') assert resp.status_int == 200 + def test_another_coll(self): + """ Test adding warc to a new coll, check replay + """ + warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz') + + main(['--init', 'foo']) + + main(['--addwarc', 'foo', warc1]) + + self._create_app() + resp = self.testapp.get('/foo/20140103030321/http://example.com?example=1') + assert resp.status_int == 200 + def test_add_more_warcs(self): + """ Test adding additional warcs, check replay of added content + """ warc1 = os.path.join(get_test_dir(), 'warcs', 'iana.warc.gz') warc2 = os.path.join(get_test_dir(), 'warcs', 'example-extra.warc') @@ -80,15 +102,80 @@ class TestManagedColls(object): with raises(IOError): main(['--addwarc', 'test', 'non-existent-file.warc.gz']) + # check adding no warc -- no op main(['--addwarc', 'test']) + # check new cdx + self._create_app() + resp = self.testapp.get('/test/20140126200624/http://www.iana.org/') + assert resp.status_int == 200 + + def test_add_custom_nested_warcs(self): + """ Test recursive indexing of custom created WARC hierarchy, + warcs/A/..., warcs/B/sub/... + Ensure CDX is relative to root archive dir, test replay + """ + + main(['--init', 'nested']) + + nested_root = os.path.join(self.root_dir, 'collections', 'nested', 'warcs') + nested_a = os.path.join(nested_root, 'A') + nested_b = os.path.join(nested_root, 'B', 'sub') + + os.makedirs(nested_a) + os.makedirs(nested_b) + + warc1 = os.path.join(get_test_dir(), 'warcs', 'iana.warc.gz') + warc2 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz') + + shutil.copy2(warc1, nested_a) + shutil.copy2(warc2, nested_b) + + main(['--index-warcs', + 'nested', + os.path.join(nested_a, 'iana.warc.gz'), + os.path.join(nested_b, 'example.warc.gz') + ]) + + nested_cdx = os.path.join(self.root_dir, 'collections', 'nested', 'cdx', 'index.cdx') + with open(nested_cdx) as fh: + nested_cdx_index = fh.read() + + assert '- 1043 333 B/sub/example.warc.gz' in nested_cdx_index + assert '- 2258 334 A/iana.warc.gz' in nested_cdx_index + + self._create_app() + resp = self.testapp.get('/nested/20140126200624/http://www.iana.org/') + assert resp.status_int == 200 + + resp = self.testapp.get('/nested/20140103030321/http://example.com?example=1') + assert resp.status_int == 200 + + def test_merge_vs_reindex_equality(self): + """ Test full reindex vs merged update when adding warcs + to ensure equality of indexes + """ + # ensure merged index is same as full reindex + coll_dir = os.path.join(self.root_dir, 'collections', 'test', 'cdx') + orig = os.path.join(coll_dir, 'index.cdx') + bak = os.path.join(coll_dir, 'index.bak') + + shutil.copy(orig, bak) + main(['--reindex', 'test']) - self._create_app() - resp = self.testapp.get('/test/20140103030321/http://example.com?example=1') - assert resp.status_int == 200 + with open(orig) as orig_fh: + merged_cdx = orig_fh.read() + + with open(bak) as bak_fh: + reindex_cdx = bak_fh.read() + + assert len(reindex_cdx.splitlines()) == len(merged_cdx.splitlines()) + assert merged_cdx == reindex_cdx def test_add_static(self): + """ Test adding static file to collection, check access + """ a_static = os.path.join(self.root_dir, 'collections', 'test', 'static', 'abc.js') with open(a_static, 'w+b') as fh: @@ -100,7 +187,9 @@ class TestManagedColls(object): assert resp.content_type == 'application/javascript' assert '/* Some JS File */' in resp.body - def test_custom_search(self): + def test_custom_template_search(self): + """ Test manually added custom search template search.html + """ a_static = os.path.join(self.root_dir, 'collections', 'test', 'templates', 'search.html') with open(a_static, 'w+b') as fh: @@ -112,7 +201,28 @@ class TestManagedColls(object): assert resp.content_type == 'text/html' assert 'pywb custom search page' in resp.body + def test_custom_config(self): + """ Test custom created config.yaml which overrides auto settings + Template relative to root dir, not collection-specific so far + """ + config_path = os.path.join(self.root_dir, 'collections', 'test', 'config.yaml') + with open(config_path, 'w+b') as fh: + fh.write('search_html: ./custom_search.html\n') + + custom_search = os.path.join(self.root_dir, 'custom_search.html') + with open(custom_search, 'w+b') as fh: + fh.write('config.yaml overriden search page') + + self._create_app() + resp = self.testapp.get('/test/') + assert resp.status_int == 200 + assert resp.content_type == 'text/html' + assert 'config.yaml overriden search page' in resp.body + + def test_no_templates(self): + """ Test removing templates dir, using default template again + """ shutil.rmtree(os.path.join(self.root_dir, 'collections', 'test', 'templates')) self._create_app() @@ -122,11 +232,44 @@ class TestManagedColls(object): assert resp.content_type == 'text/html' assert 'pywb custom search page' not in resp.body + def test_err_no_such_coll(self): + """ Test error adding warc to non-existant collection + """ + warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz') + + with raises(IOError): + main(['--addwarc', 'bar', warc1]) + + def test_err_wrong_warcs(self): + warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz') + invalid_warc = os.path.join(self.root_dir, 'collections', 'test', 'warcs', 'invalid.warc.gz') + + # Empty + main(['--index-warcs', 'test']) + + # Wrong paths not in collection + with raises(IOError): + main(['--index-warcs', 'test', warc1]) + + # Non-existent + with raises(IOError): + main(['--index-warcs', 'test', invalid_warc]) + def test_err_missing_dirs(self): + """ Test various errors with missing warcs dir, + missing cdx dir, non dir cdx file, and missing collections root + """ colls = os.path.join(self.root_dir, 'collections') + # No WARCS + warcs_path = os.path.join(colls, 'foo', 'warcs') + shutil.rmtree(warcs_path) + + with raises(IOError): + main(['--addwarc', 'foo', 'somewarc']) + # No CDX - cdx_path = os.path.join(colls, 'test', 'cdx') + cdx_path = os.path.join(colls, 'foo', 'cdx') shutil.rmtree(cdx_path) with raises(Exception):