1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

collections manager: support for merge when adding warc, explicit --index-warcs

option to index and merge instead of reindexing whole dir, #74
additional testing for recursive indexing, index merge
timeutils: add timestamp20_now() function
This commit is contained in:
Ilya Kreymer 2015-03-14 14:56:15 -07:00
parent 759d151551
commit b417b47835
4 changed files with 233 additions and 8 deletions

View File

@ -4,8 +4,11 @@ import sys
import logging
from pywb.utils.loaders import load_yaml_config
from pywb.utils.timeutils import timestamp20_now
from pywb.warc.cdxindexer import main as cdxindexer_main
from argparse import ArgumentParser, RawTextHelpFormatter
import heapq
#=============================================================================
@ -49,23 +52,83 @@ directory structure expected by pywb
def add_warcs(self, warcs):
if not os.path.isdir(self.warc_dir):
raise Exception('Directory ' + warcdir + ' does not exist')
if not os.path.isdir(self.coll_dir):
raise IOError('Collection {0} does not exist'.
format(self.coll_name))
else:
raise IOError('Directory {0} does not exist'.
format(self.warc_dir))
if not warcs:
logging.info('No WARCs specified')
return
full_paths = []
for filename in warcs:
shutil.copy2(filename, self.warc_dir)
full_paths.append(os.path.join(self.warc_dir, filename))
logging.info('Copied ' + filename + ' to ' + self.warc_dir)
self.reindex()
self._index_merge_warcs(full_paths)
def reindex(self):
cdx_file = os.path.join(self.cdx_dir, 'index.cdx')
logging.info('Indexing ' + self.warc_dir + ' to ' + cdx_file)
cdxindexer_main(['-p', '-s', '-r', cdx_file, self.warc_dir])
def index_merge(self, filelist):
wrongdir = 'Skipping {0}, must be in {1} archive directory'
notfound = 'Skipping {0}, file not found'
filtered_warcs = []
# Check that warcs are actually in warcs dir
abs_warc_dir = os.path.abspath(self.warc_dir)
for f in filelist:
abs_filepath = os.path.abspath(f)
prefix = os.path.commonprefix([abs_warc_dir, abs_filepath])
if prefix != abs_warc_dir:
raise IOError(wrongdir.format(abs_filepath, abs_warc_dir))
elif not os.path.isfile(abs_filepath):
raise IOError(notfound.format(f))
else:
filtered_warcs.append(abs_filepath.split(prefix)[1])
self._index_merge_warcs(filtered_warcs)
def _index_merge_warcs(self, new_warcs):
if not new_warcs:
return
cdx_file = os.path.join(self.cdx_dir, 'index.cdx')
# no existing file, just reindex all
if not os.path.isfile(cdx_file):
return self.reindex()
temp_file = cdx_file + '.tmp.' + timestamp20_now()
args = ['-p', '-s', '-r', temp_file]
args.extend(new_warcs)
cdxindexer_main(args)
merged_file = temp_file + '.merged'
last_line = None
with open(cdx_file) as orig_index:
with open(temp_file) as new_index:
with open(merged_file, 'w+b') as merged:
for line in heapq.merge(orig_index, new_index):
if last_line != line:
merged.write(line)
last_line = line
os.rename(merged_file, cdx_file)
os.remove(temp_file)
def main(args=None):
description = """
Create manage file based web archive collections
@ -93,6 +156,7 @@ Some examples:
group.add_argument('--init', action='store_true')
group.add_argument('--addwarc', action='store_true')
group.add_argument('--reindex', action='store_true')
group.add_argument('--index-warcs', action='store_true')
parser.add_argument('name')
parser.add_argument('files', nargs='*')
@ -104,6 +168,8 @@ Some examples:
m.add_collection()
elif r.addwarc:
m.add_warcs(r.files)
elif r.index_warcs:
m.index_merge(r.files)
elif r.reindex:
m.reindex()

View File

@ -80,6 +80,22 @@ def timestamp_now():
return datetime_to_timestamp(datetime.datetime.utcnow())
def timestamp20_now():
"""
Create 20-digit timestamp, useful to timestamping temp files
>>> n = timestamp20_now()
>>> timestamp20_now() >= n
True
>>> len(n)
20
"""
now = datetime.datetime.utcnow()
return now.strftime('%Y%m%d%H%M%S%f')
def iso_date_to_timestamp(string):
"""
>>> iso_date_to_timestamp('2013-12-26T10:11:12Z')

View File

@ -171,7 +171,7 @@ class DirectoryCollsLoader(object):
def load_dir(self, root_dir, name):
config_file = os.path.join(root_dir, 'config.yaml')
if os.path.isfile(config_file):
coll = load_yaml_file(config_file)
coll = load_yaml_config(config_file)
else:
coll = {}

View File

@ -25,6 +25,9 @@ def setup_module():
orig_cwd = os.getcwd()
os.chdir(root_dir)
# use actually set dir
root_dir = os.getcwd()
def teardown_module():
global root_dir
shutil.rmtree(root_dir)
@ -48,6 +51,8 @@ class TestManagedColls(object):
assert os.path.isdir(os.path.join(base, dir_))
def test_create_first_coll(self):
""" Test first collection creation, with all required dirs
"""
main(['--init', 'test'])
colls = os.path.join(self.root_dir, 'collections')
@ -59,6 +64,8 @@ class TestManagedColls(object):
self._check_dirs(test, ['cdx', 'warcs', 'static', 'templates'])
def test_add_warcs(self):
""" Test adding warc to new coll, check replay
"""
warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
main(['--addwarc', 'test', warc1])
@ -67,7 +74,22 @@ class TestManagedColls(object):
resp = self.testapp.get('/test/20140103030321/http://example.com?example=1')
assert resp.status_int == 200
def test_another_coll(self):
""" Test adding warc to a new coll, check replay
"""
warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
main(['--init', 'foo'])
main(['--addwarc', 'foo', warc1])
self._create_app()
resp = self.testapp.get('/foo/20140103030321/http://example.com?example=1')
assert resp.status_int == 200
def test_add_more_warcs(self):
""" Test adding additional warcs, check replay of added content
"""
warc1 = os.path.join(get_test_dir(), 'warcs', 'iana.warc.gz')
warc2 = os.path.join(get_test_dir(), 'warcs', 'example-extra.warc')
@ -80,15 +102,80 @@ class TestManagedColls(object):
with raises(IOError):
main(['--addwarc', 'test', 'non-existent-file.warc.gz'])
# check adding no warc -- no op
main(['--addwarc', 'test'])
# check new cdx
self._create_app()
resp = self.testapp.get('/test/20140126200624/http://www.iana.org/')
assert resp.status_int == 200
def test_add_custom_nested_warcs(self):
""" Test recursive indexing of custom created WARC hierarchy,
warcs/A/..., warcs/B/sub/...
Ensure CDX is relative to root archive dir, test replay
"""
main(['--init', 'nested'])
nested_root = os.path.join(self.root_dir, 'collections', 'nested', 'warcs')
nested_a = os.path.join(nested_root, 'A')
nested_b = os.path.join(nested_root, 'B', 'sub')
os.makedirs(nested_a)
os.makedirs(nested_b)
warc1 = os.path.join(get_test_dir(), 'warcs', 'iana.warc.gz')
warc2 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
shutil.copy2(warc1, nested_a)
shutil.copy2(warc2, nested_b)
main(['--index-warcs',
'nested',
os.path.join(nested_a, 'iana.warc.gz'),
os.path.join(nested_b, 'example.warc.gz')
])
nested_cdx = os.path.join(self.root_dir, 'collections', 'nested', 'cdx', 'index.cdx')
with open(nested_cdx) as fh:
nested_cdx_index = fh.read()
assert '- 1043 333 B/sub/example.warc.gz' in nested_cdx_index
assert '- 2258 334 A/iana.warc.gz' in nested_cdx_index
self._create_app()
resp = self.testapp.get('/nested/20140126200624/http://www.iana.org/')
assert resp.status_int == 200
resp = self.testapp.get('/nested/20140103030321/http://example.com?example=1')
assert resp.status_int == 200
def test_merge_vs_reindex_equality(self):
""" Test full reindex vs merged update when adding warcs
to ensure equality of indexes
"""
# ensure merged index is same as full reindex
coll_dir = os.path.join(self.root_dir, 'collections', 'test', 'cdx')
orig = os.path.join(coll_dir, 'index.cdx')
bak = os.path.join(coll_dir, 'index.bak')
shutil.copy(orig, bak)
main(['--reindex', 'test'])
self._create_app()
resp = self.testapp.get('/test/20140103030321/http://example.com?example=1')
assert resp.status_int == 200
with open(orig) as orig_fh:
merged_cdx = orig_fh.read()
with open(bak) as bak_fh:
reindex_cdx = bak_fh.read()
assert len(reindex_cdx.splitlines()) == len(merged_cdx.splitlines())
assert merged_cdx == reindex_cdx
def test_add_static(self):
""" Test adding static file to collection, check access
"""
a_static = os.path.join(self.root_dir, 'collections', 'test', 'static', 'abc.js')
with open(a_static, 'w+b') as fh:
@ -100,7 +187,9 @@ class TestManagedColls(object):
assert resp.content_type == 'application/javascript'
assert '/* Some JS File */' in resp.body
def test_custom_search(self):
def test_custom_template_search(self):
""" Test manually added custom search template search.html
"""
a_static = os.path.join(self.root_dir, 'collections', 'test', 'templates', 'search.html')
with open(a_static, 'w+b') as fh:
@ -112,7 +201,28 @@ class TestManagedColls(object):
assert resp.content_type == 'text/html'
assert 'pywb custom search page' in resp.body
def test_custom_config(self):
""" Test custom created config.yaml which overrides auto settings
Template relative to root dir, not collection-specific so far
"""
config_path = os.path.join(self.root_dir, 'collections', 'test', 'config.yaml')
with open(config_path, 'w+b') as fh:
fh.write('search_html: ./custom_search.html\n')
custom_search = os.path.join(self.root_dir, 'custom_search.html')
with open(custom_search, 'w+b') as fh:
fh.write('config.yaml overriden search page')
self._create_app()
resp = self.testapp.get('/test/')
assert resp.status_int == 200
assert resp.content_type == 'text/html'
assert 'config.yaml overriden search page' in resp.body
def test_no_templates(self):
""" Test removing templates dir, using default template again
"""
shutil.rmtree(os.path.join(self.root_dir, 'collections', 'test', 'templates'))
self._create_app()
@ -122,11 +232,44 @@ class TestManagedColls(object):
assert resp.content_type == 'text/html'
assert 'pywb custom search page' not in resp.body
def test_err_no_such_coll(self):
""" Test error adding warc to non-existant collection
"""
warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
with raises(IOError):
main(['--addwarc', 'bar', warc1])
def test_err_wrong_warcs(self):
warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
invalid_warc = os.path.join(self.root_dir, 'collections', 'test', 'warcs', 'invalid.warc.gz')
# Empty
main(['--index-warcs', 'test'])
# Wrong paths not in collection
with raises(IOError):
main(['--index-warcs', 'test', warc1])
# Non-existent
with raises(IOError):
main(['--index-warcs', 'test', invalid_warc])
def test_err_missing_dirs(self):
""" Test various errors with missing warcs dir,
missing cdx dir, non dir cdx file, and missing collections root
"""
colls = os.path.join(self.root_dir, 'collections')
# No WARCS
warcs_path = os.path.join(colls, 'foo', 'warcs')
shutil.rmtree(warcs_path)
with raises(IOError):
main(['--addwarc', 'foo', 'somewarc'])
# No CDX
cdx_path = os.path.join(colls, 'test', 'cdx')
cdx_path = os.path.join(colls, 'foo', 'cdx')
shutil.rmtree(cdx_path)
with raises(Exception):