mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
collections manager: support for merge when adding warc, explicit --index-warcs
option to index and merge instead of reindexing whole dir, #74 additional testing for recursive indexing, index merge timeutils: add timestamp20_now() function
This commit is contained in:
parent
759d151551
commit
b417b47835
@ -4,8 +4,11 @@ import sys
|
||||
import logging
|
||||
|
||||
from pywb.utils.loaders import load_yaml_config
|
||||
from pywb.utils.timeutils import timestamp20_now
|
||||
from pywb.warc.cdxindexer import main as cdxindexer_main
|
||||
|
||||
from argparse import ArgumentParser, RawTextHelpFormatter
|
||||
import heapq
|
||||
|
||||
|
||||
#=============================================================================
|
||||
@ -49,23 +52,83 @@ directory structure expected by pywb
|
||||
|
||||
def add_warcs(self, warcs):
|
||||
if not os.path.isdir(self.warc_dir):
|
||||
raise Exception('Directory ' + warcdir + ' does not exist')
|
||||
if not os.path.isdir(self.coll_dir):
|
||||
raise IOError('Collection {0} does not exist'.
|
||||
format(self.coll_name))
|
||||
else:
|
||||
raise IOError('Directory {0} does not exist'.
|
||||
format(self.warc_dir))
|
||||
|
||||
if not warcs:
|
||||
logging.info('No WARCs specified')
|
||||
return
|
||||
|
||||
full_paths = []
|
||||
for filename in warcs:
|
||||
shutil.copy2(filename, self.warc_dir)
|
||||
full_paths.append(os.path.join(self.warc_dir, filename))
|
||||
logging.info('Copied ' + filename + ' to ' + self.warc_dir)
|
||||
|
||||
self.reindex()
|
||||
self._index_merge_warcs(full_paths)
|
||||
|
||||
def reindex(self):
|
||||
cdx_file = os.path.join(self.cdx_dir, 'index.cdx')
|
||||
logging.info('Indexing ' + self.warc_dir + ' to ' + cdx_file)
|
||||
cdxindexer_main(['-p', '-s', '-r', cdx_file, self.warc_dir])
|
||||
|
||||
def index_merge(self, filelist):
|
||||
wrongdir = 'Skipping {0}, must be in {1} archive directory'
|
||||
notfound = 'Skipping {0}, file not found'
|
||||
|
||||
filtered_warcs = []
|
||||
|
||||
# Check that warcs are actually in warcs dir
|
||||
abs_warc_dir = os.path.abspath(self.warc_dir)
|
||||
|
||||
for f in filelist:
|
||||
abs_filepath = os.path.abspath(f)
|
||||
prefix = os.path.commonprefix([abs_warc_dir, abs_filepath])
|
||||
|
||||
if prefix != abs_warc_dir:
|
||||
raise IOError(wrongdir.format(abs_filepath, abs_warc_dir))
|
||||
elif not os.path.isfile(abs_filepath):
|
||||
raise IOError(notfound.format(f))
|
||||
else:
|
||||
filtered_warcs.append(abs_filepath.split(prefix)[1])
|
||||
|
||||
self._index_merge_warcs(filtered_warcs)
|
||||
|
||||
def _index_merge_warcs(self, new_warcs):
|
||||
if not new_warcs:
|
||||
return
|
||||
|
||||
cdx_file = os.path.join(self.cdx_dir, 'index.cdx')
|
||||
|
||||
# no existing file, just reindex all
|
||||
if not os.path.isfile(cdx_file):
|
||||
return self.reindex()
|
||||
|
||||
temp_file = cdx_file + '.tmp.' + timestamp20_now()
|
||||
args = ['-p', '-s', '-r', temp_file]
|
||||
args.extend(new_warcs)
|
||||
cdxindexer_main(args)
|
||||
|
||||
merged_file = temp_file + '.merged'
|
||||
|
||||
last_line = None
|
||||
|
||||
with open(cdx_file) as orig_index:
|
||||
with open(temp_file) as new_index:
|
||||
with open(merged_file, 'w+b') as merged:
|
||||
for line in heapq.merge(orig_index, new_index):
|
||||
if last_line != line:
|
||||
merged.write(line)
|
||||
last_line = line
|
||||
|
||||
os.rename(merged_file, cdx_file)
|
||||
os.remove(temp_file)
|
||||
|
||||
|
||||
def main(args=None):
|
||||
description = """
|
||||
Create manage file based web archive collections
|
||||
@ -93,6 +156,7 @@ Some examples:
|
||||
group.add_argument('--init', action='store_true')
|
||||
group.add_argument('--addwarc', action='store_true')
|
||||
group.add_argument('--reindex', action='store_true')
|
||||
group.add_argument('--index-warcs', action='store_true')
|
||||
|
||||
parser.add_argument('name')
|
||||
parser.add_argument('files', nargs='*')
|
||||
@ -104,6 +168,8 @@ Some examples:
|
||||
m.add_collection()
|
||||
elif r.addwarc:
|
||||
m.add_warcs(r.files)
|
||||
elif r.index_warcs:
|
||||
m.index_merge(r.files)
|
||||
elif r.reindex:
|
||||
m.reindex()
|
||||
|
||||
|
@ -80,6 +80,22 @@ def timestamp_now():
|
||||
return datetime_to_timestamp(datetime.datetime.utcnow())
|
||||
|
||||
|
||||
def timestamp20_now():
|
||||
"""
|
||||
Create 20-digit timestamp, useful to timestamping temp files
|
||||
|
||||
>>> n = timestamp20_now()
|
||||
>>> timestamp20_now() >= n
|
||||
True
|
||||
|
||||
>>> len(n)
|
||||
20
|
||||
|
||||
"""
|
||||
now = datetime.datetime.utcnow()
|
||||
return now.strftime('%Y%m%d%H%M%S%f')
|
||||
|
||||
|
||||
def iso_date_to_timestamp(string):
|
||||
"""
|
||||
>>> iso_date_to_timestamp('2013-12-26T10:11:12Z')
|
||||
|
@ -171,7 +171,7 @@ class DirectoryCollsLoader(object):
|
||||
def load_dir(self, root_dir, name):
|
||||
config_file = os.path.join(root_dir, 'config.yaml')
|
||||
if os.path.isfile(config_file):
|
||||
coll = load_yaml_file(config_file)
|
||||
coll = load_yaml_config(config_file)
|
||||
else:
|
||||
coll = {}
|
||||
|
||||
|
@ -25,6 +25,9 @@ def setup_module():
|
||||
orig_cwd = os.getcwd()
|
||||
os.chdir(root_dir)
|
||||
|
||||
# use actually set dir
|
||||
root_dir = os.getcwd()
|
||||
|
||||
def teardown_module():
|
||||
global root_dir
|
||||
shutil.rmtree(root_dir)
|
||||
@ -48,6 +51,8 @@ class TestManagedColls(object):
|
||||
assert os.path.isdir(os.path.join(base, dir_))
|
||||
|
||||
def test_create_first_coll(self):
|
||||
""" Test first collection creation, with all required dirs
|
||||
"""
|
||||
main(['--init', 'test'])
|
||||
|
||||
colls = os.path.join(self.root_dir, 'collections')
|
||||
@ -59,6 +64,8 @@ class TestManagedColls(object):
|
||||
self._check_dirs(test, ['cdx', 'warcs', 'static', 'templates'])
|
||||
|
||||
def test_add_warcs(self):
|
||||
""" Test adding warc to new coll, check replay
|
||||
"""
|
||||
warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
|
||||
|
||||
main(['--addwarc', 'test', warc1])
|
||||
@ -67,7 +74,22 @@ class TestManagedColls(object):
|
||||
resp = self.testapp.get('/test/20140103030321/http://example.com?example=1')
|
||||
assert resp.status_int == 200
|
||||
|
||||
def test_another_coll(self):
|
||||
""" Test adding warc to a new coll, check replay
|
||||
"""
|
||||
warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
|
||||
|
||||
main(['--init', 'foo'])
|
||||
|
||||
main(['--addwarc', 'foo', warc1])
|
||||
|
||||
self._create_app()
|
||||
resp = self.testapp.get('/foo/20140103030321/http://example.com?example=1')
|
||||
assert resp.status_int == 200
|
||||
|
||||
def test_add_more_warcs(self):
|
||||
""" Test adding additional warcs, check replay of added content
|
||||
"""
|
||||
warc1 = os.path.join(get_test_dir(), 'warcs', 'iana.warc.gz')
|
||||
warc2 = os.path.join(get_test_dir(), 'warcs', 'example-extra.warc')
|
||||
|
||||
@ -80,15 +102,80 @@ class TestManagedColls(object):
|
||||
with raises(IOError):
|
||||
main(['--addwarc', 'test', 'non-existent-file.warc.gz'])
|
||||
|
||||
# check adding no warc -- no op
|
||||
main(['--addwarc', 'test'])
|
||||
|
||||
# check new cdx
|
||||
self._create_app()
|
||||
resp = self.testapp.get('/test/20140126200624/http://www.iana.org/')
|
||||
assert resp.status_int == 200
|
||||
|
||||
def test_add_custom_nested_warcs(self):
|
||||
""" Test recursive indexing of custom created WARC hierarchy,
|
||||
warcs/A/..., warcs/B/sub/...
|
||||
Ensure CDX is relative to root archive dir, test replay
|
||||
"""
|
||||
|
||||
main(['--init', 'nested'])
|
||||
|
||||
nested_root = os.path.join(self.root_dir, 'collections', 'nested', 'warcs')
|
||||
nested_a = os.path.join(nested_root, 'A')
|
||||
nested_b = os.path.join(nested_root, 'B', 'sub')
|
||||
|
||||
os.makedirs(nested_a)
|
||||
os.makedirs(nested_b)
|
||||
|
||||
warc1 = os.path.join(get_test_dir(), 'warcs', 'iana.warc.gz')
|
||||
warc2 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
|
||||
|
||||
shutil.copy2(warc1, nested_a)
|
||||
shutil.copy2(warc2, nested_b)
|
||||
|
||||
main(['--index-warcs',
|
||||
'nested',
|
||||
os.path.join(nested_a, 'iana.warc.gz'),
|
||||
os.path.join(nested_b, 'example.warc.gz')
|
||||
])
|
||||
|
||||
nested_cdx = os.path.join(self.root_dir, 'collections', 'nested', 'cdx', 'index.cdx')
|
||||
with open(nested_cdx) as fh:
|
||||
nested_cdx_index = fh.read()
|
||||
|
||||
assert '- 1043 333 B/sub/example.warc.gz' in nested_cdx_index
|
||||
assert '- 2258 334 A/iana.warc.gz' in nested_cdx_index
|
||||
|
||||
self._create_app()
|
||||
resp = self.testapp.get('/nested/20140126200624/http://www.iana.org/')
|
||||
assert resp.status_int == 200
|
||||
|
||||
resp = self.testapp.get('/nested/20140103030321/http://example.com?example=1')
|
||||
assert resp.status_int == 200
|
||||
|
||||
def test_merge_vs_reindex_equality(self):
|
||||
""" Test full reindex vs merged update when adding warcs
|
||||
to ensure equality of indexes
|
||||
"""
|
||||
# ensure merged index is same as full reindex
|
||||
coll_dir = os.path.join(self.root_dir, 'collections', 'test', 'cdx')
|
||||
orig = os.path.join(coll_dir, 'index.cdx')
|
||||
bak = os.path.join(coll_dir, 'index.bak')
|
||||
|
||||
shutil.copy(orig, bak)
|
||||
|
||||
main(['--reindex', 'test'])
|
||||
|
||||
self._create_app()
|
||||
resp = self.testapp.get('/test/20140103030321/http://example.com?example=1')
|
||||
assert resp.status_int == 200
|
||||
with open(orig) as orig_fh:
|
||||
merged_cdx = orig_fh.read()
|
||||
|
||||
with open(bak) as bak_fh:
|
||||
reindex_cdx = bak_fh.read()
|
||||
|
||||
assert len(reindex_cdx.splitlines()) == len(merged_cdx.splitlines())
|
||||
assert merged_cdx == reindex_cdx
|
||||
|
||||
def test_add_static(self):
|
||||
""" Test adding static file to collection, check access
|
||||
"""
|
||||
a_static = os.path.join(self.root_dir, 'collections', 'test', 'static', 'abc.js')
|
||||
|
||||
with open(a_static, 'w+b') as fh:
|
||||
@ -100,7 +187,9 @@ class TestManagedColls(object):
|
||||
assert resp.content_type == 'application/javascript'
|
||||
assert '/* Some JS File */' in resp.body
|
||||
|
||||
def test_custom_search(self):
|
||||
def test_custom_template_search(self):
|
||||
""" Test manually added custom search template search.html
|
||||
"""
|
||||
a_static = os.path.join(self.root_dir, 'collections', 'test', 'templates', 'search.html')
|
||||
|
||||
with open(a_static, 'w+b') as fh:
|
||||
@ -112,7 +201,28 @@ class TestManagedColls(object):
|
||||
assert resp.content_type == 'text/html'
|
||||
assert 'pywb custom search page' in resp.body
|
||||
|
||||
def test_custom_config(self):
|
||||
""" Test custom created config.yaml which overrides auto settings
|
||||
Template relative to root dir, not collection-specific so far
|
||||
"""
|
||||
config_path = os.path.join(self.root_dir, 'collections', 'test', 'config.yaml')
|
||||
with open(config_path, 'w+b') as fh:
|
||||
fh.write('search_html: ./custom_search.html\n')
|
||||
|
||||
custom_search = os.path.join(self.root_dir, 'custom_search.html')
|
||||
with open(custom_search, 'w+b') as fh:
|
||||
fh.write('config.yaml overriden search page')
|
||||
|
||||
self._create_app()
|
||||
resp = self.testapp.get('/test/')
|
||||
assert resp.status_int == 200
|
||||
assert resp.content_type == 'text/html'
|
||||
assert 'config.yaml overriden search page' in resp.body
|
||||
|
||||
|
||||
def test_no_templates(self):
|
||||
""" Test removing templates dir, using default template again
|
||||
"""
|
||||
shutil.rmtree(os.path.join(self.root_dir, 'collections', 'test', 'templates'))
|
||||
|
||||
self._create_app()
|
||||
@ -122,11 +232,44 @@ class TestManagedColls(object):
|
||||
assert resp.content_type == 'text/html'
|
||||
assert 'pywb custom search page' not in resp.body
|
||||
|
||||
def test_err_no_such_coll(self):
|
||||
""" Test error adding warc to non-existant collection
|
||||
"""
|
||||
warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
|
||||
|
||||
with raises(IOError):
|
||||
main(['--addwarc', 'bar', warc1])
|
||||
|
||||
def test_err_wrong_warcs(self):
|
||||
warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
|
||||
invalid_warc = os.path.join(self.root_dir, 'collections', 'test', 'warcs', 'invalid.warc.gz')
|
||||
|
||||
# Empty
|
||||
main(['--index-warcs', 'test'])
|
||||
|
||||
# Wrong paths not in collection
|
||||
with raises(IOError):
|
||||
main(['--index-warcs', 'test', warc1])
|
||||
|
||||
# Non-existent
|
||||
with raises(IOError):
|
||||
main(['--index-warcs', 'test', invalid_warc])
|
||||
|
||||
def test_err_missing_dirs(self):
|
||||
""" Test various errors with missing warcs dir,
|
||||
missing cdx dir, non dir cdx file, and missing collections root
|
||||
"""
|
||||
colls = os.path.join(self.root_dir, 'collections')
|
||||
|
||||
# No WARCS
|
||||
warcs_path = os.path.join(colls, 'foo', 'warcs')
|
||||
shutil.rmtree(warcs_path)
|
||||
|
||||
with raises(IOError):
|
||||
main(['--addwarc', 'foo', 'somewarc'])
|
||||
|
||||
# No CDX
|
||||
cdx_path = os.path.join(colls, 'test', 'cdx')
|
||||
cdx_path = os.path.join(colls, 'foo', 'cdx')
|
||||
shutil.rmtree(cdx_path)
|
||||
|
||||
with raises(Exception):
|
||||
|
Loading…
x
Reference in New Issue
Block a user