mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Rename --uncompress-wacz to --unpack-wacz
This commit is contained in:
parent
4a645eee02
commit
abe755b1de
@ -46,7 +46,7 @@ It can be used to:
|
|||||||
|
|
||||||
* Create a new collection -- ``wb-manager init <coll>``
|
* Create a new collection -- ``wb-manager init <coll>``
|
||||||
* Add WARCs to collection -- ``wb-manager add <coll> <warc>``
|
* Add WARCs to collection -- ``wb-manager add <coll> <warc>``
|
||||||
* Unpack WACZs to add their WARCs and indices to collection -- ``wb-manager add --uncompress-wacz <coll> <wacz>``
|
* Unpack WACZs to add their WARCs and indices to collection -- ``wb-manager add --unpack-wacz <coll> <wacz>``
|
||||||
* Add override templates
|
* Add override templates
|
||||||
* Add and remove metadata to a collections ``metadata.yaml``
|
* Add and remove metadata to a collections ``metadata.yaml``
|
||||||
* List all collections
|
* List all collections
|
||||||
|
@ -114,7 +114,7 @@ Using Existing Web Archive Collections
|
|||||||
Existing archives of WARCs/ARCs files can be used with pywb with minimal amount of setup. By using ``wb-manager add``,
|
Existing archives of WARCs/ARCs files can be used with pywb with minimal amount of setup. By using ``wb-manager add``,
|
||||||
WARC/ARC files will automatically be placed in the collection archive directory and indexed.
|
WARC/ARC files will automatically be placed in the collection archive directory and indexed.
|
||||||
|
|
||||||
In pywb 2.8.0 and later, preliminary support for WACZ files is also added with ``wb-manager add --uncompress-wacz``. This will unpack the provided WACZ file, adding its WARCs and indices to the collection.
|
In pywb 2.8.0 and later, preliminary support for WACZ files is also added with ``wb-manager add --unpack-wacz``. This will unpack the provided WACZ file, adding its WARCs and indices to the collection.
|
||||||
|
|
||||||
By default ``wb-manager``, places new collections in ``collections/<coll name>`` subdirectory in the current working directory. To specify a different root directory, the ``wb-manager -d <dir>``. Other options can be set in the config file.
|
By default ``wb-manager``, places new collections in ``collections/<coll name>`` subdirectory in the current working directory. To specify a different root directory, the ``wb-manager -d <dir>``. Other options can be set in the config file.
|
||||||
|
|
||||||
|
@ -121,7 +121,7 @@ directory structure expected by pywb
|
|||||||
'To create a new collection, run\n\n{1} init {0}')
|
'To create a new collection, run\n\n{1} init {0}')
|
||||||
raise IOError(msg.format(self.coll_name, sys.argv[0]))
|
raise IOError(msg.format(self.coll_name, sys.argv[0]))
|
||||||
|
|
||||||
def add_archives(self, archives, uncompress_wacz=False):
|
def add_archives(self, archives, unpack_wacz=False):
|
||||||
if not os.path.isdir(self.archive_dir):
|
if not os.path.isdir(self.archive_dir):
|
||||||
raise IOError('Directory {0} does not exist'.
|
raise IOError('Directory {0} does not exist'.
|
||||||
format(self.archive_dir))
|
format(self.archive_dir))
|
||||||
@ -134,11 +134,11 @@ directory structure expected by pywb
|
|||||||
if full_path:
|
if full_path:
|
||||||
warc_paths.append(full_path)
|
warc_paths.append(full_path)
|
||||||
elif self.WACZ_RX.match(archive):
|
elif self.WACZ_RX.match(archive):
|
||||||
if uncompress_wacz:
|
if unpack_wacz:
|
||||||
self._add_wacz_uncompressed(archive)
|
self._add_wacz_unpacked(archive)
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError('Adding waczs without unpacking is not yet implemented. Use '
|
raise NotImplementedError('Adding waczs without unpacking is not yet implemented. Use '
|
||||||
'\'--uncompress-wacz\' flag to add the wacz\'s content.')
|
'\'--unpack-wacz\' flag to add the wacz\'s content.')
|
||||||
else:
|
else:
|
||||||
invalid_archives.append(archive)
|
invalid_archives.append(archive)
|
||||||
|
|
||||||
@ -160,7 +160,7 @@ directory structure expected by pywb
|
|||||||
logging.info('Copied ' + filename + ' to ' + self.archive_dir)
|
logging.info('Copied ' + filename + ' to ' + self.archive_dir)
|
||||||
return full_path
|
return full_path
|
||||||
|
|
||||||
def _add_wacz_uncompressed(self, wacz):
|
def _add_wacz_unpacked(self, wacz):
|
||||||
wacz = os.path.abspath(wacz)
|
wacz = os.path.abspath(wacz)
|
||||||
temp_dir = mkdtemp()
|
temp_dir = mkdtemp()
|
||||||
warc_regex = re.compile(r'.+\.warc(\.gz)?$')
|
warc_regex = re.compile(r'.+\.warc(\.gz)?$')
|
||||||
@ -494,11 +494,17 @@ Create manage file based web archive collections
|
|||||||
# Add Warcs or Waczs
|
# Add Warcs or Waczs
|
||||||
def do_add(r):
|
def do_add(r):
|
||||||
m = CollectionsManager(r.coll_name)
|
m = CollectionsManager(r.coll_name)
|
||||||
m.add_archives(r.files, r.uncompress_wacz)
|
m.add_archives(r.files, r.unpack_wacz)
|
||||||
|
|
||||||
add_archives_help = 'Copy ARCS/WARCS/WACZ to collection directory and reindex'
|
add_archives_help = 'Copy ARCs/WARCs to collection directory and reindex'
|
||||||
|
add_unpack_wacz_help = 'Copy WARCs and indices from WACZ to collection directory'
|
||||||
add_archives = subparsers.add_parser('add', help=add_archives_help)
|
add_archives = subparsers.add_parser('add', help=add_archives_help)
|
||||||
add_archives.add_argument('--uncompress-wacz', dest='uncompress_wacz', action='store_true')
|
add_archives.add_argument(
|
||||||
|
'--unpack-wacz',
|
||||||
|
dest='unpack_wacz',
|
||||||
|
action='store_true',
|
||||||
|
help=add_unpack_wacz_help
|
||||||
|
)
|
||||||
add_archives.add_argument('coll_name')
|
add_archives.add_argument('coll_name')
|
||||||
add_archives.add_argument('files', nargs='+')
|
add_archives.add_argument('files', nargs='+')
|
||||||
add_archives.set_defaults(func=do_add)
|
add_archives.set_defaults(func=do_add)
|
||||||
|
@ -11,19 +11,19 @@ TEST_COLLECTION_NAME = 'test-col'
|
|||||||
|
|
||||||
|
|
||||||
class TestManager:
|
class TestManager:
|
||||||
def test_add_valid_wacz_uncompressed(self, tmp_path):
|
def test_add_valid_wacz_unpacked(self, tmp_path):
|
||||||
"""Test if adding a valid wacz file to a collection succeeds"""
|
"""Test if adding a valid wacz file to a collection succeeds"""
|
||||||
manager = self.get_test_collections_manager(tmp_path)
|
manager = self.get_test_collections_manager(tmp_path)
|
||||||
manager._add_wacz_uncompressed(VALID_WACZ_PATH)
|
manager._add_wacz_unpacked(VALID_WACZ_PATH)
|
||||||
assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir)
|
assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir)
|
||||||
assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir)
|
assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir)
|
||||||
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||||||
assert '"filename": "valid_example_1-0.warc"' in f.read()
|
assert '"filename": "valid_example_1-0.warc"' in f.read()
|
||||||
|
|
||||||
def test_add_invalid_wacz_uncompressed(self, tmp_path, caplog):
|
def test_add_invalid_wacz_unpacked(self, tmp_path, caplog):
|
||||||
"""Test if adding an invalid wacz file to a collection fails"""
|
"""Test if adding an invalid wacz file to a collection fails"""
|
||||||
manager = self.get_test_collections_manager(tmp_path)
|
manager = self.get_test_collections_manager(tmp_path)
|
||||||
manager._add_wacz_uncompressed(INVALID_WACZ_PATH)
|
manager._add_wacz_unpacked(INVALID_WACZ_PATH)
|
||||||
assert 'invalid_example_1-0.warc' not in os.listdir(manager.archive_dir)
|
assert 'invalid_example_1-0.warc' not in os.listdir(manager.archive_dir)
|
||||||
assert 'sample_archive/waczs/invalid_example_1.wacz does not contain any warc files.' in caplog.text
|
assert 'sample_archive/waczs/invalid_example_1.wacz does not contain any warc files.' in caplog.text
|
||||||
|
|
||||||
@ -32,12 +32,12 @@ class TestManager:
|
|||||||
with open(index_path, 'r') as f:
|
with open(index_path, 'r') as f:
|
||||||
assert '"filename": "invalid_example_1-0.warc"' not in f.read()
|
assert '"filename": "invalid_example_1-0.warc"' not in f.read()
|
||||||
|
|
||||||
def test_add_valid_archives_uncompressed_wacz(self, tmp_path):
|
def test_add_valid_archives_unpack_wacz(self, tmp_path):
|
||||||
manager = self.get_test_collections_manager(tmp_path)
|
manager = self.get_test_collections_manager(tmp_path)
|
||||||
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
|
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
|
||||||
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
|
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
|
||||||
'sample_archive/waczs/valid_example_1.wacz']
|
'sample_archive/waczs/valid_example_1.wacz']
|
||||||
manager.add_archives(archives, uncompress_wacz=True)
|
manager.add_archives(archives, unpack_wacz=True)
|
||||||
|
|
||||||
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||||||
index_text = f.read()
|
index_text = f.read()
|
||||||
@ -51,19 +51,19 @@ class TestManager:
|
|||||||
assert archive in os.listdir(manager.archive_dir)
|
assert archive in os.listdir(manager.archive_dir)
|
||||||
assert archive in index_text
|
assert archive in index_text
|
||||||
|
|
||||||
def test_add_valid_archives_dont_uncompress_wacz(self, tmp_path):
|
def test_add_valid_archives_dont_unpack_wacz(self, tmp_path):
|
||||||
manager = self.get_test_collections_manager(tmp_path)
|
manager = self.get_test_collections_manager(tmp_path)
|
||||||
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
|
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
|
||||||
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
|
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
|
||||||
'sample_archive/waczs/valid_example_1.wacz']
|
'sample_archive/waczs/valid_example_1.wacz']
|
||||||
|
|
||||||
with pytest.raises(NotImplementedError):
|
with pytest.raises(NotImplementedError):
|
||||||
manager.add_archives(archives, uncompress_wacz=False)
|
manager.add_archives(archives, unpack_wacz=False)
|
||||||
|
|
||||||
def test_add_invalid_archives_uncompress_wacz(self, tmp_path, caplog):
|
def test_add_invalid_archives_unpack_wacz(self, tmp_path, caplog):
|
||||||
manager = self.get_test_collections_manager(tmp_path)
|
manager = self.get_test_collections_manager(tmp_path)
|
||||||
manager.add_archives(['sample_archive/warcs/example.warc', 'sample_archive/text_content/sample.html'],
|
manager.add_archives(['sample_archive/warcs/example.warc', 'sample_archive/text_content/sample.html'],
|
||||||
uncompress_wacz=True)
|
unpack_wacz=True)
|
||||||
assert 'sample.html' not in os.listdir(manager.archive_dir)
|
assert 'sample.html' not in os.listdir(manager.archive_dir)
|
||||||
assert 'example.warc' in os.listdir(manager.archive_dir)
|
assert 'example.warc' in os.listdir(manager.archive_dir)
|
||||||
assert "Invalid archives weren't added: sample_archive/text_content/sample.html" in caplog.messages
|
assert "Invalid archives weren't added: sample_archive/text_content/sample.html" in caplog.messages
|
||||||
@ -95,6 +95,10 @@ class TestManager:
|
|||||||
assert 'example-collection.warc' not in index_content
|
assert 'example-collection.warc' not in index_content
|
||||||
assert 'rewritten.warc' in index_content
|
assert 'rewritten.warc' in index_content
|
||||||
|
|
||||||
|
# check that collection index is sorted
|
||||||
|
index_lines = index_content.split('\n')
|
||||||
|
assert sorted(index_lines) == index_lines
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_test_collections_manager(collections_path):
|
def get_test_collections_manager(collections_path):
|
||||||
manager = CollectionsManager(TEST_COLLECTION_NAME, colls_dir=collections_path, must_exist=False)
|
manager = CollectionsManager(TEST_COLLECTION_NAME, colls_dir=collections_path, must_exist=False)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user