1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Rename --uncompress-wacz to --unpack-wacz

This commit is contained in:
Tessa Walsh 2024-04-24 01:18:51 +02:00
parent 4a645eee02
commit abe755b1de
4 changed files with 30 additions and 20 deletions

View File

@ -46,7 +46,7 @@ It can be used to:
* Create a new collection -- ``wb-manager init <coll>`` * Create a new collection -- ``wb-manager init <coll>``
* Add WARCs to collection -- ``wb-manager add <coll> <warc>`` * Add WARCs to collection -- ``wb-manager add <coll> <warc>``
* Unpack WACZs to add their WARCs and indices to collection -- ``wb-manager add --uncompress-wacz <coll> <wacz>`` * Unpack WACZs to add their WARCs and indices to collection -- ``wb-manager add --unpack-wacz <coll> <wacz>``
* Add override templates * Add override templates
* Add and remove metadata to a collections ``metadata.yaml`` * Add and remove metadata to a collections ``metadata.yaml``
* List all collections * List all collections

View File

@ -114,7 +114,7 @@ Using Existing Web Archive Collections
Existing archives of WARCs/ARCs files can be used with pywb with minimal amount of setup. By using ``wb-manager add``, Existing archives of WARCs/ARCs files can be used with pywb with minimal amount of setup. By using ``wb-manager add``,
WARC/ARC files will automatically be placed in the collection archive directory and indexed. WARC/ARC files will automatically be placed in the collection archive directory and indexed.
In pywb 2.8.0 and later, preliminary support for WACZ files is also added with ``wb-manager add --uncompress-wacz``. This will unpack the provided WACZ file, adding its WARCs and indices to the collection. In pywb 2.8.0 and later, preliminary support for WACZ files is also added with ``wb-manager add --unpack-wacz``. This will unpack the provided WACZ file, adding its WARCs and indices to the collection.
By default ``wb-manager``, places new collections in ``collections/<coll name>`` subdirectory in the current working directory. To specify a different root directory, the ``wb-manager -d <dir>``. Other options can be set in the config file. By default ``wb-manager``, places new collections in ``collections/<coll name>`` subdirectory in the current working directory. To specify a different root directory, the ``wb-manager -d <dir>``. Other options can be set in the config file.

View File

@ -121,7 +121,7 @@ directory structure expected by pywb
'To create a new collection, run\n\n{1} init {0}') 'To create a new collection, run\n\n{1} init {0}')
raise IOError(msg.format(self.coll_name, sys.argv[0])) raise IOError(msg.format(self.coll_name, sys.argv[0]))
def add_archives(self, archives, uncompress_wacz=False): def add_archives(self, archives, unpack_wacz=False):
if not os.path.isdir(self.archive_dir): if not os.path.isdir(self.archive_dir):
raise IOError('Directory {0} does not exist'. raise IOError('Directory {0} does not exist'.
format(self.archive_dir)) format(self.archive_dir))
@ -134,11 +134,11 @@ directory structure expected by pywb
if full_path: if full_path:
warc_paths.append(full_path) warc_paths.append(full_path)
elif self.WACZ_RX.match(archive): elif self.WACZ_RX.match(archive):
if uncompress_wacz: if unpack_wacz:
self._add_wacz_uncompressed(archive) self._add_wacz_unpacked(archive)
else: else:
raise NotImplementedError('Adding waczs without unpacking is not yet implemented. Use ' raise NotImplementedError('Adding waczs without unpacking is not yet implemented. Use '
'\'--uncompress-wacz\' flag to add the wacz\'s content.') '\'--unpack-wacz\' flag to add the wacz\'s content.')
else: else:
invalid_archives.append(archive) invalid_archives.append(archive)
@ -160,7 +160,7 @@ directory structure expected by pywb
logging.info('Copied ' + filename + ' to ' + self.archive_dir) logging.info('Copied ' + filename + ' to ' + self.archive_dir)
return full_path return full_path
def _add_wacz_uncompressed(self, wacz): def _add_wacz_unpacked(self, wacz):
wacz = os.path.abspath(wacz) wacz = os.path.abspath(wacz)
temp_dir = mkdtemp() temp_dir = mkdtemp()
warc_regex = re.compile(r'.+\.warc(\.gz)?$') warc_regex = re.compile(r'.+\.warc(\.gz)?$')
@ -494,11 +494,17 @@ Create manage file based web archive collections
# Add Warcs or Waczs # Add Warcs or Waczs
def do_add(r): def do_add(r):
m = CollectionsManager(r.coll_name) m = CollectionsManager(r.coll_name)
m.add_archives(r.files, r.uncompress_wacz) m.add_archives(r.files, r.unpack_wacz)
add_archives_help = 'Copy ARCS/WARCS/WACZ to collection directory and reindex' add_archives_help = 'Copy ARCs/WARCs to collection directory and reindex'
add_unpack_wacz_help = 'Copy WARCs and indices from WACZ to collection directory'
add_archives = subparsers.add_parser('add', help=add_archives_help) add_archives = subparsers.add_parser('add', help=add_archives_help)
add_archives.add_argument('--uncompress-wacz', dest='uncompress_wacz', action='store_true') add_archives.add_argument(
'--unpack-wacz',
dest='unpack_wacz',
action='store_true',
help=add_unpack_wacz_help
)
add_archives.add_argument('coll_name') add_archives.add_argument('coll_name')
add_archives.add_argument('files', nargs='+') add_archives.add_argument('files', nargs='+')
add_archives.set_defaults(func=do_add) add_archives.set_defaults(func=do_add)

View File

@ -11,19 +11,19 @@ TEST_COLLECTION_NAME = 'test-col'
class TestManager: class TestManager:
def test_add_valid_wacz_uncompressed(self, tmp_path): def test_add_valid_wacz_unpacked(self, tmp_path):
"""Test if adding a valid wacz file to a collection succeeds""" """Test if adding a valid wacz file to a collection succeeds"""
manager = self.get_test_collections_manager(tmp_path) manager = self.get_test_collections_manager(tmp_path)
manager._add_wacz_uncompressed(VALID_WACZ_PATH) manager._add_wacz_unpacked(VALID_WACZ_PATH)
assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir) assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir)
assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir) assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir)
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
assert '"filename": "valid_example_1-0.warc"' in f.read() assert '"filename": "valid_example_1-0.warc"' in f.read()
def test_add_invalid_wacz_uncompressed(self, tmp_path, caplog): def test_add_invalid_wacz_unpacked(self, tmp_path, caplog):
"""Test if adding an invalid wacz file to a collection fails""" """Test if adding an invalid wacz file to a collection fails"""
manager = self.get_test_collections_manager(tmp_path) manager = self.get_test_collections_manager(tmp_path)
manager._add_wacz_uncompressed(INVALID_WACZ_PATH) manager._add_wacz_unpacked(INVALID_WACZ_PATH)
assert 'invalid_example_1-0.warc' not in os.listdir(manager.archive_dir) assert 'invalid_example_1-0.warc' not in os.listdir(manager.archive_dir)
assert 'sample_archive/waczs/invalid_example_1.wacz does not contain any warc files.' in caplog.text assert 'sample_archive/waczs/invalid_example_1.wacz does not contain any warc files.' in caplog.text
@ -32,12 +32,12 @@ class TestManager:
with open(index_path, 'r') as f: with open(index_path, 'r') as f:
assert '"filename": "invalid_example_1-0.warc"' not in f.read() assert '"filename": "invalid_example_1-0.warc"' not in f.read()
def test_add_valid_archives_uncompressed_wacz(self, tmp_path): def test_add_valid_archives_unpack_wacz(self, tmp_path):
manager = self.get_test_collections_manager(tmp_path) manager = self.get_test_collections_manager(tmp_path)
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz', archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz', 'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
'sample_archive/waczs/valid_example_1.wacz'] 'sample_archive/waczs/valid_example_1.wacz']
manager.add_archives(archives, uncompress_wacz=True) manager.add_archives(archives, unpack_wacz=True)
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
index_text = f.read() index_text = f.read()
@ -51,19 +51,19 @@ class TestManager:
assert archive in os.listdir(manager.archive_dir) assert archive in os.listdir(manager.archive_dir)
assert archive in index_text assert archive in index_text
def test_add_valid_archives_dont_uncompress_wacz(self, tmp_path): def test_add_valid_archives_dont_unpack_wacz(self, tmp_path):
manager = self.get_test_collections_manager(tmp_path) manager = self.get_test_collections_manager(tmp_path)
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz', archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz', 'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
'sample_archive/waczs/valid_example_1.wacz'] 'sample_archive/waczs/valid_example_1.wacz']
with pytest.raises(NotImplementedError): with pytest.raises(NotImplementedError):
manager.add_archives(archives, uncompress_wacz=False) manager.add_archives(archives, unpack_wacz=False)
def test_add_invalid_archives_uncompress_wacz(self, tmp_path, caplog): def test_add_invalid_archives_unpack_wacz(self, tmp_path, caplog):
manager = self.get_test_collections_manager(tmp_path) manager = self.get_test_collections_manager(tmp_path)
manager.add_archives(['sample_archive/warcs/example.warc', 'sample_archive/text_content/sample.html'], manager.add_archives(['sample_archive/warcs/example.warc', 'sample_archive/text_content/sample.html'],
uncompress_wacz=True) unpack_wacz=True)
assert 'sample.html' not in os.listdir(manager.archive_dir) assert 'sample.html' not in os.listdir(manager.archive_dir)
assert 'example.warc' in os.listdir(manager.archive_dir) assert 'example.warc' in os.listdir(manager.archive_dir)
assert "Invalid archives weren't added: sample_archive/text_content/sample.html" in caplog.messages assert "Invalid archives weren't added: sample_archive/text_content/sample.html" in caplog.messages
@ -95,6 +95,10 @@ class TestManager:
assert 'example-collection.warc' not in index_content assert 'example-collection.warc' not in index_content
assert 'rewritten.warc' in index_content assert 'rewritten.warc' in index_content
# check that collection index is sorted
index_lines = index_content.split('\n')
assert sorted(index_lines) == index_lines
@staticmethod @staticmethod
def get_test_collections_manager(collections_path): def get_test_collections_manager(collections_path):
manager = CollectionsManager(TEST_COLLECTION_NAME, colls_dir=collections_path, must_exist=False) manager = CollectionsManager(TEST_COLLECTION_NAME, colls_dir=collections_path, must_exist=False)