1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-14 15:53:28 +01:00

Rename --uncompress-wacz to --unpack-wacz and add docs (#901)

Also adds help text for wb-manager add --unpack-wacz option in CLI
This commit is contained in:
Tessa Walsh 2024-04-24 05:02:26 -04:00 committed by GitHub
parent b4c91c6633
commit e89924bd39
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 33 additions and 19 deletions

View File

@ -45,7 +45,8 @@ The tool can be used while ``wayback`` is running, and pywb will detect many cha
It can be used to:
* Create a new collection -- ``wb-manager init <coll>``
* Add WARCs or WACZs to collection -- ``wb-manager add <coll> <warc/wacz>``
* Add WARCs to collection -- ``wb-manager add <coll> <warc>``
* Unpack WACZs to add their WARCs and indices to collection -- ``wb-manager add --unpack-wacz <coll> <wacz>``
* Add override templates
* Add and remove metadata to a collections ``metadata.yaml``
* List all collections

View File

@ -114,6 +114,8 @@ Using Existing Web Archive Collections
Existing archives of WARCs/ARCs files can be used with pywb with minimal amount of setup. By using ``wb-manager add``,
WARC/ARC files will automatically be placed in the collection archive directory and indexed.
In pywb 2.8.0 and later, preliminary support for WACZ files is also added with ``wb-manager add --unpack-wacz``. This will unpack the provided WACZ file, adding its WARCs and indices to the collection.
By default ``wb-manager``, places new collections in ``collections/<coll name>`` subdirectory in the current working directory. To specify a different root directory, the ``wb-manager -d <dir>``. Other options can be set in the config file.
If you have a large number of existing CDX index files, pywb will be able to read them as well after running through a simple conversion process.

View File

@ -121,7 +121,7 @@ directory structure expected by pywb
'To create a new collection, run\n\n{1} init {0}')
raise IOError(msg.format(self.coll_name, sys.argv[0]))
def add_archives(self, archives, uncompress_wacz=False):
def add_archives(self, archives, unpack_wacz=False):
if not os.path.isdir(self.archive_dir):
raise IOError('Directory {0} does not exist'.
format(self.archive_dir))
@ -134,11 +134,11 @@ directory structure expected by pywb
if full_path:
warc_paths.append(full_path)
elif self.WACZ_RX.match(archive):
if uncompress_wacz:
self._add_wacz_uncompressed(archive)
if unpack_wacz:
self._add_wacz_unpacked(archive)
else:
raise NotImplementedError('Adding waczs without unpacking is not yet implemented. Use '
'\'--uncompress-wacz\' flag to add the wacz\'s content.')
'\'--unpack-wacz\' flag to add the wacz\'s content.')
else:
invalid_archives.append(archive)
@ -160,7 +160,7 @@ directory structure expected by pywb
logging.info('Copied ' + filename + ' to ' + self.archive_dir)
return full_path
def _add_wacz_uncompressed(self, wacz):
def _add_wacz_unpacked(self, wacz):
wacz = os.path.abspath(wacz)
temp_dir = mkdtemp()
warc_regex = re.compile(r'.+\.warc(\.gz)?$')
@ -494,11 +494,17 @@ Create manage file based web archive collections
# Add Warcs or Waczs
def do_add(r):
m = CollectionsManager(r.coll_name)
m.add_archives(r.files, r.uncompress_wacz)
m.add_archives(r.files, r.unpack_wacz)
add_archives_help = 'Copy ARCS/WARCS/WACZ to collection directory and reindex'
add_archives_help = 'Copy ARCs/WARCs to collection directory and reindex'
add_unpack_wacz_help = 'Copy WARCs from WACZ to collection directory and reindex'
add_archives = subparsers.add_parser('add', help=add_archives_help)
add_archives.add_argument('--uncompress-wacz', dest='uncompress_wacz', action='store_true')
add_archives.add_argument(
'--unpack-wacz',
dest='unpack_wacz',
action='store_true',
help=add_unpack_wacz_help
)
add_archives.add_argument('coll_name')
add_archives.add_argument('files', nargs='+')
add_archives.set_defaults(func=do_add)

View File

@ -11,19 +11,19 @@ TEST_COLLECTION_NAME = 'test-col'
class TestManager:
def test_add_valid_wacz_uncompressed(self, tmp_path):
def test_add_valid_wacz_unpacked(self, tmp_path):
"""Test if adding a valid wacz file to a collection succeeds"""
manager = self.get_test_collections_manager(tmp_path)
manager._add_wacz_uncompressed(VALID_WACZ_PATH)
manager._add_wacz_unpacked(VALID_WACZ_PATH)
assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir)
assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir)
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
assert '"filename": "valid_example_1-0.warc"' in f.read()
def test_add_invalid_wacz_uncompressed(self, tmp_path, caplog):
def test_add_invalid_wacz_unpacked(self, tmp_path, caplog):
"""Test if adding an invalid wacz file to a collection fails"""
manager = self.get_test_collections_manager(tmp_path)
manager._add_wacz_uncompressed(INVALID_WACZ_PATH)
manager._add_wacz_unpacked(INVALID_WACZ_PATH)
assert 'invalid_example_1-0.warc' not in os.listdir(manager.archive_dir)
assert 'sample_archive/waczs/invalid_example_1.wacz does not contain any warc files.' in caplog.text
@ -32,12 +32,12 @@ class TestManager:
with open(index_path, 'r') as f:
assert '"filename": "invalid_example_1-0.warc"' not in f.read()
def test_add_valid_archives_uncompressed_wacz(self, tmp_path):
def test_add_valid_archives_unpack_wacz(self, tmp_path):
manager = self.get_test_collections_manager(tmp_path)
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
'sample_archive/waczs/valid_example_1.wacz']
manager.add_archives(archives, uncompress_wacz=True)
manager.add_archives(archives, unpack_wacz=True)
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
index_text = f.read()
@ -51,19 +51,19 @@ class TestManager:
assert archive in os.listdir(manager.archive_dir)
assert archive in index_text
def test_add_valid_archives_dont_uncompress_wacz(self, tmp_path):
def test_add_valid_archives_dont_unpack_wacz(self, tmp_path):
manager = self.get_test_collections_manager(tmp_path)
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
'sample_archive/waczs/valid_example_1.wacz']
with pytest.raises(NotImplementedError):
manager.add_archives(archives, uncompress_wacz=False)
manager.add_archives(archives, unpack_wacz=False)
def test_add_invalid_archives_uncompress_wacz(self, tmp_path, caplog):
def test_add_invalid_archives_unpack_wacz(self, tmp_path, caplog):
manager = self.get_test_collections_manager(tmp_path)
manager.add_archives(['sample_archive/warcs/example.warc', 'sample_archive/text_content/sample.html'],
uncompress_wacz=True)
unpack_wacz=True)
assert 'sample.html' not in os.listdir(manager.archive_dir)
assert 'example.warc' in os.listdir(manager.archive_dir)
assert "Invalid archives weren't added: sample_archive/text_content/sample.html" in caplog.messages
@ -91,10 +91,15 @@ class TestManager:
{'example-collection.warc': 'rewritten.warc'})
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
index_content = f.read()
index_content = index_content.strip()
assert 'example-collection.warc' not in index_content
assert 'rewritten.warc' in index_content
# check that collection index is sorted
index_lines = index_content.split('\n')
assert sorted(index_lines) == index_lines
@staticmethod
def get_test_collections_manager(collections_path):
manager = CollectionsManager(TEST_COLLECTION_NAME, colls_dir=collections_path, must_exist=False)