[#799] wb-manager: Add wacz archives to collection with --uncompress-wacz (#800)

Add WACZ support for `wb-manager add` by unpacking WACZ files with --uncompress-wacz. A future commit will add pywb support for WACZ files without requiring them to be unpacked.
2025-03-15 00:03:28 +01:00 · 2023-02-15 23:00:38 +01:00 · 2023-02-15 23:00:38 +01:00 · 454486bf75
commit 454486bf75
parent b8693307d1
6 changed files with 224 additions and 22 deletions
--- a/docs/manual/apps.rst
+++ b/docs/manual/apps.rst
@ -45,7 +45,7 @@ The tool can be used while ``wayback`` is running, and pywb will detect many cha
 It can be used to:
 * Create a new collection --  ``wb-manager init <coll>``
-* Add WARCs to collection -- ``wb-manager add <coll> <warc>``
+* Add WARCs or WACZs to collection -- ``wb-manager add <coll> <warc/wacz>``
 * Add override templates
 * Add and remove metadata to a collections ``metadata.yaml``
 * List all collections
--- a/pywb/manager/manager.py
+++ b/pywb/manager/manager.py
@ -5,12 +5,15 @@ import logging
 import heapq
 import yaml
 import re
 import gzip
 import six
 from distutils.util import strtobool
 from pkg_resources import resource_string, get_distribution
 from argparse import ArgumentParser, RawTextHelpFormatter
 from tempfile import mkdtemp
 from zipfile import ZipFile
 from pywb.utils.loaders import load_yaml_config
 from warcio.timeutils import timestamp20_now
@ -47,6 +50,9 @@ directory structure expected by pywb
    COLLS_DIR = 'collections'
    WARC_RX = re.compile(r'.*\.w?arc(\.gz)?$')
    WACZ_RX = re.compile(r'.*\.wacz$')
    def __init__(self, coll_name, colls_dir=None, must_exist=True):
        colls_dir = colls_dir or self.COLLS_DIR
        self.default_config = load_yaml_config(DEFAULT_CONFIG)
@ -115,29 +121,127 @@ directory structure expected by pywb
                   'To create a new collection, run\n\n{1} init {0}')
            raise IOError(msg.format(self.coll_name, sys.argv[0]))
-    def add_warcs(self, warcs):
+    def add_archives(self, archives, uncompress_wacz=False):
        if not os.path.isdir(self.archive_dir):
            raise IOError('Directory {0} does not exist'.
                          format(self.archive_dir))
-        full_paths = []
+        invalid_archives = []
-        duplicate_warcs = []
+        warc_paths = []
-        for filename in warcs:
+        for archive in archives:
-            filename = os.path.abspath(filename)
+            if self.WARC_RX.match(archive):
                full_path = self._add_warc(archive)
                if full_path:
                    warc_paths.append(full_path)
            elif self.WACZ_RX.match(archive):
                if uncompress_wacz:
                    self._add_wacz_uncompressed(archive)
                else:
                    raise NotImplementedError('Adding waczs without unpacking is not yet implemented. Use '
                                              '\'--uncompress-wacz\' flag to add the wacz\'s content.')
            else:
                invalid_archives.append(archive)
        self._index_merge_warcs(warc_paths, self.DEF_INDEX_FILE)
        if invalid_archives:
            logging.warning(f'Invalid archives weren\'t added: {", ".join(invalid_archives)}')
    def _add_warc(self, warc):
        filename = os.path.abspath(warc)
        # don't overwrite existing warcs with duplicate names
        if os.path.exists(os.path.join(self.archive_dir, os.path.basename(filename))):
-                duplicate_warcs.append(filename)
+            logging.warning(f'Warc {filename} wasn\'t added because of duplicate name.')
-                continue
+            return None
        shutil.copy2(filename, self.archive_dir)
-            full_paths.append(os.path.join(self.archive_dir, filename))
+        full_path = os.path.join(self.archive_dir, filename)
        logging.info('Copied ' + filename + ' to ' + self.archive_dir)
        return full_path
-        self._index_merge_warcs(full_paths, self.DEF_INDEX_FILE)
+    def _add_wacz_uncompressed(self, wacz):
        wacz = os.path.abspath(wacz)
        temp_dir = mkdtemp()
        warc_regex = re.compile(r'.+\.warc(\.gz)?$')
        cdx_regex = re.compile(r'.+\.cdx(\.gz)?$')
        with ZipFile(wacz, 'r') as wacz_zip_file:
            archive_members = wacz_zip_file.namelist()
            warc_files = [file for file in archive_members if warc_regex.match(file)]
            if not warc_files:
                logging.warning(f'WACZ {wacz} does not contain any warc files.')
                return
-        if duplicate_warcs:
+            # extract warc files
-            logging.warning(f'Warcs {", ".join(duplicate_warcs)} weren\'t added because of duplicate names.')
+            for warc_file in warc_files:
                wacz_zip_file.extract(warc_file, temp_dir)
            cdx_files = [file for file in archive_members if cdx_regex.match(file)]
            if not cdx_files:
                logging.warning(f'WACZ {wacz} does not contain any indices.')
                return
            for cdx_file in cdx_files:
                wacz_zip_file.extract(cdx_file, temp_dir)
        # copy extracted warc files to collections archive dir, use wacz filename as filename with added index if
        # multiple warc files exist
        warc_filename_mapping = {}
        full_paths = []
        for idx, extracted_warc_file in enumerate(warc_files):
            _, warc_ext = os.path.splitext(extracted_warc_file)
            if warc_ext == '.gz':
                warc_ext = '.warc.gz'
            warc_filename = os.path.basename(wacz)
            warc_filename, _ = os.path.splitext(warc_filename)
            warc_filename = f'{warc_filename}-{idx}{warc_ext}'
            warc_destination_path = os.path.join(self.archive_dir, warc_filename)
            if os.path.exists(warc_destination_path):
                logging.warning(f'Warc {warc_filename} wasn\'t added because of duplicate name.')
                continue
            warc_filename_mapping[os.path.basename(extracted_warc_file)] = warc_filename
            shutil.copy2(os.path.join(temp_dir, extracted_warc_file), warc_destination_path)
            full_paths.append(warc_destination_path)
        # rewrite filenames in wacz indices and merge them with collection index file
        for cdx_file in cdx_files:
            self._add_wacz_index(os.path.join(self.indexes_dir, self.DEF_INDEX_FILE), os.path.join(temp_dir, cdx_file),
                                 warc_filename_mapping)
        # delete temporary files
        shutil.rmtree(temp_dir)
    @staticmethod
    def _add_wacz_index(collection_index_path, wacz_index_path, filename_mapping):
        from pywb.warcserver.index.cdxobject import CDXObject
        # copy collection index to temporary directory
        tempdir = mkdtemp()
        collection_index_name = os.path.basename(collection_index_path)
        collection_index_temp_path = os.path.join(tempdir, collection_index_name)
        if os.path.exists(collection_index_path):
            shutil.copy2(collection_index_path, collection_index_temp_path)
        with open(collection_index_temp_path, 'a') as collection_index_temp_file:
            if wacz_index_path.endswith('.gz'):
                wacz_index_file = gzip.open(wacz_index_path, 'rb')
            else:
                wacz_index_file = open(wacz_index_path, 'rb')
            collection_index_temp_file.write('\n')
            for line in wacz_index_file.readlines():
                cdx_object = CDXObject(cdxline=line)
                if cdx_object['filename'] in filename_mapping:
                    cdx_object['filename'] = filename_mapping[cdx_object['filename']]
                collection_index_temp_file.write(cdx_object.to_cdxj())
            wacz_index_file.close()
        # copy temporary index back to original location and delete temporary directory
        shutil.move(collection_index_temp_path, collection_index_path)
        shutil.rmtree(tempdir)
    def reindex(self):
        cdx_file = os.path.join(self.indexes_dir, self.DEF_INDEX_FILE)
@ -383,16 +487,17 @@ Create manage file based web archive collections
    listcmd = subparsers.add_parser('list', help=list_help)
    listcmd.set_defaults(func=do_list)
-    # Add Warcs
+    # Add Warcs or Waczs
    def do_add(r):
        m = CollectionsManager(r.coll_name)
-        m.add_warcs(r.files)
+        m.add_archives(r.files, r.uncompress_wacz)
-    addwarc_help = 'Copy ARCS/WARCS to collection directory and reindex'
+    add_archives_help = 'Copy ARCS/WARCS/WACZ to collection directory and reindex'
-    addwarc = subparsers.add_parser('add', help=addwarc_help)
+    add_archives = subparsers.add_parser('add', help=add_archives_help)
-    addwarc.add_argument('coll_name')
+    add_archives.add_argument('--uncompress-wacz', dest='uncompress_wacz', action='store_true')
-    addwarc.add_argument('files', nargs='+')
+    add_archives.add_argument('coll_name')
-    addwarc.set_defaults(func=do_add)
+    add_archives.add_argument('files', nargs='+')
    add_archives.set_defaults(func=do_add)
    # Reindex All
    def do_reindex(r):
--- a/sample_archive/cdxj/example.cdx.gz
+++ b/sample_archive/cdxj/example.cdx.gz
--- a/sample_archive/waczs/invalid_example_1.wacz
+++ b/sample_archive/waczs/invalid_example_1.wacz
--- a/sample_archive/waczs/valid_example_1.wacz
+++ b/sample_archive/waczs/valid_example_1.wacz
--- a/tests/test_manager.py
+++ b/tests/test_manager.py
@ -0,0 +1,97 @@
 import os
 import pytest
 from pywb.manager.manager import CollectionsManager
 VALID_WACZ_PATH = 'sample_archive/waczs/valid_example_1.wacz'
 INVALID_WACZ_PATH = 'sample_archive/waczs/invalid_example_1.wacz'
 TEST_COLLECTION_NAME = 'test-col'
 class TestManager:
    def test_add_valid_wacz_uncompressed(self, tmp_path):
        """Test if adding a valid wacz file to a collection succeeds"""
        manager = self.get_test_collections_manager(tmp_path)
        manager._add_wacz_uncompressed(VALID_WACZ_PATH)
        assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir)
        assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir)
        with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
            assert '"filename": "valid_example_1-0.warc"' in f.read()
    def test_add_invalid_wacz_uncompressed(self, tmp_path, caplog):
        """Test if adding an invalid wacz file to a collection fails"""
        manager = self.get_test_collections_manager(tmp_path)
        manager._add_wacz_uncompressed(INVALID_WACZ_PATH)
        assert 'invalid_example_1-0.warc' not in os.listdir(manager.archive_dir)
        assert 'sample_archive/waczs/invalid_example_1.wacz does not contain any warc files.' in caplog.text
        index_path = os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE)
        if os.path.exists(index_path):
            with open(index_path, 'r') as f:
                assert '"filename": "invalid_example_1-0.warc"' not in f.read()
    def test_add_valid_archives_uncompressed_wacz(self, tmp_path):
        manager = self.get_test_collections_manager(tmp_path)
        archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
                    'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
                    'sample_archive/waczs/valid_example_1.wacz']
        manager.add_archives(archives, uncompress_wacz=True)
        with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
            index_text = f.read()
        for archive in archives:
            archive = os.path.basename(archive)
            if archive.endswith('wacz'):
                archive = 'valid_example_1-0.warc'
            assert archive in os.listdir(manager.archive_dir)
            assert archive in index_text
    def test_add_valid_archives_dont_uncompress_wacz(self, tmp_path):
        manager = self.get_test_collections_manager(tmp_path)
        archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
                    'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
                    'sample_archive/waczs/valid_example_1.wacz']
        with pytest.raises(NotImplementedError):
            manager.add_archives(archives, uncompress_wacz=False)
    def test_add_invalid_archives_uncompress_wacz(self, tmp_path, caplog):
        manager = self.get_test_collections_manager(tmp_path)
        manager.add_archives(['sample_archive/warcs/example.warc', 'sample_archive/text_content/sample.html'],
                             uncompress_wacz=True)
        assert 'sample.html' not in os.listdir(manager.archive_dir)
        assert 'example.warc' in os.listdir(manager.archive_dir)
        assert "Invalid archives weren't added: sample_archive/text_content/sample.html" in caplog.messages
    def test_merge_wacz_index(self, tmp_path):
        manager = self.get_test_collections_manager(tmp_path)
        manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE),
                                'sample_archive/cdxj/example.cdxj',
                                {'example.warc.gz': 'rewritten.warc.gz'})
        with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
            index_content = f.read()
        assert 'example.warc.gz' not in index_content
        assert 'rewritten.warc.gz' in index_content
    def test_merge_wacz_index_gzip(self, tmp_path):
        manager = self.get_test_collections_manager(tmp_path)
        manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE),
                                'sample_archive/cdxj/example.cdx.gz',
                                {'example-collection.warc': 'rewritten.warc'})
        with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
            index_content = f.read()
        assert 'example-collection.warc' not in index_content
        assert 'rewritten.warc' in index_content
    @staticmethod
    def get_test_collections_manager(collections_path):
        manager = CollectionsManager(TEST_COLLECTION_NAME, colls_dir=collections_path, must_exist=False)
        manager.add_collection()
        return manager