import os import pytest from pywb.manager.manager import CollectionsManager VALID_WACZ_PATH = 'sample_archive/waczs/valid_example_1.wacz' INVALID_WACZ_PATH = 'sample_archive/waczs/invalid_example_1.wacz' TEST_COLLECTION_NAME = 'test-col' class TestManager: def test_add_valid_wacz_unpacked(self, tmp_path): """Test if adding a valid wacz file to a collection succeeds""" manager = self.get_test_collections_manager(tmp_path) manager._add_wacz_unpacked(VALID_WACZ_PATH) assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir) assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir) with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: assert '"filename": "valid_example_1-0.warc"' in f.read() def test_add_valid_wacz_unpacked_dupe_name(self, tmp_path): """Test if warc that already exists is renamed with -index suffix""" manager = self.get_test_collections_manager(tmp_path) manager._add_wacz_unpacked(VALID_WACZ_PATH) # Add it again to see if there are name conflicts manager._add_wacz_unpacked(VALID_WACZ_PATH) assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir) assert 'valid_example_1-0-1.warc' in os.listdir(manager.archive_dir) assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir) with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: data = f.read() assert '"filename": "valid_example_1-0.warc"' in data assert '"filename": "valid_example_1-0-1.warc"' in data def test_add_invalid_wacz_unpacked(self, tmp_path, caplog): """Test if adding an invalid wacz file to a collection fails""" manager = self.get_test_collections_manager(tmp_path) manager._add_wacz_unpacked(INVALID_WACZ_PATH) assert 'invalid_example_1-0.warc' not in os.listdir(manager.archive_dir) assert 'sample_archive/waczs/invalid_example_1.wacz does not contain any warc files.' in caplog.text index_path = os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE) if os.path.exists(index_path): with open(index_path, 'r') as f: assert '"filename": "invalid_example_1-0.warc"' not in f.read() def test_add_valid_archives_unpack_wacz(self, tmp_path): manager = self.get_test_collections_manager(tmp_path) archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz', 'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz', 'sample_archive/waczs/valid_example_1.wacz'] manager.add_archives(archives, unpack_wacz=True) with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: index_text = f.read() for archive in archives: archive = os.path.basename(archive) if archive.endswith('wacz'): archive = 'valid_example_1-0.warc' assert archive in os.listdir(manager.archive_dir) assert archive in index_text def test_add_valid_archives_dupe_name(self, tmp_path): manager = self.get_test_collections_manager(tmp_path) warc_filename = 'sample_archive/warcs/example.warc.gz' manager.add_archives([warc_filename, warc_filename]) with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: index_text = f.read() expected_archives = ('example.warc.gz', 'example-1.warc.gz') for archive in expected_archives: assert archive in os.listdir(manager.archive_dir) assert archive in index_text def test_add_valid_archives_dont_unpack_wacz(self, tmp_path): manager = self.get_test_collections_manager(tmp_path) archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz', 'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz', 'sample_archive/waczs/valid_example_1.wacz'] with pytest.raises(NotImplementedError): manager.add_archives(archives, unpack_wacz=False) def test_add_invalid_archives_unpack_wacz(self, tmp_path, caplog): manager = self.get_test_collections_manager(tmp_path) manager.add_archives(['sample_archive/warcs/example.warc', 'sample_archive/text_content/sample.html'], unpack_wacz=True) assert 'sample.html' not in os.listdir(manager.archive_dir) assert 'example.warc' in os.listdir(manager.archive_dir) assert "Invalid archives weren't added: sample_archive/text_content/sample.html" in caplog.messages def test_merge_wacz_index(self, tmp_path): manager = self.get_test_collections_manager(tmp_path) manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'sample_archive/cdxj/example.cdxj', {'example.warc.gz': 'rewritten.warc.gz'}) with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: index_content = f.read() index_content = index_content.strip() assert 'example.warc.gz' not in index_content assert 'rewritten.warc.gz' in index_content # check that collection index is sorted index_lines = index_content.split('\n') assert sorted(index_lines) == index_lines def test_merge_wacz_index_gzip(self, tmp_path): manager = self.get_test_collections_manager(tmp_path) manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'sample_archive/cdxj/example.cdx.gz', {'example-collection.warc': 'rewritten.warc'}) with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: index_content = f.read() index_content = index_content.strip() assert 'example-collection.warc' not in index_content assert 'rewritten.warc' in index_content # check that collection index is sorted index_lines = index_content.split('\n') assert sorted(index_lines) == index_lines @staticmethod def get_test_collections_manager(collections_path): manager = CollectionsManager(TEST_COLLECTION_NAME, colls_dir=collections_path, must_exist=False) manager.add_collection() return manager