1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00
pywb/tests/test_manager.py
Tessa Walsh b9f1609df9
Handle WARC filename conflicts with wb-manager add (#902)
Append -index to end of filename prior to extension until there is no conflict

Also makes sure this behavior is documented in tests
2024-04-24 08:09:02 -04:00

136 lines
6.4 KiB
Python

import os
import pytest
from pywb.manager.manager import CollectionsManager
VALID_WACZ_PATH = 'sample_archive/waczs/valid_example_1.wacz'
INVALID_WACZ_PATH = 'sample_archive/waczs/invalid_example_1.wacz'
TEST_COLLECTION_NAME = 'test-col'
class TestManager:
def test_add_valid_wacz_unpacked(self, tmp_path):
"""Test if adding a valid wacz file to a collection succeeds"""
manager = self.get_test_collections_manager(tmp_path)
manager._add_wacz_unpacked(VALID_WACZ_PATH)
assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir)
assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir)
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
assert '"filename": "valid_example_1-0.warc"' in f.read()
def test_add_valid_wacz_unpacked_dupe_name(self, tmp_path):
"""Test if warc that already exists is renamed with -index suffix"""
manager = self.get_test_collections_manager(tmp_path)
manager._add_wacz_unpacked(VALID_WACZ_PATH)
# Add it again to see if there are name conflicts
manager._add_wacz_unpacked(VALID_WACZ_PATH)
assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir)
assert 'valid_example_1-0-1.warc' in os.listdir(manager.archive_dir)
assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir)
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
data = f.read()
assert '"filename": "valid_example_1-0.warc"' in data
assert '"filename": "valid_example_1-0-1.warc"' in data
def test_add_invalid_wacz_unpacked(self, tmp_path, caplog):
"""Test if adding an invalid wacz file to a collection fails"""
manager = self.get_test_collections_manager(tmp_path)
manager._add_wacz_unpacked(INVALID_WACZ_PATH)
assert 'invalid_example_1-0.warc' not in os.listdir(manager.archive_dir)
assert 'sample_archive/waczs/invalid_example_1.wacz does not contain any warc files.' in caplog.text
index_path = os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE)
if os.path.exists(index_path):
with open(index_path, 'r') as f:
assert '"filename": "invalid_example_1-0.warc"' not in f.read()
def test_add_valid_archives_unpack_wacz(self, tmp_path):
manager = self.get_test_collections_manager(tmp_path)
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
'sample_archive/waczs/valid_example_1.wacz']
manager.add_archives(archives, unpack_wacz=True)
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
index_text = f.read()
for archive in archives:
archive = os.path.basename(archive)
if archive.endswith('wacz'):
archive = 'valid_example_1-0.warc'
assert archive in os.listdir(manager.archive_dir)
assert archive in index_text
def test_add_valid_archives_dupe_name(self, tmp_path):
manager = self.get_test_collections_manager(tmp_path)
warc_filename = 'sample_archive/warcs/example.warc.gz'
manager.add_archives([warc_filename, warc_filename])
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
index_text = f.read()
expected_archives = ('example.warc.gz', 'example-1.warc.gz')
for archive in expected_archives:
assert archive in os.listdir(manager.archive_dir)
assert archive in index_text
def test_add_valid_archives_dont_unpack_wacz(self, tmp_path):
manager = self.get_test_collections_manager(tmp_path)
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
'sample_archive/waczs/valid_example_1.wacz']
with pytest.raises(NotImplementedError):
manager.add_archives(archives, unpack_wacz=False)
def test_add_invalid_archives_unpack_wacz(self, tmp_path, caplog):
manager = self.get_test_collections_manager(tmp_path)
manager.add_archives(['sample_archive/warcs/example.warc', 'sample_archive/text_content/sample.html'],
unpack_wacz=True)
assert 'sample.html' not in os.listdir(manager.archive_dir)
assert 'example.warc' in os.listdir(manager.archive_dir)
assert "Invalid archives weren't added: sample_archive/text_content/sample.html" in caplog.messages
def test_merge_wacz_index(self, tmp_path):
manager = self.get_test_collections_manager(tmp_path)
manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE),
'sample_archive/cdxj/example.cdxj',
{'example.warc.gz': 'rewritten.warc.gz'})
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
index_content = f.read()
index_content = index_content.strip()
assert 'example.warc.gz' not in index_content
assert 'rewritten.warc.gz' in index_content
# check that collection index is sorted
index_lines = index_content.split('\n')
assert sorted(index_lines) == index_lines
def test_merge_wacz_index_gzip(self, tmp_path):
manager = self.get_test_collections_manager(tmp_path)
manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE),
'sample_archive/cdxj/example.cdx.gz',
{'example-collection.warc': 'rewritten.warc'})
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
index_content = f.read()
index_content = index_content.strip()
assert 'example-collection.warc' not in index_content
assert 'rewritten.warc' in index_content
# check that collection index is sorted
index_lines = index_content.split('\n')
assert sorted(index_lines) == index_lines
@staticmethod
def get_test_collections_manager(collections_path):
manager = CollectionsManager(TEST_COLLECTION_NAME, colls_dir=collections_path, must_exist=False)
manager.add_collection()
return manager