mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-14 15:53:28 +01:00
Append -index to end of filename prior to extension until there is no conflict Also makes sure this behavior is documented in tests
136 lines
6.4 KiB
Python
136 lines
6.4 KiB
Python
import os
|
|
|
|
import pytest
|
|
|
|
from pywb.manager.manager import CollectionsManager
|
|
|
|
VALID_WACZ_PATH = 'sample_archive/waczs/valid_example_1.wacz'
|
|
INVALID_WACZ_PATH = 'sample_archive/waczs/invalid_example_1.wacz'
|
|
|
|
TEST_COLLECTION_NAME = 'test-col'
|
|
|
|
|
|
class TestManager:
|
|
def test_add_valid_wacz_unpacked(self, tmp_path):
|
|
"""Test if adding a valid wacz file to a collection succeeds"""
|
|
manager = self.get_test_collections_manager(tmp_path)
|
|
manager._add_wacz_unpacked(VALID_WACZ_PATH)
|
|
assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir)
|
|
assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir)
|
|
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
|
assert '"filename": "valid_example_1-0.warc"' in f.read()
|
|
|
|
def test_add_valid_wacz_unpacked_dupe_name(self, tmp_path):
|
|
"""Test if warc that already exists is renamed with -index suffix"""
|
|
manager = self.get_test_collections_manager(tmp_path)
|
|
manager._add_wacz_unpacked(VALID_WACZ_PATH)
|
|
# Add it again to see if there are name conflicts
|
|
manager._add_wacz_unpacked(VALID_WACZ_PATH)
|
|
assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir)
|
|
assert 'valid_example_1-0-1.warc' in os.listdir(manager.archive_dir)
|
|
assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir)
|
|
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
|
data = f.read()
|
|
assert '"filename": "valid_example_1-0.warc"' in data
|
|
assert '"filename": "valid_example_1-0-1.warc"' in data
|
|
|
|
def test_add_invalid_wacz_unpacked(self, tmp_path, caplog):
|
|
"""Test if adding an invalid wacz file to a collection fails"""
|
|
manager = self.get_test_collections_manager(tmp_path)
|
|
manager._add_wacz_unpacked(INVALID_WACZ_PATH)
|
|
assert 'invalid_example_1-0.warc' not in os.listdir(manager.archive_dir)
|
|
assert 'sample_archive/waczs/invalid_example_1.wacz does not contain any warc files.' in caplog.text
|
|
|
|
index_path = os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE)
|
|
if os.path.exists(index_path):
|
|
with open(index_path, 'r') as f:
|
|
assert '"filename": "invalid_example_1-0.warc"' not in f.read()
|
|
|
|
def test_add_valid_archives_unpack_wacz(self, tmp_path):
|
|
manager = self.get_test_collections_manager(tmp_path)
|
|
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
|
|
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
|
|
'sample_archive/waczs/valid_example_1.wacz']
|
|
manager.add_archives(archives, unpack_wacz=True)
|
|
|
|
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
|
index_text = f.read()
|
|
|
|
for archive in archives:
|
|
archive = os.path.basename(archive)
|
|
|
|
if archive.endswith('wacz'):
|
|
archive = 'valid_example_1-0.warc'
|
|
|
|
assert archive in os.listdir(manager.archive_dir)
|
|
assert archive in index_text
|
|
|
|
def test_add_valid_archives_dupe_name(self, tmp_path):
|
|
manager = self.get_test_collections_manager(tmp_path)
|
|
warc_filename = 'sample_archive/warcs/example.warc.gz'
|
|
manager.add_archives([warc_filename, warc_filename])
|
|
|
|
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
|
index_text = f.read()
|
|
|
|
expected_archives = ('example.warc.gz', 'example-1.warc.gz')
|
|
|
|
for archive in expected_archives:
|
|
assert archive in os.listdir(manager.archive_dir)
|
|
assert archive in index_text
|
|
|
|
def test_add_valid_archives_dont_unpack_wacz(self, tmp_path):
|
|
manager = self.get_test_collections_manager(tmp_path)
|
|
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
|
|
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
|
|
'sample_archive/waczs/valid_example_1.wacz']
|
|
|
|
with pytest.raises(NotImplementedError):
|
|
manager.add_archives(archives, unpack_wacz=False)
|
|
|
|
def test_add_invalid_archives_unpack_wacz(self, tmp_path, caplog):
|
|
manager = self.get_test_collections_manager(tmp_path)
|
|
manager.add_archives(['sample_archive/warcs/example.warc', 'sample_archive/text_content/sample.html'],
|
|
unpack_wacz=True)
|
|
assert 'sample.html' not in os.listdir(manager.archive_dir)
|
|
assert 'example.warc' in os.listdir(manager.archive_dir)
|
|
assert "Invalid archives weren't added: sample_archive/text_content/sample.html" in caplog.messages
|
|
|
|
def test_merge_wacz_index(self, tmp_path):
|
|
manager = self.get_test_collections_manager(tmp_path)
|
|
manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE),
|
|
'sample_archive/cdxj/example.cdxj',
|
|
{'example.warc.gz': 'rewritten.warc.gz'})
|
|
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
|
index_content = f.read()
|
|
index_content = index_content.strip()
|
|
|
|
assert 'example.warc.gz' not in index_content
|
|
assert 'rewritten.warc.gz' in index_content
|
|
|
|
# check that collection index is sorted
|
|
index_lines = index_content.split('\n')
|
|
assert sorted(index_lines) == index_lines
|
|
|
|
def test_merge_wacz_index_gzip(self, tmp_path):
|
|
manager = self.get_test_collections_manager(tmp_path)
|
|
manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE),
|
|
'sample_archive/cdxj/example.cdx.gz',
|
|
{'example-collection.warc': 'rewritten.warc'})
|
|
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
|
index_content = f.read()
|
|
index_content = index_content.strip()
|
|
|
|
assert 'example-collection.warc' not in index_content
|
|
assert 'rewritten.warc' in index_content
|
|
|
|
# check that collection index is sorted
|
|
index_lines = index_content.split('\n')
|
|
assert sorted(index_lines) == index_lines
|
|
|
|
@staticmethod
|
|
def get_test_collections_manager(collections_path):
|
|
manager = CollectionsManager(TEST_COLLECTION_NAME, colls_dir=collections_path, must_exist=False)
|
|
manager.add_collection()
|
|
return manager
|