mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
98 lines
4.7 KiB
Python
98 lines
4.7 KiB
Python
|
import os
|
||
|
|
||
|
import pytest
|
||
|
|
||
|
from pywb.manager.manager import CollectionsManager
|
||
|
|
||
|
VALID_WACZ_PATH = 'sample_archive/waczs/valid_example_1.wacz'
|
||
|
INVALID_WACZ_PATH = 'sample_archive/waczs/invalid_example_1.wacz'
|
||
|
|
||
|
TEST_COLLECTION_NAME = 'test-col'
|
||
|
|
||
|
|
||
|
class TestManager:
|
||
|
def test_add_valid_wacz_uncompressed(self, tmp_path):
|
||
|
"""Test if adding a valid wacz file to a collection succeeds"""
|
||
|
manager = self.get_test_collections_manager(tmp_path)
|
||
|
manager._add_wacz_uncompressed(VALID_WACZ_PATH)
|
||
|
assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir)
|
||
|
assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir)
|
||
|
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||
|
assert '"filename": "valid_example_1-0.warc"' in f.read()
|
||
|
|
||
|
def test_add_invalid_wacz_uncompressed(self, tmp_path, caplog):
|
||
|
"""Test if adding an invalid wacz file to a collection fails"""
|
||
|
manager = self.get_test_collections_manager(tmp_path)
|
||
|
manager._add_wacz_uncompressed(INVALID_WACZ_PATH)
|
||
|
assert 'invalid_example_1-0.warc' not in os.listdir(manager.archive_dir)
|
||
|
assert 'sample_archive/waczs/invalid_example_1.wacz does not contain any warc files.' in caplog.text
|
||
|
|
||
|
index_path = os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE)
|
||
|
if os.path.exists(index_path):
|
||
|
with open(index_path, 'r') as f:
|
||
|
assert '"filename": "invalid_example_1-0.warc"' not in f.read()
|
||
|
|
||
|
def test_add_valid_archives_uncompressed_wacz(self, tmp_path):
|
||
|
manager = self.get_test_collections_manager(tmp_path)
|
||
|
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
|
||
|
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
|
||
|
'sample_archive/waczs/valid_example_1.wacz']
|
||
|
manager.add_archives(archives, uncompress_wacz=True)
|
||
|
|
||
|
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||
|
index_text = f.read()
|
||
|
|
||
|
for archive in archives:
|
||
|
archive = os.path.basename(archive)
|
||
|
|
||
|
if archive.endswith('wacz'):
|
||
|
archive = 'valid_example_1-0.warc'
|
||
|
|
||
|
assert archive in os.listdir(manager.archive_dir)
|
||
|
assert archive in index_text
|
||
|
|
||
|
def test_add_valid_archives_dont_uncompress_wacz(self, tmp_path):
|
||
|
manager = self.get_test_collections_manager(tmp_path)
|
||
|
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
|
||
|
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
|
||
|
'sample_archive/waczs/valid_example_1.wacz']
|
||
|
|
||
|
with pytest.raises(NotImplementedError):
|
||
|
manager.add_archives(archives, uncompress_wacz=False)
|
||
|
|
||
|
def test_add_invalid_archives_uncompress_wacz(self, tmp_path, caplog):
|
||
|
manager = self.get_test_collections_manager(tmp_path)
|
||
|
manager.add_archives(['sample_archive/warcs/example.warc', 'sample_archive/text_content/sample.html'],
|
||
|
uncompress_wacz=True)
|
||
|
assert 'sample.html' not in os.listdir(manager.archive_dir)
|
||
|
assert 'example.warc' in os.listdir(manager.archive_dir)
|
||
|
assert "Invalid archives weren't added: sample_archive/text_content/sample.html" in caplog.messages
|
||
|
|
||
|
def test_merge_wacz_index(self, tmp_path):
|
||
|
manager = self.get_test_collections_manager(tmp_path)
|
||
|
manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE),
|
||
|
'sample_archive/cdxj/example.cdxj',
|
||
|
{'example.warc.gz': 'rewritten.warc.gz'})
|
||
|
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||
|
index_content = f.read()
|
||
|
|
||
|
assert 'example.warc.gz' not in index_content
|
||
|
assert 'rewritten.warc.gz' in index_content
|
||
|
|
||
|
def test_merge_wacz_index_gzip(self, tmp_path):
|
||
|
manager = self.get_test_collections_manager(tmp_path)
|
||
|
manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE),
|
||
|
'sample_archive/cdxj/example.cdx.gz',
|
||
|
{'example-collection.warc': 'rewritten.warc'})
|
||
|
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||
|
index_content = f.read()
|
||
|
|
||
|
assert 'example-collection.warc' not in index_content
|
||
|
assert 'rewritten.warc' in index_content
|
||
|
|
||
|
@staticmethod
|
||
|
def get_test_collections_manager(collections_path):
|
||
|
manager = CollectionsManager(TEST_COLLECTION_NAME, colls_dir=collections_path, must_exist=False)
|
||
|
manager.add_collection()
|
||
|
return manager
|