1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-14 15:53:28 +01:00

Handle WARC filename conflicts with wb-manager add (#902)

Append -index to end of filename prior to extension until there is no conflict

Also makes sure this behavior is documented in tests
This commit is contained in:
Tessa Walsh 2024-04-24 08:09:02 -04:00 committed by GitHub
parent e89924bd39
commit b9f1609df9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 54 additions and 10 deletions

View File

@ -7,6 +7,7 @@ import yaml
import re import re
import gzip import gzip
import six import six
import pathlib
from distutils.util import strtobool from distutils.util import strtobool
from pkg_resources import resource_string, get_distribution from pkg_resources import resource_string, get_distribution
@ -147,18 +148,32 @@ directory structure expected by pywb
if invalid_archives: if invalid_archives:
logging.warning(f'Invalid archives weren\'t added: {", ".join(invalid_archives)}') logging.warning(f'Invalid archives weren\'t added: {", ".join(invalid_archives)}')
def _rename_warc(self, warc_basename):
dupe_idx = 1
ext = ''.join(pathlib.Path(warc_basename).suffixes)
pre_ext_name = warc_basename.split(ext)[0]
while True:
new_basename = f'{pre_ext_name}-{dupe_idx}{ext}'
if not os.path.exists(os.path.join(self.archive_dir, new_basename)):
break
dupe_idx += 1
return new_basename
def _add_warc(self, warc): def _add_warc(self, warc):
filename = os.path.abspath(warc) warc_source = os.path.abspath(warc)
source_dir, warc_basename = os.path.split(warc_source)
# don't overwrite existing warcs with duplicate names # don't overwrite existing warcs with duplicate names
if os.path.exists(os.path.join(self.archive_dir, os.path.basename(filename))): if os.path.exists(os.path.join(self.archive_dir, warc_basename)):
logging.warning(f'Warc {filename} wasn\'t added because of duplicate name.') warc_basename = self._rename_warc(warc_basename)
return None logging.info(f'Warc {os.path.basename(warc)} already exists - renamed to {warc_basename}.')
shutil.copy2(filename, self.archive_dir) warc_dest = os.path.join(self.archive_dir, warc_basename)
full_path = os.path.join(self.archive_dir, filename) shutil.copy2(warc_source, warc_dest)
logging.info('Copied ' + filename + ' to ' + self.archive_dir) logging.info(f'Copied {warc} to {self.archive_dir} as {warc_basename}')
return full_path return warc_dest
def _add_wacz_unpacked(self, wacz): def _add_wacz_unpacked(self, wacz):
wacz = os.path.abspath(wacz) wacz = os.path.abspath(wacz)
@ -198,8 +213,9 @@ directory structure expected by pywb
warc_destination_path = os.path.join(self.archive_dir, warc_filename) warc_destination_path = os.path.join(self.archive_dir, warc_filename)
if os.path.exists(warc_destination_path): if os.path.exists(warc_destination_path):
logging.warning(f'Warc {warc_filename} wasn\'t added because of duplicate name.') warc_filename = self._rename_warc(warc_filename)
continue logging.info(f'Warc {warc_destination_path} already exists - renamed to {warc_filename}.')
warc_destination_path = os.path.join(self.archive_dir, warc_filename)
warc_filename_mapping[os.path.basename(extracted_warc_file)] = warc_filename warc_filename_mapping[os.path.basename(extracted_warc_file)] = warc_filename
shutil.copy2(os.path.join(temp_dir, extracted_warc_file), warc_destination_path) shutil.copy2(os.path.join(temp_dir, extracted_warc_file), warc_destination_path)

View File

@ -20,6 +20,20 @@ class TestManager:
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
assert '"filename": "valid_example_1-0.warc"' in f.read() assert '"filename": "valid_example_1-0.warc"' in f.read()
def test_add_valid_wacz_unpacked_dupe_name(self, tmp_path):
"""Test if warc that already exists is renamed with -index suffix"""
manager = self.get_test_collections_manager(tmp_path)
manager._add_wacz_unpacked(VALID_WACZ_PATH)
# Add it again to see if there are name conflicts
manager._add_wacz_unpacked(VALID_WACZ_PATH)
assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir)
assert 'valid_example_1-0-1.warc' in os.listdir(manager.archive_dir)
assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir)
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
data = f.read()
assert '"filename": "valid_example_1-0.warc"' in data
assert '"filename": "valid_example_1-0-1.warc"' in data
def test_add_invalid_wacz_unpacked(self, tmp_path, caplog): def test_add_invalid_wacz_unpacked(self, tmp_path, caplog):
"""Test if adding an invalid wacz file to a collection fails""" """Test if adding an invalid wacz file to a collection fails"""
manager = self.get_test_collections_manager(tmp_path) manager = self.get_test_collections_manager(tmp_path)
@ -51,6 +65,20 @@ class TestManager:
assert archive in os.listdir(manager.archive_dir) assert archive in os.listdir(manager.archive_dir)
assert archive in index_text assert archive in index_text
def test_add_valid_archives_dupe_name(self, tmp_path):
manager = self.get_test_collections_manager(tmp_path)
warc_filename = 'sample_archive/warcs/example.warc.gz'
manager.add_archives([warc_filename, warc_filename])
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
index_text = f.read()
expected_archives = ('example.warc.gz', 'example-1.warc.gz')
for archive in expected_archives:
assert archive in os.listdir(manager.archive_dir)
assert archive in index_text
def test_add_valid_archives_dont_unpack_wacz(self, tmp_path): def test_add_valid_archives_dont_unpack_wacz(self, tmp_path):
manager = self.get_test_collections_manager(tmp_path) manager = self.get_test_collections_manager(tmp_path)
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz', archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',