mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Avoid name conflicts when adding WARCs to collection
Append -index to end of files until there is no conflict
This commit is contained in:
parent
e89924bd39
commit
8d5b2be4c4
@ -147,18 +147,29 @@ directory structure expected by pywb
|
|||||||
if invalid_archives:
|
if invalid_archives:
|
||||||
logging.warning(f'Invalid archives weren\'t added: {", ".join(invalid_archives)}')
|
logging.warning(f'Invalid archives weren\'t added: {", ".join(invalid_archives)}')
|
||||||
|
|
||||||
|
def _rename_warc(self, source_dir, warc_basename):
|
||||||
|
dupe_idx = 1
|
||||||
|
while True:
|
||||||
|
new_basename = f'{warc_basename}-{dupe_idx}'
|
||||||
|
if not os.path.exists(os.path.join(self.archive_dir, new_basename)):
|
||||||
|
break
|
||||||
|
dupe_idx += 1
|
||||||
|
|
||||||
|
return new_basename
|
||||||
|
|
||||||
def _add_warc(self, warc):
|
def _add_warc(self, warc):
|
||||||
filename = os.path.abspath(warc)
|
warc_source = os.path.abspath(warc)
|
||||||
|
source_dir, warc_basename = os.path.split(warc_source)
|
||||||
|
|
||||||
# don't overwrite existing warcs with duplicate names
|
# don't overwrite existing warcs with duplicate names
|
||||||
if os.path.exists(os.path.join(self.archive_dir, os.path.basename(filename))):
|
if os.path.exists(os.path.join(self.archive_dir, warc_basename)):
|
||||||
logging.warning(f'Warc {filename} wasn\'t added because of duplicate name.')
|
warc_basename = self._rename_warc(source_dir, warc_basename)
|
||||||
return None
|
logging.info(f'Warc {os.path.basename(warc)} already exists - renamed to {warc_basename}.')
|
||||||
|
|
||||||
shutil.copy2(filename, self.archive_dir)
|
warc_dest = os.path.join(self.archive_dir, warc_basename)
|
||||||
full_path = os.path.join(self.archive_dir, filename)
|
shutil.copy2(warc_source, warc_dest)
|
||||||
logging.info('Copied ' + filename + ' to ' + self.archive_dir)
|
logging.info(f'Copied {warc} to {self.archive_dir} as {warc_basename}')
|
||||||
return full_path
|
return warc_dest
|
||||||
|
|
||||||
def _add_wacz_unpacked(self, wacz):
|
def _add_wacz_unpacked(self, wacz):
|
||||||
wacz = os.path.abspath(wacz)
|
wacz = os.path.abspath(wacz)
|
||||||
|
@ -20,6 +20,20 @@ class TestManager:
|
|||||||
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||||||
assert '"filename": "valid_example_1-0.warc"' in f.read()
|
assert '"filename": "valid_example_1-0.warc"' in f.read()
|
||||||
|
|
||||||
|
def test_add_valid_wacz_unpacked_dupe_name(self, tmp_path):
|
||||||
|
"""Test if warc that already exists is renamed with -index suffix"""
|
||||||
|
manager = self.get_test_collections_manager(tmp_path)
|
||||||
|
manager._add_wacz_unpacked(VALID_WACZ_PATH)
|
||||||
|
# Add it again to see if there are name conflicts
|
||||||
|
manager._add_wacz_unpacked(VALID_WACZ_PATH)
|
||||||
|
assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir)
|
||||||
|
assert 'valid_example_1-0-1.warc' in os.listdir(manager.archive_dir)
|
||||||
|
assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir)
|
||||||
|
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||||||
|
data = f.read()
|
||||||
|
assert '"filename": "valid_example_1-0.warc"' in data
|
||||||
|
assert '"filename": "valid_example_1-0-1.warc"' in data
|
||||||
|
|
||||||
def test_add_invalid_wacz_unpacked(self, tmp_path, caplog):
|
def test_add_invalid_wacz_unpacked(self, tmp_path, caplog):
|
||||||
"""Test if adding an invalid wacz file to a collection fails"""
|
"""Test if adding an invalid wacz file to a collection fails"""
|
||||||
manager = self.get_test_collections_manager(tmp_path)
|
manager = self.get_test_collections_manager(tmp_path)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user