1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Fix dupe renaming and add additional test for warc.gz

This commit is contained in:
Tessa Walsh 2024-04-24 12:23:35 +02:00
parent ee15a3e06f
commit 52c5b84b1f
2 changed files with 20 additions and 1 deletions

View File

@ -7,6 +7,7 @@ import yaml
import re import re
import gzip import gzip
import six import six
import pathlib
from distutils.util import strtobool from distutils.util import strtobool
from pkg_resources import resource_string, get_distribution from pkg_resources import resource_string, get_distribution
@ -149,8 +150,11 @@ directory structure expected by pywb
def _rename_warc(self, warc_basename): def _rename_warc(self, warc_basename):
dupe_idx = 1 dupe_idx = 1
ext = ''.join(pathlib.Path(warc_basename).suffixes)
pre_ext_name = warc_basename.split(ext)[0]
while True: while True:
new_basename = f'{warc_basename}-{dupe_idx}' new_basename = f'{pre_ext_name}-{dupe_idx}{ext}'
if not os.path.exists(os.path.join(self.archive_dir, new_basename)): if not os.path.exists(os.path.join(self.archive_dir, new_basename)):
break break
dupe_idx += 1 dupe_idx += 1

View File

@ -65,6 +65,21 @@ class TestManager:
assert archive in os.listdir(manager.archive_dir) assert archive in os.listdir(manager.archive_dir)
assert archive in index_text assert archive in index_text
def test_add_valid_archives_dupe_name(self, tmp_path):
manager = self.get_test_collections_manager(tmp_path)
warc_filename = 'sample_archive/warcs/example.warc.gz'
manager.add_archives(warc_filename)
manager.add_archives(warc_filename)
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
index_text = f.read()
expected_archives = ('example.warc.gz', 'example-1.warc.gz')
for archive in expected_archives:
assert archive in os.listdir(manager.archive_dir)
assert archive in index_text
def test_add_valid_archives_dont_unpack_wacz(self, tmp_path): def test_add_valid_archives_dont_unpack_wacz(self, tmp_path):
manager = self.get_test_collections_manager(tmp_path) manager = self.get_test_collections_manager(tmp_path)
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz', archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',