1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Sort index when adding wacz archives (#820)

This commit is contained in:
kuechensofa 2023-11-23 18:10:52 +01:00 committed by GitHub
parent 6b4f9b323e
commit f40e7ef18c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 38 additions and 29 deletions

View File

@ -12,7 +12,7 @@ from distutils.util import strtobool
from pkg_resources import resource_string, get_distribution from pkg_resources import resource_string, get_distribution
from argparse import ArgumentParser, RawTextHelpFormatter from argparse import ArgumentParser, RawTextHelpFormatter
from tempfile import mkdtemp from tempfile import mkdtemp, TemporaryDirectory
from zipfile import ZipFile from zipfile import ZipFile
from pywb.utils.loaders import load_yaml_config from pywb.utils.loaders import load_yaml_config
@ -213,35 +213,35 @@ directory structure expected by pywb
# delete temporary files # delete temporary files
shutil.rmtree(temp_dir) shutil.rmtree(temp_dir)
@staticmethod def _add_wacz_index(self, collection_index_path, wacz_index_path, filename_mapping):
def _add_wacz_index(collection_index_path, wacz_index_path, filename_mapping):
from pywb.warcserver.index.cdxobject import CDXObject from pywb.warcserver.index.cdxobject import CDXObject
# copy collection index to temporary directory # rewrite wacz index to temporary index file
tempdir = mkdtemp() tempdir = TemporaryDirectory()
collection_index_name = os.path.basename(collection_index_path) wacz_index_name = os.path.basename(wacz_index_path)
collection_index_temp_path = os.path.join(tempdir, collection_index_name) rewritten_index_path = os.path.join(tempdir.name, wacz_index_name)
if os.path.exists(collection_index_path): with open(rewritten_index_path, 'w') as rewritten_index:
shutil.copy2(collection_index_path, collection_index_temp_path)
with open(collection_index_temp_path, 'a') as collection_index_temp_file:
if wacz_index_path.endswith('.gz'): if wacz_index_path.endswith('.gz'):
wacz_index_file = gzip.open(wacz_index_path, 'rb') wacz_index = gzip.open(wacz_index_path, 'rb')
else: else:
wacz_index_file = open(wacz_index_path, 'rb') wacz_index = open(wacz_index_path, 'rb')
collection_index_temp_file.write('\n')
for line in wacz_index_file.readlines(): for line in wacz_index:
cdx_object = CDXObject(cdxline=line) cdx_object = CDXObject(cdxline=line)
if cdx_object['filename'] in filename_mapping: if cdx_object['filename'] in filename_mapping:
cdx_object['filename'] = filename_mapping[cdx_object['filename']] cdx_object['filename'] = filename_mapping[cdx_object['filename']]
collection_index_temp_file.write(cdx_object.to_cdxj()) rewritten_index.write(cdx_object.to_cdxj())
wacz_index_file.close() if not os.path.isfile(collection_index_path):
shutil.move(rewritten_index_path, collection_index_path)
return
# copy temporary index back to original location and delete temporary directory temp_coll_index_path = collection_index_path + '.tmp.' + timestamp20_now()
shutil.move(collection_index_temp_path, collection_index_path) self._merge_indices(collection_index_path, rewritten_index_path, temp_coll_index_path)
shutil.rmtree(tempdir) shutil.move(temp_coll_index_path, collection_index_path)
tempdir.cleanup()
def reindex(self): def reindex(self):
cdx_file = os.path.join(self.indexes_dir, self.DEF_INDEX_FILE) cdx_file = os.path.join(self.indexes_dir, self.DEF_INDEX_FILE)
@ -294,20 +294,24 @@ directory structure expected by pywb
merged_file = temp_file + '.merged' merged_file = temp_file + '.merged'
last_line = None self._merge_indices(cdx_file, temp_file, merged_file)
with open(cdx_file, 'rb') as orig_index:
with open(temp_file, 'rb') as new_index:
with open(merged_file, 'w+b') as merged:
for line in heapq.merge(orig_index, new_index):
if last_line != line:
merged.write(line)
last_line = line
shutil.move(merged_file, cdx_file) shutil.move(merged_file, cdx_file)
#os.rename(merged_file, cdx_file) #os.rename(merged_file, cdx_file)
os.remove(temp_file) os.remove(temp_file)
@staticmethod
def _merge_indices(index1, index2, dest):
last_line = None
with open(index1, 'rb') as index1_f:
with open(index2, 'rb') as index2_f:
with open(dest, 'wb') as dest_f:
for line in heapq.merge(index1_f, index2_f):
if last_line != line:
dest_f.write(line)
last_line = line
def set_metadata(self, namevalue_pairs): def set_metadata(self, namevalue_pairs):
metadata_yaml = os.path.join(self.curr_coll_dir, 'metadata.yaml') metadata_yaml = os.path.join(self.curr_coll_dir, 'metadata.yaml')
metadata = None metadata = None

View File

@ -75,10 +75,15 @@ class TestManager:
{'example.warc.gz': 'rewritten.warc.gz'}) {'example.warc.gz': 'rewritten.warc.gz'})
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
index_content = f.read() index_content = f.read()
index_content = index_content.strip()
assert 'example.warc.gz' not in index_content assert 'example.warc.gz' not in index_content
assert 'rewritten.warc.gz' in index_content assert 'rewritten.warc.gz' in index_content
# check that collection index is sorted
index_lines = index_content.split('\n')
assert sorted(index_lines) == index_lines
def test_merge_wacz_index_gzip(self, tmp_path): def test_merge_wacz_index_gzip(self, tmp_path):
manager = self.get_test_collections_manager(tmp_path) manager = self.get_test_collections_manager(tmp_path)
manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE),