1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

[#799] wb-manager: Add wacz archives to collection with --uncompress-wacz (#800)

Add WACZ support for `wb-manager add` by unpacking WACZ files with --uncompress-wacz.

A future commit will add pywb support for WACZ files without requiring them to be unpacked.
This commit is contained in:
kuechensofa 2023-02-15 23:00:38 +01:00 committed by GitHub
parent b8693307d1
commit 454486bf75
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 224 additions and 22 deletions

View File

@ -45,7 +45,7 @@ The tool can be used while ``wayback`` is running, and pywb will detect many cha
It can be used to:
* Create a new collection -- ``wb-manager init <coll>``
* Add WARCs to collection -- ``wb-manager add <coll> <warc>``
* Add WARCs or WACZs to collection -- ``wb-manager add <coll> <warc/wacz>``
* Add override templates
* Add and remove metadata to a collections ``metadata.yaml``
* List all collections

View File

@ -5,12 +5,15 @@ import logging
import heapq
import yaml
import re
import gzip
import six
from distutils.util import strtobool
from pkg_resources import resource_string, get_distribution
from argparse import ArgumentParser, RawTextHelpFormatter
from tempfile import mkdtemp
from zipfile import ZipFile
from pywb.utils.loaders import load_yaml_config
from warcio.timeutils import timestamp20_now
@ -47,6 +50,9 @@ directory structure expected by pywb
COLLS_DIR = 'collections'
WARC_RX = re.compile(r'.*\.w?arc(\.gz)?$')
WACZ_RX = re.compile(r'.*\.wacz$')
def __init__(self, coll_name, colls_dir=None, must_exist=True):
colls_dir = colls_dir or self.COLLS_DIR
self.default_config = load_yaml_config(DEFAULT_CONFIG)
@ -115,29 +121,127 @@ directory structure expected by pywb
'To create a new collection, run\n\n{1} init {0}')
raise IOError(msg.format(self.coll_name, sys.argv[0]))
def add_warcs(self, warcs):
def add_archives(self, archives, uncompress_wacz=False):
if not os.path.isdir(self.archive_dir):
raise IOError('Directory {0} does not exist'.
format(self.archive_dir))
full_paths = []
duplicate_warcs = []
for filename in warcs:
filename = os.path.abspath(filename)
invalid_archives = []
warc_paths = []
for archive in archives:
if self.WARC_RX.match(archive):
full_path = self._add_warc(archive)
if full_path:
warc_paths.append(full_path)
elif self.WACZ_RX.match(archive):
if uncompress_wacz:
self._add_wacz_uncompressed(archive)
else:
raise NotImplementedError('Adding waczs without unpacking is not yet implemented. Use '
'\'--uncompress-wacz\' flag to add the wacz\'s content.')
else:
invalid_archives.append(archive)
# don't overwrite existing warcs with duplicate names
if os.path.exists(os.path.join(self.archive_dir, os.path.basename(filename))):
duplicate_warcs.append(filename)
self._index_merge_warcs(warc_paths, self.DEF_INDEX_FILE)
if invalid_archives:
logging.warning(f'Invalid archives weren\'t added: {", ".join(invalid_archives)}')
def _add_warc(self, warc):
filename = os.path.abspath(warc)
# don't overwrite existing warcs with duplicate names
if os.path.exists(os.path.join(self.archive_dir, os.path.basename(filename))):
logging.warning(f'Warc {filename} wasn\'t added because of duplicate name.')
return None
shutil.copy2(filename, self.archive_dir)
full_path = os.path.join(self.archive_dir, filename)
logging.info('Copied ' + filename + ' to ' + self.archive_dir)
return full_path
def _add_wacz_uncompressed(self, wacz):
wacz = os.path.abspath(wacz)
temp_dir = mkdtemp()
warc_regex = re.compile(r'.+\.warc(\.gz)?$')
cdx_regex = re.compile(r'.+\.cdx(\.gz)?$')
with ZipFile(wacz, 'r') as wacz_zip_file:
archive_members = wacz_zip_file.namelist()
warc_files = [file for file in archive_members if warc_regex.match(file)]
if not warc_files:
logging.warning(f'WACZ {wacz} does not contain any warc files.')
return
# extract warc files
for warc_file in warc_files:
wacz_zip_file.extract(warc_file, temp_dir)
cdx_files = [file for file in archive_members if cdx_regex.match(file)]
if not cdx_files:
logging.warning(f'WACZ {wacz} does not contain any indices.')
return
for cdx_file in cdx_files:
wacz_zip_file.extract(cdx_file, temp_dir)
# copy extracted warc files to collections archive dir, use wacz filename as filename with added index if
# multiple warc files exist
warc_filename_mapping = {}
full_paths = []
for idx, extracted_warc_file in enumerate(warc_files):
_, warc_ext = os.path.splitext(extracted_warc_file)
if warc_ext == '.gz':
warc_ext = '.warc.gz'
warc_filename = os.path.basename(wacz)
warc_filename, _ = os.path.splitext(warc_filename)
warc_filename = f'{warc_filename}-{idx}{warc_ext}'
warc_destination_path = os.path.join(self.archive_dir, warc_filename)
if os.path.exists(warc_destination_path):
logging.warning(f'Warc {warc_filename} wasn\'t added because of duplicate name.')
continue
shutil.copy2(filename, self.archive_dir)
full_paths.append(os.path.join(self.archive_dir, filename))
logging.info('Copied ' + filename + ' to ' + self.archive_dir)
warc_filename_mapping[os.path.basename(extracted_warc_file)] = warc_filename
shutil.copy2(os.path.join(temp_dir, extracted_warc_file), warc_destination_path)
full_paths.append(warc_destination_path)
self._index_merge_warcs(full_paths, self.DEF_INDEX_FILE)
# rewrite filenames in wacz indices and merge them with collection index file
for cdx_file in cdx_files:
self._add_wacz_index(os.path.join(self.indexes_dir, self.DEF_INDEX_FILE), os.path.join(temp_dir, cdx_file),
warc_filename_mapping)
if duplicate_warcs:
logging.warning(f'Warcs {", ".join(duplicate_warcs)} weren\'t added because of duplicate names.')
# delete temporary files
shutil.rmtree(temp_dir)
@staticmethod
def _add_wacz_index(collection_index_path, wacz_index_path, filename_mapping):
from pywb.warcserver.index.cdxobject import CDXObject
# copy collection index to temporary directory
tempdir = mkdtemp()
collection_index_name = os.path.basename(collection_index_path)
collection_index_temp_path = os.path.join(tempdir, collection_index_name)
if os.path.exists(collection_index_path):
shutil.copy2(collection_index_path, collection_index_temp_path)
with open(collection_index_temp_path, 'a') as collection_index_temp_file:
if wacz_index_path.endswith('.gz'):
wacz_index_file = gzip.open(wacz_index_path, 'rb')
else:
wacz_index_file = open(wacz_index_path, 'rb')
collection_index_temp_file.write('\n')
for line in wacz_index_file.readlines():
cdx_object = CDXObject(cdxline=line)
if cdx_object['filename'] in filename_mapping:
cdx_object['filename'] = filename_mapping[cdx_object['filename']]
collection_index_temp_file.write(cdx_object.to_cdxj())
wacz_index_file.close()
# copy temporary index back to original location and delete temporary directory
shutil.move(collection_index_temp_path, collection_index_path)
shutil.rmtree(tempdir)
def reindex(self):
cdx_file = os.path.join(self.indexes_dir, self.DEF_INDEX_FILE)
@ -383,16 +487,17 @@ Create manage file based web archive collections
listcmd = subparsers.add_parser('list', help=list_help)
listcmd.set_defaults(func=do_list)
# Add Warcs
# Add Warcs or Waczs
def do_add(r):
m = CollectionsManager(r.coll_name)
m.add_warcs(r.files)
m.add_archives(r.files, r.uncompress_wacz)
addwarc_help = 'Copy ARCS/WARCS to collection directory and reindex'
addwarc = subparsers.add_parser('add', help=addwarc_help)
addwarc.add_argument('coll_name')
addwarc.add_argument('files', nargs='+')
addwarc.set_defaults(func=do_add)
add_archives_help = 'Copy ARCS/WARCS/WACZ to collection directory and reindex'
add_archives = subparsers.add_parser('add', help=add_archives_help)
add_archives.add_argument('--uncompress-wacz', dest='uncompress_wacz', action='store_true')
add_archives.add_argument('coll_name')
add_archives.add_argument('files', nargs='+')
add_archives.set_defaults(func=do_add)
# Reindex All
def do_reindex(r):

Binary file not shown.

Binary file not shown.

Binary file not shown.

97
tests/test_manager.py Normal file
View File

@ -0,0 +1,97 @@
import os
import pytest
from pywb.manager.manager import CollectionsManager
VALID_WACZ_PATH = 'sample_archive/waczs/valid_example_1.wacz'
INVALID_WACZ_PATH = 'sample_archive/waczs/invalid_example_1.wacz'
TEST_COLLECTION_NAME = 'test-col'
class TestManager:
def test_add_valid_wacz_uncompressed(self, tmp_path):
"""Test if adding a valid wacz file to a collection succeeds"""
manager = self.get_test_collections_manager(tmp_path)
manager._add_wacz_uncompressed(VALID_WACZ_PATH)
assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir)
assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir)
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
assert '"filename": "valid_example_1-0.warc"' in f.read()
def test_add_invalid_wacz_uncompressed(self, tmp_path, caplog):
"""Test if adding an invalid wacz file to a collection fails"""
manager = self.get_test_collections_manager(tmp_path)
manager._add_wacz_uncompressed(INVALID_WACZ_PATH)
assert 'invalid_example_1-0.warc' not in os.listdir(manager.archive_dir)
assert 'sample_archive/waczs/invalid_example_1.wacz does not contain any warc files.' in caplog.text
index_path = os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE)
if os.path.exists(index_path):
with open(index_path, 'r') as f:
assert '"filename": "invalid_example_1-0.warc"' not in f.read()
def test_add_valid_archives_uncompressed_wacz(self, tmp_path):
manager = self.get_test_collections_manager(tmp_path)
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
'sample_archive/waczs/valid_example_1.wacz']
manager.add_archives(archives, uncompress_wacz=True)
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
index_text = f.read()
for archive in archives:
archive = os.path.basename(archive)
if archive.endswith('wacz'):
archive = 'valid_example_1-0.warc'
assert archive in os.listdir(manager.archive_dir)
assert archive in index_text
def test_add_valid_archives_dont_uncompress_wacz(self, tmp_path):
manager = self.get_test_collections_manager(tmp_path)
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
'sample_archive/waczs/valid_example_1.wacz']
with pytest.raises(NotImplementedError):
manager.add_archives(archives, uncompress_wacz=False)
def test_add_invalid_archives_uncompress_wacz(self, tmp_path, caplog):
manager = self.get_test_collections_manager(tmp_path)
manager.add_archives(['sample_archive/warcs/example.warc', 'sample_archive/text_content/sample.html'],
uncompress_wacz=True)
assert 'sample.html' not in os.listdir(manager.archive_dir)
assert 'example.warc' in os.listdir(manager.archive_dir)
assert "Invalid archives weren't added: sample_archive/text_content/sample.html" in caplog.messages
def test_merge_wacz_index(self, tmp_path):
manager = self.get_test_collections_manager(tmp_path)
manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE),
'sample_archive/cdxj/example.cdxj',
{'example.warc.gz': 'rewritten.warc.gz'})
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
index_content = f.read()
assert 'example.warc.gz' not in index_content
assert 'rewritten.warc.gz' in index_content
def test_merge_wacz_index_gzip(self, tmp_path):
manager = self.get_test_collections_manager(tmp_path)
manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE),
'sample_archive/cdxj/example.cdx.gz',
{'example-collection.warc': 'rewritten.warc'})
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
index_content = f.read()
assert 'example-collection.warc' not in index_content
assert 'rewritten.warc' in index_content
@staticmethod
def get_test_collections_manager(collections_path):
manager = CollectionsManager(TEST_COLLECTION_NAME, colls_dir=collections_path, must_exist=False)
manager.add_collection()
return manager