mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Add WACZ support for `wb-manager add` by unpacking WACZ files with --uncompress-wacz. A future commit will add pywb support for WACZ files without requiring them to be unpacked.
This commit is contained in:
parent
b8693307d1
commit
454486bf75
@ -45,7 +45,7 @@ The tool can be used while ``wayback`` is running, and pywb will detect many cha
|
||||
It can be used to:
|
||||
|
||||
* Create a new collection -- ``wb-manager init <coll>``
|
||||
* Add WARCs to collection -- ``wb-manager add <coll> <warc>``
|
||||
* Add WARCs or WACZs to collection -- ``wb-manager add <coll> <warc/wacz>``
|
||||
* Add override templates
|
||||
* Add and remove metadata to a collections ``metadata.yaml``
|
||||
* List all collections
|
||||
|
@ -5,12 +5,15 @@ import logging
|
||||
import heapq
|
||||
import yaml
|
||||
import re
|
||||
import gzip
|
||||
import six
|
||||
|
||||
from distutils.util import strtobool
|
||||
from pkg_resources import resource_string, get_distribution
|
||||
|
||||
from argparse import ArgumentParser, RawTextHelpFormatter
|
||||
from tempfile import mkdtemp
|
||||
from zipfile import ZipFile
|
||||
|
||||
from pywb.utils.loaders import load_yaml_config
|
||||
from warcio.timeutils import timestamp20_now
|
||||
@ -47,6 +50,9 @@ directory structure expected by pywb
|
||||
|
||||
COLLS_DIR = 'collections'
|
||||
|
||||
WARC_RX = re.compile(r'.*\.w?arc(\.gz)?$')
|
||||
WACZ_RX = re.compile(r'.*\.wacz$')
|
||||
|
||||
def __init__(self, coll_name, colls_dir=None, must_exist=True):
|
||||
colls_dir = colls_dir or self.COLLS_DIR
|
||||
self.default_config = load_yaml_config(DEFAULT_CONFIG)
|
||||
@ -115,29 +121,127 @@ directory structure expected by pywb
|
||||
'To create a new collection, run\n\n{1} init {0}')
|
||||
raise IOError(msg.format(self.coll_name, sys.argv[0]))
|
||||
|
||||
def add_warcs(self, warcs):
|
||||
def add_archives(self, archives, uncompress_wacz=False):
|
||||
if not os.path.isdir(self.archive_dir):
|
||||
raise IOError('Directory {0} does not exist'.
|
||||
format(self.archive_dir))
|
||||
|
||||
full_paths = []
|
||||
duplicate_warcs = []
|
||||
for filename in warcs:
|
||||
filename = os.path.abspath(filename)
|
||||
invalid_archives = []
|
||||
warc_paths = []
|
||||
for archive in archives:
|
||||
if self.WARC_RX.match(archive):
|
||||
full_path = self._add_warc(archive)
|
||||
if full_path:
|
||||
warc_paths.append(full_path)
|
||||
elif self.WACZ_RX.match(archive):
|
||||
if uncompress_wacz:
|
||||
self._add_wacz_uncompressed(archive)
|
||||
else:
|
||||
raise NotImplementedError('Adding waczs without unpacking is not yet implemented. Use '
|
||||
'\'--uncompress-wacz\' flag to add the wacz\'s content.')
|
||||
else:
|
||||
invalid_archives.append(archive)
|
||||
|
||||
self._index_merge_warcs(warc_paths, self.DEF_INDEX_FILE)
|
||||
|
||||
if invalid_archives:
|
||||
logging.warning(f'Invalid archives weren\'t added: {", ".join(invalid_archives)}')
|
||||
|
||||
def _add_warc(self, warc):
|
||||
filename = os.path.abspath(warc)
|
||||
|
||||
# don't overwrite existing warcs with duplicate names
|
||||
if os.path.exists(os.path.join(self.archive_dir, os.path.basename(filename))):
|
||||
duplicate_warcs.append(filename)
|
||||
continue
|
||||
logging.warning(f'Warc {filename} wasn\'t added because of duplicate name.')
|
||||
return None
|
||||
|
||||
shutil.copy2(filename, self.archive_dir)
|
||||
full_paths.append(os.path.join(self.archive_dir, filename))
|
||||
full_path = os.path.join(self.archive_dir, filename)
|
||||
logging.info('Copied ' + filename + ' to ' + self.archive_dir)
|
||||
return full_path
|
||||
|
||||
self._index_merge_warcs(full_paths, self.DEF_INDEX_FILE)
|
||||
def _add_wacz_uncompressed(self, wacz):
|
||||
wacz = os.path.abspath(wacz)
|
||||
temp_dir = mkdtemp()
|
||||
warc_regex = re.compile(r'.+\.warc(\.gz)?$')
|
||||
cdx_regex = re.compile(r'.+\.cdx(\.gz)?$')
|
||||
with ZipFile(wacz, 'r') as wacz_zip_file:
|
||||
archive_members = wacz_zip_file.namelist()
|
||||
warc_files = [file for file in archive_members if warc_regex.match(file)]
|
||||
if not warc_files:
|
||||
logging.warning(f'WACZ {wacz} does not contain any warc files.')
|
||||
return
|
||||
|
||||
if duplicate_warcs:
|
||||
logging.warning(f'Warcs {", ".join(duplicate_warcs)} weren\'t added because of duplicate names.')
|
||||
# extract warc files
|
||||
for warc_file in warc_files:
|
||||
wacz_zip_file.extract(warc_file, temp_dir)
|
||||
|
||||
cdx_files = [file for file in archive_members if cdx_regex.match(file)]
|
||||
if not cdx_files:
|
||||
logging.warning(f'WACZ {wacz} does not contain any indices.')
|
||||
return
|
||||
|
||||
for cdx_file in cdx_files:
|
||||
wacz_zip_file.extract(cdx_file, temp_dir)
|
||||
|
||||
# copy extracted warc files to collections archive dir, use wacz filename as filename with added index if
|
||||
# multiple warc files exist
|
||||
warc_filename_mapping = {}
|
||||
full_paths = []
|
||||
for idx, extracted_warc_file in enumerate(warc_files):
|
||||
_, warc_ext = os.path.splitext(extracted_warc_file)
|
||||
if warc_ext == '.gz':
|
||||
warc_ext = '.warc.gz'
|
||||
warc_filename = os.path.basename(wacz)
|
||||
warc_filename, _ = os.path.splitext(warc_filename)
|
||||
warc_filename = f'{warc_filename}-{idx}{warc_ext}'
|
||||
warc_destination_path = os.path.join(self.archive_dir, warc_filename)
|
||||
|
||||
if os.path.exists(warc_destination_path):
|
||||
logging.warning(f'Warc {warc_filename} wasn\'t added because of duplicate name.')
|
||||
continue
|
||||
|
||||
warc_filename_mapping[os.path.basename(extracted_warc_file)] = warc_filename
|
||||
shutil.copy2(os.path.join(temp_dir, extracted_warc_file), warc_destination_path)
|
||||
full_paths.append(warc_destination_path)
|
||||
|
||||
# rewrite filenames in wacz indices and merge them with collection index file
|
||||
for cdx_file in cdx_files:
|
||||
self._add_wacz_index(os.path.join(self.indexes_dir, self.DEF_INDEX_FILE), os.path.join(temp_dir, cdx_file),
|
||||
warc_filename_mapping)
|
||||
|
||||
# delete temporary files
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
@staticmethod
|
||||
def _add_wacz_index(collection_index_path, wacz_index_path, filename_mapping):
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
|
||||
# copy collection index to temporary directory
|
||||
tempdir = mkdtemp()
|
||||
collection_index_name = os.path.basename(collection_index_path)
|
||||
collection_index_temp_path = os.path.join(tempdir, collection_index_name)
|
||||
|
||||
if os.path.exists(collection_index_path):
|
||||
shutil.copy2(collection_index_path, collection_index_temp_path)
|
||||
|
||||
with open(collection_index_temp_path, 'a') as collection_index_temp_file:
|
||||
if wacz_index_path.endswith('.gz'):
|
||||
wacz_index_file = gzip.open(wacz_index_path, 'rb')
|
||||
else:
|
||||
wacz_index_file = open(wacz_index_path, 'rb')
|
||||
collection_index_temp_file.write('\n')
|
||||
for line in wacz_index_file.readlines():
|
||||
cdx_object = CDXObject(cdxline=line)
|
||||
if cdx_object['filename'] in filename_mapping:
|
||||
cdx_object['filename'] = filename_mapping[cdx_object['filename']]
|
||||
collection_index_temp_file.write(cdx_object.to_cdxj())
|
||||
|
||||
wacz_index_file.close()
|
||||
|
||||
# copy temporary index back to original location and delete temporary directory
|
||||
shutil.move(collection_index_temp_path, collection_index_path)
|
||||
shutil.rmtree(tempdir)
|
||||
|
||||
def reindex(self):
|
||||
cdx_file = os.path.join(self.indexes_dir, self.DEF_INDEX_FILE)
|
||||
@ -383,16 +487,17 @@ Create manage file based web archive collections
|
||||
listcmd = subparsers.add_parser('list', help=list_help)
|
||||
listcmd.set_defaults(func=do_list)
|
||||
|
||||
# Add Warcs
|
||||
# Add Warcs or Waczs
|
||||
def do_add(r):
|
||||
m = CollectionsManager(r.coll_name)
|
||||
m.add_warcs(r.files)
|
||||
m.add_archives(r.files, r.uncompress_wacz)
|
||||
|
||||
addwarc_help = 'Copy ARCS/WARCS to collection directory and reindex'
|
||||
addwarc = subparsers.add_parser('add', help=addwarc_help)
|
||||
addwarc.add_argument('coll_name')
|
||||
addwarc.add_argument('files', nargs='+')
|
||||
addwarc.set_defaults(func=do_add)
|
||||
add_archives_help = 'Copy ARCS/WARCS/WACZ to collection directory and reindex'
|
||||
add_archives = subparsers.add_parser('add', help=add_archives_help)
|
||||
add_archives.add_argument('--uncompress-wacz', dest='uncompress_wacz', action='store_true')
|
||||
add_archives.add_argument('coll_name')
|
||||
add_archives.add_argument('files', nargs='+')
|
||||
add_archives.set_defaults(func=do_add)
|
||||
|
||||
# Reindex All
|
||||
def do_reindex(r):
|
||||
|
BIN
sample_archive/cdxj/example.cdx.gz
Normal file
BIN
sample_archive/cdxj/example.cdx.gz
Normal file
Binary file not shown.
BIN
sample_archive/waczs/invalid_example_1.wacz
Normal file
BIN
sample_archive/waczs/invalid_example_1.wacz
Normal file
Binary file not shown.
BIN
sample_archive/waczs/valid_example_1.wacz
Normal file
BIN
sample_archive/waczs/valid_example_1.wacz
Normal file
Binary file not shown.
97
tests/test_manager.py
Normal file
97
tests/test_manager.py
Normal file
@ -0,0 +1,97 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from pywb.manager.manager import CollectionsManager
|
||||
|
||||
VALID_WACZ_PATH = 'sample_archive/waczs/valid_example_1.wacz'
|
||||
INVALID_WACZ_PATH = 'sample_archive/waczs/invalid_example_1.wacz'
|
||||
|
||||
TEST_COLLECTION_NAME = 'test-col'
|
||||
|
||||
|
||||
class TestManager:
|
||||
def test_add_valid_wacz_uncompressed(self, tmp_path):
|
||||
"""Test if adding a valid wacz file to a collection succeeds"""
|
||||
manager = self.get_test_collections_manager(tmp_path)
|
||||
manager._add_wacz_uncompressed(VALID_WACZ_PATH)
|
||||
assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir)
|
||||
assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir)
|
||||
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||||
assert '"filename": "valid_example_1-0.warc"' in f.read()
|
||||
|
||||
def test_add_invalid_wacz_uncompressed(self, tmp_path, caplog):
|
||||
"""Test if adding an invalid wacz file to a collection fails"""
|
||||
manager = self.get_test_collections_manager(tmp_path)
|
||||
manager._add_wacz_uncompressed(INVALID_WACZ_PATH)
|
||||
assert 'invalid_example_1-0.warc' not in os.listdir(manager.archive_dir)
|
||||
assert 'sample_archive/waczs/invalid_example_1.wacz does not contain any warc files.' in caplog.text
|
||||
|
||||
index_path = os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE)
|
||||
if os.path.exists(index_path):
|
||||
with open(index_path, 'r') as f:
|
||||
assert '"filename": "invalid_example_1-0.warc"' not in f.read()
|
||||
|
||||
def test_add_valid_archives_uncompressed_wacz(self, tmp_path):
|
||||
manager = self.get_test_collections_manager(tmp_path)
|
||||
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
|
||||
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
|
||||
'sample_archive/waczs/valid_example_1.wacz']
|
||||
manager.add_archives(archives, uncompress_wacz=True)
|
||||
|
||||
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||||
index_text = f.read()
|
||||
|
||||
for archive in archives:
|
||||
archive = os.path.basename(archive)
|
||||
|
||||
if archive.endswith('wacz'):
|
||||
archive = 'valid_example_1-0.warc'
|
||||
|
||||
assert archive in os.listdir(manager.archive_dir)
|
||||
assert archive in index_text
|
||||
|
||||
def test_add_valid_archives_dont_uncompress_wacz(self, tmp_path):
|
||||
manager = self.get_test_collections_manager(tmp_path)
|
||||
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
|
||||
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
|
||||
'sample_archive/waczs/valid_example_1.wacz']
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
manager.add_archives(archives, uncompress_wacz=False)
|
||||
|
||||
def test_add_invalid_archives_uncompress_wacz(self, tmp_path, caplog):
|
||||
manager = self.get_test_collections_manager(tmp_path)
|
||||
manager.add_archives(['sample_archive/warcs/example.warc', 'sample_archive/text_content/sample.html'],
|
||||
uncompress_wacz=True)
|
||||
assert 'sample.html' not in os.listdir(manager.archive_dir)
|
||||
assert 'example.warc' in os.listdir(manager.archive_dir)
|
||||
assert "Invalid archives weren't added: sample_archive/text_content/sample.html" in caplog.messages
|
||||
|
||||
def test_merge_wacz_index(self, tmp_path):
|
||||
manager = self.get_test_collections_manager(tmp_path)
|
||||
manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE),
|
||||
'sample_archive/cdxj/example.cdxj',
|
||||
{'example.warc.gz': 'rewritten.warc.gz'})
|
||||
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||||
index_content = f.read()
|
||||
|
||||
assert 'example.warc.gz' not in index_content
|
||||
assert 'rewritten.warc.gz' in index_content
|
||||
|
||||
def test_merge_wacz_index_gzip(self, tmp_path):
|
||||
manager = self.get_test_collections_manager(tmp_path)
|
||||
manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE),
|
||||
'sample_archive/cdxj/example.cdx.gz',
|
||||
{'example-collection.warc': 'rewritten.warc'})
|
||||
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||||
index_content = f.read()
|
||||
|
||||
assert 'example-collection.warc' not in index_content
|
||||
assert 'rewritten.warc' in index_content
|
||||
|
||||
@staticmethod
|
||||
def get_test_collections_manager(collections_path):
|
||||
manager = CollectionsManager(TEST_COLLECTION_NAME, colls_dir=collections_path, must_exist=False)
|
||||
manager.add_collection()
|
||||
return manager
|
Loading…
x
Reference in New Issue
Block a user