mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-14 15:53:28 +01:00
Rename --uncompress-wacz to --unpack-wacz and add docs (#901)
Also adds help text for wb-manager add --unpack-wacz option in CLI
This commit is contained in:
parent
b4c91c6633
commit
e89924bd39
@ -45,7 +45,8 @@ The tool can be used while ``wayback`` is running, and pywb will detect many cha
|
||||
It can be used to:
|
||||
|
||||
* Create a new collection -- ``wb-manager init <coll>``
|
||||
* Add WARCs or WACZs to collection -- ``wb-manager add <coll> <warc/wacz>``
|
||||
* Add WARCs to collection -- ``wb-manager add <coll> <warc>``
|
||||
* Unpack WACZs to add their WARCs and indices to collection -- ``wb-manager add --unpack-wacz <coll> <wacz>``
|
||||
* Add override templates
|
||||
* Add and remove metadata to a collections ``metadata.yaml``
|
||||
* List all collections
|
||||
|
@ -114,6 +114,8 @@ Using Existing Web Archive Collections
|
||||
Existing archives of WARCs/ARCs files can be used with pywb with minimal amount of setup. By using ``wb-manager add``,
|
||||
WARC/ARC files will automatically be placed in the collection archive directory and indexed.
|
||||
|
||||
In pywb 2.8.0 and later, preliminary support for WACZ files is also added with ``wb-manager add --unpack-wacz``. This will unpack the provided WACZ file, adding its WARCs and indices to the collection.
|
||||
|
||||
By default ``wb-manager``, places new collections in ``collections/<coll name>`` subdirectory in the current working directory. To specify a different root directory, the ``wb-manager -d <dir>``. Other options can be set in the config file.
|
||||
|
||||
If you have a large number of existing CDX index files, pywb will be able to read them as well after running through a simple conversion process.
|
||||
|
@ -121,7 +121,7 @@ directory structure expected by pywb
|
||||
'To create a new collection, run\n\n{1} init {0}')
|
||||
raise IOError(msg.format(self.coll_name, sys.argv[0]))
|
||||
|
||||
def add_archives(self, archives, uncompress_wacz=False):
|
||||
def add_archives(self, archives, unpack_wacz=False):
|
||||
if not os.path.isdir(self.archive_dir):
|
||||
raise IOError('Directory {0} does not exist'.
|
||||
format(self.archive_dir))
|
||||
@ -134,11 +134,11 @@ directory structure expected by pywb
|
||||
if full_path:
|
||||
warc_paths.append(full_path)
|
||||
elif self.WACZ_RX.match(archive):
|
||||
if uncompress_wacz:
|
||||
self._add_wacz_uncompressed(archive)
|
||||
if unpack_wacz:
|
||||
self._add_wacz_unpacked(archive)
|
||||
else:
|
||||
raise NotImplementedError('Adding waczs without unpacking is not yet implemented. Use '
|
||||
'\'--uncompress-wacz\' flag to add the wacz\'s content.')
|
||||
'\'--unpack-wacz\' flag to add the wacz\'s content.')
|
||||
else:
|
||||
invalid_archives.append(archive)
|
||||
|
||||
@ -160,7 +160,7 @@ directory structure expected by pywb
|
||||
logging.info('Copied ' + filename + ' to ' + self.archive_dir)
|
||||
return full_path
|
||||
|
||||
def _add_wacz_uncompressed(self, wacz):
|
||||
def _add_wacz_unpacked(self, wacz):
|
||||
wacz = os.path.abspath(wacz)
|
||||
temp_dir = mkdtemp()
|
||||
warc_regex = re.compile(r'.+\.warc(\.gz)?$')
|
||||
@ -494,11 +494,17 @@ Create manage file based web archive collections
|
||||
# Add Warcs or Waczs
|
||||
def do_add(r):
|
||||
m = CollectionsManager(r.coll_name)
|
||||
m.add_archives(r.files, r.uncompress_wacz)
|
||||
m.add_archives(r.files, r.unpack_wacz)
|
||||
|
||||
add_archives_help = 'Copy ARCS/WARCS/WACZ to collection directory and reindex'
|
||||
add_archives_help = 'Copy ARCs/WARCs to collection directory and reindex'
|
||||
add_unpack_wacz_help = 'Copy WARCs from WACZ to collection directory and reindex'
|
||||
add_archives = subparsers.add_parser('add', help=add_archives_help)
|
||||
add_archives.add_argument('--uncompress-wacz', dest='uncompress_wacz', action='store_true')
|
||||
add_archives.add_argument(
|
||||
'--unpack-wacz',
|
||||
dest='unpack_wacz',
|
||||
action='store_true',
|
||||
help=add_unpack_wacz_help
|
||||
)
|
||||
add_archives.add_argument('coll_name')
|
||||
add_archives.add_argument('files', nargs='+')
|
||||
add_archives.set_defaults(func=do_add)
|
||||
|
@ -11,19 +11,19 @@ TEST_COLLECTION_NAME = 'test-col'
|
||||
|
||||
|
||||
class TestManager:
|
||||
def test_add_valid_wacz_uncompressed(self, tmp_path):
|
||||
def test_add_valid_wacz_unpacked(self, tmp_path):
|
||||
"""Test if adding a valid wacz file to a collection succeeds"""
|
||||
manager = self.get_test_collections_manager(tmp_path)
|
||||
manager._add_wacz_uncompressed(VALID_WACZ_PATH)
|
||||
manager._add_wacz_unpacked(VALID_WACZ_PATH)
|
||||
assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir)
|
||||
assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir)
|
||||
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||||
assert '"filename": "valid_example_1-0.warc"' in f.read()
|
||||
|
||||
def test_add_invalid_wacz_uncompressed(self, tmp_path, caplog):
|
||||
def test_add_invalid_wacz_unpacked(self, tmp_path, caplog):
|
||||
"""Test if adding an invalid wacz file to a collection fails"""
|
||||
manager = self.get_test_collections_manager(tmp_path)
|
||||
manager._add_wacz_uncompressed(INVALID_WACZ_PATH)
|
||||
manager._add_wacz_unpacked(INVALID_WACZ_PATH)
|
||||
assert 'invalid_example_1-0.warc' not in os.listdir(manager.archive_dir)
|
||||
assert 'sample_archive/waczs/invalid_example_1.wacz does not contain any warc files.' in caplog.text
|
||||
|
||||
@ -32,12 +32,12 @@ class TestManager:
|
||||
with open(index_path, 'r') as f:
|
||||
assert '"filename": "invalid_example_1-0.warc"' not in f.read()
|
||||
|
||||
def test_add_valid_archives_uncompressed_wacz(self, tmp_path):
|
||||
def test_add_valid_archives_unpack_wacz(self, tmp_path):
|
||||
manager = self.get_test_collections_manager(tmp_path)
|
||||
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
|
||||
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
|
||||
'sample_archive/waczs/valid_example_1.wacz']
|
||||
manager.add_archives(archives, uncompress_wacz=True)
|
||||
manager.add_archives(archives, unpack_wacz=True)
|
||||
|
||||
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||||
index_text = f.read()
|
||||
@ -51,19 +51,19 @@ class TestManager:
|
||||
assert archive in os.listdir(manager.archive_dir)
|
||||
assert archive in index_text
|
||||
|
||||
def test_add_valid_archives_dont_uncompress_wacz(self, tmp_path):
|
||||
def test_add_valid_archives_dont_unpack_wacz(self, tmp_path):
|
||||
manager = self.get_test_collections_manager(tmp_path)
|
||||
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
|
||||
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
|
||||
'sample_archive/waczs/valid_example_1.wacz']
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
manager.add_archives(archives, uncompress_wacz=False)
|
||||
manager.add_archives(archives, unpack_wacz=False)
|
||||
|
||||
def test_add_invalid_archives_uncompress_wacz(self, tmp_path, caplog):
|
||||
def test_add_invalid_archives_unpack_wacz(self, tmp_path, caplog):
|
||||
manager = self.get_test_collections_manager(tmp_path)
|
||||
manager.add_archives(['sample_archive/warcs/example.warc', 'sample_archive/text_content/sample.html'],
|
||||
uncompress_wacz=True)
|
||||
unpack_wacz=True)
|
||||
assert 'sample.html' not in os.listdir(manager.archive_dir)
|
||||
assert 'example.warc' in os.listdir(manager.archive_dir)
|
||||
assert "Invalid archives weren't added: sample_archive/text_content/sample.html" in caplog.messages
|
||||
@ -91,10 +91,15 @@ class TestManager:
|
||||
{'example-collection.warc': 'rewritten.warc'})
|
||||
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
|
||||
index_content = f.read()
|
||||
index_content = index_content.strip()
|
||||
|
||||
assert 'example-collection.warc' not in index_content
|
||||
assert 'rewritten.warc' in index_content
|
||||
|
||||
# check that collection index is sorted
|
||||
index_lines = index_content.split('\n')
|
||||
assert sorted(index_lines) == index_lines
|
||||
|
||||
@staticmethod
|
||||
def get_test_collections_manager(collections_path):
|
||||
manager = CollectionsManager(TEST_COLLECTION_NAME, colls_dir=collections_path, must_exist=False)
|
||||
|
Loading…
x
Reference in New Issue
Block a user