From 454486bf75ec57abbf3427fd4c4e356aa7106424 Mon Sep 17 00:00:00 2001 From: kuechensofa <89413714+kuechensofa@users.noreply.github.com> Date: Wed, 15 Feb 2023 23:00:38 +0100 Subject: [PATCH] [#799] wb-manager: Add wacz archives to collection with --uncompress-wacz (#800) Add WACZ support for `wb-manager add` by unpacking WACZ files with --uncompress-wacz. A future commit will add pywb support for WACZ files without requiring them to be unpacked. --- docs/manual/apps.rst | 2 +- pywb/manager/manager.py | 147 +++++++++++++++++--- sample_archive/cdxj/example.cdx.gz | Bin 0 -> 194 bytes sample_archive/waczs/invalid_example_1.wacz | Bin 0 -> 485 bytes sample_archive/waczs/valid_example_1.wacz | Bin 0 -> 4186 bytes tests/test_manager.py | 97 +++++++++++++ 6 files changed, 224 insertions(+), 22 deletions(-) create mode 100644 sample_archive/cdxj/example.cdx.gz create mode 100644 sample_archive/waczs/invalid_example_1.wacz create mode 100644 sample_archive/waczs/valid_example_1.wacz create mode 100644 tests/test_manager.py diff --git a/docs/manual/apps.rst b/docs/manual/apps.rst index 630d5657..4c7f1b99 100644 --- a/docs/manual/apps.rst +++ b/docs/manual/apps.rst @@ -45,7 +45,7 @@ The tool can be used while ``wayback`` is running, and pywb will detect many cha It can be used to: * Create a new collection -- ``wb-manager init `` -* Add WARCs to collection -- ``wb-manager add `` +* Add WARCs or WACZs to collection -- ``wb-manager add `` * Add override templates * Add and remove metadata to a collections ``metadata.yaml`` * List all collections diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index 18c76ee7..40a8bef8 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -5,12 +5,15 @@ import logging import heapq import yaml import re +import gzip import six from distutils.util import strtobool from pkg_resources import resource_string, get_distribution from argparse import ArgumentParser, RawTextHelpFormatter +from tempfile import mkdtemp +from zipfile import ZipFile from pywb.utils.loaders import load_yaml_config from warcio.timeutils import timestamp20_now @@ -47,6 +50,9 @@ directory structure expected by pywb COLLS_DIR = 'collections' + WARC_RX = re.compile(r'.*\.w?arc(\.gz)?$') + WACZ_RX = re.compile(r'.*\.wacz$') + def __init__(self, coll_name, colls_dir=None, must_exist=True): colls_dir = colls_dir or self.COLLS_DIR self.default_config = load_yaml_config(DEFAULT_CONFIG) @@ -115,29 +121,127 @@ directory structure expected by pywb 'To create a new collection, run\n\n{1} init {0}') raise IOError(msg.format(self.coll_name, sys.argv[0])) - def add_warcs(self, warcs): + def add_archives(self, archives, uncompress_wacz=False): if not os.path.isdir(self.archive_dir): raise IOError('Directory {0} does not exist'. format(self.archive_dir)) - full_paths = [] - duplicate_warcs = [] - for filename in warcs: - filename = os.path.abspath(filename) + invalid_archives = [] + warc_paths = [] + for archive in archives: + if self.WARC_RX.match(archive): + full_path = self._add_warc(archive) + if full_path: + warc_paths.append(full_path) + elif self.WACZ_RX.match(archive): + if uncompress_wacz: + self._add_wacz_uncompressed(archive) + else: + raise NotImplementedError('Adding waczs without unpacking is not yet implemented. Use ' + '\'--uncompress-wacz\' flag to add the wacz\'s content.') + else: + invalid_archives.append(archive) - # don't overwrite existing warcs with duplicate names - if os.path.exists(os.path.join(self.archive_dir, os.path.basename(filename))): - duplicate_warcs.append(filename) + self._index_merge_warcs(warc_paths, self.DEF_INDEX_FILE) + + if invalid_archives: + logging.warning(f'Invalid archives weren\'t added: {", ".join(invalid_archives)}') + + def _add_warc(self, warc): + filename = os.path.abspath(warc) + + # don't overwrite existing warcs with duplicate names + if os.path.exists(os.path.join(self.archive_dir, os.path.basename(filename))): + logging.warning(f'Warc {filename} wasn\'t added because of duplicate name.') + return None + + shutil.copy2(filename, self.archive_dir) + full_path = os.path.join(self.archive_dir, filename) + logging.info('Copied ' + filename + ' to ' + self.archive_dir) + return full_path + + def _add_wacz_uncompressed(self, wacz): + wacz = os.path.abspath(wacz) + temp_dir = mkdtemp() + warc_regex = re.compile(r'.+\.warc(\.gz)?$') + cdx_regex = re.compile(r'.+\.cdx(\.gz)?$') + with ZipFile(wacz, 'r') as wacz_zip_file: + archive_members = wacz_zip_file.namelist() + warc_files = [file for file in archive_members if warc_regex.match(file)] + if not warc_files: + logging.warning(f'WACZ {wacz} does not contain any warc files.') + return + + # extract warc files + for warc_file in warc_files: + wacz_zip_file.extract(warc_file, temp_dir) + + cdx_files = [file for file in archive_members if cdx_regex.match(file)] + if not cdx_files: + logging.warning(f'WACZ {wacz} does not contain any indices.') + return + + for cdx_file in cdx_files: + wacz_zip_file.extract(cdx_file, temp_dir) + + # copy extracted warc files to collections archive dir, use wacz filename as filename with added index if + # multiple warc files exist + warc_filename_mapping = {} + full_paths = [] + for idx, extracted_warc_file in enumerate(warc_files): + _, warc_ext = os.path.splitext(extracted_warc_file) + if warc_ext == '.gz': + warc_ext = '.warc.gz' + warc_filename = os.path.basename(wacz) + warc_filename, _ = os.path.splitext(warc_filename) + warc_filename = f'{warc_filename}-{idx}{warc_ext}' + warc_destination_path = os.path.join(self.archive_dir, warc_filename) + + if os.path.exists(warc_destination_path): + logging.warning(f'Warc {warc_filename} wasn\'t added because of duplicate name.') continue - shutil.copy2(filename, self.archive_dir) - full_paths.append(os.path.join(self.archive_dir, filename)) - logging.info('Copied ' + filename + ' to ' + self.archive_dir) + warc_filename_mapping[os.path.basename(extracted_warc_file)] = warc_filename + shutil.copy2(os.path.join(temp_dir, extracted_warc_file), warc_destination_path) + full_paths.append(warc_destination_path) - self._index_merge_warcs(full_paths, self.DEF_INDEX_FILE) + # rewrite filenames in wacz indices and merge them with collection index file + for cdx_file in cdx_files: + self._add_wacz_index(os.path.join(self.indexes_dir, self.DEF_INDEX_FILE), os.path.join(temp_dir, cdx_file), + warc_filename_mapping) - if duplicate_warcs: - logging.warning(f'Warcs {", ".join(duplicate_warcs)} weren\'t added because of duplicate names.') + # delete temporary files + shutil.rmtree(temp_dir) + + @staticmethod + def _add_wacz_index(collection_index_path, wacz_index_path, filename_mapping): + from pywb.warcserver.index.cdxobject import CDXObject + + # copy collection index to temporary directory + tempdir = mkdtemp() + collection_index_name = os.path.basename(collection_index_path) + collection_index_temp_path = os.path.join(tempdir, collection_index_name) + + if os.path.exists(collection_index_path): + shutil.copy2(collection_index_path, collection_index_temp_path) + + with open(collection_index_temp_path, 'a') as collection_index_temp_file: + if wacz_index_path.endswith('.gz'): + wacz_index_file = gzip.open(wacz_index_path, 'rb') + else: + wacz_index_file = open(wacz_index_path, 'rb') + collection_index_temp_file.write('\n') + for line in wacz_index_file.readlines(): + cdx_object = CDXObject(cdxline=line) + if cdx_object['filename'] in filename_mapping: + cdx_object['filename'] = filename_mapping[cdx_object['filename']] + collection_index_temp_file.write(cdx_object.to_cdxj()) + + wacz_index_file.close() + + # copy temporary index back to original location and delete temporary directory + shutil.move(collection_index_temp_path, collection_index_path) + shutil.rmtree(tempdir) def reindex(self): cdx_file = os.path.join(self.indexes_dir, self.DEF_INDEX_FILE) @@ -383,16 +487,17 @@ Create manage file based web archive collections listcmd = subparsers.add_parser('list', help=list_help) listcmd.set_defaults(func=do_list) - # Add Warcs + # Add Warcs or Waczs def do_add(r): m = CollectionsManager(r.coll_name) - m.add_warcs(r.files) + m.add_archives(r.files, r.uncompress_wacz) - addwarc_help = 'Copy ARCS/WARCS to collection directory and reindex' - addwarc = subparsers.add_parser('add', help=addwarc_help) - addwarc.add_argument('coll_name') - addwarc.add_argument('files', nargs='+') - addwarc.set_defaults(func=do_add) + add_archives_help = 'Copy ARCS/WARCS/WACZ to collection directory and reindex' + add_archives = subparsers.add_parser('add', help=add_archives_help) + add_archives.add_argument('--uncompress-wacz', dest='uncompress_wacz', action='store_true') + add_archives.add_argument('coll_name') + add_archives.add_argument('files', nargs='+') + add_archives.set_defaults(func=do_add) # Reindex All def do_reindex(r): diff --git a/sample_archive/cdxj/example.cdx.gz b/sample_archive/cdxj/example.cdx.gz new file mode 100644 index 0000000000000000000000000000000000000000..9746596354b1ae906d30eea0ef4eff41804d97cc GIT binary patch literal 194 zcmV;z06qU7iwFP!000006D`h33&JoEhT*%vqU0vjCY@HR-t^#J7mC}l#B_ngh#jdQ z{&yYi_2C`9k=D5CnVOqcZSH^o;R_Lg4-oX67kk__8#=5&ybU?8>)LB#Pg=v7>o8Lq zMG2}W@&b(}tT4q#MGQg^T*_QkNN`-lu-wMsbQ0{AFrFue{R~FQsTZy{`*XCSTB~c0 w1=T(b0uq~LmHOTH!!AKqexvk1-MrM1R%<1(v{$cA?~=V+A3rTrcisR100UxLlmGw# literal 0 HcmV?d00001 diff --git a/sample_archive/waczs/invalid_example_1.wacz b/sample_archive/waczs/invalid_example_1.wacz new file mode 100644 index 0000000000000000000000000000000000000000..f5dc1abf3a8fb83bddc7863224bb4816d6cd69e5 GIT binary patch literal 485 zcmWIWW@Zs#U|`^2xVbGVuxfAV@>(EoArK2N$S|ZNmLwJ=CTAz6r|M-D=jVlna56Ce z>Yg6|8;DCQxEUB(zA`c}fK4#%xy^ORK)|Jb>6^{h%vkdGE7|2tI?KA0_a?{H3H94o z{xIPwJ}1(Vx$U86!P^4~$7)^vI=F9mtsr=Q{gTe>AHx)qzSX}GP;=(~s9&8s?OyGy z!;;01xhtOVia*X+Yond2&*dNG5#(`LP2RH^=Sx`=9!A z)CLKIqu?%LzpkPdFqEj3Cs!01eg-cL*M~JI+rf09^G$DamITHGG7U>-adYO(k=W`^ z*MQD&&SLKK=bT#y^PH`5;|=SnY3YXc+x`Uel$Qf&>2Mj$Uel87x6728sbwlRrV!6` zOgd6Gzz^9}$xE*75k(7>fxFaIsqWC3^_v;ob2RI$n(m%=rQb{jXI%?Bqf+ZDX0DJv zalz!6lp!ZjMx8;*#?GU98{(W1k;7g%NMd(SrNIJL<5kcZJ2Rz;!|eo z&?Z@tC+FxLUNbqmQ}B|O_d7zzjP=4OFB4e=nQ7ZgOY`P$tfM!bBt369_2jbFV$52~ z#Gg3K^~PQGl-Za*E4HSYAGmhEA4ET38n(ryzEAomIsyX047lEM@;${83Ic+k%7sS< zb+)j!afge+QBZqFJNR`A2Rl2sg{zGNQuH}~?$6{9G1%)uqsG9%Q}b0nvNOBGk9J<= z%9kehmFVPRf=EciW|L#{pOqfUsmdbqcITFEQM&O?4*#Of!OT$Cmuks z|4N&uakF^i_s2kD1(|PBiv8v)-~G85u>
    q0%`8tUolxAM2-?rnA>GM=SPqLD`! zeFz|$KS!dB<6VVxuzg7CB{qwk+^@!_RthUD>;22@ncs$RZ%sGW8En2&6_u-%Ps9QQ z{c?@=LAF^{0M27--yhf@&hwnw6Y~jo4zDy3*h5T+@i+sc@@Sqi(-5dANV>|c z$8)Y%>%E?_Rd1zjb);|va|{%e$5VajjRNmd19;v)*X>o}4Kv%_4lIur=IuOzk%DFR=D!r7EWW%gp^;4Bny&c3huo)d6pewFC6%V$)xqO(hp(i!e9T+FE6ALs=- z4=736b=9so5H2%JoiMAoGRXBP>Le)2NqY-!{#QQTAP8NCM=YucVYQ&RoTee7r{*GERqau`3zp^7i${? zm*jq8wFG*DA=#Xm`Y7r}Oy=zfOT&H!{UV4z8$j!@_-}T$ZXnmDLO#pHx#Kp zustOi*aP^;y-*^fNZ=u(RR&!N9{}kKpmkFc=XIPj=qM7A%yVYk%`98Hw>GJ$H<4&m z+dYaUoH5~r@IhC9(2EDda1tqsJC~^_qu5vdJG5%6`jN}t{(;_7Z25a%S8rr0#}*>^ zCvS^rAqFL_o>DBz5oM>thp&luOiP5Ndt78cfFnaA!Pa>Y4m8{Dw6}OMFWF7I;EJ}WyZt=XKLET~uG1Ul89Pfs>%^?j#^V1?9MOQAX6lMbA*4 z55f38mT^iZIq*R}?-lu?;C>H-R!a)X+~|zgDf^I@CN{zJM%}yulJxK#-ItnP#6VyS z9DM!N(<`TP0E+$(Y3HU=0RsStpN~E4?ta62X4Pg~fW}S}RYY3WlFV&4qiG!LVIN~v zi(ixvt_{uRGxyXj4|)&F=eT16viIS51K$X(Q9b`KL!2$4O3D@7{aJo0!3Y&$T<2Jp zS3jrLa#H#w*6hn7a42oe7;Q?5-@jJRSIyV{3Ay->^ zrxvp)&clgzpA98BVQQ*DYSMB-bP1eiKo8BJc@vLMV@<6$Z`EG6#xn~#VKctUj*CZczed}0d zH-4wZXTf!tJvc=Apb?Dvks2jU%bf0VX>SUy>c$@l_Ohvf* z;_|lM!Y0r2*h~GMOJ29prBAj{79W!|95TDK%NxX(zZOzjgz9%FtOeg$?wj51lYf=D z9f9W3_VF5sX#9S1^!|BrtWbF4r>WhB$EHw?4b{@|`f-ej5LWK}%`bJ=N=~Ft|JVK?NM3rm|MkKFo79Cv`43;q;__-E>7L(w6Vh;9zA?G4&!!o<Q-86;-z{?Uh$*D z!=ACUL0_@vt`5|s>F_b7m1o_lN9UnOl3f}=C_v@hmuaEWI~JuN<;^`I!PCtw=u%~2 zx*d{!TUh43O>$}AH65_^qp&M)cUDL4`B?C4D{i*$S$n7*+S+s&GL$c_*&72#YzS_ zw-H7Zw~9<&D)>)t1eE9B3(R3+n;zg7RAo3%VF+!A zfhe^*SL4cz%UA;OJB|e>QP(gOo72c73aE{|UDTURK+WT^uyW5(BCPUKG}0#2^3{3c z{MA*3I5Dh5xFs7Pt(pG1tZe<9v$jF(x;gm z&>GU9U~c>vo+);WR++uOk5fAXLmyU7~i9$+4-0)p5N?}=7vvhg(zJ^J-iSTP3 zhWOW~^g-4ZtRy59DVG28B1e>Ug8QwN0~4YnDC1O$Wl zctE3#RJX@u28f8ILQXz28LR6A5JjMc!|I<3#6WKc2Ir zFgX**NhG2q22|P;+aryyOfRsBEv&``VY1VtMjsrhFK}TJ4w2+pFgCv|qXM>b2&dM5 zgmljNZxs>?+_yP_LSoAYfVbcmh&pp``TVaE7&1L@{9Sk z{T~7>3>IYLvV|Ygsa|!qS3DJ&%^|EJnoEe_pljAjqy8E$WX-O%8!=|U;vd`oA>!yP z!EX6MY>)D`2M1vO4HxZLh@Ot5%H+ z!f~zghPeu{#Con>Ph^DjU1KqNJEia3O(6|6r4iYYfKbTUi0WG;q3x~!8jRV zQ2&p3-f{kHW$v`npms;`2X#+Ak9sDiCI>ZPCWp^XeZy#XMr{?I6x4-(ev7qQKs)F- zr!no^%+z)(nQ1-)=xP!Y0g3-_>Wfe3FY(9c_kT&@??m`75C2Xif43(f@F&Fm!vmgH z{$>8}Oz%(gynuK-g#U;4{pE~|;NR-$PiOvw9#jMb|ER9NoB#X|@V^`TFLN>Ke{OSK UO%lM*H%Rb@CLZe#nx9Yq0fAk9TmS$7 literal 0 HcmV?d00001 diff --git a/tests/test_manager.py b/tests/test_manager.py new file mode 100644 index 00000000..285e64f8 --- /dev/null +++ b/tests/test_manager.py @@ -0,0 +1,97 @@ +import os + +import pytest + +from pywb.manager.manager import CollectionsManager + +VALID_WACZ_PATH = 'sample_archive/waczs/valid_example_1.wacz' +INVALID_WACZ_PATH = 'sample_archive/waczs/invalid_example_1.wacz' + +TEST_COLLECTION_NAME = 'test-col' + + +class TestManager: + def test_add_valid_wacz_uncompressed(self, tmp_path): + """Test if adding a valid wacz file to a collection succeeds""" + manager = self.get_test_collections_manager(tmp_path) + manager._add_wacz_uncompressed(VALID_WACZ_PATH) + assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir) + assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir) + with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: + assert '"filename": "valid_example_1-0.warc"' in f.read() + + def test_add_invalid_wacz_uncompressed(self, tmp_path, caplog): + """Test if adding an invalid wacz file to a collection fails""" + manager = self.get_test_collections_manager(tmp_path) + manager._add_wacz_uncompressed(INVALID_WACZ_PATH) + assert 'invalid_example_1-0.warc' not in os.listdir(manager.archive_dir) + assert 'sample_archive/waczs/invalid_example_1.wacz does not contain any warc files.' in caplog.text + + index_path = os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE) + if os.path.exists(index_path): + with open(index_path, 'r') as f: + assert '"filename": "invalid_example_1-0.warc"' not in f.read() + + def test_add_valid_archives_uncompressed_wacz(self, tmp_path): + manager = self.get_test_collections_manager(tmp_path) + archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz', + 'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz', + 'sample_archive/waczs/valid_example_1.wacz'] + manager.add_archives(archives, uncompress_wacz=True) + + with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: + index_text = f.read() + + for archive in archives: + archive = os.path.basename(archive) + + if archive.endswith('wacz'): + archive = 'valid_example_1-0.warc' + + assert archive in os.listdir(manager.archive_dir) + assert archive in index_text + + def test_add_valid_archives_dont_uncompress_wacz(self, tmp_path): + manager = self.get_test_collections_manager(tmp_path) + archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz', + 'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz', + 'sample_archive/waczs/valid_example_1.wacz'] + + with pytest.raises(NotImplementedError): + manager.add_archives(archives, uncompress_wacz=False) + + def test_add_invalid_archives_uncompress_wacz(self, tmp_path, caplog): + manager = self.get_test_collections_manager(tmp_path) + manager.add_archives(['sample_archive/warcs/example.warc', 'sample_archive/text_content/sample.html'], + uncompress_wacz=True) + assert 'sample.html' not in os.listdir(manager.archive_dir) + assert 'example.warc' in os.listdir(manager.archive_dir) + assert "Invalid archives weren't added: sample_archive/text_content/sample.html" in caplog.messages + + def test_merge_wacz_index(self, tmp_path): + manager = self.get_test_collections_manager(tmp_path) + manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), + 'sample_archive/cdxj/example.cdxj', + {'example.warc.gz': 'rewritten.warc.gz'}) + with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: + index_content = f.read() + + assert 'example.warc.gz' not in index_content + assert 'rewritten.warc.gz' in index_content + + def test_merge_wacz_index_gzip(self, tmp_path): + manager = self.get_test_collections_manager(tmp_path) + manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), + 'sample_archive/cdxj/example.cdx.gz', + {'example-collection.warc': 'rewritten.warc'}) + with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: + index_content = f.read() + + assert 'example-collection.warc' not in index_content + assert 'rewritten.warc' in index_content + + @staticmethod + def get_test_collections_manager(collections_path): + manager = CollectionsManager(TEST_COLLECTION_NAME, colls_dir=collections_path, must_exist=False) + manager.add_collection() + return manager