From 60f33412ff971c72f8c92e1c00cc4ca16e61e222 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 25 Feb 2015 13:19:20 -0800 Subject: [PATCH] collections manager: add new collections manager, first pass #74 add cli 'wb-manager' tool very preliminary, needs testing still --- pywb/manager/__init__.py | 0 pywb/manager/manager.py | 114 +++++++++++++++++++++++++++++++++++++++ setup.py | 1 + 3 files changed, 115 insertions(+) create mode 100644 pywb/manager/__init__.py create mode 100644 pywb/manager/manager.py diff --git a/pywb/manager/__init__.py b/pywb/manager/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py new file mode 100644 index 00000000..9b7de7b9 --- /dev/null +++ b/pywb/manager/manager.py @@ -0,0 +1,114 @@ +import os +import shutil +import sys +import logging + +from pywb.utils.loaders import load_yaml_config +from pywb.warc.cdxindexer import main as cdxindexer_main +from argparse import ArgumentParser, RawTextHelpFormatter + + +#============================================================================= +class CollectionsManager(object): + """ This utility is designed to +simplify the creation and management of web archive collections + +It may be used via cmdline to setup and maintain the +directory structure expected by pywb + """ + def __init__(self, coll_name, root_dir='collections'): + self.root_dir = root_dir + self.default_config = load_yaml_config('pywb/default_config.yaml') + self.coll_name = coll_name + + self.coll_dir = os.path.join(self.root_dir, coll_name) + + self.warc_dir = self._get_dir('archive_paths') + self.cdx_dir = self._get_dir('index_paths') + self.static_dir = self._get_dir('static_path') + self.templates_dir = self._get_dir('templates_dir') + + def _get_dir(self, name): + return os.path.join(self.coll_dir, + self.default_config['paths'][name]) + + def _create_dir(self, dirname): + try: + os.mkdir(dirname) + except: + pass + + logging.info('Created Dir: ' + dirname) + + def add_collection(self): + os.makedirs(self.coll_dir) + logging.info('Created directory: ' + self.coll_dir) + + self._create_dir(self.warc_dir) + self._create_dir(self.cdx_dir) + self._create_dir(self.static_dir) + self._create_dir(self.templates_dir) + + def add_warcs(self, warcs): + if not os.path.isdir(self.warc_dir): + raise Exception('Directory ' + warcdir + ' does not exist') + + if not warcs: + print('No WARCs specified') + return + + for filename in warcs: + shutil.copy2(filename, self.warc_dir) + logging.info('Copied ' + filename + ' to ' + self.warc_dir) + + self.reindex() + + def reindex(self): + cdx_file = os.path.join(self.cdx_dir, 'index.cdx') + logging.info('Indexing ' + self.warc_dir + ' to ' + cdx_file) + cdxindexer_main(['-p', '-s', '-r', cdx_file, self.warc_dir]) + +def main(): + description = """ +Create manage file based web archive collections +""" + + epilog = """ +Some examples: + +* Create new collection 'my_coll' +{0} create my_coll + +* Add warc mywarc1.warc.gz to my_coll (The warc will be copied to the collecton directory) +{0} add my_coll mywarc1.warc.gz + +""".format(os.path.basename(sys.argv[0])) + + logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s', + level=logging.DEBUG) + + parser = ArgumentParser(description=description, + epilog=epilog, + formatter_class=RawTextHelpFormatter) + + group = parser.add_mutually_exclusive_group() + group.add_argument('--init', action='store_true') + group.add_argument('--addwarc', action='store_true') + group.add_argument('--reindex', action='store_true') + + parser.add_argument('name') + parser.add_argument('files', nargs='*') + + r = parser.parse_args() + + m = CollectionsManager(r.name) + if r.init: + m.add_collection() + elif r.addwarc: + m.add_warcs(r.files) + elif r.reindex: + m.reindex() + + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index 7ff067dc..23f4c23f 100755 --- a/setup.py +++ b/setup.py @@ -89,6 +89,7 @@ setup( cdx-indexer = pywb.warc.cdxindexer:main live-rewrite-server = pywb.apps.live_rewrite_server:main proxy-cert-auth = pywb.framework.certauth:main + wb-manager = pywb.manager.manager:main """, zip_safe=True, classifiers=[