1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

collections manager: add new collections manager, first pass #74

add cli 'wb-manager' tool
very preliminary, needs testing still
This commit is contained in:
Ilya Kreymer 2015-02-25 13:19:20 -08:00
parent 69613a0e25
commit 60f33412ff
3 changed files with 115 additions and 0 deletions

0
pywb/manager/__init__.py Normal file
View File

114
pywb/manager/manager.py Normal file
View File

@ -0,0 +1,114 @@
import os
import shutil
import sys
import logging
from pywb.utils.loaders import load_yaml_config
from pywb.warc.cdxindexer import main as cdxindexer_main
from argparse import ArgumentParser, RawTextHelpFormatter
#=============================================================================
class CollectionsManager(object):
""" This utility is designed to
simplify the creation and management of web archive collections
It may be used via cmdline to setup and maintain the
directory structure expected by pywb
"""
def __init__(self, coll_name, root_dir='collections'):
self.root_dir = root_dir
self.default_config = load_yaml_config('pywb/default_config.yaml')
self.coll_name = coll_name
self.coll_dir = os.path.join(self.root_dir, coll_name)
self.warc_dir = self._get_dir('archive_paths')
self.cdx_dir = self._get_dir('index_paths')
self.static_dir = self._get_dir('static_path')
self.templates_dir = self._get_dir('templates_dir')
def _get_dir(self, name):
return os.path.join(self.coll_dir,
self.default_config['paths'][name])
def _create_dir(self, dirname):
try:
os.mkdir(dirname)
except:
pass
logging.info('Created Dir: ' + dirname)
def add_collection(self):
os.makedirs(self.coll_dir)
logging.info('Created directory: ' + self.coll_dir)
self._create_dir(self.warc_dir)
self._create_dir(self.cdx_dir)
self._create_dir(self.static_dir)
self._create_dir(self.templates_dir)
def add_warcs(self, warcs):
if not os.path.isdir(self.warc_dir):
raise Exception('Directory ' + warcdir + ' does not exist')
if not warcs:
print('No WARCs specified')
return
for filename in warcs:
shutil.copy2(filename, self.warc_dir)
logging.info('Copied ' + filename + ' to ' + self.warc_dir)
self.reindex()
def reindex(self):
cdx_file = os.path.join(self.cdx_dir, 'index.cdx')
logging.info('Indexing ' + self.warc_dir + ' to ' + cdx_file)
cdxindexer_main(['-p', '-s', '-r', cdx_file, self.warc_dir])
def main():
description = """
Create manage file based web archive collections
"""
epilog = """
Some examples:
* Create new collection 'my_coll'
{0} create my_coll
* Add warc mywarc1.warc.gz to my_coll (The warc will be copied to the collecton directory)
{0} add my_coll mywarc1.warc.gz
""".format(os.path.basename(sys.argv[0]))
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
level=logging.DEBUG)
parser = ArgumentParser(description=description,
epilog=epilog,
formatter_class=RawTextHelpFormatter)
group = parser.add_mutually_exclusive_group()
group.add_argument('--init', action='store_true')
group.add_argument('--addwarc', action='store_true')
group.add_argument('--reindex', action='store_true')
parser.add_argument('name')
parser.add_argument('files', nargs='*')
r = parser.parse_args()
m = CollectionsManager(r.name)
if r.init:
m.add_collection()
elif r.addwarc:
m.add_warcs(r.files)
elif r.reindex:
m.reindex()
if __name__ == "__main__":
main()

View File

@ -89,6 +89,7 @@ setup(
cdx-indexer = pywb.warc.cdxindexer:main
live-rewrite-server = pywb.apps.live_rewrite_server:main
proxy-cert-auth = pywb.framework.certauth:main
wb-manager = pywb.manager.manager:main
""",
zip_safe=True,
classifiers=[