2015-02-25 13:19:20 -08:00
|
|
|
import os
|
|
|
|
import shutil
|
|
|
|
import sys
|
|
|
|
import logging
|
|
|
|
|
|
|
|
from pywb.utils.loaders import load_yaml_config
|
2015-03-14 14:56:15 -07:00
|
|
|
from pywb.utils.timeutils import timestamp20_now
|
2015-02-25 13:19:20 -08:00
|
|
|
from pywb.warc.cdxindexer import main as cdxindexer_main
|
2015-03-16 16:53:24 -07:00
|
|
|
from pywb.webapp.pywb_init import DEFAULT_CONFIG
|
|
|
|
|
|
|
|
from distutils.util import strtobool
|
|
|
|
from pkg_resources import resource_string
|
2015-03-14 14:56:15 -07:00
|
|
|
|
2015-02-25 13:19:20 -08:00
|
|
|
from argparse import ArgumentParser, RawTextHelpFormatter
|
2015-03-14 14:56:15 -07:00
|
|
|
import heapq
|
2015-03-15 21:20:00 -07:00
|
|
|
import yaml
|
2015-02-25 13:19:20 -08:00
|
|
|
|
|
|
|
|
2015-03-16 16:53:24 -07:00
|
|
|
#=============================================================================
|
|
|
|
# to allow testing by mocking get_input
|
|
|
|
def get_input(msg): #pragma: no cover
|
|
|
|
return raw_input(msg)
|
|
|
|
|
|
|
|
|
2015-02-25 13:19:20 -08:00
|
|
|
#=============================================================================
|
2015-03-17 21:11:35 -07:00
|
|
|
class CollectionsManager(object):
|
2015-02-25 13:19:20 -08:00
|
|
|
""" This utility is designed to
|
|
|
|
simplify the creation and management of web archive collections
|
|
|
|
|
|
|
|
It may be used via cmdline to setup and maintain the
|
|
|
|
directory structure expected by pywb
|
|
|
|
"""
|
2015-03-15 21:20:00 -07:00
|
|
|
def __init__(self, coll_name, root_dir='collections', must_exist=True):
|
2015-02-25 13:19:20 -08:00
|
|
|
self.root_dir = root_dir
|
|
|
|
self.default_config = load_yaml_config('pywb/default_config.yaml')
|
|
|
|
self.coll_name = coll_name
|
|
|
|
|
|
|
|
self.coll_dir = os.path.join(self.root_dir, coll_name)
|
|
|
|
|
|
|
|
self.warc_dir = self._get_dir('archive_paths')
|
|
|
|
self.cdx_dir = self._get_dir('index_paths')
|
|
|
|
self.static_dir = self._get_dir('static_path')
|
|
|
|
self.templates_dir = self._get_dir('templates_dir')
|
2015-03-15 21:20:00 -07:00
|
|
|
if must_exist:
|
|
|
|
self._assert_coll_exists()
|
|
|
|
|
|
|
|
def list_colls(self):
|
|
|
|
print('Collections:')
|
|
|
|
for d in os.listdir(self.root_dir):
|
|
|
|
if os.path.isdir(os.path.join(self.root_dir, d)):
|
|
|
|
print('- ' + d)
|
2015-02-25 13:19:20 -08:00
|
|
|
|
2015-03-17 19:05:39 -07:00
|
|
|
def _get_root_dir(self, name):
|
|
|
|
return os.path.join(os.getcwd(),
|
|
|
|
self.default_config['paths'][name])
|
|
|
|
|
2015-02-25 13:19:20 -08:00
|
|
|
def _get_dir(self, name):
|
|
|
|
return os.path.join(self.coll_dir,
|
|
|
|
self.default_config['paths'][name])
|
|
|
|
|
|
|
|
def _create_dir(self, dirname):
|
2015-03-13 19:53:50 -07:00
|
|
|
if not os.path.isdir(dirname):
|
2015-02-25 13:19:20 -08:00
|
|
|
os.mkdir(dirname)
|
|
|
|
|
|
|
|
logging.info('Created Dir: ' + dirname)
|
|
|
|
|
|
|
|
def add_collection(self):
|
|
|
|
os.makedirs(self.coll_dir)
|
|
|
|
logging.info('Created directory: ' + self.coll_dir)
|
|
|
|
|
|
|
|
self._create_dir(self.warc_dir)
|
|
|
|
self._create_dir(self.cdx_dir)
|
|
|
|
self._create_dir(self.static_dir)
|
|
|
|
self._create_dir(self.templates_dir)
|
|
|
|
|
2015-03-17 19:05:39 -07:00
|
|
|
self._create_dir(self._get_root_dir('static_path'))
|
|
|
|
self._create_dir(self._get_root_dir('templates_dir'))
|
|
|
|
|
2015-03-15 21:20:00 -07:00
|
|
|
def _assert_coll_exists(self):
|
|
|
|
if not os.path.isdir(self.coll_dir):
|
|
|
|
raise IOError('Collection {0} does not exist'.
|
|
|
|
format(self.coll_name))
|
|
|
|
|
2015-02-25 13:19:20 -08:00
|
|
|
def add_warcs(self, warcs):
|
|
|
|
if not os.path.isdir(self.warc_dir):
|
2015-03-15 21:20:00 -07:00
|
|
|
raise IOError('Directory {0} does not exist'.
|
|
|
|
format(self.warc_dir))
|
2015-02-25 13:19:20 -08:00
|
|
|
|
2015-03-14 14:56:15 -07:00
|
|
|
full_paths = []
|
2015-02-25 13:19:20 -08:00
|
|
|
for filename in warcs:
|
|
|
|
shutil.copy2(filename, self.warc_dir)
|
2015-03-14 14:56:15 -07:00
|
|
|
full_paths.append(os.path.join(self.warc_dir, filename))
|
2015-02-25 13:19:20 -08:00
|
|
|
logging.info('Copied ' + filename + ' to ' + self.warc_dir)
|
|
|
|
|
2015-03-14 14:56:15 -07:00
|
|
|
self._index_merge_warcs(full_paths)
|
2015-02-25 13:19:20 -08:00
|
|
|
|
|
|
|
def reindex(self):
|
|
|
|
cdx_file = os.path.join(self.cdx_dir, 'index.cdx')
|
|
|
|
logging.info('Indexing ' + self.warc_dir + ' to ' + cdx_file)
|
|
|
|
cdxindexer_main(['-p', '-s', '-r', cdx_file, self.warc_dir])
|
|
|
|
|
2015-03-14 14:56:15 -07:00
|
|
|
def index_merge(self, filelist):
|
|
|
|
wrongdir = 'Skipping {0}, must be in {1} archive directory'
|
|
|
|
notfound = 'Skipping {0}, file not found'
|
|
|
|
|
|
|
|
filtered_warcs = []
|
|
|
|
|
|
|
|
# Check that warcs are actually in warcs dir
|
|
|
|
abs_warc_dir = os.path.abspath(self.warc_dir)
|
|
|
|
|
|
|
|
for f in filelist:
|
|
|
|
abs_filepath = os.path.abspath(f)
|
|
|
|
prefix = os.path.commonprefix([abs_warc_dir, abs_filepath])
|
|
|
|
|
|
|
|
if prefix != abs_warc_dir:
|
|
|
|
raise IOError(wrongdir.format(abs_filepath, abs_warc_dir))
|
|
|
|
elif not os.path.isfile(abs_filepath):
|
|
|
|
raise IOError(notfound.format(f))
|
|
|
|
else:
|
|
|
|
filtered_warcs.append(abs_filepath.split(prefix)[1])
|
|
|
|
|
|
|
|
self._index_merge_warcs(filtered_warcs)
|
|
|
|
|
|
|
|
def _index_merge_warcs(self, new_warcs):
|
|
|
|
cdx_file = os.path.join(self.cdx_dir, 'index.cdx')
|
|
|
|
|
|
|
|
# no existing file, just reindex all
|
|
|
|
if not os.path.isfile(cdx_file):
|
|
|
|
return self.reindex()
|
|
|
|
|
|
|
|
temp_file = cdx_file + '.tmp.' + timestamp20_now()
|
|
|
|
args = ['-p', '-s', '-r', temp_file]
|
|
|
|
args.extend(new_warcs)
|
|
|
|
cdxindexer_main(args)
|
|
|
|
|
|
|
|
merged_file = temp_file + '.merged'
|
|
|
|
|
|
|
|
last_line = None
|
|
|
|
|
|
|
|
with open(cdx_file) as orig_index:
|
|
|
|
with open(temp_file) as new_index:
|
|
|
|
with open(merged_file, 'w+b') as merged:
|
|
|
|
for line in heapq.merge(orig_index, new_index):
|
|
|
|
if last_line != line:
|
|
|
|
merged.write(line)
|
|
|
|
last_line = line
|
|
|
|
|
|
|
|
os.rename(merged_file, cdx_file)
|
|
|
|
os.remove(temp_file)
|
|
|
|
|
2015-03-15 21:20:00 -07:00
|
|
|
def set_metadata(self, namevalue_pairs):
|
|
|
|
metadata_yaml = os.path.join(self.coll_dir, 'metadata.yaml')
|
|
|
|
metadata = None
|
|
|
|
if os.path.isfile(metadata_yaml):
|
|
|
|
with open(metadata_yaml) as fh:
|
|
|
|
metadata = yaml.safe_load(fh)
|
2015-03-14 14:56:15 -07:00
|
|
|
|
2015-03-15 21:20:00 -07:00
|
|
|
if not metadata:
|
|
|
|
metadata = {}
|
|
|
|
|
|
|
|
msg = 'Metadata params must be in the form "name=value"'
|
|
|
|
for pair in namevalue_pairs:
|
|
|
|
v = pair.split('=', 1)
|
|
|
|
if len(v) != 2:
|
|
|
|
raise ValueError(msg)
|
2015-02-25 13:19:20 -08:00
|
|
|
|
2015-03-16 16:53:24 -07:00
|
|
|
print('Set {0}={1}'.format(v[0], v[1]))
|
2015-03-15 21:20:00 -07:00
|
|
|
metadata[v[0]] = v[1]
|
2015-02-25 13:19:20 -08:00
|
|
|
|
2015-03-15 21:20:00 -07:00
|
|
|
with open(metadata_yaml, 'w+b') as fh:
|
|
|
|
fh.write(yaml.dump(metadata, default_flow_style=False))
|
2015-02-25 13:19:20 -08:00
|
|
|
|
2015-03-16 16:53:24 -07:00
|
|
|
def _load_templates_map(self):
|
|
|
|
defaults = load_yaml_config(DEFAULT_CONFIG)
|
|
|
|
|
|
|
|
# Coll Templates
|
|
|
|
templates = defaults['paths']['template_files']
|
|
|
|
|
|
|
|
for name, _ in templates.iteritems():
|
|
|
|
templates[name] = defaults[name]
|
|
|
|
|
|
|
|
|
|
|
|
# Shared Templates
|
|
|
|
shared_templates = defaults['paths']['shared_template_files']
|
|
|
|
|
|
|
|
for name, _ in shared_templates.iteritems():
|
|
|
|
shared_templates[name] = defaults[name]
|
|
|
|
|
|
|
|
return templates, shared_templates
|
|
|
|
|
|
|
|
def list_templates(self):
|
|
|
|
templates, shared_templates = self._load_templates_map()
|
|
|
|
|
|
|
|
print('Shared Templates')
|
|
|
|
for n, v in shared_templates.iteritems():
|
|
|
|
print('- {0}: (pywb/{1})'.format(n, v))
|
|
|
|
|
|
|
|
print('')
|
|
|
|
|
|
|
|
print('Collection Templates')
|
|
|
|
for n, v in templates.iteritems():
|
|
|
|
print('- {0}: (pywb/{1})'.format(n, v))
|
|
|
|
|
|
|
|
def _confirm_overwrite(self, full_path, msg):
|
|
|
|
if not os.path.isfile(full_path):
|
|
|
|
return True
|
|
|
|
|
|
|
|
res = get_input(msg)
|
|
|
|
try:
|
|
|
|
res = strtobool(res)
|
|
|
|
except ValueError:
|
|
|
|
res = False
|
|
|
|
|
|
|
|
if not res:
|
|
|
|
raise IOError('Skipping, {0} already exists'.format(full_path))
|
|
|
|
|
|
|
|
def _get_template_path(self, template_name, verb):
|
|
|
|
templates, shared_templates = self._load_templates_map()
|
|
|
|
|
|
|
|
try:
|
|
|
|
filename = templates[template_name]
|
|
|
|
if not self.coll_name:
|
|
|
|
msg = ('To {1} a "{0}" template, you must specify ' +
|
|
|
|
'a collection name: template <coll> --{1} {0}')
|
|
|
|
raise IOError(msg.format(template_name, verb))
|
|
|
|
|
2015-03-16 19:57:28 -07:00
|
|
|
full_path = os.path.join(self.templates_dir, os.path.basename(filename))
|
2015-03-16 16:53:24 -07:00
|
|
|
|
|
|
|
except KeyError:
|
|
|
|
try:
|
|
|
|
filename = shared_templates[template_name]
|
2015-03-16 19:57:28 -07:00
|
|
|
full_path = os.path.join(os.getcwd(), filename)
|
|
|
|
|
2015-03-16 16:53:24 -07:00
|
|
|
except KeyError:
|
|
|
|
msg = 'template name must be one of {0} or {1}'
|
2015-03-16 19:57:28 -07:00
|
|
|
msg = msg.format(templates.keys(), shared_templates.keys())
|
2015-03-16 16:53:24 -07:00
|
|
|
raise KeyError(msg)
|
|
|
|
|
|
|
|
return full_path, filename
|
|
|
|
|
|
|
|
def add_template(self, template_name, force=False):
|
|
|
|
full_path, filename = self._get_template_path(template_name, 'add')
|
|
|
|
|
|
|
|
msg = ('Template file "{0}" ({1}) already exists. ' +
|
|
|
|
'Overwrite with default template? (y/n) ')
|
|
|
|
msg = msg.format(full_path, template_name)
|
|
|
|
|
|
|
|
if not force:
|
|
|
|
self._confirm_overwrite(full_path, msg)
|
|
|
|
|
|
|
|
data = resource_string('pywb', filename)
|
|
|
|
with open(full_path, 'w+b') as fh:
|
|
|
|
fh.write(data)
|
|
|
|
|
|
|
|
full_path = os.path.abspath(full_path)
|
|
|
|
print('Copied default template "{0}" to "{1}"'.format(filename, full_path))
|
|
|
|
|
|
|
|
def remove_template(self, template_name, force=False):
|
|
|
|
full_path, filename = self._get_template_path(template_name, 'remove')
|
|
|
|
|
|
|
|
if not os.path.isfile(full_path):
|
|
|
|
msg = 'Template "{0}" does not exist.'
|
|
|
|
raise IOError(msg.format(full_path))
|
|
|
|
|
|
|
|
msg = 'Delete template file "{0}" ({1})? (y/n) '
|
|
|
|
msg = msg.format(full_path, template_name)
|
|
|
|
|
|
|
|
if not force:
|
|
|
|
self._confirm_overwrite(full_path, msg)
|
|
|
|
|
|
|
|
os.remove(full_path)
|
|
|
|
print('Removed template file "{0}"'.format(full_path))
|
2015-02-25 13:19:20 -08:00
|
|
|
|
2015-03-15 21:20:00 -07:00
|
|
|
#=============================================================================
|
|
|
|
def main(args=None):
|
|
|
|
description = """
|
|
|
|
Create manage file based web archive collections
|
|
|
|
"""
|
|
|
|
#format(os.path.basename(sys.argv[0]))
|
2015-02-25 13:19:20 -08:00
|
|
|
|
|
|
|
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
|
|
|
|
level=logging.DEBUG)
|
|
|
|
|
|
|
|
parser = ArgumentParser(description=description,
|
2015-03-15 21:20:00 -07:00
|
|
|
#epilog=epilog,
|
2015-02-25 13:19:20 -08:00
|
|
|
formatter_class=RawTextHelpFormatter)
|
|
|
|
|
2015-03-15 21:20:00 -07:00
|
|
|
subparsers = parser.add_subparsers(dest='type')
|
|
|
|
|
|
|
|
# Init Coll
|
|
|
|
def do_init(r):
|
|
|
|
m = CollectionsManager(r.coll_name, must_exist=False)
|
|
|
|
m.add_collection()
|
2015-02-25 13:19:20 -08:00
|
|
|
|
2015-03-15 21:20:00 -07:00
|
|
|
init_help = 'Init new collection, create all collection directories'
|
|
|
|
init = subparsers.add_parser('init', help=init_help)
|
|
|
|
init.add_argument('coll_name')
|
|
|
|
init.set_defaults(func=do_init)
|
2015-02-25 13:19:20 -08:00
|
|
|
|
2015-03-15 21:20:00 -07:00
|
|
|
# List Colls
|
|
|
|
def do_list(r):
|
|
|
|
m = CollectionsManager('', must_exist=False)
|
|
|
|
m.list_colls()
|
2015-02-25 13:19:20 -08:00
|
|
|
|
2015-03-15 21:20:00 -07:00
|
|
|
list_help = 'List Collections'
|
|
|
|
listcmd = subparsers.add_parser('list', help=list_help)
|
|
|
|
listcmd.set_defaults(func=do_list)
|
|
|
|
|
|
|
|
# Add Warcs
|
|
|
|
def do_add(r):
|
|
|
|
m = CollectionsManager(r.coll_name)
|
2015-02-25 13:19:20 -08:00
|
|
|
m.add_warcs(r.files)
|
2015-03-15 21:20:00 -07:00
|
|
|
|
|
|
|
addwarc_help = 'Copy ARCS/WARCS to collection directory and reindex'
|
|
|
|
addwarc = subparsers.add_parser('add', help=addwarc_help)
|
|
|
|
addwarc.add_argument('coll_name')
|
|
|
|
addwarc.add_argument('files', nargs='+')
|
|
|
|
addwarc.set_defaults(func=do_add)
|
|
|
|
|
|
|
|
|
|
|
|
# Reindex All
|
|
|
|
def do_reindex(r):
|
|
|
|
m = CollectionsManager(r.coll_name)
|
2015-02-25 13:19:20 -08:00
|
|
|
m.reindex()
|
|
|
|
|
2015-03-15 21:20:00 -07:00
|
|
|
reindex_help = 'Re-Index entire collection'
|
|
|
|
reindex = subparsers.add_parser('reindex', help=reindex_help)
|
|
|
|
reindex.add_argument('coll_name')
|
|
|
|
reindex.set_defaults(func=do_reindex)
|
|
|
|
|
|
|
|
# Index warcs
|
|
|
|
def do_index(r):
|
|
|
|
m = CollectionsManager(r.coll_name)
|
|
|
|
m.index_merge(r.files)
|
|
|
|
|
|
|
|
indexwarcs_help = 'Index specified ARC/WARC files in the collection'
|
|
|
|
indexwarcs = subparsers.add_parser('index', help=indexwarcs_help)
|
|
|
|
indexwarcs.add_argument('coll_name')
|
|
|
|
indexwarcs.add_argument('files', nargs='+')
|
|
|
|
indexwarcs.set_defaults(func=do_index)
|
|
|
|
|
|
|
|
# Set metadata
|
|
|
|
def do_metadata(r):
|
|
|
|
m = CollectionsManager(r.coll_name)
|
|
|
|
m.set_metadata(r.set)
|
|
|
|
|
|
|
|
metadata_help = 'Set Metadata'
|
|
|
|
metadata = subparsers.add_parser('metadata', help=metadata_help)
|
|
|
|
metadata.add_argument('coll_name')
|
|
|
|
metadata.add_argument('--set', nargs='+')
|
|
|
|
metadata.set_defaults(func=do_metadata)
|
|
|
|
|
2015-03-16 16:53:24 -07:00
|
|
|
# Add default template
|
|
|
|
def do_add_template(r):
|
|
|
|
m = CollectionsManager(r.coll_name)
|
|
|
|
if r.add:
|
|
|
|
m.add_template(r.add, r.force)
|
|
|
|
elif r.remove:
|
|
|
|
m.remove_template(r.remove, r.force)
|
|
|
|
elif r.list:
|
|
|
|
m.list_templates()
|
|
|
|
|
|
|
|
template_help = 'Add default html template for customization'
|
|
|
|
template = subparsers.add_parser('template')
|
|
|
|
template.add_argument('coll_name', nargs='?', default='')
|
|
|
|
template.add_argument('-f', '--force', action='store_true')
|
|
|
|
template.add_argument('--add')
|
|
|
|
template.add_argument('--remove')
|
|
|
|
template.add_argument('--list', action='store_true')
|
|
|
|
template.set_defaults(func=do_add_template)
|
|
|
|
|
2015-03-15 21:20:00 -07:00
|
|
|
r = parser.parse_args(args=args)
|
|
|
|
r.func(r)
|
|
|
|
|
2015-02-25 13:19:20 -08:00
|
|
|
|
2015-03-16 16:53:24 -07:00
|
|
|
# special wrapper for cli to avoid printing stack trace
|
|
|
|
def main_wrap_exc(): #pragma: no cover
|
|
|
|
try:
|
|
|
|
main()
|
|
|
|
except Exception as e:
|
|
|
|
print('Error: ' + str(e))
|
|
|
|
sys.exit(2)
|
|
|
|
|
|
|
|
|
2015-02-25 13:19:20 -08:00
|
|
|
if __name__ == "__main__":
|
2015-03-16 16:53:24 -07:00
|
|
|
main_wrap_exc()
|