mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
manager: support autoindexing! (#91) wb-manager autoindex will use watchdog library to detect creation/updates
to any warc/arc in specified collection or across all and update autoindex cdx cdx indexing: add --dir-root option to specify custom relative root dir for filenames used in cdx
This commit is contained in:
parent
cc068f8ee8
commit
733642551d
44
pywb/manager/autoindex.py
Normal file
44
pywb/manager/autoindex.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from watchdog.observers import Observer
|
||||||
|
from watchdog.events import RegexMatchingEventHandler
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
EXT_REGEX = '.*\.w?arc(\.gz)?$'
|
||||||
|
|
||||||
|
keep_running = True
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
class CDXAutoIndexer(RegexMatchingEventHandler):
|
||||||
|
def __init__(self, updater, path):
|
||||||
|
super(CDXAutoIndexer, self).__init__(regexes=[EXT_REGEX],
|
||||||
|
ignore_directories=True)
|
||||||
|
self.updater = updater
|
||||||
|
self.cdx_path = path
|
||||||
|
|
||||||
|
def on_created(self, event):
|
||||||
|
self.updater(event.src_path)
|
||||||
|
|
||||||
|
def on_modified(self, event):
|
||||||
|
self.updater(event.src_path)
|
||||||
|
|
||||||
|
def do_watch(self, sleep_time=1):
|
||||||
|
observer = Observer()
|
||||||
|
observer.schedule(self, self.cdx_path, recursive=True)
|
||||||
|
observer.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
while keep_running:
|
||||||
|
time.sleep(sleep_time)
|
||||||
|
except KeyboardInterrupt: # pragma: no cover
|
||||||
|
observer.stop()
|
||||||
|
observer.join()
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
if __name__ == "__main__":
|
||||||
|
w = Watcher(sys.argv[1] if len(sys.argv) > 1 else '.')
|
||||||
|
def p(x):
|
||||||
|
print(x)
|
||||||
|
w.run(p)
|
@ -2,19 +2,18 @@ import os
|
|||||||
import shutil
|
import shutil
|
||||||
import sys
|
import sys
|
||||||
import logging
|
import logging
|
||||||
|
import heapq
|
||||||
from pywb.utils.loaders import load_yaml_config
|
import yaml
|
||||||
from pywb.utils.timeutils import timestamp20_now
|
|
||||||
from pywb.warc.cdxindexer import main as cdxindexer_main
|
|
||||||
from pywb.webapp.pywb_init import DEFAULT_CONFIG
|
|
||||||
from migrate import MigrateCDX
|
|
||||||
|
|
||||||
from distutils.util import strtobool
|
from distutils.util import strtobool
|
||||||
from pkg_resources import resource_string
|
from pkg_resources import resource_string
|
||||||
|
|
||||||
from argparse import ArgumentParser, RawTextHelpFormatter
|
from argparse import ArgumentParser, RawTextHelpFormatter
|
||||||
import heapq
|
|
||||||
import yaml
|
from pywb.utils.loaders import load_yaml_config
|
||||||
|
from pywb.utils.timeutils import timestamp20_now
|
||||||
|
|
||||||
|
from pywb import DEFAULT_CONFIG
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
@ -32,25 +31,32 @@ It may be used via cmdline to setup and maintain the
|
|||||||
directory structure expected by pywb
|
directory structure expected by pywb
|
||||||
"""
|
"""
|
||||||
DEF_INDEX_FILE = 'index.cdxj'
|
DEF_INDEX_FILE = 'index.cdxj'
|
||||||
|
AUTO_INDEX_FILE = 'autoindex.cdxj'
|
||||||
|
|
||||||
def __init__(self, coll_name, root_dir='collections', must_exist=True):
|
def __init__(self, coll_name, colls_dir='collections', must_exist=True):
|
||||||
self.root_dir = root_dir
|
self.default_config = load_yaml_config(DEFAULT_CONFIG)
|
||||||
self.default_config = load_yaml_config('pywb/default_config.yaml')
|
|
||||||
self.coll_name = coll_name
|
|
||||||
|
|
||||||
self.coll_dir = os.path.join(self.root_dir, coll_name)
|
self.colls_dir = colls_dir
|
||||||
|
|
||||||
|
self._set_coll_dirs(coll_name)
|
||||||
|
|
||||||
self.warc_dir = self._get_dir('archive_paths')
|
|
||||||
self.cdx_dir = self._get_dir('index_paths')
|
|
||||||
self.static_dir = self._get_dir('static_path')
|
|
||||||
self.templates_dir = self._get_dir('templates_dir')
|
|
||||||
if must_exist:
|
if must_exist:
|
||||||
self._assert_coll_exists()
|
self._assert_coll_exists()
|
||||||
|
|
||||||
|
def _set_coll_dirs(self, coll_name):
|
||||||
|
self.coll_name = coll_name
|
||||||
|
self.curr_coll_dir = os.path.join(self.colls_dir, coll_name)
|
||||||
|
|
||||||
|
self.archive_dir = self._get_dir('archive_paths')
|
||||||
|
|
||||||
|
self.indexes_dir = self._get_dir('index_paths')
|
||||||
|
self.static_dir = self._get_dir('static_path')
|
||||||
|
self.templates_dir = self._get_dir('templates_dir')
|
||||||
|
|
||||||
def list_colls(self):
|
def list_colls(self):
|
||||||
print('Collections:')
|
print('Collections:')
|
||||||
for d in os.listdir(self.root_dir):
|
for d in os.listdir(self.colls_dir):
|
||||||
if os.path.isdir(os.path.join(self.root_dir, d)):
|
if os.path.isdir(os.path.join(self.colls_dir, d)):
|
||||||
print('- ' + d)
|
print('- ' + d)
|
||||||
|
|
||||||
def _get_root_dir(self, name):
|
def _get_root_dir(self, name):
|
||||||
@ -58,7 +64,7 @@ directory structure expected by pywb
|
|||||||
self.default_config['paths'][name])
|
self.default_config['paths'][name])
|
||||||
|
|
||||||
def _get_dir(self, name):
|
def _get_dir(self, name):
|
||||||
return os.path.join(self.coll_dir,
|
return os.path.join(self.curr_coll_dir,
|
||||||
self.default_config['paths'][name])
|
self.default_config['paths'][name])
|
||||||
|
|
||||||
def _create_dir(self, dirname):
|
def _create_dir(self, dirname):
|
||||||
@ -68,11 +74,11 @@ directory structure expected by pywb
|
|||||||
logging.info('Created Dir: ' + dirname)
|
logging.info('Created Dir: ' + dirname)
|
||||||
|
|
||||||
def add_collection(self):
|
def add_collection(self):
|
||||||
os.makedirs(self.coll_dir)
|
os.makedirs(self.curr_coll_dir)
|
||||||
logging.info('Created directory: ' + self.coll_dir)
|
logging.info('Created directory: ' + self.curr_coll_dir)
|
||||||
|
|
||||||
self._create_dir(self.warc_dir)
|
self._create_dir(self.archive_dir)
|
||||||
self._create_dir(self.cdx_dir)
|
self._create_dir(self.indexes_dir)
|
||||||
self._create_dir(self.static_dir)
|
self._create_dir(self.static_dir)
|
||||||
self._create_dir(self.templates_dir)
|
self._create_dir(self.templates_dir)
|
||||||
|
|
||||||
@ -80,65 +86,71 @@ directory structure expected by pywb
|
|||||||
self._create_dir(self._get_root_dir('templates_dir'))
|
self._create_dir(self._get_root_dir('templates_dir'))
|
||||||
|
|
||||||
def _assert_coll_exists(self):
|
def _assert_coll_exists(self):
|
||||||
if not os.path.isdir(self.coll_dir):
|
if not os.path.isdir(self.curr_coll_dir):
|
||||||
raise IOError('Collection {0} does not exist'.
|
raise IOError('Collection {0} does not exist'.
|
||||||
format(self.coll_name))
|
format(self.coll_name))
|
||||||
|
|
||||||
def add_warcs(self, warcs):
|
def add_warcs(self, warcs):
|
||||||
if not os.path.isdir(self.warc_dir):
|
if not os.path.isdir(self.archive_dir):
|
||||||
raise IOError('Directory {0} does not exist'.
|
raise IOError('Directory {0} does not exist'.
|
||||||
format(self.warc_dir))
|
format(self.archive_dir))
|
||||||
|
|
||||||
full_paths = []
|
full_paths = []
|
||||||
for filename in warcs:
|
for filename in warcs:
|
||||||
shutil.copy2(filename, self.warc_dir)
|
shutil.copy2(filename, self.archive_dir)
|
||||||
full_paths.append(os.path.join(self.warc_dir, filename))
|
full_paths.append(os.path.join(self.archive_dir, filename))
|
||||||
logging.info('Copied ' + filename + ' to ' + self.warc_dir)
|
logging.info('Copied ' + filename + ' to ' + self.archive_dir)
|
||||||
|
|
||||||
self._index_merge_warcs(full_paths)
|
self._index_merge_warcs(full_paths, self.DEF_INDEX_FILE)
|
||||||
|
|
||||||
def reindex(self):
|
def reindex(self):
|
||||||
cdx_file = os.path.join(self.cdx_dir, self.DEF_INDEX_FILE)
|
cdx_file = os.path.join(self.indexes_dir, self.DEF_INDEX_FILE)
|
||||||
logging.info('Indexing ' + self.warc_dir + ' to ' + cdx_file)
|
logging.info('Indexing ' + self.archive_dir + ' to ' + cdx_file)
|
||||||
self._cdx_index(cdx_file, [self.warc_dir])
|
self._cdx_index(cdx_file, [self.archive_dir])
|
||||||
|
|
||||||
def _cdx_index(self, out, input_):
|
def _cdx_index(self, out, input_, rel_root=None):
|
||||||
def_args = ['-p', '-j', '-s', '-r']
|
from pywb.warc.cdxindexer import write_multi_cdx_index
|
||||||
def_args.append(out)
|
|
||||||
def_args.extend(input_)
|
|
||||||
cdxindexer_main(def_args)
|
|
||||||
|
|
||||||
def index_merge(self, filelist):
|
options = dict(append_post=True,
|
||||||
|
cdxj=True,
|
||||||
|
sort=True,
|
||||||
|
recurse=True,
|
||||||
|
rel_root=rel_root)
|
||||||
|
|
||||||
|
write_multi_cdx_index(out, input_, **options)
|
||||||
|
|
||||||
|
def index_merge(self, filelist, index_file):
|
||||||
wrongdir = 'Skipping {0}, must be in {1} archive directory'
|
wrongdir = 'Skipping {0}, must be in {1} archive directory'
|
||||||
notfound = 'Skipping {0}, file not found'
|
notfound = 'Skipping {0}, file not found'
|
||||||
|
|
||||||
filtered_warcs = []
|
filtered_warcs = []
|
||||||
|
|
||||||
# Check that warcs are actually in warcs dir
|
# Check that warcs are actually in archive dir
|
||||||
abs_warc_dir = os.path.abspath(self.warc_dir)
|
abs_archive_dir = os.path.abspath(self.archive_dir)
|
||||||
|
|
||||||
for f in filelist:
|
for f in filelist:
|
||||||
abs_filepath = os.path.abspath(f)
|
abs_filepath = os.path.abspath(f)
|
||||||
prefix = os.path.commonprefix([abs_warc_dir, abs_filepath])
|
prefix = os.path.commonprefix([abs_archive_dir, abs_filepath])
|
||||||
|
|
||||||
if prefix != abs_warc_dir:
|
if prefix != abs_archive_dir:
|
||||||
raise IOError(wrongdir.format(abs_filepath, abs_warc_dir))
|
raise IOError(wrongdir.format(abs_filepath, abs_archive_dir))
|
||||||
elif not os.path.isfile(abs_filepath):
|
elif not os.path.isfile(abs_filepath):
|
||||||
raise IOError(notfound.format(f))
|
raise IOError(notfound.format(f))
|
||||||
else:
|
else:
|
||||||
filtered_warcs.append(abs_filepath.split(prefix)[1])
|
filtered_warcs.append(abs_filepath)
|
||||||
|
|
||||||
self._index_merge_warcs(filtered_warcs)
|
self._index_merge_warcs(filtered_warcs, index_file, abs_archive_dir)
|
||||||
|
|
||||||
def _index_merge_warcs(self, new_warcs):
|
def _index_merge_warcs(self, new_warcs, index_file, rel_root=None):
|
||||||
cdx_file = os.path.join(self.cdx_dir, self.DEF_INDEX_FILE)
|
cdx_file = os.path.join(self.indexes_dir, index_file)
|
||||||
|
|
||||||
# no existing file, just reindex all
|
|
||||||
if not os.path.isfile(cdx_file):
|
|
||||||
return self.reindex()
|
|
||||||
|
|
||||||
temp_file = cdx_file + '.tmp.' + timestamp20_now()
|
temp_file = cdx_file + '.tmp.' + timestamp20_now()
|
||||||
self._cdx_index(temp_file, new_warcs)
|
self._cdx_index(temp_file, new_warcs, rel_root)
|
||||||
|
|
||||||
|
# no existing file, so just make it the new file
|
||||||
|
if not os.path.isfile(cdx_file):
|
||||||
|
shutil.move(temp_file, cdx_file)
|
||||||
|
return
|
||||||
|
|
||||||
merged_file = temp_file + '.merged'
|
merged_file = temp_file + '.merged'
|
||||||
|
|
||||||
@ -157,7 +169,7 @@ directory structure expected by pywb
|
|||||||
os.remove(temp_file)
|
os.remove(temp_file)
|
||||||
|
|
||||||
def set_metadata(self, namevalue_pairs):
|
def set_metadata(self, namevalue_pairs):
|
||||||
metadata_yaml = os.path.join(self.coll_dir, 'metadata.yaml')
|
metadata_yaml = os.path.join(self.curr_coll_dir, 'metadata.yaml')
|
||||||
metadata = None
|
metadata = None
|
||||||
if os.path.isfile(metadata_yaml):
|
if os.path.isfile(metadata_yaml):
|
||||||
with open(metadata_yaml) as fh:
|
with open(metadata_yaml) as fh:
|
||||||
@ -280,6 +292,8 @@ directory structure expected by pywb
|
|||||||
print('Removed template file "{0}"'.format(full_path))
|
print('Removed template file "{0}"'.format(full_path))
|
||||||
|
|
||||||
def migrate_cdxj(self, path, force=False):
|
def migrate_cdxj(self, path, force=False):
|
||||||
|
from migrate import MigrateCDX
|
||||||
|
|
||||||
migrate = MigrateCDX(path)
|
migrate = MigrateCDX(path)
|
||||||
count = migrate.count_cdx()
|
count = migrate.count_cdx()
|
||||||
if count == 0:
|
if count == 0:
|
||||||
@ -299,6 +313,30 @@ directory structure expected by pywb
|
|||||||
|
|
||||||
migrate.convert_to_cdxj()
|
migrate.convert_to_cdxj()
|
||||||
|
|
||||||
|
def autoindex(self):
|
||||||
|
from autoindex import CDXAutoIndexer
|
||||||
|
|
||||||
|
if self.coll_name:
|
||||||
|
any_coll = False
|
||||||
|
path = self.archive_dir
|
||||||
|
else:
|
||||||
|
path = self.colls_dir
|
||||||
|
any_coll = True
|
||||||
|
|
||||||
|
def do_index(warc):
|
||||||
|
if any_coll:
|
||||||
|
coll_name = warc.split(self.colls_dir + os.path.sep)[-1].split('/')[0]
|
||||||
|
if coll_name != self.coll_name:
|
||||||
|
self._set_coll_dirs(coll_name)
|
||||||
|
|
||||||
|
print('Auto-Indexing: ' + warc)
|
||||||
|
self.index_merge([warc], self.AUTO_INDEX_FILE)
|
||||||
|
print('Done.. Waiting for file updates')
|
||||||
|
|
||||||
|
|
||||||
|
indexer = CDXAutoIndexer(do_index, path)
|
||||||
|
indexer.do_watch()
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
def main(args=None):
|
def main(args=None):
|
||||||
@ -360,7 +398,7 @@ Create manage file based web archive collections
|
|||||||
# Index warcs
|
# Index warcs
|
||||||
def do_index(r):
|
def do_index(r):
|
||||||
m = CollectionsManager(r.coll_name)
|
m = CollectionsManager(r.coll_name)
|
||||||
m.index_merge(r.files)
|
m.index_merge(r.files, m.DEF_INDEX_FILE)
|
||||||
|
|
||||||
indexwarcs_help = 'Index specified ARC/WARC files in the collection'
|
indexwarcs_help = 'Index specified ARC/WARC files in the collection'
|
||||||
indexwarcs = subparsers.add_parser('index', help=indexwarcs_help)
|
indexwarcs = subparsers.add_parser('index', help=indexwarcs_help)
|
||||||
@ -390,7 +428,7 @@ Create manage file based web archive collections
|
|||||||
m.list_templates()
|
m.list_templates()
|
||||||
|
|
||||||
template_help = 'Add default html template for customization'
|
template_help = 'Add default html template for customization'
|
||||||
template = subparsers.add_parser('template')
|
template = subparsers.add_parser('template', help=template_help)
|
||||||
template.add_argument('coll_name', nargs='?', default='')
|
template.add_argument('coll_name', nargs='?', default='')
|
||||||
template.add_argument('-f', '--force', action='store_true')
|
template.add_argument('-f', '--force', action='store_true')
|
||||||
template.add_argument('--add')
|
template.add_argument('--add')
|
||||||
@ -398,14 +436,26 @@ Create manage file based web archive collections
|
|||||||
template.add_argument('--list', action='store_true')
|
template.add_argument('--list', action='store_true')
|
||||||
template.set_defaults(func=do_add_template)
|
template.set_defaults(func=do_add_template)
|
||||||
|
|
||||||
|
# Migrate CDX
|
||||||
def do_migrate(r):
|
def do_migrate(r):
|
||||||
m = CollectionsManager('', must_exist=False)
|
m = CollectionsManager('', must_exist=False)
|
||||||
m.migrate_cdxj(r.path, r.force)
|
m.migrate_cdxj(r.path, r.force)
|
||||||
|
|
||||||
template = subparsers.add_parser('migrate')
|
migrate_help = 'Convert any existing archive indexes to new json format'
|
||||||
template.add_argument('path', default='./', nargs='?')
|
migrate = subparsers.add_parser('migrate', help=migrate_help)
|
||||||
template.add_argument('-f', '--force', action='store_true')
|
migrate.add_argument('path', default='./', nargs='?')
|
||||||
template.set_defaults(func=do_migrate)
|
migrate.add_argument('-f', '--force', action='store_true')
|
||||||
|
migrate.set_defaults(func=do_migrate)
|
||||||
|
|
||||||
|
# Auto Index
|
||||||
|
def do_autoindex(r):
|
||||||
|
m = CollectionsManager(r.coll_name, must_exist=False)
|
||||||
|
m.autoindex()
|
||||||
|
|
||||||
|
autoindex_help = 'Automatically index any change archive files'
|
||||||
|
autoindex = subparsers.add_parser('autoindex', help=autoindex_help)
|
||||||
|
autoindex.add_argument('coll_name', nargs='?', default='')
|
||||||
|
autoindex.set_defaults(func=do_autoindex)
|
||||||
|
|
||||||
r = parser.parse_args(args=args)
|
r = parser.parse_args(args=args)
|
||||||
r.func(r)
|
r.func(r)
|
||||||
@ -416,6 +466,7 @@ def main_wrap_exc(): #pragma: no cover
|
|||||||
try:
|
try:
|
||||||
main()
|
main()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
raise
|
||||||
print('Error: ' + str(e))
|
print('Error: ' + str(e))
|
||||||
sys.exit(2)
|
sys.exit(2)
|
||||||
|
|
||||||
|
@ -144,22 +144,32 @@ ALLOWED_EXT = ('.arc', '.arc.gz', '.warc', '.warc.gz')
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def iter_file_or_dir(inputs, recursive=True):
|
def iter_file_or_dir(inputs, recursive=True, rel_root=None):
|
||||||
for input_ in inputs:
|
for input_ in inputs:
|
||||||
if not os.path.isdir(input_):
|
if not os.path.isdir(input_):
|
||||||
yield input_, os.path.basename(input_)
|
if not rel_root:
|
||||||
|
filename = os.path.basename(input_)
|
||||||
|
else:
|
||||||
|
filename = os.path.relpath(input_, rel_root)
|
||||||
|
|
||||||
|
yield input_, filename
|
||||||
|
|
||||||
elif not recursive:
|
elif not recursive:
|
||||||
for filename in os.listdir(input_):
|
for filename in os.listdir(input_):
|
||||||
if filename.endswith(ALLOWED_EXT):
|
if filename.endswith(ALLOWED_EXT):
|
||||||
yield os.path.join(input_, filename), filename
|
full_path = os.path.join(input_, filename)
|
||||||
|
if rel_root:
|
||||||
|
filename = os.path.relpath(full_path, rel_root)
|
||||||
|
yield full_path, filename
|
||||||
|
|
||||||
else:
|
else:
|
||||||
for root, dirs, files in os.walk(input_):
|
for root, dirs, files in os.walk(input_):
|
||||||
for filename in files:
|
for filename in files:
|
||||||
if filename.endswith(ALLOWED_EXT):
|
if filename.endswith(ALLOWED_EXT):
|
||||||
full_path = os.path.join(root, filename)
|
full_path = os.path.join(root, filename)
|
||||||
rel_path = os.path.relpath(full_path, input_)
|
if not rel_root:
|
||||||
|
rel_root = input_
|
||||||
|
rel_path = os.path.relpath(full_path, rel_root)
|
||||||
rel_path = rel_path.replace(os.path.sep, '/')
|
rel_path = rel_path.replace(os.path.sep, '/')
|
||||||
yield full_path, rel_path
|
yield full_path, rel_path
|
||||||
|
|
||||||
@ -181,10 +191,10 @@ def cdx_filename(filename):
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def get_cdx_writer_cls(options):
|
def get_cdx_writer_cls(options):
|
||||||
writer_cls = options.get('writer_cls')
|
|
||||||
if options.get('minimal'):
|
if options.get('minimal'):
|
||||||
options['cdxj'] = True
|
options['cdxj'] = True
|
||||||
|
|
||||||
|
writer_cls = options.get('writer_cls')
|
||||||
if writer_cls:
|
if writer_cls:
|
||||||
if not options.get('writer_add_mixin'):
|
if not options.get('writer_add_mixin'):
|
||||||
return writer_cls
|
return writer_cls
|
||||||
@ -209,10 +219,13 @@ def get_cdx_writer_cls(options):
|
|||||||
#=================================================================
|
#=================================================================
|
||||||
def write_multi_cdx_index(output, inputs, **options):
|
def write_multi_cdx_index(output, inputs, **options):
|
||||||
recurse = options.get('recurse', False)
|
recurse = options.get('recurse', False)
|
||||||
|
rel_root = options.get('rel_root')
|
||||||
|
|
||||||
# write one cdx per dir
|
# write one cdx per dir
|
||||||
if output != '-' and os.path.isdir(output):
|
if output != '-' and os.path.isdir(output):
|
||||||
for fullpath, filename in iter_file_or_dir(inputs, recurse):
|
for fullpath, filename in iter_file_or_dir(inputs,
|
||||||
|
recurse,
|
||||||
|
rel_root):
|
||||||
outpath = cdx_filename(filename)
|
outpath = cdx_filename(filename)
|
||||||
outpath = os.path.join(output, outpath)
|
outpath = os.path.join(output, outpath)
|
||||||
|
|
||||||
@ -234,7 +247,9 @@ def write_multi_cdx_index(output, inputs, **options):
|
|||||||
record_iter = DefaultRecordIter(**options)
|
record_iter = DefaultRecordIter(**options)
|
||||||
|
|
||||||
with writer_cls(outfile) as writer:
|
with writer_cls(outfile) as writer:
|
||||||
for fullpath, filename in iter_file_or_dir(inputs, recurse):
|
for fullpath, filename in iter_file_or_dir(inputs,
|
||||||
|
recurse,
|
||||||
|
rel_root):
|
||||||
with open(fullpath, 'rb') as infile:
|
with open(fullpath, 'rb') as infile:
|
||||||
entry_iter = record_iter(infile)
|
entry_iter = record_iter(infile)
|
||||||
|
|
||||||
@ -282,7 +297,7 @@ Some examples:
|
|||||||
""".format(os.path.basename(sys.argv[0]))
|
""".format(os.path.basename(sys.argv[0]))
|
||||||
|
|
||||||
sort_help = """
|
sort_help = """
|
||||||
sort the output to each file before writing to create a total ordering
|
Sort the output to each file before writing to create a total ordering
|
||||||
"""
|
"""
|
||||||
|
|
||||||
unsurt_help = """
|
unsurt_help = """
|
||||||
@ -296,8 +311,8 @@ Use older 9-field cdx format, default is 11-cdx field
|
|||||||
"""
|
"""
|
||||||
minimal_json_help = """
|
minimal_json_help = """
|
||||||
CDX JSON output, but with minimal fields only, available w/o parsing
|
CDX JSON output, but with minimal fields only, available w/o parsing
|
||||||
http record. The fields are:
|
http record. The fields are: canonicalized url, timestamp,
|
||||||
canonicalized url, timestamp, original url, digest, archive offset, archive length
|
original url, digest, archive offset, archive length
|
||||||
and archive filename. mimetype is included to indicate warc/revisit only.
|
and archive filename. mimetype is included to indicate warc/revisit only.
|
||||||
|
|
||||||
This option skips record parsing and will not work with
|
This option skips record parsing and will not work with
|
||||||
@ -305,30 +320,42 @@ POST append (-p) option
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
json_help = """
|
json_help = """
|
||||||
Output CDX JSON format per line, with url timestamp first, followed by json dict
|
Output CDX JSON format per line, with url timestamp first,
|
||||||
for all other fields:
|
followed by a json dict for all other fields:
|
||||||
url timestamp { ... }
|
url timestamp { ... }
|
||||||
"""
|
"""
|
||||||
|
|
||||||
output_help = """output file or directory.
|
output_help = """
|
||||||
|
Output file or directory.
|
||||||
- If directory, each input file is written to a seperate output file
|
- If directory, each input file is written to a seperate output file
|
||||||
with a .cdx extension
|
with a .cdx extension
|
||||||
- If output is '-', output is written to stdout
|
- If output is '-', output is written to stdout
|
||||||
"""
|
"""
|
||||||
|
|
||||||
input_help = """input file or directory
|
input_help = """
|
||||||
|
Input file or directory.
|
||||||
- If directory, all archive files from that directory are read
|
- If directory, all archive files from that directory are read
|
||||||
"""
|
"""
|
||||||
|
|
||||||
allrecords_help = """include all records.
|
allrecords_help = """
|
||||||
|
Include All records.
|
||||||
currently includes the 'request' records in addition to all
|
currently includes the 'request' records in addition to all
|
||||||
response records"""
|
response records
|
||||||
|
"""
|
||||||
|
|
||||||
post_append_help = """for POST requests, append
|
post_append_help = """
|
||||||
form query to url key. (Only applies to form url encoded posts)"""
|
For POST requests, append form query to url key.
|
||||||
|
(Only applies to form url encoded posts)
|
||||||
|
"""
|
||||||
|
|
||||||
recurse_dirs_help = """recurse through all subdirectories
|
recurse_dirs_help = """
|
||||||
if input is a directory"""
|
Recurse through all subdirectories if the input is a directory
|
||||||
|
"""
|
||||||
|
|
||||||
|
dir_root_help = """
|
||||||
|
Make CDX filenames relative to specified root directory,
|
||||||
|
instead of current working directory
|
||||||
|
"""
|
||||||
|
|
||||||
parser = ArgumentParser(description=description,
|
parser = ArgumentParser(description=description,
|
||||||
epilog=epilog,
|
epilog=epilog,
|
||||||
@ -350,6 +377,9 @@ if input is a directory"""
|
|||||||
action='store_true',
|
action='store_true',
|
||||||
help=recurse_dirs_help)
|
help=recurse_dirs_help)
|
||||||
|
|
||||||
|
parser.add_argument('-d', '--dir-root',
|
||||||
|
help=dir_root_help)
|
||||||
|
|
||||||
parser.add_argument('-u', '--unsurt',
|
parser.add_argument('-u', '--unsurt',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
help=unsurt_help)
|
help=unsurt_help)
|
||||||
@ -378,6 +408,7 @@ if input is a directory"""
|
|||||||
include_all=cmd.allrecords,
|
include_all=cmd.allrecords,
|
||||||
append_post=cmd.postappend,
|
append_post=cmd.postappend,
|
||||||
recurse=cmd.recurse,
|
recurse=cmd.recurse,
|
||||||
|
rel_root=cmd.dir_root,
|
||||||
cdx09=cmd.cdx09,
|
cdx09=cmd.cdx09,
|
||||||
cdxj=cmd.cdxj,
|
cdxj=cmd.cdxj,
|
||||||
minimal=cmd.minimal_cdxj)
|
minimal=cmd.minimal_cdxj)
|
||||||
|
@ -162,6 +162,18 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20
|
|||||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||||
Total: 4
|
Total: 4
|
||||||
|
|
||||||
|
# test custom root dir for cdx filenames, singlw warc
|
||||||
|
>>> cli_lines(['--dir-root', get_test_dir() + 'other/', TEST_WARC_DIR + 'example.warc.gz'])
|
||||||
|
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 ../warcs/example.warc.gz
|
||||||
|
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 ../warcs/example.warc.gz
|
||||||
|
Total: 4
|
||||||
|
|
||||||
|
# test custom root dir for cdx filenames, dir input
|
||||||
|
>>> cli_lines(['--sort', '--dir-root', get_test_dir() + 'other/', TEST_WARC_DIR])
|
||||||
|
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 ../warcs/example-url-agnostic-revisit.warc.gz
|
||||||
|
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 ../warcs/example-url-agnostic-orig.warc.gz
|
||||||
|
Total: 206
|
||||||
|
|
||||||
# test writing to temp dir, also use unicode filename
|
# test writing to temp dir, also use unicode filename
|
||||||
>>> cli_lines_with_dir(unicode(TEST_WARC_DIR + 'example.warc.gz'))
|
>>> cli_lines_with_dir(unicode(TEST_WARC_DIR + 'example.warc.gz'))
|
||||||
example.cdx
|
example.cdx
|
||||||
|
@ -5,6 +5,9 @@ import sys
|
|||||||
|
|
||||||
import webtest
|
import webtest
|
||||||
|
|
||||||
|
import time
|
||||||
|
import threading
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
from pywb.webapp.pywb_init import create_wb_router
|
from pywb.webapp.pywb_init import create_wb_router
|
||||||
@ -22,7 +25,9 @@ from mock import patch
|
|||||||
#=============================================================================
|
#=============================================================================
|
||||||
ARCHIVE_DIR = 'archive'
|
ARCHIVE_DIR = 'archive'
|
||||||
INDEX_DIR = 'indexes'
|
INDEX_DIR = 'indexes'
|
||||||
|
|
||||||
INDEX_FILE = 'index.cdxj'
|
INDEX_FILE = 'index.cdxj'
|
||||||
|
AUTOINDEX_FILE = 'autoindex.cdxj'
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
@ -432,6 +437,56 @@ class TestManagedColls(object):
|
|||||||
# Nothing else to migrate
|
# Nothing else to migrate
|
||||||
main(['migrate', migrate_dir])
|
main(['migrate', migrate_dir])
|
||||||
|
|
||||||
|
def test_auto_index(self):
|
||||||
|
main(['init', 'auto'])
|
||||||
|
auto_dir = os.path.join(self.root_dir, 'collections', 'auto')
|
||||||
|
archive_dir = os.path.join(auto_dir, ARCHIVE_DIR)
|
||||||
|
|
||||||
|
archive_sub_dir = os.path.join(archive_dir, 'sub')
|
||||||
|
os.makedirs(archive_sub_dir)
|
||||||
|
|
||||||
|
def do_copy():
|
||||||
|
try:
|
||||||
|
time.sleep(1)
|
||||||
|
shutil.copy(self._get_sample_warc('example.warc.gz'), archive_dir)
|
||||||
|
shutil.copy(self._get_sample_warc('example-extra.warc'), archive_sub_dir)
|
||||||
|
time.sleep(1)
|
||||||
|
finally:
|
||||||
|
import pywb.manager.autoindex
|
||||||
|
pywb.manager.autoindex.keep_running = False
|
||||||
|
|
||||||
|
thread = threading.Thread(target=do_copy)
|
||||||
|
thread.daemon = True
|
||||||
|
thread.start()
|
||||||
|
|
||||||
|
main(['autoindex'])
|
||||||
|
|
||||||
|
index_file = os.path.join(auto_dir, INDEX_DIR, AUTOINDEX_FILE)
|
||||||
|
assert os.path.isfile(index_file)
|
||||||
|
|
||||||
|
with open(index_file) as fh:
|
||||||
|
index = fh.read()
|
||||||
|
|
||||||
|
assert '"example.warc.gz' in index
|
||||||
|
assert '"sub/example-extra.warc' in index, index
|
||||||
|
|
||||||
|
mtime = os.path.getmtime(index_file)
|
||||||
|
|
||||||
|
# Update
|
||||||
|
import pywb.manager.autoindex
|
||||||
|
pywb.manager.autoindex.keep_running = True
|
||||||
|
|
||||||
|
os.remove(index_file)
|
||||||
|
|
||||||
|
thread = threading.Thread(target=do_copy)
|
||||||
|
thread.daemon = True
|
||||||
|
thread.start()
|
||||||
|
|
||||||
|
main(['autoindex', 'auto'])
|
||||||
|
|
||||||
|
# assert file was update
|
||||||
|
assert os.path.getmtime(index_file) > mtime
|
||||||
|
|
||||||
def test_err_template_remove(self):
|
def test_err_template_remove(self):
|
||||||
""" Test various error conditions for templates:
|
""" Test various error conditions for templates:
|
||||||
invalid template name, no collection for collection template
|
invalid template name, no collection for collection template
|
||||||
|
Loading…
x
Reference in New Issue
Block a user