From 30454abb6b0fc26819bebc80496706bf5e5e2672 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 15 Mar 2015 21:20:00 -0700 Subject: [PATCH] metadata: add support for user-defined per-collection metadata! #78 metadata stored in wbrequest.user_metadata and available to all templates collections manager: refactor to use subparsers, add list collections and set metadata commands update tests for new commands index template: use user metadata title for collections listing search template: display all metadata and title, if available --- pywb/framework/archivalrouter.py | 6 +- pywb/framework/wbrequestresponse.py | 5 +- pywb/manager/manager.py | 148 ++++++++++++++++++++-------- pywb/ui/index.html | 23 ++--- pywb/ui/search.html | 15 ++- pywb/webapp/pywb_init.py | 40 +++++--- pywb/webapp/views.py | 6 ++ setup.py | 3 +- tests/test_auto_colls.py | 87 ++++++++++++---- 9 files changed, 235 insertions(+), 98 deletions(-) diff --git a/pywb/framework/archivalrouter.py b/pywb/framework/archivalrouter.py index 801636ad..43365abe 100644 --- a/pywb/framework/archivalrouter.py +++ b/pywb/framework/archivalrouter.py @@ -66,7 +66,8 @@ class ArchivalRouter(object): wburl_class=route.handler.get_wburl_type(), urlrewriter_class=self.urlrewriter_class, cookie_scope=route.cookie_scope, - rewrite_opts=route.rewrite_opts) + rewrite_opts=route.rewrite_opts, + user_metadata=route.user_metadata) # Allow for applying of additional filters route.apply_filters(wbrequest, matcher) @@ -100,12 +101,15 @@ class Route(object): self.regex = re.compile(regex + lookahead) else: self.regex = re.compile('') + self.handler = handler self.request_class = request_class + # collection id from regex group (default 0) self.coll_group = coll_group self.cookie_scope = config.get('cookie_scope') self.rewrite_opts = config.get('rewrite_opts', {}) + self.user_metadata = config.get('metadata', {}) self._custom_init(config) def is_handling(self, request_uri): diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index 95a07e66..e5c9fbaa 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -40,7 +40,9 @@ class WbRequest(object): urlrewriter_class=None, is_proxy=False, cookie_scope=None, - rewrite_opts={}): + rewrite_opts={}, + user_metadata={}, + ): self.env = env @@ -96,6 +98,7 @@ class WbRequest(object): self.query_filter = [] self.custom_params = {} + self.user_metadata = user_metadata # PERF env['X_PERF'] = {} diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index 12bde6c3..d51bf953 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -9,6 +9,7 @@ from pywb.warc.cdxindexer import main as cdxindexer_main from argparse import ArgumentParser, RawTextHelpFormatter import heapq +import yaml #============================================================================= @@ -19,7 +20,7 @@ simplify the creation and management of web archive collections It may be used via cmdline to setup and maintain the directory structure expected by pywb """ - def __init__(self, coll_name, root_dir='collections'): + def __init__(self, coll_name, root_dir='collections', must_exist=True): self.root_dir = root_dir self.default_config = load_yaml_config('pywb/default_config.yaml') self.coll_name = coll_name @@ -30,6 +31,14 @@ directory structure expected by pywb self.cdx_dir = self._get_dir('index_paths') self.static_dir = self._get_dir('static_path') self.templates_dir = self._get_dir('templates_dir') + if must_exist: + self._assert_coll_exists() + + def list_colls(self): + print('Collections:') + for d in os.listdir(self.root_dir): + if os.path.isdir(os.path.join(self.root_dir, d)): + print('- ' + d) def _get_dir(self, name): return os.path.join(self.coll_dir, @@ -50,18 +59,15 @@ directory structure expected by pywb self._create_dir(self.static_dir) self._create_dir(self.templates_dir) + def _assert_coll_exists(self): + if not os.path.isdir(self.coll_dir): + raise IOError('Collection {0} does not exist'. + format(self.coll_name)) + def add_warcs(self, warcs): if not os.path.isdir(self.warc_dir): - if not os.path.isdir(self.coll_dir): - raise IOError('Collection {0} does not exist'. - format(self.coll_name)) - else: - raise IOError('Directory {0} does not exist'. - format(self.warc_dir)) - - if not warcs: - logging.info('No WARCs specified') - return + raise IOError('Directory {0} does not exist'. + format(self.warc_dir)) full_paths = [] for filename in warcs: @@ -99,9 +105,6 @@ directory structure expected by pywb self._index_merge_warcs(filtered_warcs) def _index_merge_warcs(self, new_warcs): - if not new_warcs: - return - cdx_file = os.path.join(self.cdx_dir, 'index.cdx') # no existing file, just reindex all @@ -128,50 +131,109 @@ directory structure expected by pywb os.rename(merged_file, cdx_file) os.remove(temp_file) + def set_metadata(self, namevalue_pairs): + metadata_yaml = os.path.join(self.coll_dir, 'metadata.yaml') + metadata = None + if os.path.isfile(metadata_yaml): + with open(metadata_yaml) as fh: + metadata = yaml.safe_load(fh) + if not metadata: + metadata = {} + + msg = 'Metadata params must be in the form "name=value"' + for pair in namevalue_pairs: + v = pair.split('=', 1) + if len(v) != 2: + raise ValueError(msg) + + metadata[v[0]] = v[1] + + with open(metadata_yaml, 'w+b') as fh: + fh.write(yaml.dump(metadata, default_flow_style=False)) + + +#============================================================================= def main(args=None): description = """ Create manage file based web archive collections """ - - epilog = """ -Some examples: - -* Create new collection 'my_coll' -{0} create my_coll - -* Add warc mywarc1.warc.gz to my_coll (The warc will be copied to the collecton directory) -{0} add my_coll mywarc1.warc.gz - -""".format(os.path.basename(sys.argv[0])) + #format(os.path.basename(sys.argv[0])) logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s', level=logging.DEBUG) parser = ArgumentParser(description=description, - epilog=epilog, + #epilog=epilog, formatter_class=RawTextHelpFormatter) - group = parser.add_mutually_exclusive_group() - group.add_argument('--init', action='store_true') - group.add_argument('--addwarc', action='store_true') - group.add_argument('--reindex', action='store_true') - group.add_argument('--index-warcs', action='store_true') + subparsers = parser.add_subparsers(dest='type') - parser.add_argument('name') - parser.add_argument('files', nargs='*') + # Init Coll + def do_init(r): + m = CollectionsManager(r.coll_name, must_exist=False) + m.add_collection() + + init_help = 'Init new collection, create all collection directories' + init = subparsers.add_parser('init', help=init_help) + init.add_argument('coll_name') + init.set_defaults(func=do_init) + + # List Colls + def do_list(r): + m = CollectionsManager('', must_exist=False) + m.list_colls() + + list_help = 'List Collections' + listcmd = subparsers.add_parser('list', help=list_help) + listcmd.set_defaults(func=do_list) + + # Add Warcs + def do_add(r): + m = CollectionsManager(r.coll_name) + m.add_warcs(r.files) + + addwarc_help = 'Copy ARCS/WARCS to collection directory and reindex' + addwarc = subparsers.add_parser('add', help=addwarc_help) + addwarc.add_argument('coll_name') + addwarc.add_argument('files', nargs='+') + addwarc.set_defaults(func=do_add) + + + # Reindex All + def do_reindex(r): + m = CollectionsManager(r.coll_name) + m.reindex() + + reindex_help = 'Re-Index entire collection' + reindex = subparsers.add_parser('reindex', help=reindex_help) + reindex.add_argument('coll_name') + reindex.set_defaults(func=do_reindex) + + # Index warcs + def do_index(r): + m = CollectionsManager(r.coll_name) + m.index_merge(r.files) + + indexwarcs_help = 'Index specified ARC/WARC files in the collection' + indexwarcs = subparsers.add_parser('index', help=indexwarcs_help) + indexwarcs.add_argument('coll_name') + indexwarcs.add_argument('files', nargs='+') + indexwarcs.set_defaults(func=do_index) + + # Set metadata + def do_metadata(r): + m = CollectionsManager(r.coll_name) + m.set_metadata(r.set) + + metadata_help = 'Set Metadata' + metadata = subparsers.add_parser('metadata', help=metadata_help) + metadata.add_argument('coll_name') + metadata.add_argument('--set', nargs='+') + metadata.set_defaults(func=do_metadata) r = parser.parse_args(args=args) - - m = CollectionsManager(r.name) - if r.init: - m.add_collection() - elif r.addwarc: - m.add_warcs(r.files) - elif r.index_warcs: - m.index_merge(r.files) - elif r.reindex: - m.reindex() + r.func(r) if __name__ == "__main__": diff --git a/pywb/ui/index.html b/pywb/ui/index.html index 3a8ff0c9..af9db3cb 100644 --- a/pywb/ui/index.html +++ b/pywb/ui/index.html @@ -1,22 +1,17 @@ -

pywb Sample Home Page

+

pywb Wayback Machine

-The following archive collections are available: +This archive contains the following collections: - -Other endpoints in this deployment: - - diff --git a/pywb/ui/search.html b/pywb/ui/search.html index 2e1e5b36..94804d39 100644 --- a/pywb/ui/search.html +++ b/pywb/ui/search.html @@ -1,6 +1,17 @@ -

pywb Search Page

-Search Archived Content: +

{{ wbrequest.user_metadata.title if wbrequest.user_metadata.title else wbrequest.coll }} Search Page

+ +
+ +{% for key, val in wbrequest.user_metadata.iteritems() %} + +{% endfor %} +
{{ key }}:{{ val }}
+
+ +

+Search this collection by url:

+

diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py index eef703d0..1603e805 100644 --- a/pywb/webapp/pywb_init.py +++ b/pywb/webapp/pywb_init.py @@ -143,13 +143,13 @@ class DirectoryCollsLoader(object): if not os.path.isdir(full): continue - coll = self.load_dir(full, name) - if coll: - colls[name] = coll + coll_config = self.load_coll_dir(full, name) + if coll_config: + colls[name] = coll_config return colls - def _add_if_exists(self, coll, root_dir, dir_key, required=False): + def _add_dir_if_exists(self, coll, root_dir, dir_key, required=False): if dir_key in coll: # already set return False @@ -168,18 +168,26 @@ class DirectoryCollsLoader(object): else: return False - def load_dir(self, root_dir, name): - config_file = os.path.join(root_dir, 'config.yaml') - if os.path.isfile(config_file): - coll = load_yaml_config(config_file) + def load_yaml_file(self, root_dir, filename): + filename = os.path.join(root_dir, filename) + if os.path.isfile(filename): + return load_yaml_config(filename) else: - coll = {} + return {} - self._add_if_exists(coll, root_dir, 'index_paths', True) - self._add_if_exists(coll, root_dir, 'archive_paths', True) + def load_coll_dir(self, root_dir, name): + # Load config.yaml + coll_config = self.load_yaml_file(root_dir, 'config.yaml') - if self._add_if_exists(coll, root_dir, 'static_path', False): - self.static_routes['static/' + name] = coll['static_path'] + # Load metadata.yaml + metadata = self.load_yaml_file(root_dir, 'metadata.yaml') + coll_config['metadata'] = metadata + + self._add_dir_if_exists(coll_config, root_dir, 'index_paths', True) + self._add_dir_if_exists(coll_config, root_dir, 'archive_paths', True) + + if self._add_dir_if_exists(coll_config, root_dir, 'static_path', False): + self.static_routes['static/' + name] = coll_config['static_path'] # Add templates templates_dir = self.config.get('paths').get('templates_dir') @@ -187,15 +195,15 @@ class DirectoryCollsLoader(object): template_dir = os.path.join(root_dir, templates_dir) if template_dir: for tname, tfile in self.config.get('paths')['template_files'].iteritems(): - if tname in coll: + if tname in coll_config: # Already set continue full = os.path.join(template_dir, tfile) if os.path.isfile(full): - coll[tname] = full + coll_config[tname] = full - return coll + return coll_config #================================================================= diff --git a/pywb/webapp/views.py b/pywb/webapp/views.py index 036977a1..44dcb052 100644 --- a/pywb/webapp/views.py +++ b/pywb/webapp/views.py @@ -5,6 +5,7 @@ from pywb.framework.memento import make_timemap, LINK_FORMAT import urlparse import urllib import logging +import json from os import path from itertools import imap @@ -59,6 +60,11 @@ def is_wb_handler(obj): return obj.handler.__class__.__name__ == "WBHandler" +@template_filter() +def jsonify(obj): + return json.dumps(obj) + + #================================================================= class J2TemplateView(object): env_globals = {'static_path': 'static/default', diff --git a/setup.py b/setup.py index 1eeb424d..52714325 100755 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ setup( long_description=long_description, license='GPL', packages=find_packages(), - #include_package_data=True, + zip_safe=True, provides=[ 'pywb', 'pywb.utils', @@ -92,7 +92,6 @@ setup( proxy-cert-auth = pywb.framework.certauth:main wb-manager = pywb.manager.manager:main """, - zip_safe=True, classifiers=[ 'Development Status :: 4 - Beta', 'Environment :: Web Environment', diff --git a/tests/test_auto_colls.py b/tests/test_auto_colls.py index 01fb78ec..bcadf064 100644 --- a/tests/test_auto_colls.py +++ b/tests/test_auto_colls.py @@ -1,9 +1,12 @@ import os import tempfile import shutil +import sys import webtest +from io import BytesIO + from pywb.webapp.pywb_init import create_wb_router from pywb.manager.manager import main @@ -53,7 +56,7 @@ class TestManagedColls(object): def test_create_first_coll(self): """ Test first collection creation, with all required dirs """ - main(['--init', 'test']) + main(['init', 'test']) colls = os.path.join(self.root_dir, 'collections') assert os.path.isdir(colls) @@ -68,7 +71,7 @@ class TestManagedColls(object): """ warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz') - main(['--addwarc', 'test', warc1]) + main(['add', 'test', warc1]) self._create_app() resp = self.testapp.get('/test/20140103030321/http://example.com?example=1') @@ -79,9 +82,9 @@ class TestManagedColls(object): """ warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz') - main(['--init', 'foo']) + main(['init', 'foo']) - main(['--addwarc', 'foo', warc1]) + main(['add', 'foo', warc1]) self._create_app() resp = self.testapp.get('/foo/20140103030321/http://example.com?example=1') @@ -93,17 +96,14 @@ class TestManagedColls(object): warc1 = os.path.join(get_test_dir(), 'warcs', 'iana.warc.gz') warc2 = os.path.join(get_test_dir(), 'warcs', 'example-extra.warc') - main(['--addwarc', 'test', warc1, warc2]) + main(['add', 'test', warc1, warc2]) # Spurrious file in collections with open(os.path.join(self.root_dir, 'collections', 'blah'), 'w+b') as fh: fh.write('foo\n') with raises(IOError): - main(['--addwarc', 'test', 'non-existent-file.warc.gz']) - - # check adding no warc -- no op - main(['--addwarc', 'test']) + main(['add', 'test', 'non-existent-file.warc.gz']) # check new cdx self._create_app() @@ -116,7 +116,7 @@ class TestManagedColls(object): Ensure CDX is relative to root archive dir, test replay """ - main(['--init', 'nested']) + main(['init', 'nested']) nested_root = os.path.join(self.root_dir, 'collections', 'nested', 'warcs') nested_a = os.path.join(nested_root, 'A') @@ -131,7 +131,7 @@ class TestManagedColls(object): shutil.copy2(warc1, nested_a) shutil.copy2(warc2, nested_b) - main(['--index-warcs', + main(['index', 'nested', os.path.join(nested_a, 'iana.warc.gz'), os.path.join(nested_b, 'example.warc.gz') @@ -162,7 +162,7 @@ class TestManagedColls(object): shutil.copy(orig, bak) - main(['--reindex', 'test']) + main(['reindex', 'test']) with open(orig) as orig_fh: merged_cdx = orig_fh.read() @@ -187,6 +187,39 @@ class TestManagedColls(object): assert resp.content_type == 'application/javascript' assert '/* Some JS File */' in resp.body + def test_add_title_metadata_index_page(self): + """ Test adding title metadata to a collection, test + retrieval on default index page + """ + main(['metadata', 'foo', '--set', 'title=Collection Title']) + + self._create_app() + resp = self.testapp.get('/') + assert resp.status_int == 200 + assert resp.content_type == 'text/html' + assert '(Collection Title)' in resp.body + + def test_other_metadata_search_page(self): + main(['metadata', 'foo', '--set', + 'desc=Some Description Text', + 'other=custom value']) + + with raises(ValueError): + main(['metadata', 'foo', '--set', 'name_only']) + + self._create_app() + resp = self.testapp.get('/foo/') + assert resp.status_int == 200 + assert resp.content_type == 'text/html' + + assert 'Collection Title' in resp.body + + assert 'desc' in resp.body + assert 'Some Description Text' in resp.body + + assert 'other' in resp.body + assert 'custom value' in resp.body + def test_custom_template_search(self): """ Test manually added custom search template search.html """ @@ -219,7 +252,6 @@ class TestManagedColls(object): assert resp.content_type == 'text/html' assert 'config.yaml overriden search page' in resp.body - def test_no_templates(self): """ Test removing templates dir, using default template again """ @@ -232,28 +264,45 @@ class TestManagedColls(object): assert resp.content_type == 'text/html' assert 'pywb custom search page' not in resp.body + def test_list_colls(self): + """ Test collection listing, printed to stdout + """ + orig_stdout = sys.stdout + buff = BytesIO() + sys.stdout = buff + main(['list']) + sys.stdout = orig_stdout + + output = buff.getvalue().splitlines() + assert len(output) == 4 + assert 'Collections' in output[0] + assert 'foo' in output[1] + assert 'nested' in output[2] + assert 'test' in output[3] + def test_err_no_such_coll(self): """ Test error adding warc to non-existant collection """ warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz') with raises(IOError): - main(['--addwarc', 'bar', warc1]) + main(['add', 'bar', warc1]) def test_err_wrong_warcs(self): warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz') invalid_warc = os.path.join(self.root_dir, 'collections', 'test', 'warcs', 'invalid.warc.gz') - # Empty - main(['--index-warcs', 'test']) + # Empty warc list, argparse calls exit + with raises(SystemExit): + main(['index', 'test']) # Wrong paths not in collection with raises(IOError): - main(['--index-warcs', 'test', warc1]) + main(['index', 'test', warc1]) # Non-existent with raises(IOError): - main(['--index-warcs', 'test', invalid_warc]) + main(['index', 'test', invalid_warc]) def test_err_missing_dirs(self): """ Test various errors with missing warcs dir, @@ -266,7 +315,7 @@ class TestManagedColls(object): shutil.rmtree(warcs_path) with raises(IOError): - main(['--addwarc', 'foo', 'somewarc']) + main(['add', 'foo', 'somewarc']) # No CDX cdx_path = os.path.join(colls, 'foo', 'cdx')