1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

metadata: add support for user-defined per-collection metadata! #78

metadata stored in wbrequest.user_metadata and available to all templates

collections manager: refactor to use subparsers, add list collections and set metadata commands
update tests for new commands
index template: use user metadata title for collections listing
search template: display all metadata and title, if available
This commit is contained in:
Ilya Kreymer 2015-03-15 21:20:00 -07:00
parent b417b47835
commit 30454abb6b
9 changed files with 235 additions and 98 deletions

View File

@ -66,7 +66,8 @@ class ArchivalRouter(object):
wburl_class=route.handler.get_wburl_type(),
urlrewriter_class=self.urlrewriter_class,
cookie_scope=route.cookie_scope,
rewrite_opts=route.rewrite_opts)
rewrite_opts=route.rewrite_opts,
user_metadata=route.user_metadata)
# Allow for applying of additional filters
route.apply_filters(wbrequest, matcher)
@ -100,12 +101,15 @@ class Route(object):
self.regex = re.compile(regex + lookahead)
else:
self.regex = re.compile('')
self.handler = handler
self.request_class = request_class
# collection id from regex group (default 0)
self.coll_group = coll_group
self.cookie_scope = config.get('cookie_scope')
self.rewrite_opts = config.get('rewrite_opts', {})
self.user_metadata = config.get('metadata', {})
self._custom_init(config)
def is_handling(self, request_uri):

View File

@ -40,7 +40,9 @@ class WbRequest(object):
urlrewriter_class=None,
is_proxy=False,
cookie_scope=None,
rewrite_opts={}):
rewrite_opts={},
user_metadata={},
):
self.env = env
@ -96,6 +98,7 @@ class WbRequest(object):
self.query_filter = []
self.custom_params = {}
self.user_metadata = user_metadata
# PERF
env['X_PERF'] = {}

View File

@ -9,6 +9,7 @@ from pywb.warc.cdxindexer import main as cdxindexer_main
from argparse import ArgumentParser, RawTextHelpFormatter
import heapq
import yaml
#=============================================================================
@ -19,7 +20,7 @@ simplify the creation and management of web archive collections
It may be used via cmdline to setup and maintain the
directory structure expected by pywb
"""
def __init__(self, coll_name, root_dir='collections'):
def __init__(self, coll_name, root_dir='collections', must_exist=True):
self.root_dir = root_dir
self.default_config = load_yaml_config('pywb/default_config.yaml')
self.coll_name = coll_name
@ -30,6 +31,14 @@ directory structure expected by pywb
self.cdx_dir = self._get_dir('index_paths')
self.static_dir = self._get_dir('static_path')
self.templates_dir = self._get_dir('templates_dir')
if must_exist:
self._assert_coll_exists()
def list_colls(self):
print('Collections:')
for d in os.listdir(self.root_dir):
if os.path.isdir(os.path.join(self.root_dir, d)):
print('- ' + d)
def _get_dir(self, name):
return os.path.join(self.coll_dir,
@ -50,18 +59,15 @@ directory structure expected by pywb
self._create_dir(self.static_dir)
self._create_dir(self.templates_dir)
def _assert_coll_exists(self):
if not os.path.isdir(self.coll_dir):
raise IOError('Collection {0} does not exist'.
format(self.coll_name))
def add_warcs(self, warcs):
if not os.path.isdir(self.warc_dir):
if not os.path.isdir(self.coll_dir):
raise IOError('Collection {0} does not exist'.
format(self.coll_name))
else:
raise IOError('Directory {0} does not exist'.
format(self.warc_dir))
if not warcs:
logging.info('No WARCs specified')
return
raise IOError('Directory {0} does not exist'.
format(self.warc_dir))
full_paths = []
for filename in warcs:
@ -99,9 +105,6 @@ directory structure expected by pywb
self._index_merge_warcs(filtered_warcs)
def _index_merge_warcs(self, new_warcs):
if not new_warcs:
return
cdx_file = os.path.join(self.cdx_dir, 'index.cdx')
# no existing file, just reindex all
@ -128,50 +131,109 @@ directory structure expected by pywb
os.rename(merged_file, cdx_file)
os.remove(temp_file)
def set_metadata(self, namevalue_pairs):
metadata_yaml = os.path.join(self.coll_dir, 'metadata.yaml')
metadata = None
if os.path.isfile(metadata_yaml):
with open(metadata_yaml) as fh:
metadata = yaml.safe_load(fh)
if not metadata:
metadata = {}
msg = 'Metadata params must be in the form "name=value"'
for pair in namevalue_pairs:
v = pair.split('=', 1)
if len(v) != 2:
raise ValueError(msg)
metadata[v[0]] = v[1]
with open(metadata_yaml, 'w+b') as fh:
fh.write(yaml.dump(metadata, default_flow_style=False))
#=============================================================================
def main(args=None):
description = """
Create manage file based web archive collections
"""
epilog = """
Some examples:
* Create new collection 'my_coll'
{0} create my_coll
* Add warc mywarc1.warc.gz to my_coll (The warc will be copied to the collecton directory)
{0} add my_coll mywarc1.warc.gz
""".format(os.path.basename(sys.argv[0]))
#format(os.path.basename(sys.argv[0]))
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
level=logging.DEBUG)
parser = ArgumentParser(description=description,
epilog=epilog,
#epilog=epilog,
formatter_class=RawTextHelpFormatter)
group = parser.add_mutually_exclusive_group()
group.add_argument('--init', action='store_true')
group.add_argument('--addwarc', action='store_true')
group.add_argument('--reindex', action='store_true')
group.add_argument('--index-warcs', action='store_true')
subparsers = parser.add_subparsers(dest='type')
parser.add_argument('name')
parser.add_argument('files', nargs='*')
# Init Coll
def do_init(r):
m = CollectionsManager(r.coll_name, must_exist=False)
m.add_collection()
init_help = 'Init new collection, create all collection directories'
init = subparsers.add_parser('init', help=init_help)
init.add_argument('coll_name')
init.set_defaults(func=do_init)
# List Colls
def do_list(r):
m = CollectionsManager('', must_exist=False)
m.list_colls()
list_help = 'List Collections'
listcmd = subparsers.add_parser('list', help=list_help)
listcmd.set_defaults(func=do_list)
# Add Warcs
def do_add(r):
m = CollectionsManager(r.coll_name)
m.add_warcs(r.files)
addwarc_help = 'Copy ARCS/WARCS to collection directory and reindex'
addwarc = subparsers.add_parser('add', help=addwarc_help)
addwarc.add_argument('coll_name')
addwarc.add_argument('files', nargs='+')
addwarc.set_defaults(func=do_add)
# Reindex All
def do_reindex(r):
m = CollectionsManager(r.coll_name)
m.reindex()
reindex_help = 'Re-Index entire collection'
reindex = subparsers.add_parser('reindex', help=reindex_help)
reindex.add_argument('coll_name')
reindex.set_defaults(func=do_reindex)
# Index warcs
def do_index(r):
m = CollectionsManager(r.coll_name)
m.index_merge(r.files)
indexwarcs_help = 'Index specified ARC/WARC files in the collection'
indexwarcs = subparsers.add_parser('index', help=indexwarcs_help)
indexwarcs.add_argument('coll_name')
indexwarcs.add_argument('files', nargs='+')
indexwarcs.set_defaults(func=do_index)
# Set metadata
def do_metadata(r):
m = CollectionsManager(r.coll_name)
m.set_metadata(r.set)
metadata_help = 'Set Metadata'
metadata = subparsers.add_parser('metadata', help=metadata_help)
metadata.add_argument('coll_name')
metadata.add_argument('--set', nargs='+')
metadata.set_defaults(func=do_metadata)
r = parser.parse_args(args=args)
m = CollectionsManager(r.name)
if r.init:
m.add_collection()
elif r.addwarc:
m.add_warcs(r.files)
elif r.index_warcs:
m.index_merge(r.files)
elif r.reindex:
m.reindex()
r.func(r)
if __name__ == "__main__":

View File

@ -1,22 +1,17 @@
<h2>pywb Sample Home Page</h2>
<h2>pywb Wayback Machine</h2>
The following archive collections are available:
This archive contains the following collections:
<ul>
{% for route in routes %}
{% if route | is_wb_handler %}
<li><a href="{{ '/' + route.path }}">{{ '/' + route.path }}</a>: {{ route | string }}</li>
{% endif %}
{% endfor %}
</ul>
Other endpoints in this deployment:
<ul>
{% for route in routes %}
{% if not route | is_wb_handler %}
<li><b>{{ '/' + route.path }}</b> - {{ route | string }}</li>
{% endif %}
<li>
<a href="{{ '/' + route.path }}">{{ '/' + route.path }}</a>
{% if route.user_metadata.title is defined %}
({{ route.user_metadata.title }})
{% endif %}
</li>
{% endif %}
{% endfor %}
</ul>

View File

@ -1,6 +1,17 @@
<h2>pywb Search Page</h2>
Search Archived Content:
<h2>{{ wbrequest.user_metadata.title if wbrequest.user_metadata.title else wbrequest.coll }} Search Page</h2>
<div>
<table style="text-align: left">
{% for key, val in wbrequest.user_metadata.iteritems() %}
<tr><th>{{ key }}:</th><td>{{ val }}</td>
{% endfor %}
</table>
</div>
<p>
Search this collection by url:
<form onsubmit="url = document.getElementById('search').value; if (url != '') { document.location.href = '{{ wbrequest.wb_prefix }}' + '*/' + url; } return false;">
<input id="search" name="search" placeholder="Enter url to search"/>
<button type="submit">Search</button>
</form>
</p>

View File

@ -143,13 +143,13 @@ class DirectoryCollsLoader(object):
if not os.path.isdir(full):
continue
coll = self.load_dir(full, name)
if coll:
colls[name] = coll
coll_config = self.load_coll_dir(full, name)
if coll_config:
colls[name] = coll_config
return colls
def _add_if_exists(self, coll, root_dir, dir_key, required=False):
def _add_dir_if_exists(self, coll, root_dir, dir_key, required=False):
if dir_key in coll:
# already set
return False
@ -168,18 +168,26 @@ class DirectoryCollsLoader(object):
else:
return False
def load_dir(self, root_dir, name):
config_file = os.path.join(root_dir, 'config.yaml')
if os.path.isfile(config_file):
coll = load_yaml_config(config_file)
def load_yaml_file(self, root_dir, filename):
filename = os.path.join(root_dir, filename)
if os.path.isfile(filename):
return load_yaml_config(filename)
else:
coll = {}
return {}
self._add_if_exists(coll, root_dir, 'index_paths', True)
self._add_if_exists(coll, root_dir, 'archive_paths', True)
def load_coll_dir(self, root_dir, name):
# Load config.yaml
coll_config = self.load_yaml_file(root_dir, 'config.yaml')
if self._add_if_exists(coll, root_dir, 'static_path', False):
self.static_routes['static/' + name] = coll['static_path']
# Load metadata.yaml
metadata = self.load_yaml_file(root_dir, 'metadata.yaml')
coll_config['metadata'] = metadata
self._add_dir_if_exists(coll_config, root_dir, 'index_paths', True)
self._add_dir_if_exists(coll_config, root_dir, 'archive_paths', True)
if self._add_dir_if_exists(coll_config, root_dir, 'static_path', False):
self.static_routes['static/' + name] = coll_config['static_path']
# Add templates
templates_dir = self.config.get('paths').get('templates_dir')
@ -187,15 +195,15 @@ class DirectoryCollsLoader(object):
template_dir = os.path.join(root_dir, templates_dir)
if template_dir:
for tname, tfile in self.config.get('paths')['template_files'].iteritems():
if tname in coll:
if tname in coll_config:
# Already set
continue
full = os.path.join(template_dir, tfile)
if os.path.isfile(full):
coll[tname] = full
coll_config[tname] = full
return coll
return coll_config
#=================================================================

View File

@ -5,6 +5,7 @@ from pywb.framework.memento import make_timemap, LINK_FORMAT
import urlparse
import urllib
import logging
import json
from os import path
from itertools import imap
@ -59,6 +60,11 @@ def is_wb_handler(obj):
return obj.handler.__class__.__name__ == "WBHandler"
@template_filter()
def jsonify(obj):
return json.dumps(obj)
#=================================================================
class J2TemplateView(object):
env_globals = {'static_path': 'static/default',

View File

@ -42,7 +42,7 @@ setup(
long_description=long_description,
license='GPL',
packages=find_packages(),
#include_package_data=True,
zip_safe=True,
provides=[
'pywb',
'pywb.utils',
@ -92,7 +92,6 @@ setup(
proxy-cert-auth = pywb.framework.certauth:main
wb-manager = pywb.manager.manager:main
""",
zip_safe=True,
classifiers=[
'Development Status :: 4 - Beta',
'Environment :: Web Environment',

View File

@ -1,9 +1,12 @@
import os
import tempfile
import shutil
import sys
import webtest
from io import BytesIO
from pywb.webapp.pywb_init import create_wb_router
from pywb.manager.manager import main
@ -53,7 +56,7 @@ class TestManagedColls(object):
def test_create_first_coll(self):
""" Test first collection creation, with all required dirs
"""
main(['--init', 'test'])
main(['init', 'test'])
colls = os.path.join(self.root_dir, 'collections')
assert os.path.isdir(colls)
@ -68,7 +71,7 @@ class TestManagedColls(object):
"""
warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
main(['--addwarc', 'test', warc1])
main(['add', 'test', warc1])
self._create_app()
resp = self.testapp.get('/test/20140103030321/http://example.com?example=1')
@ -79,9 +82,9 @@ class TestManagedColls(object):
"""
warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
main(['--init', 'foo'])
main(['init', 'foo'])
main(['--addwarc', 'foo', warc1])
main(['add', 'foo', warc1])
self._create_app()
resp = self.testapp.get('/foo/20140103030321/http://example.com?example=1')
@ -93,17 +96,14 @@ class TestManagedColls(object):
warc1 = os.path.join(get_test_dir(), 'warcs', 'iana.warc.gz')
warc2 = os.path.join(get_test_dir(), 'warcs', 'example-extra.warc')
main(['--addwarc', 'test', warc1, warc2])
main(['add', 'test', warc1, warc2])
# Spurrious file in collections
with open(os.path.join(self.root_dir, 'collections', 'blah'), 'w+b') as fh:
fh.write('foo\n')
with raises(IOError):
main(['--addwarc', 'test', 'non-existent-file.warc.gz'])
# check adding no warc -- no op
main(['--addwarc', 'test'])
main(['add', 'test', 'non-existent-file.warc.gz'])
# check new cdx
self._create_app()
@ -116,7 +116,7 @@ class TestManagedColls(object):
Ensure CDX is relative to root archive dir, test replay
"""
main(['--init', 'nested'])
main(['init', 'nested'])
nested_root = os.path.join(self.root_dir, 'collections', 'nested', 'warcs')
nested_a = os.path.join(nested_root, 'A')
@ -131,7 +131,7 @@ class TestManagedColls(object):
shutil.copy2(warc1, nested_a)
shutil.copy2(warc2, nested_b)
main(['--index-warcs',
main(['index',
'nested',
os.path.join(nested_a, 'iana.warc.gz'),
os.path.join(nested_b, 'example.warc.gz')
@ -162,7 +162,7 @@ class TestManagedColls(object):
shutil.copy(orig, bak)
main(['--reindex', 'test'])
main(['reindex', 'test'])
with open(orig) as orig_fh:
merged_cdx = orig_fh.read()
@ -187,6 +187,39 @@ class TestManagedColls(object):
assert resp.content_type == 'application/javascript'
assert '/* Some JS File */' in resp.body
def test_add_title_metadata_index_page(self):
""" Test adding title metadata to a collection, test
retrieval on default index page
"""
main(['metadata', 'foo', '--set', 'title=Collection Title'])
self._create_app()
resp = self.testapp.get('/')
assert resp.status_int == 200
assert resp.content_type == 'text/html'
assert '(Collection Title)' in resp.body
def test_other_metadata_search_page(self):
main(['metadata', 'foo', '--set',
'desc=Some Description Text',
'other=custom value'])
with raises(ValueError):
main(['metadata', 'foo', '--set', 'name_only'])
self._create_app()
resp = self.testapp.get('/foo/')
assert resp.status_int == 200
assert resp.content_type == 'text/html'
assert 'Collection Title' in resp.body
assert 'desc' in resp.body
assert 'Some Description Text' in resp.body
assert 'other' in resp.body
assert 'custom value' in resp.body
def test_custom_template_search(self):
""" Test manually added custom search template search.html
"""
@ -219,7 +252,6 @@ class TestManagedColls(object):
assert resp.content_type == 'text/html'
assert 'config.yaml overriden search page' in resp.body
def test_no_templates(self):
""" Test removing templates dir, using default template again
"""
@ -232,28 +264,45 @@ class TestManagedColls(object):
assert resp.content_type == 'text/html'
assert 'pywb custom search page' not in resp.body
def test_list_colls(self):
""" Test collection listing, printed to stdout
"""
orig_stdout = sys.stdout
buff = BytesIO()
sys.stdout = buff
main(['list'])
sys.stdout = orig_stdout
output = buff.getvalue().splitlines()
assert len(output) == 4
assert 'Collections' in output[0]
assert 'foo' in output[1]
assert 'nested' in output[2]
assert 'test' in output[3]
def test_err_no_such_coll(self):
""" Test error adding warc to non-existant collection
"""
warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
with raises(IOError):
main(['--addwarc', 'bar', warc1])
main(['add', 'bar', warc1])
def test_err_wrong_warcs(self):
warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
invalid_warc = os.path.join(self.root_dir, 'collections', 'test', 'warcs', 'invalid.warc.gz')
# Empty
main(['--index-warcs', 'test'])
# Empty warc list, argparse calls exit
with raises(SystemExit):
main(['index', 'test'])
# Wrong paths not in collection
with raises(IOError):
main(['--index-warcs', 'test', warc1])
main(['index', 'test', warc1])
# Non-existent
with raises(IOError):
main(['--index-warcs', 'test', invalid_warc])
main(['index', 'test', invalid_warc])
def test_err_missing_dirs(self):
""" Test various errors with missing warcs dir,
@ -266,7 +315,7 @@ class TestManagedColls(object):
shutil.rmtree(warcs_path)
with raises(IOError):
main(['--addwarc', 'foo', 'somewarc'])
main(['add', 'foo', 'somewarc'])
# No CDX
cdx_path = os.path.join(colls, 'foo', 'cdx')