1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

metadata: add support for user-defined per-collection metadata! #78

metadata stored in wbrequest.user_metadata and available to all templates

collections manager: refactor to use subparsers, add list collections and set metadata commands
update tests for new commands
index template: use user metadata title for collections listing
search template: display all metadata and title, if available
This commit is contained in:
Ilya Kreymer 2015-03-15 21:20:00 -07:00
parent b417b47835
commit 30454abb6b
9 changed files with 235 additions and 98 deletions

View File

@ -66,7 +66,8 @@ class ArchivalRouter(object):
wburl_class=route.handler.get_wburl_type(), wburl_class=route.handler.get_wburl_type(),
urlrewriter_class=self.urlrewriter_class, urlrewriter_class=self.urlrewriter_class,
cookie_scope=route.cookie_scope, cookie_scope=route.cookie_scope,
rewrite_opts=route.rewrite_opts) rewrite_opts=route.rewrite_opts,
user_metadata=route.user_metadata)
# Allow for applying of additional filters # Allow for applying of additional filters
route.apply_filters(wbrequest, matcher) route.apply_filters(wbrequest, matcher)
@ -100,12 +101,15 @@ class Route(object):
self.regex = re.compile(regex + lookahead) self.regex = re.compile(regex + lookahead)
else: else:
self.regex = re.compile('') self.regex = re.compile('')
self.handler = handler self.handler = handler
self.request_class = request_class self.request_class = request_class
# collection id from regex group (default 0) # collection id from regex group (default 0)
self.coll_group = coll_group self.coll_group = coll_group
self.cookie_scope = config.get('cookie_scope') self.cookie_scope = config.get('cookie_scope')
self.rewrite_opts = config.get('rewrite_opts', {}) self.rewrite_opts = config.get('rewrite_opts', {})
self.user_metadata = config.get('metadata', {})
self._custom_init(config) self._custom_init(config)
def is_handling(self, request_uri): def is_handling(self, request_uri):

View File

@ -40,7 +40,9 @@ class WbRequest(object):
urlrewriter_class=None, urlrewriter_class=None,
is_proxy=False, is_proxy=False,
cookie_scope=None, cookie_scope=None,
rewrite_opts={}): rewrite_opts={},
user_metadata={},
):
self.env = env self.env = env
@ -96,6 +98,7 @@ class WbRequest(object):
self.query_filter = [] self.query_filter = []
self.custom_params = {} self.custom_params = {}
self.user_metadata = user_metadata
# PERF # PERF
env['X_PERF'] = {} env['X_PERF'] = {}

View File

@ -9,6 +9,7 @@ from pywb.warc.cdxindexer import main as cdxindexer_main
from argparse import ArgumentParser, RawTextHelpFormatter from argparse import ArgumentParser, RawTextHelpFormatter
import heapq import heapq
import yaml
#============================================================================= #=============================================================================
@ -19,7 +20,7 @@ simplify the creation and management of web archive collections
It may be used via cmdline to setup and maintain the It may be used via cmdline to setup and maintain the
directory structure expected by pywb directory structure expected by pywb
""" """
def __init__(self, coll_name, root_dir='collections'): def __init__(self, coll_name, root_dir='collections', must_exist=True):
self.root_dir = root_dir self.root_dir = root_dir
self.default_config = load_yaml_config('pywb/default_config.yaml') self.default_config = load_yaml_config('pywb/default_config.yaml')
self.coll_name = coll_name self.coll_name = coll_name
@ -30,6 +31,14 @@ directory structure expected by pywb
self.cdx_dir = self._get_dir('index_paths') self.cdx_dir = self._get_dir('index_paths')
self.static_dir = self._get_dir('static_path') self.static_dir = self._get_dir('static_path')
self.templates_dir = self._get_dir('templates_dir') self.templates_dir = self._get_dir('templates_dir')
if must_exist:
self._assert_coll_exists()
def list_colls(self):
print('Collections:')
for d in os.listdir(self.root_dir):
if os.path.isdir(os.path.join(self.root_dir, d)):
print('- ' + d)
def _get_dir(self, name): def _get_dir(self, name):
return os.path.join(self.coll_dir, return os.path.join(self.coll_dir,
@ -50,18 +59,15 @@ directory structure expected by pywb
self._create_dir(self.static_dir) self._create_dir(self.static_dir)
self._create_dir(self.templates_dir) self._create_dir(self.templates_dir)
def _assert_coll_exists(self):
if not os.path.isdir(self.coll_dir):
raise IOError('Collection {0} does not exist'.
format(self.coll_name))
def add_warcs(self, warcs): def add_warcs(self, warcs):
if not os.path.isdir(self.warc_dir): if not os.path.isdir(self.warc_dir):
if not os.path.isdir(self.coll_dir): raise IOError('Directory {0} does not exist'.
raise IOError('Collection {0} does not exist'. format(self.warc_dir))
format(self.coll_name))
else:
raise IOError('Directory {0} does not exist'.
format(self.warc_dir))
if not warcs:
logging.info('No WARCs specified')
return
full_paths = [] full_paths = []
for filename in warcs: for filename in warcs:
@ -99,9 +105,6 @@ directory structure expected by pywb
self._index_merge_warcs(filtered_warcs) self._index_merge_warcs(filtered_warcs)
def _index_merge_warcs(self, new_warcs): def _index_merge_warcs(self, new_warcs):
if not new_warcs:
return
cdx_file = os.path.join(self.cdx_dir, 'index.cdx') cdx_file = os.path.join(self.cdx_dir, 'index.cdx')
# no existing file, just reindex all # no existing file, just reindex all
@ -128,50 +131,109 @@ directory structure expected by pywb
os.rename(merged_file, cdx_file) os.rename(merged_file, cdx_file)
os.remove(temp_file) os.remove(temp_file)
def set_metadata(self, namevalue_pairs):
metadata_yaml = os.path.join(self.coll_dir, 'metadata.yaml')
metadata = None
if os.path.isfile(metadata_yaml):
with open(metadata_yaml) as fh:
metadata = yaml.safe_load(fh)
if not metadata:
metadata = {}
msg = 'Metadata params must be in the form "name=value"'
for pair in namevalue_pairs:
v = pair.split('=', 1)
if len(v) != 2:
raise ValueError(msg)
metadata[v[0]] = v[1]
with open(metadata_yaml, 'w+b') as fh:
fh.write(yaml.dump(metadata, default_flow_style=False))
#=============================================================================
def main(args=None): def main(args=None):
description = """ description = """
Create manage file based web archive collections Create manage file based web archive collections
""" """
#format(os.path.basename(sys.argv[0]))
epilog = """
Some examples:
* Create new collection 'my_coll'
{0} create my_coll
* Add warc mywarc1.warc.gz to my_coll (The warc will be copied to the collecton directory)
{0} add my_coll mywarc1.warc.gz
""".format(os.path.basename(sys.argv[0]))
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s', logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
level=logging.DEBUG) level=logging.DEBUG)
parser = ArgumentParser(description=description, parser = ArgumentParser(description=description,
epilog=epilog, #epilog=epilog,
formatter_class=RawTextHelpFormatter) formatter_class=RawTextHelpFormatter)
group = parser.add_mutually_exclusive_group() subparsers = parser.add_subparsers(dest='type')
group.add_argument('--init', action='store_true')
group.add_argument('--addwarc', action='store_true')
group.add_argument('--reindex', action='store_true')
group.add_argument('--index-warcs', action='store_true')
parser.add_argument('name') # Init Coll
parser.add_argument('files', nargs='*') def do_init(r):
m = CollectionsManager(r.coll_name, must_exist=False)
m.add_collection()
init_help = 'Init new collection, create all collection directories'
init = subparsers.add_parser('init', help=init_help)
init.add_argument('coll_name')
init.set_defaults(func=do_init)
# List Colls
def do_list(r):
m = CollectionsManager('', must_exist=False)
m.list_colls()
list_help = 'List Collections'
listcmd = subparsers.add_parser('list', help=list_help)
listcmd.set_defaults(func=do_list)
# Add Warcs
def do_add(r):
m = CollectionsManager(r.coll_name)
m.add_warcs(r.files)
addwarc_help = 'Copy ARCS/WARCS to collection directory and reindex'
addwarc = subparsers.add_parser('add', help=addwarc_help)
addwarc.add_argument('coll_name')
addwarc.add_argument('files', nargs='+')
addwarc.set_defaults(func=do_add)
# Reindex All
def do_reindex(r):
m = CollectionsManager(r.coll_name)
m.reindex()
reindex_help = 'Re-Index entire collection'
reindex = subparsers.add_parser('reindex', help=reindex_help)
reindex.add_argument('coll_name')
reindex.set_defaults(func=do_reindex)
# Index warcs
def do_index(r):
m = CollectionsManager(r.coll_name)
m.index_merge(r.files)
indexwarcs_help = 'Index specified ARC/WARC files in the collection'
indexwarcs = subparsers.add_parser('index', help=indexwarcs_help)
indexwarcs.add_argument('coll_name')
indexwarcs.add_argument('files', nargs='+')
indexwarcs.set_defaults(func=do_index)
# Set metadata
def do_metadata(r):
m = CollectionsManager(r.coll_name)
m.set_metadata(r.set)
metadata_help = 'Set Metadata'
metadata = subparsers.add_parser('metadata', help=metadata_help)
metadata.add_argument('coll_name')
metadata.add_argument('--set', nargs='+')
metadata.set_defaults(func=do_metadata)
r = parser.parse_args(args=args) r = parser.parse_args(args=args)
r.func(r)
m = CollectionsManager(r.name)
if r.init:
m.add_collection()
elif r.addwarc:
m.add_warcs(r.files)
elif r.index_warcs:
m.index_merge(r.files)
elif r.reindex:
m.reindex()
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -1,22 +1,17 @@
<h2>pywb Sample Home Page</h2> <h2>pywb Wayback Machine</h2>
The following archive collections are available: This archive contains the following collections:
<ul> <ul>
{% for route in routes %} {% for route in routes %}
{% if route | is_wb_handler %} {% if route | is_wb_handler %}
<li><a href="{{ '/' + route.path }}">{{ '/' + route.path }}</a>: {{ route | string }}</li> <li>
{% endif %} <a href="{{ '/' + route.path }}">{{ '/' + route.path }}</a>
{% endfor %} {% if route.user_metadata.title is defined %}
</ul> ({{ route.user_metadata.title }})
{% endif %}
Other endpoints in this deployment: </li>
{% endif %}
<ul>
{% for route in routes %}
{% if not route | is_wb_handler %}
<li><b>{{ '/' + route.path }}</b> - {{ route | string }}</li>
{% endif %}
{% endfor %} {% endfor %}
</ul> </ul>

View File

@ -1,6 +1,17 @@
<h2>pywb Search Page</h2> <h2>{{ wbrequest.user_metadata.title if wbrequest.user_metadata.title else wbrequest.coll }} Search Page</h2>
Search Archived Content:
<div>
<table style="text-align: left">
{% for key, val in wbrequest.user_metadata.iteritems() %}
<tr><th>{{ key }}:</th><td>{{ val }}</td>
{% endfor %}
</table>
</div>
<p>
Search this collection by url:
<form onsubmit="url = document.getElementById('search').value; if (url != '') { document.location.href = '{{ wbrequest.wb_prefix }}' + '*/' + url; } return false;"> <form onsubmit="url = document.getElementById('search').value; if (url != '') { document.location.href = '{{ wbrequest.wb_prefix }}' + '*/' + url; } return false;">
<input id="search" name="search" placeholder="Enter url to search"/> <input id="search" name="search" placeholder="Enter url to search"/>
<button type="submit">Search</button> <button type="submit">Search</button>
</form> </form>
</p>

View File

@ -143,13 +143,13 @@ class DirectoryCollsLoader(object):
if not os.path.isdir(full): if not os.path.isdir(full):
continue continue
coll = self.load_dir(full, name) coll_config = self.load_coll_dir(full, name)
if coll: if coll_config:
colls[name] = coll colls[name] = coll_config
return colls return colls
def _add_if_exists(self, coll, root_dir, dir_key, required=False): def _add_dir_if_exists(self, coll, root_dir, dir_key, required=False):
if dir_key in coll: if dir_key in coll:
# already set # already set
return False return False
@ -168,18 +168,26 @@ class DirectoryCollsLoader(object):
else: else:
return False return False
def load_dir(self, root_dir, name): def load_yaml_file(self, root_dir, filename):
config_file = os.path.join(root_dir, 'config.yaml') filename = os.path.join(root_dir, filename)
if os.path.isfile(config_file): if os.path.isfile(filename):
coll = load_yaml_config(config_file) return load_yaml_config(filename)
else: else:
coll = {} return {}
self._add_if_exists(coll, root_dir, 'index_paths', True) def load_coll_dir(self, root_dir, name):
self._add_if_exists(coll, root_dir, 'archive_paths', True) # Load config.yaml
coll_config = self.load_yaml_file(root_dir, 'config.yaml')
if self._add_if_exists(coll, root_dir, 'static_path', False): # Load metadata.yaml
self.static_routes['static/' + name] = coll['static_path'] metadata = self.load_yaml_file(root_dir, 'metadata.yaml')
coll_config['metadata'] = metadata
self._add_dir_if_exists(coll_config, root_dir, 'index_paths', True)
self._add_dir_if_exists(coll_config, root_dir, 'archive_paths', True)
if self._add_dir_if_exists(coll_config, root_dir, 'static_path', False):
self.static_routes['static/' + name] = coll_config['static_path']
# Add templates # Add templates
templates_dir = self.config.get('paths').get('templates_dir') templates_dir = self.config.get('paths').get('templates_dir')
@ -187,15 +195,15 @@ class DirectoryCollsLoader(object):
template_dir = os.path.join(root_dir, templates_dir) template_dir = os.path.join(root_dir, templates_dir)
if template_dir: if template_dir:
for tname, tfile in self.config.get('paths')['template_files'].iteritems(): for tname, tfile in self.config.get('paths')['template_files'].iteritems():
if tname in coll: if tname in coll_config:
# Already set # Already set
continue continue
full = os.path.join(template_dir, tfile) full = os.path.join(template_dir, tfile)
if os.path.isfile(full): if os.path.isfile(full):
coll[tname] = full coll_config[tname] = full
return coll return coll_config
#================================================================= #=================================================================

View File

@ -5,6 +5,7 @@ from pywb.framework.memento import make_timemap, LINK_FORMAT
import urlparse import urlparse
import urllib import urllib
import logging import logging
import json
from os import path from os import path
from itertools import imap from itertools import imap
@ -59,6 +60,11 @@ def is_wb_handler(obj):
return obj.handler.__class__.__name__ == "WBHandler" return obj.handler.__class__.__name__ == "WBHandler"
@template_filter()
def jsonify(obj):
return json.dumps(obj)
#================================================================= #=================================================================
class J2TemplateView(object): class J2TemplateView(object):
env_globals = {'static_path': 'static/default', env_globals = {'static_path': 'static/default',

View File

@ -42,7 +42,7 @@ setup(
long_description=long_description, long_description=long_description,
license='GPL', license='GPL',
packages=find_packages(), packages=find_packages(),
#include_package_data=True, zip_safe=True,
provides=[ provides=[
'pywb', 'pywb',
'pywb.utils', 'pywb.utils',
@ -92,7 +92,6 @@ setup(
proxy-cert-auth = pywb.framework.certauth:main proxy-cert-auth = pywb.framework.certauth:main
wb-manager = pywb.manager.manager:main wb-manager = pywb.manager.manager:main
""", """,
zip_safe=True,
classifiers=[ classifiers=[
'Development Status :: 4 - Beta', 'Development Status :: 4 - Beta',
'Environment :: Web Environment', 'Environment :: Web Environment',

View File

@ -1,9 +1,12 @@
import os import os
import tempfile import tempfile
import shutil import shutil
import sys
import webtest import webtest
from io import BytesIO
from pywb.webapp.pywb_init import create_wb_router from pywb.webapp.pywb_init import create_wb_router
from pywb.manager.manager import main from pywb.manager.manager import main
@ -53,7 +56,7 @@ class TestManagedColls(object):
def test_create_first_coll(self): def test_create_first_coll(self):
""" Test first collection creation, with all required dirs """ Test first collection creation, with all required dirs
""" """
main(['--init', 'test']) main(['init', 'test'])
colls = os.path.join(self.root_dir, 'collections') colls = os.path.join(self.root_dir, 'collections')
assert os.path.isdir(colls) assert os.path.isdir(colls)
@ -68,7 +71,7 @@ class TestManagedColls(object):
""" """
warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz') warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
main(['--addwarc', 'test', warc1]) main(['add', 'test', warc1])
self._create_app() self._create_app()
resp = self.testapp.get('/test/20140103030321/http://example.com?example=1') resp = self.testapp.get('/test/20140103030321/http://example.com?example=1')
@ -79,9 +82,9 @@ class TestManagedColls(object):
""" """
warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz') warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
main(['--init', 'foo']) main(['init', 'foo'])
main(['--addwarc', 'foo', warc1]) main(['add', 'foo', warc1])
self._create_app() self._create_app()
resp = self.testapp.get('/foo/20140103030321/http://example.com?example=1') resp = self.testapp.get('/foo/20140103030321/http://example.com?example=1')
@ -93,17 +96,14 @@ class TestManagedColls(object):
warc1 = os.path.join(get_test_dir(), 'warcs', 'iana.warc.gz') warc1 = os.path.join(get_test_dir(), 'warcs', 'iana.warc.gz')
warc2 = os.path.join(get_test_dir(), 'warcs', 'example-extra.warc') warc2 = os.path.join(get_test_dir(), 'warcs', 'example-extra.warc')
main(['--addwarc', 'test', warc1, warc2]) main(['add', 'test', warc1, warc2])
# Spurrious file in collections # Spurrious file in collections
with open(os.path.join(self.root_dir, 'collections', 'blah'), 'w+b') as fh: with open(os.path.join(self.root_dir, 'collections', 'blah'), 'w+b') as fh:
fh.write('foo\n') fh.write('foo\n')
with raises(IOError): with raises(IOError):
main(['--addwarc', 'test', 'non-existent-file.warc.gz']) main(['add', 'test', 'non-existent-file.warc.gz'])
# check adding no warc -- no op
main(['--addwarc', 'test'])
# check new cdx # check new cdx
self._create_app() self._create_app()
@ -116,7 +116,7 @@ class TestManagedColls(object):
Ensure CDX is relative to root archive dir, test replay Ensure CDX is relative to root archive dir, test replay
""" """
main(['--init', 'nested']) main(['init', 'nested'])
nested_root = os.path.join(self.root_dir, 'collections', 'nested', 'warcs') nested_root = os.path.join(self.root_dir, 'collections', 'nested', 'warcs')
nested_a = os.path.join(nested_root, 'A') nested_a = os.path.join(nested_root, 'A')
@ -131,7 +131,7 @@ class TestManagedColls(object):
shutil.copy2(warc1, nested_a) shutil.copy2(warc1, nested_a)
shutil.copy2(warc2, nested_b) shutil.copy2(warc2, nested_b)
main(['--index-warcs', main(['index',
'nested', 'nested',
os.path.join(nested_a, 'iana.warc.gz'), os.path.join(nested_a, 'iana.warc.gz'),
os.path.join(nested_b, 'example.warc.gz') os.path.join(nested_b, 'example.warc.gz')
@ -162,7 +162,7 @@ class TestManagedColls(object):
shutil.copy(orig, bak) shutil.copy(orig, bak)
main(['--reindex', 'test']) main(['reindex', 'test'])
with open(orig) as orig_fh: with open(orig) as orig_fh:
merged_cdx = orig_fh.read() merged_cdx = orig_fh.read()
@ -187,6 +187,39 @@ class TestManagedColls(object):
assert resp.content_type == 'application/javascript' assert resp.content_type == 'application/javascript'
assert '/* Some JS File */' in resp.body assert '/* Some JS File */' in resp.body
def test_add_title_metadata_index_page(self):
""" Test adding title metadata to a collection, test
retrieval on default index page
"""
main(['metadata', 'foo', '--set', 'title=Collection Title'])
self._create_app()
resp = self.testapp.get('/')
assert resp.status_int == 200
assert resp.content_type == 'text/html'
assert '(Collection Title)' in resp.body
def test_other_metadata_search_page(self):
main(['metadata', 'foo', '--set',
'desc=Some Description Text',
'other=custom value'])
with raises(ValueError):
main(['metadata', 'foo', '--set', 'name_only'])
self._create_app()
resp = self.testapp.get('/foo/')
assert resp.status_int == 200
assert resp.content_type == 'text/html'
assert 'Collection Title' in resp.body
assert 'desc' in resp.body
assert 'Some Description Text' in resp.body
assert 'other' in resp.body
assert 'custom value' in resp.body
def test_custom_template_search(self): def test_custom_template_search(self):
""" Test manually added custom search template search.html """ Test manually added custom search template search.html
""" """
@ -219,7 +252,6 @@ class TestManagedColls(object):
assert resp.content_type == 'text/html' assert resp.content_type == 'text/html'
assert 'config.yaml overriden search page' in resp.body assert 'config.yaml overriden search page' in resp.body
def test_no_templates(self): def test_no_templates(self):
""" Test removing templates dir, using default template again """ Test removing templates dir, using default template again
""" """
@ -232,28 +264,45 @@ class TestManagedColls(object):
assert resp.content_type == 'text/html' assert resp.content_type == 'text/html'
assert 'pywb custom search page' not in resp.body assert 'pywb custom search page' not in resp.body
def test_list_colls(self):
""" Test collection listing, printed to stdout
"""
orig_stdout = sys.stdout
buff = BytesIO()
sys.stdout = buff
main(['list'])
sys.stdout = orig_stdout
output = buff.getvalue().splitlines()
assert len(output) == 4
assert 'Collections' in output[0]
assert 'foo' in output[1]
assert 'nested' in output[2]
assert 'test' in output[3]
def test_err_no_such_coll(self): def test_err_no_such_coll(self):
""" Test error adding warc to non-existant collection """ Test error adding warc to non-existant collection
""" """
warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz') warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
with raises(IOError): with raises(IOError):
main(['--addwarc', 'bar', warc1]) main(['add', 'bar', warc1])
def test_err_wrong_warcs(self): def test_err_wrong_warcs(self):
warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz') warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
invalid_warc = os.path.join(self.root_dir, 'collections', 'test', 'warcs', 'invalid.warc.gz') invalid_warc = os.path.join(self.root_dir, 'collections', 'test', 'warcs', 'invalid.warc.gz')
# Empty # Empty warc list, argparse calls exit
main(['--index-warcs', 'test']) with raises(SystemExit):
main(['index', 'test'])
# Wrong paths not in collection # Wrong paths not in collection
with raises(IOError): with raises(IOError):
main(['--index-warcs', 'test', warc1]) main(['index', 'test', warc1])
# Non-existent # Non-existent
with raises(IOError): with raises(IOError):
main(['--index-warcs', 'test', invalid_warc]) main(['index', 'test', invalid_warc])
def test_err_missing_dirs(self): def test_err_missing_dirs(self):
""" Test various errors with missing warcs dir, """ Test various errors with missing warcs dir,
@ -266,7 +315,7 @@ class TestManagedColls(object):
shutil.rmtree(warcs_path) shutil.rmtree(warcs_path)
with raises(IOError): with raises(IOError):
main(['--addwarc', 'foo', 'somewarc']) main(['add', 'foo', 'somewarc'])
# No CDX # No CDX
cdx_path = os.path.join(colls, 'foo', 'cdx') cdx_path = os.path.join(colls, 'foo', 'cdx')