mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
wb-manager acl command: support manipulating sorted access-list .aclj files via command-line (ukwa/ukwa-pywb#7)
- support as target an auto-collection, where acl file added automatically in ./collections/<coll>/acl/access-rules.aclj or specifying an .aclj explicitly for more custom configs - support adding urls and surts, determine if url is already a surt, otherwise canonicalize acl commands include: - acl add <target_file_or_coll> <url_or_surt> <access> -- add (or replace) rule for url/surt with access level <access> - acl remove <target_filr_or_coll> <url_or_surt> -- remove url/surt from target - acl list <target_file_or_coll> -- list all rules for target - acl validate <target_file_or_coll> -- ensure sort order is correct, otherwise fix and save - acl match <target_file_or_coll> <url> -- find matching rule, if any, in target for specified url, or print no match/default rule - acl importtxt <target_file_or_coll> <filename> -- bulk import of 'excludes.txt' style rules, one url-per-line and add to target
This commit is contained in:
parent
a3f81dcc0f
commit
bfa3aa7264
253
pywb/manager/aclmanager.py
Normal file
253
pywb/manager/aclmanager.py
Normal file
@ -0,0 +1,253 @@
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import re
|
||||
|
||||
from argparse import ArgumentParser, RawTextHelpFormatter
|
||||
from collections import OrderedDict
|
||||
|
||||
from pywb.manager.manager import CollectionsManager
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
|
||||
from pywb.warcserver.access_checker import AccessChecker
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class ACLManager(CollectionsManager):
|
||||
SURT_RX = re.compile('([^:.]+[,)])+')
|
||||
|
||||
VALID_ACCESS = ('allow', 'block', 'exclude')
|
||||
|
||||
DEFAULT_FILE = 'access-rules.aclj'
|
||||
|
||||
def __init__(self, r):
|
||||
self.rules = []
|
||||
|
||||
coll_name = r.coll_name
|
||||
if not self.is_valid_auto_coll(r.coll_name):
|
||||
coll_name = ''
|
||||
|
||||
self.target = r.coll_name
|
||||
|
||||
super(ACLManager, self).__init__(coll_name, must_exist=False)
|
||||
|
||||
# if auto collection, use default file in ./collections/<coll>/acl/<DEFAULT_FILE>
|
||||
if os.path.isdir(self.curr_coll_dir):
|
||||
self.acl_file = os.path.join(self.acl_dir, self.DEFAULT_FILE)
|
||||
|
||||
# else, treat the 'r.coll_name' param as the filename
|
||||
else:
|
||||
self.acl_file = r.coll_name
|
||||
|
||||
if r.op == 'add':
|
||||
self.load_acl(False)
|
||||
|
||||
else:
|
||||
if not self.load_acl(True):
|
||||
return
|
||||
|
||||
# if 'validate', the command itself is validation
|
||||
if r.op != 'validate':
|
||||
self.validate()
|
||||
|
||||
r.acl_func(self, r)
|
||||
|
||||
def is_valid_auto_coll(self, coll_name):
|
||||
if not self.COLL_RX.match(coll_name):
|
||||
return False
|
||||
|
||||
if not os.path.isdir(os.path.join(self.COLLS_DIR, coll_name)):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def load_acl(self, must_exist=True):
|
||||
try:
|
||||
with open(self.acl_file, 'rb') as fh:
|
||||
for line in fh:
|
||||
if line:
|
||||
self.rules.append(CDXObject(line))
|
||||
|
||||
return True
|
||||
|
||||
except IOError as io:
|
||||
if must_exist:
|
||||
print('Error Occured: ' + str(io))
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print('Error Occured: ' + str(e))
|
||||
return False
|
||||
|
||||
def save_acl(self, r=None):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(self.acl_file))
|
||||
except IOError:
|
||||
pass
|
||||
|
||||
try:
|
||||
with open(self.acl_file, 'wb') as fh:
|
||||
for acl in self.rules:
|
||||
fh.write(acl.to_cdxj().encode('utf-8'))
|
||||
|
||||
except Exception as e:
|
||||
print('Error Saving ACL Rules: ' + str(e))
|
||||
|
||||
def to_key(self, url_or_surt):
|
||||
""" If 'url_or_surt' already a SURT, use as is
|
||||
"""
|
||||
if self.SURT_RX.search(url_or_surt):
|
||||
return url_or_surt
|
||||
else:
|
||||
return canonicalize(url_or_surt)
|
||||
|
||||
def validate_access(self, access):
|
||||
if access not in self.VALID_ACCESS:
|
||||
print('Valid access values are: ' + ', '.join(self.VALID_ACCESS))
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def add_rule(self, r):
|
||||
return self._add_rule(r.url, r.access)
|
||||
|
||||
def _add_rule(self, url, access):
|
||||
if not self.validate_access(access):
|
||||
return
|
||||
|
||||
acl = CDXObject()
|
||||
acl['urlkey'] = self.to_key(url)
|
||||
acl['timestamp'] = '-'
|
||||
acl['access'] = access
|
||||
acl['url'] = url
|
||||
|
||||
i = 0
|
||||
replace = False
|
||||
|
||||
for rule in self.rules:
|
||||
if acl['urlkey'] == rule['urlkey'] and acl['timestamp'] == rule['timestamp']:
|
||||
replace = True
|
||||
break
|
||||
|
||||
if acl > rule:
|
||||
break
|
||||
|
||||
i += 1
|
||||
|
||||
if replace:
|
||||
print('Existing Rule Found, Replacing:')
|
||||
self.print_rule(self.rules[i])
|
||||
print('with:')
|
||||
self.print_rule(acl)
|
||||
self.rules[i] = acl
|
||||
else:
|
||||
print('Added new Rule:')
|
||||
self.print_rule(acl)
|
||||
self.rules.insert(i, acl)
|
||||
|
||||
self.save_acl()
|
||||
|
||||
def validate_save(self, r=None):
|
||||
if self.validate(True):
|
||||
self.save_acl()
|
||||
|
||||
def validate(self, log=False):
|
||||
last_rule = None
|
||||
out_of_order = False
|
||||
for rule in self.rules:
|
||||
if last_rule and rule > last_rule:
|
||||
out_of_order = True
|
||||
break
|
||||
|
||||
last_rule = rule
|
||||
|
||||
if out_of_order:
|
||||
if log:
|
||||
print('Rules out of order, resorting')
|
||||
self.rules.sort(reverse=True)
|
||||
return True
|
||||
else:
|
||||
if log:
|
||||
print('Rules in order')
|
||||
|
||||
return False
|
||||
|
||||
def remove_rule(self, r):
|
||||
i = 0
|
||||
urlkey = self.to_key(r.url)
|
||||
for rule in self.rules:
|
||||
if urlkey == rule['urlkey']:# and r.timestamp == rule['timestamp']:
|
||||
acl = self.rules.pop(i)
|
||||
print('Removed Rule:')
|
||||
self.print_rule(acl)
|
||||
self.save_acl()
|
||||
return
|
||||
|
||||
i += 1
|
||||
|
||||
print('Rule to remove not found!')
|
||||
|
||||
def list_rules(self, r):
|
||||
print('Rules for {0} from {1}:'.format(self.target, self.acl_file))
|
||||
print('')
|
||||
for rule in self.rules:
|
||||
sys.stdout.write(rule.to_cdxj())
|
||||
print('')
|
||||
|
||||
def find_match(self, r):
|
||||
access_checker = AccessChecker(self.acl_file, '<default>')
|
||||
rule = access_checker.find_access_rule(r.url)
|
||||
|
||||
print('Matched rule:')
|
||||
print('')
|
||||
if rule['urlkey'] == '':
|
||||
print(' <No Match, Using Default Rule>')
|
||||
print('')
|
||||
else:
|
||||
self.print_rule(rule)
|
||||
|
||||
def add_excludes(self, r):
|
||||
"""
|
||||
Import old-style excludes, in url-per-line format
|
||||
"""
|
||||
if not self.validate_access(r.access):
|
||||
return
|
||||
|
||||
try:
|
||||
with open(r.filename, 'rb') as fh:
|
||||
count = 0
|
||||
for url in fh:
|
||||
url = url.decode('utf-8').strip()
|
||||
self._add_rule(url, r.access)
|
||||
count += 1
|
||||
|
||||
print('Added or replaced {0} rules from '.format(count) + r.filename)
|
||||
|
||||
except Exception as e:
|
||||
print('Error Importing: ' + str(e))
|
||||
|
||||
def print_rule(self, rule):
|
||||
print(' ' + rule.to_cdxj())
|
||||
|
||||
@classmethod
|
||||
def init_parser(cls, parser):
|
||||
subparsers = parser.add_subparsers(dest='op')
|
||||
subparsers.required = True
|
||||
|
||||
def command(name, *args, func=None):
|
||||
op = subparsers.add_parser(name)
|
||||
for arg in args:
|
||||
if arg == 'default_access':
|
||||
op.add_argument(arg, nargs='?', default='allow')
|
||||
else:
|
||||
op.add_argument(arg)
|
||||
op.set_defaults(acl_func=func)
|
||||
|
||||
command('add', 'coll_name', 'url', 'access', func=cls.add_rule)
|
||||
command('remove', 'coll_name', 'url', func=cls.remove_rule)
|
||||
command('list', 'coll_name', func=cls.list_rules)
|
||||
command('validate', 'coll_name', func=cls.validate_save)
|
||||
command('match', 'coll_name', 'url', 'default_access', func=cls.find_match)
|
||||
command('importtxt', 'coll_name', 'filename', 'access', func=cls.add_excludes)
|
||||
|
@ -19,6 +19,7 @@ from pywb import DEFAULT_CONFIG
|
||||
|
||||
from six.moves import input
|
||||
|
||||
|
||||
#=============================================================================
|
||||
# to allow testing by mocking get_input
|
||||
|
||||
@ -66,6 +67,8 @@ directory structure expected by pywb
|
||||
self.static_dir = self._get_dir('static_path')
|
||||
self.templates_dir = self._get_dir('templates_dir')
|
||||
|
||||
self.acl_dir = self._get_dir('acl_paths')
|
||||
|
||||
def list_colls(self):
|
||||
print('Collections:')
|
||||
if not os.path.isdir(self.colls_dir):
|
||||
@ -427,6 +430,16 @@ Create manage file based web archive collections
|
||||
migrate.add_argument('-f', '--force', action='store_true')
|
||||
migrate.set_defaults(func=do_migrate)
|
||||
|
||||
# ACL
|
||||
from pywb.manager.aclmanager import ACLManager
|
||||
def do_acl(r):
|
||||
acl = ACLManager(r)
|
||||
|
||||
acl_help = 'Configure Access Control Lists (ACL) for a collection'
|
||||
acl = subparsers.add_parser('acl', help=acl_help)
|
||||
ACLManager.init_parser(acl)
|
||||
acl.set_defaults(func=do_acl)
|
||||
|
||||
# Parse
|
||||
r = parser.parse_args(args=args)
|
||||
r.func(r)
|
||||
|
@ -1,6 +1,7 @@
|
||||
from pywb.warcserver.index.indexsource import FileIndexSource
|
||||
from pywb.warcserver.index.aggregator import DirectoryIndexSource, CacheDirectoryMixin
|
||||
from pywb.warcserver.index.aggregator import SimpleAggregator
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
|
||||
from pywb.utils.binsearch import search
|
||||
from pywb.utils.merge import merge
|
||||
@ -49,7 +50,11 @@ class AccessChecker(object):
|
||||
else:
|
||||
self.aggregator = access_source
|
||||
|
||||
self.default_rule = {'urlkey': '', 'access': default_access}
|
||||
self.default_rule = CDXObject()
|
||||
self.default_rule['urlkey'] = ''
|
||||
self.default_rule['timestamp'] = '-'
|
||||
self.default_rule['access'] = default_access
|
||||
self.default_rule['default'] = 'true'
|
||||
|
||||
def create_access_aggregator(self, source_files):
|
||||
sources = {}
|
||||
|
@ -121,7 +121,7 @@ class CDXObject(OrderedDict):
|
||||
if fields[-1].startswith(b'{'):
|
||||
self[URLKEY] = to_native_str(fields[0], 'utf-8')
|
||||
self[TIMESTAMP] = to_native_str(fields[1], 'utf-8')
|
||||
json_fields = json_decode(to_native_str(fields[-1], 'utf-8'))
|
||||
json_fields = self.json_decode(to_native_str(fields[-1], 'utf-8'))
|
||||
for n, v in six.iteritems(json_fields):
|
||||
n = to_native_str(n, 'utf-8')
|
||||
n = self.CDX_ALT_FIELDS.get(n, n)
|
||||
@ -246,6 +246,10 @@ class CDXObject(OrderedDict):
|
||||
res = (self._cached_json <= other._cached_json)
|
||||
return res
|
||||
|
||||
@classmethod
|
||||
def json_decode(cls, string):
|
||||
return json_decode(string, object_pairs_hook=OrderedDict)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class IDXObject(OrderedDict):
|
||||
|
Loading…
x
Reference in New Issue
Block a user