mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
wb-manager acl command: support manipulating sorted access-list .aclj files via command-line (ukwa/ukwa-pywb#7)
- support as target an auto-collection, where acl file added automatically in ./collections/<coll>/acl/access-rules.aclj or specifying an .aclj explicitly for more custom configs - support adding urls and surts, determine if url is already a surt, otherwise canonicalize acl commands include: - acl add <target_file_or_coll> <url_or_surt> <access> -- add (or replace) rule for url/surt with access level <access> - acl remove <target_filr_or_coll> <url_or_surt> -- remove url/surt from target - acl list <target_file_or_coll> -- list all rules for target - acl validate <target_file_or_coll> -- ensure sort order is correct, otherwise fix and save - acl match <target_file_or_coll> <url> -- find matching rule, if any, in target for specified url, or print no match/default rule - acl importtxt <target_file_or_coll> <filename> -- bulk import of 'excludes.txt' style rules, one url-per-line and add to target
This commit is contained in:
parent
a3f81dcc0f
commit
bfa3aa7264
253
pywb/manager/aclmanager.py
Normal file
253
pywb/manager/aclmanager.py
Normal file
@ -0,0 +1,253 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
|
from argparse import ArgumentParser, RawTextHelpFormatter
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
|
from pywb.manager.manager import CollectionsManager
|
||||||
|
from pywb.warcserver.index.cdxobject import CDXObject
|
||||||
|
from pywb.utils.canonicalize import canonicalize
|
||||||
|
|
||||||
|
from pywb.warcserver.access_checker import AccessChecker
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
class ACLManager(CollectionsManager):
|
||||||
|
SURT_RX = re.compile('([^:.]+[,)])+')
|
||||||
|
|
||||||
|
VALID_ACCESS = ('allow', 'block', 'exclude')
|
||||||
|
|
||||||
|
DEFAULT_FILE = 'access-rules.aclj'
|
||||||
|
|
||||||
|
def __init__(self, r):
|
||||||
|
self.rules = []
|
||||||
|
|
||||||
|
coll_name = r.coll_name
|
||||||
|
if not self.is_valid_auto_coll(r.coll_name):
|
||||||
|
coll_name = ''
|
||||||
|
|
||||||
|
self.target = r.coll_name
|
||||||
|
|
||||||
|
super(ACLManager, self).__init__(coll_name, must_exist=False)
|
||||||
|
|
||||||
|
# if auto collection, use default file in ./collections/<coll>/acl/<DEFAULT_FILE>
|
||||||
|
if os.path.isdir(self.curr_coll_dir):
|
||||||
|
self.acl_file = os.path.join(self.acl_dir, self.DEFAULT_FILE)
|
||||||
|
|
||||||
|
# else, treat the 'r.coll_name' param as the filename
|
||||||
|
else:
|
||||||
|
self.acl_file = r.coll_name
|
||||||
|
|
||||||
|
if r.op == 'add':
|
||||||
|
self.load_acl(False)
|
||||||
|
|
||||||
|
else:
|
||||||
|
if not self.load_acl(True):
|
||||||
|
return
|
||||||
|
|
||||||
|
# if 'validate', the command itself is validation
|
||||||
|
if r.op != 'validate':
|
||||||
|
self.validate()
|
||||||
|
|
||||||
|
r.acl_func(self, r)
|
||||||
|
|
||||||
|
def is_valid_auto_coll(self, coll_name):
|
||||||
|
if not self.COLL_RX.match(coll_name):
|
||||||
|
return False
|
||||||
|
|
||||||
|
if not os.path.isdir(os.path.join(self.COLLS_DIR, coll_name)):
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def load_acl(self, must_exist=True):
|
||||||
|
try:
|
||||||
|
with open(self.acl_file, 'rb') as fh:
|
||||||
|
for line in fh:
|
||||||
|
if line:
|
||||||
|
self.rules.append(CDXObject(line))
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except IOError as io:
|
||||||
|
if must_exist:
|
||||||
|
print('Error Occured: ' + str(io))
|
||||||
|
return False
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print('Error Occured: ' + str(e))
|
||||||
|
return False
|
||||||
|
|
||||||
|
def save_acl(self, r=None):
|
||||||
|
try:
|
||||||
|
os.makedirs(os.path.dirname(self.acl_file))
|
||||||
|
except IOError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(self.acl_file, 'wb') as fh:
|
||||||
|
for acl in self.rules:
|
||||||
|
fh.write(acl.to_cdxj().encode('utf-8'))
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print('Error Saving ACL Rules: ' + str(e))
|
||||||
|
|
||||||
|
def to_key(self, url_or_surt):
|
||||||
|
""" If 'url_or_surt' already a SURT, use as is
|
||||||
|
"""
|
||||||
|
if self.SURT_RX.search(url_or_surt):
|
||||||
|
return url_or_surt
|
||||||
|
else:
|
||||||
|
return canonicalize(url_or_surt)
|
||||||
|
|
||||||
|
def validate_access(self, access):
|
||||||
|
if access not in self.VALID_ACCESS:
|
||||||
|
print('Valid access values are: ' + ', '.join(self.VALID_ACCESS))
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def add_rule(self, r):
|
||||||
|
return self._add_rule(r.url, r.access)
|
||||||
|
|
||||||
|
def _add_rule(self, url, access):
|
||||||
|
if not self.validate_access(access):
|
||||||
|
return
|
||||||
|
|
||||||
|
acl = CDXObject()
|
||||||
|
acl['urlkey'] = self.to_key(url)
|
||||||
|
acl['timestamp'] = '-'
|
||||||
|
acl['access'] = access
|
||||||
|
acl['url'] = url
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
replace = False
|
||||||
|
|
||||||
|
for rule in self.rules:
|
||||||
|
if acl['urlkey'] == rule['urlkey'] and acl['timestamp'] == rule['timestamp']:
|
||||||
|
replace = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if acl > rule:
|
||||||
|
break
|
||||||
|
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
if replace:
|
||||||
|
print('Existing Rule Found, Replacing:')
|
||||||
|
self.print_rule(self.rules[i])
|
||||||
|
print('with:')
|
||||||
|
self.print_rule(acl)
|
||||||
|
self.rules[i] = acl
|
||||||
|
else:
|
||||||
|
print('Added new Rule:')
|
||||||
|
self.print_rule(acl)
|
||||||
|
self.rules.insert(i, acl)
|
||||||
|
|
||||||
|
self.save_acl()
|
||||||
|
|
||||||
|
def validate_save(self, r=None):
|
||||||
|
if self.validate(True):
|
||||||
|
self.save_acl()
|
||||||
|
|
||||||
|
def validate(self, log=False):
|
||||||
|
last_rule = None
|
||||||
|
out_of_order = False
|
||||||
|
for rule in self.rules:
|
||||||
|
if last_rule and rule > last_rule:
|
||||||
|
out_of_order = True
|
||||||
|
break
|
||||||
|
|
||||||
|
last_rule = rule
|
||||||
|
|
||||||
|
if out_of_order:
|
||||||
|
if log:
|
||||||
|
print('Rules out of order, resorting')
|
||||||
|
self.rules.sort(reverse=True)
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
if log:
|
||||||
|
print('Rules in order')
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def remove_rule(self, r):
|
||||||
|
i = 0
|
||||||
|
urlkey = self.to_key(r.url)
|
||||||
|
for rule in self.rules:
|
||||||
|
if urlkey == rule['urlkey']:# and r.timestamp == rule['timestamp']:
|
||||||
|
acl = self.rules.pop(i)
|
||||||
|
print('Removed Rule:')
|
||||||
|
self.print_rule(acl)
|
||||||
|
self.save_acl()
|
||||||
|
return
|
||||||
|
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
print('Rule to remove not found!')
|
||||||
|
|
||||||
|
def list_rules(self, r):
|
||||||
|
print('Rules for {0} from {1}:'.format(self.target, self.acl_file))
|
||||||
|
print('')
|
||||||
|
for rule in self.rules:
|
||||||
|
sys.stdout.write(rule.to_cdxj())
|
||||||
|
print('')
|
||||||
|
|
||||||
|
def find_match(self, r):
|
||||||
|
access_checker = AccessChecker(self.acl_file, '<default>')
|
||||||
|
rule = access_checker.find_access_rule(r.url)
|
||||||
|
|
||||||
|
print('Matched rule:')
|
||||||
|
print('')
|
||||||
|
if rule['urlkey'] == '':
|
||||||
|
print(' <No Match, Using Default Rule>')
|
||||||
|
print('')
|
||||||
|
else:
|
||||||
|
self.print_rule(rule)
|
||||||
|
|
||||||
|
def add_excludes(self, r):
|
||||||
|
"""
|
||||||
|
Import old-style excludes, in url-per-line format
|
||||||
|
"""
|
||||||
|
if not self.validate_access(r.access):
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(r.filename, 'rb') as fh:
|
||||||
|
count = 0
|
||||||
|
for url in fh:
|
||||||
|
url = url.decode('utf-8').strip()
|
||||||
|
self._add_rule(url, r.access)
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
print('Added or replaced {0} rules from '.format(count) + r.filename)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print('Error Importing: ' + str(e))
|
||||||
|
|
||||||
|
def print_rule(self, rule):
|
||||||
|
print(' ' + rule.to_cdxj())
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def init_parser(cls, parser):
|
||||||
|
subparsers = parser.add_subparsers(dest='op')
|
||||||
|
subparsers.required = True
|
||||||
|
|
||||||
|
def command(name, *args, func=None):
|
||||||
|
op = subparsers.add_parser(name)
|
||||||
|
for arg in args:
|
||||||
|
if arg == 'default_access':
|
||||||
|
op.add_argument(arg, nargs='?', default='allow')
|
||||||
|
else:
|
||||||
|
op.add_argument(arg)
|
||||||
|
op.set_defaults(acl_func=func)
|
||||||
|
|
||||||
|
command('add', 'coll_name', 'url', 'access', func=cls.add_rule)
|
||||||
|
command('remove', 'coll_name', 'url', func=cls.remove_rule)
|
||||||
|
command('list', 'coll_name', func=cls.list_rules)
|
||||||
|
command('validate', 'coll_name', func=cls.validate_save)
|
||||||
|
command('match', 'coll_name', 'url', 'default_access', func=cls.find_match)
|
||||||
|
command('importtxt', 'coll_name', 'filename', 'access', func=cls.add_excludes)
|
||||||
|
|
@ -19,6 +19,7 @@ from pywb import DEFAULT_CONFIG
|
|||||||
|
|
||||||
from six.moves import input
|
from six.moves import input
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
# to allow testing by mocking get_input
|
# to allow testing by mocking get_input
|
||||||
|
|
||||||
@ -66,6 +67,8 @@ directory structure expected by pywb
|
|||||||
self.static_dir = self._get_dir('static_path')
|
self.static_dir = self._get_dir('static_path')
|
||||||
self.templates_dir = self._get_dir('templates_dir')
|
self.templates_dir = self._get_dir('templates_dir')
|
||||||
|
|
||||||
|
self.acl_dir = self._get_dir('acl_paths')
|
||||||
|
|
||||||
def list_colls(self):
|
def list_colls(self):
|
||||||
print('Collections:')
|
print('Collections:')
|
||||||
if not os.path.isdir(self.colls_dir):
|
if not os.path.isdir(self.colls_dir):
|
||||||
@ -427,6 +430,16 @@ Create manage file based web archive collections
|
|||||||
migrate.add_argument('-f', '--force', action='store_true')
|
migrate.add_argument('-f', '--force', action='store_true')
|
||||||
migrate.set_defaults(func=do_migrate)
|
migrate.set_defaults(func=do_migrate)
|
||||||
|
|
||||||
|
# ACL
|
||||||
|
from pywb.manager.aclmanager import ACLManager
|
||||||
|
def do_acl(r):
|
||||||
|
acl = ACLManager(r)
|
||||||
|
|
||||||
|
acl_help = 'Configure Access Control Lists (ACL) for a collection'
|
||||||
|
acl = subparsers.add_parser('acl', help=acl_help)
|
||||||
|
ACLManager.init_parser(acl)
|
||||||
|
acl.set_defaults(func=do_acl)
|
||||||
|
|
||||||
# Parse
|
# Parse
|
||||||
r = parser.parse_args(args=args)
|
r = parser.parse_args(args=args)
|
||||||
r.func(r)
|
r.func(r)
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
from pywb.warcserver.index.indexsource import FileIndexSource
|
from pywb.warcserver.index.indexsource import FileIndexSource
|
||||||
from pywb.warcserver.index.aggregator import DirectoryIndexSource, CacheDirectoryMixin
|
from pywb.warcserver.index.aggregator import DirectoryIndexSource, CacheDirectoryMixin
|
||||||
from pywb.warcserver.index.aggregator import SimpleAggregator
|
from pywb.warcserver.index.aggregator import SimpleAggregator
|
||||||
|
from pywb.warcserver.index.cdxobject import CDXObject
|
||||||
|
|
||||||
from pywb.utils.binsearch import search
|
from pywb.utils.binsearch import search
|
||||||
from pywb.utils.merge import merge
|
from pywb.utils.merge import merge
|
||||||
@ -49,7 +50,11 @@ class AccessChecker(object):
|
|||||||
else:
|
else:
|
||||||
self.aggregator = access_source
|
self.aggregator = access_source
|
||||||
|
|
||||||
self.default_rule = {'urlkey': '', 'access': default_access}
|
self.default_rule = CDXObject()
|
||||||
|
self.default_rule['urlkey'] = ''
|
||||||
|
self.default_rule['timestamp'] = '-'
|
||||||
|
self.default_rule['access'] = default_access
|
||||||
|
self.default_rule['default'] = 'true'
|
||||||
|
|
||||||
def create_access_aggregator(self, source_files):
|
def create_access_aggregator(self, source_files):
|
||||||
sources = {}
|
sources = {}
|
||||||
|
@ -121,7 +121,7 @@ class CDXObject(OrderedDict):
|
|||||||
if fields[-1].startswith(b'{'):
|
if fields[-1].startswith(b'{'):
|
||||||
self[URLKEY] = to_native_str(fields[0], 'utf-8')
|
self[URLKEY] = to_native_str(fields[0], 'utf-8')
|
||||||
self[TIMESTAMP] = to_native_str(fields[1], 'utf-8')
|
self[TIMESTAMP] = to_native_str(fields[1], 'utf-8')
|
||||||
json_fields = json_decode(to_native_str(fields[-1], 'utf-8'))
|
json_fields = self.json_decode(to_native_str(fields[-1], 'utf-8'))
|
||||||
for n, v in six.iteritems(json_fields):
|
for n, v in six.iteritems(json_fields):
|
||||||
n = to_native_str(n, 'utf-8')
|
n = to_native_str(n, 'utf-8')
|
||||||
n = self.CDX_ALT_FIELDS.get(n, n)
|
n = self.CDX_ALT_FIELDS.get(n, n)
|
||||||
@ -246,6 +246,10 @@ class CDXObject(OrderedDict):
|
|||||||
res = (self._cached_json <= other._cached_json)
|
res = (self._cached_json <= other._cached_json)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def json_decode(cls, string):
|
||||||
|
return json_decode(string, object_pairs_hook=OrderedDict)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class IDXObject(OrderedDict):
|
class IDXObject(OrderedDict):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user