1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

wb-manager acl command: support manipulating sorted access-list .aclj files via command-line (ukwa/ukwa-pywb#7)

- support as target an auto-collection, where acl file added automatically in ./collections/<coll>/acl/access-rules.aclj
or specifying an .aclj explicitly for more custom configs
- support adding urls and surts, determine if url is already a surt, otherwise canonicalize
acl commands include:
- acl add <target_file_or_coll> <url_or_surt> <access> -- add (or replace) rule for url/surt with access level <access>
- acl remove <target_filr_or_coll> <url_or_surt> -- remove url/surt from target
- acl list <target_file_or_coll> -- list all rules for target
- acl validate <target_file_or_coll> -- ensure sort order is correct, otherwise fix and save
- acl match <target_file_or_coll> <url> -- find matching rule, if any, in target for specified url, or print no match/default rule
- acl importtxt <target_file_or_coll> <filename> -- bulk import of 'excludes.txt' style rules, one url-per-line and add to target
This commit is contained in:
Ilya Kreymer 2018-02-20 23:16:51 -08:00 committed by John Berlin
parent a3f81dcc0f
commit bfa3aa7264
No known key found for this signature in database
GPG Key ID: 6EF5E4B442011B02
4 changed files with 277 additions and 2 deletions

253
pywb/manager/aclmanager.py Normal file
View File

@ -0,0 +1,253 @@
import os
import sys
import json
import re
from argparse import ArgumentParser, RawTextHelpFormatter
from collections import OrderedDict
from pywb.manager.manager import CollectionsManager
from pywb.warcserver.index.cdxobject import CDXObject
from pywb.utils.canonicalize import canonicalize
from pywb.warcserver.access_checker import AccessChecker
# ============================================================================
class ACLManager(CollectionsManager):
SURT_RX = re.compile('([^:.]+[,)])+')
VALID_ACCESS = ('allow', 'block', 'exclude')
DEFAULT_FILE = 'access-rules.aclj'
def __init__(self, r):
self.rules = []
coll_name = r.coll_name
if not self.is_valid_auto_coll(r.coll_name):
coll_name = ''
self.target = r.coll_name
super(ACLManager, self).__init__(coll_name, must_exist=False)
# if auto collection, use default file in ./collections/<coll>/acl/<DEFAULT_FILE>
if os.path.isdir(self.curr_coll_dir):
self.acl_file = os.path.join(self.acl_dir, self.DEFAULT_FILE)
# else, treat the 'r.coll_name' param as the filename
else:
self.acl_file = r.coll_name
if r.op == 'add':
self.load_acl(False)
else:
if not self.load_acl(True):
return
# if 'validate', the command itself is validation
if r.op != 'validate':
self.validate()
r.acl_func(self, r)
def is_valid_auto_coll(self, coll_name):
if not self.COLL_RX.match(coll_name):
return False
if not os.path.isdir(os.path.join(self.COLLS_DIR, coll_name)):
return False
return True
def load_acl(self, must_exist=True):
try:
with open(self.acl_file, 'rb') as fh:
for line in fh:
if line:
self.rules.append(CDXObject(line))
return True
except IOError as io:
if must_exist:
print('Error Occured: ' + str(io))
return False
except Exception as e:
print('Error Occured: ' + str(e))
return False
def save_acl(self, r=None):
try:
os.makedirs(os.path.dirname(self.acl_file))
except IOError:
pass
try:
with open(self.acl_file, 'wb') as fh:
for acl in self.rules:
fh.write(acl.to_cdxj().encode('utf-8'))
except Exception as e:
print('Error Saving ACL Rules: ' + str(e))
def to_key(self, url_or_surt):
""" If 'url_or_surt' already a SURT, use as is
"""
if self.SURT_RX.search(url_or_surt):
return url_or_surt
else:
return canonicalize(url_or_surt)
def validate_access(self, access):
if access not in self.VALID_ACCESS:
print('Valid access values are: ' + ', '.join(self.VALID_ACCESS))
return False
return True
def add_rule(self, r):
return self._add_rule(r.url, r.access)
def _add_rule(self, url, access):
if not self.validate_access(access):
return
acl = CDXObject()
acl['urlkey'] = self.to_key(url)
acl['timestamp'] = '-'
acl['access'] = access
acl['url'] = url
i = 0
replace = False
for rule in self.rules:
if acl['urlkey'] == rule['urlkey'] and acl['timestamp'] == rule['timestamp']:
replace = True
break
if acl > rule:
break
i += 1
if replace:
print('Existing Rule Found, Replacing:')
self.print_rule(self.rules[i])
print('with:')
self.print_rule(acl)
self.rules[i] = acl
else:
print('Added new Rule:')
self.print_rule(acl)
self.rules.insert(i, acl)
self.save_acl()
def validate_save(self, r=None):
if self.validate(True):
self.save_acl()
def validate(self, log=False):
last_rule = None
out_of_order = False
for rule in self.rules:
if last_rule and rule > last_rule:
out_of_order = True
break
last_rule = rule
if out_of_order:
if log:
print('Rules out of order, resorting')
self.rules.sort(reverse=True)
return True
else:
if log:
print('Rules in order')
return False
def remove_rule(self, r):
i = 0
urlkey = self.to_key(r.url)
for rule in self.rules:
if urlkey == rule['urlkey']:# and r.timestamp == rule['timestamp']:
acl = self.rules.pop(i)
print('Removed Rule:')
self.print_rule(acl)
self.save_acl()
return
i += 1
print('Rule to remove not found!')
def list_rules(self, r):
print('Rules for {0} from {1}:'.format(self.target, self.acl_file))
print('')
for rule in self.rules:
sys.stdout.write(rule.to_cdxj())
print('')
def find_match(self, r):
access_checker = AccessChecker(self.acl_file, '<default>')
rule = access_checker.find_access_rule(r.url)
print('Matched rule:')
print('')
if rule['urlkey'] == '':
print(' <No Match, Using Default Rule>')
print('')
else:
self.print_rule(rule)
def add_excludes(self, r):
"""
Import old-style excludes, in url-per-line format
"""
if not self.validate_access(r.access):
return
try:
with open(r.filename, 'rb') as fh:
count = 0
for url in fh:
url = url.decode('utf-8').strip()
self._add_rule(url, r.access)
count += 1
print('Added or replaced {0} rules from '.format(count) + r.filename)
except Exception as e:
print('Error Importing: ' + str(e))
def print_rule(self, rule):
print(' ' + rule.to_cdxj())
@classmethod
def init_parser(cls, parser):
subparsers = parser.add_subparsers(dest='op')
subparsers.required = True
def command(name, *args, func=None):
op = subparsers.add_parser(name)
for arg in args:
if arg == 'default_access':
op.add_argument(arg, nargs='?', default='allow')
else:
op.add_argument(arg)
op.set_defaults(acl_func=func)
command('add', 'coll_name', 'url', 'access', func=cls.add_rule)
command('remove', 'coll_name', 'url', func=cls.remove_rule)
command('list', 'coll_name', func=cls.list_rules)
command('validate', 'coll_name', func=cls.validate_save)
command('match', 'coll_name', 'url', 'default_access', func=cls.find_match)
command('importtxt', 'coll_name', 'filename', 'access', func=cls.add_excludes)

View File

@ -19,6 +19,7 @@ from pywb import DEFAULT_CONFIG
from six.moves import input
#=============================================================================
# to allow testing by mocking get_input
@ -66,6 +67,8 @@ directory structure expected by pywb
self.static_dir = self._get_dir('static_path')
self.templates_dir = self._get_dir('templates_dir')
self.acl_dir = self._get_dir('acl_paths')
def list_colls(self):
print('Collections:')
if not os.path.isdir(self.colls_dir):
@ -427,6 +430,16 @@ Create manage file based web archive collections
migrate.add_argument('-f', '--force', action='store_true')
migrate.set_defaults(func=do_migrate)
# ACL
from pywb.manager.aclmanager import ACLManager
def do_acl(r):
acl = ACLManager(r)
acl_help = 'Configure Access Control Lists (ACL) for a collection'
acl = subparsers.add_parser('acl', help=acl_help)
ACLManager.init_parser(acl)
acl.set_defaults(func=do_acl)
# Parse
r = parser.parse_args(args=args)
r.func(r)

View File

@ -1,6 +1,7 @@
from pywb.warcserver.index.indexsource import FileIndexSource
from pywb.warcserver.index.aggregator import DirectoryIndexSource, CacheDirectoryMixin
from pywb.warcserver.index.aggregator import SimpleAggregator
from pywb.warcserver.index.cdxobject import CDXObject
from pywb.utils.binsearch import search
from pywb.utils.merge import merge
@ -49,7 +50,11 @@ class AccessChecker(object):
else:
self.aggregator = access_source
self.default_rule = {'urlkey': '', 'access': default_access}
self.default_rule = CDXObject()
self.default_rule['urlkey'] = ''
self.default_rule['timestamp'] = '-'
self.default_rule['access'] = default_access
self.default_rule['default'] = 'true'
def create_access_aggregator(self, source_files):
sources = {}

View File

@ -121,7 +121,7 @@ class CDXObject(OrderedDict):
if fields[-1].startswith(b'{'):
self[URLKEY] = to_native_str(fields[0], 'utf-8')
self[TIMESTAMP] = to_native_str(fields[1], 'utf-8')
json_fields = json_decode(to_native_str(fields[-1], 'utf-8'))
json_fields = self.json_decode(to_native_str(fields[-1], 'utf-8'))
for n, v in six.iteritems(json_fields):
n = to_native_str(n, 'utf-8')
n = self.CDX_ALT_FIELDS.get(n, n)
@ -246,6 +246,10 @@ class CDXObject(OrderedDict):
res = (self._cached_json <= other._cached_json)
return res
@classmethod
def json_decode(cls, string):
return json_decode(string, object_pairs_hook=OrderedDict)
#=================================================================
class IDXObject(OrderedDict):