From bfa3aa7264d01780d87c54e0db855b45d6cc06d2 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 20 Feb 2018 23:16:51 -0800 Subject: [PATCH] wb-manager acl command: support manipulating sorted access-list .aclj files via command-line (ukwa/ukwa-pywb#7) - support as target an auto-collection, where acl file added automatically in ./collections//acl/access-rules.aclj or specifying an .aclj explicitly for more custom configs - support adding urls and surts, determine if url is already a surt, otherwise canonicalize acl commands include: - acl add -- add (or replace) rule for url/surt with access level - acl remove -- remove url/surt from target - acl list -- list all rules for target - acl validate -- ensure sort order is correct, otherwise fix and save - acl match -- find matching rule, if any, in target for specified url, or print no match/default rule - acl importtxt -- bulk import of 'excludes.txt' style rules, one url-per-line and add to target --- pywb/manager/aclmanager.py | 253 +++++++++++++++++++++++++++++ pywb/manager/manager.py | 13 ++ pywb/warcserver/access_checker.py | 7 +- pywb/warcserver/index/cdxobject.py | 6 +- 4 files changed, 277 insertions(+), 2 deletions(-) create mode 100644 pywb/manager/aclmanager.py diff --git a/pywb/manager/aclmanager.py b/pywb/manager/aclmanager.py new file mode 100644 index 00000000..7319af89 --- /dev/null +++ b/pywb/manager/aclmanager.py @@ -0,0 +1,253 @@ +import os +import sys +import json +import re + +from argparse import ArgumentParser, RawTextHelpFormatter +from collections import OrderedDict + +from pywb.manager.manager import CollectionsManager +from pywb.warcserver.index.cdxobject import CDXObject +from pywb.utils.canonicalize import canonicalize + +from pywb.warcserver.access_checker import AccessChecker + + +# ============================================================================ +class ACLManager(CollectionsManager): + SURT_RX = re.compile('([^:.]+[,)])+') + + VALID_ACCESS = ('allow', 'block', 'exclude') + + DEFAULT_FILE = 'access-rules.aclj' + + def __init__(self, r): + self.rules = [] + + coll_name = r.coll_name + if not self.is_valid_auto_coll(r.coll_name): + coll_name = '' + + self.target = r.coll_name + + super(ACLManager, self).__init__(coll_name, must_exist=False) + + # if auto collection, use default file in ./collections//acl/ + if os.path.isdir(self.curr_coll_dir): + self.acl_file = os.path.join(self.acl_dir, self.DEFAULT_FILE) + + # else, treat the 'r.coll_name' param as the filename + else: + self.acl_file = r.coll_name + + if r.op == 'add': + self.load_acl(False) + + else: + if not self.load_acl(True): + return + + # if 'validate', the command itself is validation + if r.op != 'validate': + self.validate() + + r.acl_func(self, r) + + def is_valid_auto_coll(self, coll_name): + if not self.COLL_RX.match(coll_name): + return False + + if not os.path.isdir(os.path.join(self.COLLS_DIR, coll_name)): + return False + + return True + + def load_acl(self, must_exist=True): + try: + with open(self.acl_file, 'rb') as fh: + for line in fh: + if line: + self.rules.append(CDXObject(line)) + + return True + + except IOError as io: + if must_exist: + print('Error Occured: ' + str(io)) + return False + + except Exception as e: + print('Error Occured: ' + str(e)) + return False + + def save_acl(self, r=None): + try: + os.makedirs(os.path.dirname(self.acl_file)) + except IOError: + pass + + try: + with open(self.acl_file, 'wb') as fh: + for acl in self.rules: + fh.write(acl.to_cdxj().encode('utf-8')) + + except Exception as e: + print('Error Saving ACL Rules: ' + str(e)) + + def to_key(self, url_or_surt): + """ If 'url_or_surt' already a SURT, use as is + """ + if self.SURT_RX.search(url_or_surt): + return url_or_surt + else: + return canonicalize(url_or_surt) + + def validate_access(self, access): + if access not in self.VALID_ACCESS: + print('Valid access values are: ' + ', '.join(self.VALID_ACCESS)) + return False + + return True + + def add_rule(self, r): + return self._add_rule(r.url, r.access) + + def _add_rule(self, url, access): + if not self.validate_access(access): + return + + acl = CDXObject() + acl['urlkey'] = self.to_key(url) + acl['timestamp'] = '-' + acl['access'] = access + acl['url'] = url + + i = 0 + replace = False + + for rule in self.rules: + if acl['urlkey'] == rule['urlkey'] and acl['timestamp'] == rule['timestamp']: + replace = True + break + + if acl > rule: + break + + i += 1 + + if replace: + print('Existing Rule Found, Replacing:') + self.print_rule(self.rules[i]) + print('with:') + self.print_rule(acl) + self.rules[i] = acl + else: + print('Added new Rule:') + self.print_rule(acl) + self.rules.insert(i, acl) + + self.save_acl() + + def validate_save(self, r=None): + if self.validate(True): + self.save_acl() + + def validate(self, log=False): + last_rule = None + out_of_order = False + for rule in self.rules: + if last_rule and rule > last_rule: + out_of_order = True + break + + last_rule = rule + + if out_of_order: + if log: + print('Rules out of order, resorting') + self.rules.sort(reverse=True) + return True + else: + if log: + print('Rules in order') + + return False + + def remove_rule(self, r): + i = 0 + urlkey = self.to_key(r.url) + for rule in self.rules: + if urlkey == rule['urlkey']:# and r.timestamp == rule['timestamp']: + acl = self.rules.pop(i) + print('Removed Rule:') + self.print_rule(acl) + self.save_acl() + return + + i += 1 + + print('Rule to remove not found!') + + def list_rules(self, r): + print('Rules for {0} from {1}:'.format(self.target, self.acl_file)) + print('') + for rule in self.rules: + sys.stdout.write(rule.to_cdxj()) + print('') + + def find_match(self, r): + access_checker = AccessChecker(self.acl_file, '') + rule = access_checker.find_access_rule(r.url) + + print('Matched rule:') + print('') + if rule['urlkey'] == '': + print(' ') + print('') + else: + self.print_rule(rule) + + def add_excludes(self, r): + """ + Import old-style excludes, in url-per-line format + """ + if not self.validate_access(r.access): + return + + try: + with open(r.filename, 'rb') as fh: + count = 0 + for url in fh: + url = url.decode('utf-8').strip() + self._add_rule(url, r.access) + count += 1 + + print('Added or replaced {0} rules from '.format(count) + r.filename) + + except Exception as e: + print('Error Importing: ' + str(e)) + + def print_rule(self, rule): + print(' ' + rule.to_cdxj()) + + @classmethod + def init_parser(cls, parser): + subparsers = parser.add_subparsers(dest='op') + subparsers.required = True + + def command(name, *args, func=None): + op = subparsers.add_parser(name) + for arg in args: + if arg == 'default_access': + op.add_argument(arg, nargs='?', default='allow') + else: + op.add_argument(arg) + op.set_defaults(acl_func=func) + + command('add', 'coll_name', 'url', 'access', func=cls.add_rule) + command('remove', 'coll_name', 'url', func=cls.remove_rule) + command('list', 'coll_name', func=cls.list_rules) + command('validate', 'coll_name', func=cls.validate_save) + command('match', 'coll_name', 'url', 'default_access', func=cls.find_match) + command('importtxt', 'coll_name', 'filename', 'access', func=cls.add_excludes) + diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index 04fe90f2..d86b12d7 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -19,6 +19,7 @@ from pywb import DEFAULT_CONFIG from six.moves import input + #============================================================================= # to allow testing by mocking get_input @@ -66,6 +67,8 @@ directory structure expected by pywb self.static_dir = self._get_dir('static_path') self.templates_dir = self._get_dir('templates_dir') + self.acl_dir = self._get_dir('acl_paths') + def list_colls(self): print('Collections:') if not os.path.isdir(self.colls_dir): @@ -427,6 +430,16 @@ Create manage file based web archive collections migrate.add_argument('-f', '--force', action='store_true') migrate.set_defaults(func=do_migrate) + # ACL + from pywb.manager.aclmanager import ACLManager + def do_acl(r): + acl = ACLManager(r) + + acl_help = 'Configure Access Control Lists (ACL) for a collection' + acl = subparsers.add_parser('acl', help=acl_help) + ACLManager.init_parser(acl) + acl.set_defaults(func=do_acl) + # Parse r = parser.parse_args(args=args) r.func(r) diff --git a/pywb/warcserver/access_checker.py b/pywb/warcserver/access_checker.py index aaaf67b7..efe650cc 100644 --- a/pywb/warcserver/access_checker.py +++ b/pywb/warcserver/access_checker.py @@ -1,6 +1,7 @@ from pywb.warcserver.index.indexsource import FileIndexSource from pywb.warcserver.index.aggregator import DirectoryIndexSource, CacheDirectoryMixin from pywb.warcserver.index.aggregator import SimpleAggregator +from pywb.warcserver.index.cdxobject import CDXObject from pywb.utils.binsearch import search from pywb.utils.merge import merge @@ -49,7 +50,11 @@ class AccessChecker(object): else: self.aggregator = access_source - self.default_rule = {'urlkey': '', 'access': default_access} + self.default_rule = CDXObject() + self.default_rule['urlkey'] = '' + self.default_rule['timestamp'] = '-' + self.default_rule['access'] = default_access + self.default_rule['default'] = 'true' def create_access_aggregator(self, source_files): sources = {} diff --git a/pywb/warcserver/index/cdxobject.py b/pywb/warcserver/index/cdxobject.py index 8f050cb1..5729f879 100644 --- a/pywb/warcserver/index/cdxobject.py +++ b/pywb/warcserver/index/cdxobject.py @@ -121,7 +121,7 @@ class CDXObject(OrderedDict): if fields[-1].startswith(b'{'): self[URLKEY] = to_native_str(fields[0], 'utf-8') self[TIMESTAMP] = to_native_str(fields[1], 'utf-8') - json_fields = json_decode(to_native_str(fields[-1], 'utf-8')) + json_fields = self.json_decode(to_native_str(fields[-1], 'utf-8')) for n, v in six.iteritems(json_fields): n = to_native_str(n, 'utf-8') n = self.CDX_ALT_FIELDS.get(n, n) @@ -246,6 +246,10 @@ class CDXObject(OrderedDict): res = (self._cached_json <= other._cached_json) return res + @classmethod + def json_decode(cls, string): + return json_decode(string, object_pairs_hook=OrderedDict) + #================================================================= class IDXObject(OrderedDict):