wb-manager acl command: support manipulating sorted access-list .aclj files via command-line (ukwa/ukwa-pywb#7)

- support as target an auto-collection, where acl file added automatically in ./collections/<coll>/acl/access-rules.aclj or specifying an .aclj explicitly for more custom configs - support adding urls and surts, determine if url is already a surt, otherwise canonicalize acl commands include: - acl add <target_file_or_coll> <url_or_surt> <access> -- add (or replace) rule for url/surt with access level <access> - acl remove <target_filr_or_coll> <url_or_surt> -- remove url/surt from target - acl list <target_file_or_coll> -- list all rules for target - acl validate <target_file_or_coll> -- ensure sort order is correct, otherwise fix and save - acl match <target_file_or_coll> <url> -- find matching rule, if any, in target for specified url, or print no match/default rule - acl importtxt <target_file_or_coll> <filename> -- bulk import of 'excludes.txt' style rules, one url-per-line and add to target
2025-03-15 00:03:28 +01:00 · 2018-02-20 23:16:51 -08:00 · 2018-02-20 23:16:51 -08:00 · bfa3aa7264
commit bfa3aa7264
parent a3f81dcc0f
4 changed files with 277 additions and 2 deletions
--- a/pywb/manager/aclmanager.py
+++ b/pywb/manager/aclmanager.py
@ -0,0 +1,253 @@
+import os
+import sys
+import json
+import re
+
+from argparse import ArgumentParser, RawTextHelpFormatter
+from collections import OrderedDict
+
+from pywb.manager.manager import CollectionsManager
+from pywb.warcserver.index.cdxobject import CDXObject
+from pywb.utils.canonicalize import canonicalize
+
+from pywb.warcserver.access_checker import AccessChecker
+
+
+# ============================================================================
+class ACLManager(CollectionsManager):
+    SURT_RX = re.compile('([^:.]+[,)])+')
+
+    VALID_ACCESS = ('allow', 'block', 'exclude')
+
+    DEFAULT_FILE = 'access-rules.aclj'
+
+    def __init__(self, r):
+        self.rules = []
+
+        coll_name = r.coll_name
+        if not self.is_valid_auto_coll(r.coll_name):
+            coll_name = ''
+
+        self.target = r.coll_name
+
+        super(ACLManager, self).__init__(coll_name, must_exist=False)
+
+        # if auto collection, use default file in ./collections/<coll>/acl/<DEFAULT_FILE>
+        if os.path.isdir(self.curr_coll_dir):
+            self.acl_file = os.path.join(self.acl_dir, self.DEFAULT_FILE)
+
+        # else, treat the 'r.coll_name' param as the filename
+        else:
+            self.acl_file = r.coll_name
+
+        if r.op == 'add':
+            self.load_acl(False)
+
+        else:
+            if not self.load_acl(True):
+                return
+
+        # if 'validate', the command itself is validation
+        if r.op != 'validate':
+            self.validate()
+
+        r.acl_func(self, r)
+
+    def is_valid_auto_coll(self, coll_name):
+        if not self.COLL_RX.match(coll_name):
+            return False
+
+        if not os.path.isdir(os.path.join(self.COLLS_DIR, coll_name)):
+            return False
+
+        return True
+
+    def load_acl(self, must_exist=True):
+        try:
+            with open(self.acl_file, 'rb') as fh:
+                for line in fh:
+                    if line:
+                        self.rules.append(CDXObject(line))
+
+            return True
+
+        except IOError as io:
+            if must_exist:
+                print('Error Occured: ' + str(io))
+            return False
+
+        except Exception as e:
+            print('Error Occured: ' + str(e))
+            return False
+
+    def save_acl(self, r=None):
+        try:
+            os.makedirs(os.path.dirname(self.acl_file))
+        except IOError:
+            pass
+
+        try:
+            with open(self.acl_file, 'wb') as fh:
+                for acl in self.rules:
+                    fh.write(acl.to_cdxj().encode('utf-8'))
+
+        except Exception as e:
+            print('Error Saving ACL Rules: ' + str(e))
+
+    def to_key(self, url_or_surt):
+        """ If 'url_or_surt' already a SURT, use as is
+        """
+        if self.SURT_RX.search(url_or_surt):
+            return url_or_surt
+        else:
+            return canonicalize(url_or_surt)
+
+    def validate_access(self, access):
+        if access not in self.VALID_ACCESS:
+            print('Valid access values are: ' + ', '.join(self.VALID_ACCESS))
+            return False
+
+        return True
+
+    def add_rule(self, r):
+        return self._add_rule(r.url, r.access)
+
+    def _add_rule(self, url, access):
+        if not self.validate_access(access):
+            return
+
+        acl = CDXObject()
+        acl['urlkey'] = self.to_key(url)
+        acl['timestamp'] = '-'
+        acl['access'] = access
+        acl['url'] = url
+
+        i = 0
+        replace = False
+
+        for rule in self.rules:
+            if acl['urlkey'] == rule['urlkey'] and acl['timestamp'] == rule['timestamp']:
+                replace = True
+                break
+
+            if acl > rule:
+                break
+
+            i += 1
+
+        if replace:
+            print('Existing Rule Found, Replacing:')
+            self.print_rule(self.rules[i])
+            print('with:')
+            self.print_rule(acl)
+            self.rules[i] = acl
+        else:
+            print('Added new Rule:')
+            self.print_rule(acl)
+            self.rules.insert(i, acl)
+
+        self.save_acl()
+
+    def validate_save(self, r=None):
+        if self.validate(True):
+            self.save_acl()
+
+    def validate(self, log=False):
+        last_rule = None
+        out_of_order = False
+        for rule in self.rules:
+            if last_rule and rule > last_rule:
+                out_of_order = True
+                break
+
+            last_rule = rule
+
+        if out_of_order:
+            if log:
+                print('Rules out of order, resorting')
+            self.rules.sort(reverse=True)
+            return True
+        else:
+            if log:
+                print('Rules in order')
+
+            return False
+
+    def remove_rule(self, r):
+        i = 0
+        urlkey = self.to_key(r.url)
+        for rule in self.rules:
+            if urlkey == rule['urlkey']:# and r.timestamp == rule['timestamp']:
+                acl = self.rules.pop(i)
+                print('Removed Rule:')
+                self.print_rule(acl)
+                self.save_acl()
+                return
+
+            i += 1
+
+        print('Rule to remove not found!')
+
+    def list_rules(self, r):
+        print('Rules for {0} from {1}:'.format(self.target, self.acl_file))
+        print('')
+        for rule in self.rules:
+            sys.stdout.write(rule.to_cdxj())
+        print('')
+
+    def find_match(self, r):
+        access_checker = AccessChecker(self.acl_file, '<default>')
+        rule = access_checker.find_access_rule(r.url)
+
+        print('Matched rule:')
+        print('')
+        if rule['urlkey'] == '':
+            print('    <No Match, Using Default Rule>')
+            print('')
+        else:
+            self.print_rule(rule)
+
+    def add_excludes(self, r):
+        """
+        Import old-style excludes, in url-per-line format
+        """
+        if not self.validate_access(r.access):
+            return
+
+        try:
+            with open(r.filename, 'rb') as fh:
+                count = 0
+                for url in fh:
+                    url = url.decode('utf-8').strip()
+                    self._add_rule(url, r.access)
+                    count += 1
+
+            print('Added or replaced {0} rules from '.format(count) + r.filename)
+
+        except Exception as e:
+            print('Error Importing: ' + str(e))
+
+    def print_rule(self, rule):
+        print('    ' + rule.to_cdxj())
+
+    @classmethod
+    def init_parser(cls, parser):
+        subparsers = parser.add_subparsers(dest='op')
+        subparsers.required = True
+
+        def command(name, *args, func=None):
+            op = subparsers.add_parser(name)
+            for arg in args:
+                if arg == 'default_access':
+                    op.add_argument(arg, nargs='?', default='allow')
+                else:
+                    op.add_argument(arg)
+            op.set_defaults(acl_func=func)
+
+        command('add', 'coll_name', 'url', 'access', func=cls.add_rule)
+        command('remove', 'coll_name', 'url', func=cls.remove_rule)
+        command('list', 'coll_name', func=cls.list_rules)
+        command('validate', 'coll_name', func=cls.validate_save)
+        command('match', 'coll_name', 'url', 'default_access', func=cls.find_match)
+        command('importtxt', 'coll_name', 'filename', 'access', func=cls.add_excludes)
+
--- a/pywb/manager/manager.py
+++ b/pywb/manager/manager.py
@ -19,6 +19,7 @@ from pywb import DEFAULT_CONFIG

 from six.moves import input

+
 #=============================================================================
 # to allow testing by mocking get_input

@ -66,6 +67,8 @@ directory structure expected by pywb
        self.static_dir = self._get_dir('static_path')
        self.templates_dir = self._get_dir('templates_dir')

+        self.acl_dir = self._get_dir('acl_paths')
+
    def list_colls(self):
        print('Collections:')
        if not os.path.isdir(self.colls_dir):
@ -427,6 +430,16 @@ Create manage file based web archive collections
    migrate.add_argument('-f', '--force', action='store_true')
    migrate.set_defaults(func=do_migrate)

+    # ACL
+    from pywb.manager.aclmanager import ACLManager
+    def do_acl(r):
+        acl = ACLManager(r)
+
+    acl_help = 'Configure Access Control Lists (ACL) for a collection'
+    acl = subparsers.add_parser('acl', help=acl_help)
+    ACLManager.init_parser(acl)
+    acl.set_defaults(func=do_acl)
+
    # Parse
    r = parser.parse_args(args=args)
    r.func(r)
--- a/pywb/warcserver/access_checker.py
+++ b/pywb/warcserver/access_checker.py
@ -1,6 +1,7 @@
 from pywb.warcserver.index.indexsource import FileIndexSource
 from pywb.warcserver.index.aggregator import DirectoryIndexSource, CacheDirectoryMixin
 from pywb.warcserver.index.aggregator import SimpleAggregator
+from pywb.warcserver.index.cdxobject import CDXObject

 from pywb.utils.binsearch import search
 from pywb.utils.merge import merge
@ -49,7 +50,11 @@ class AccessChecker(object):
        else:
            self.aggregator = access_source

-        self.default_rule = {'urlkey': '', 'access': default_access}
+        self.default_rule = CDXObject()
+        self.default_rule['urlkey'] = ''
+        self.default_rule['timestamp'] = '-'
+        self.default_rule['access'] = default_access
+        self.default_rule['default'] = 'true'

    def create_access_aggregator(self, source_files):
        sources = {}
--- a/pywb/warcserver/index/cdxobject.py
+++ b/pywb/warcserver/index/cdxobject.py
@ -121,7 +121,7 @@ class CDXObject(OrderedDict):
        if fields[-1].startswith(b'{'):
            self[URLKEY] = to_native_str(fields[0], 'utf-8')
            self[TIMESTAMP] = to_native_str(fields[1], 'utf-8')
-            json_fields = json_decode(to_native_str(fields[-1], 'utf-8'))
+            json_fields = self.json_decode(to_native_str(fields[-1], 'utf-8'))
            for n, v in six.iteritems(json_fields):
                n = to_native_str(n, 'utf-8')
                n = self.CDX_ALT_FIELDS.get(n, n)
@ -246,6 +246,10 @@ class CDXObject(OrderedDict):
        res = (self._cached_json <= other._cached_json)
        return res

+    @classmethod
+    def json_decode(cls, string):
+        return json_decode(string, object_pairs_hook=OrderedDict)
+

 #=================================================================
 class IDXObject(OrderedDict):