2018-02-20 23:16:51 -08:00
|
|
|
import os
|
|
|
|
import re
|
2019-04-03 18:02:25 -04:00
|
|
|
import sys
|
2018-02-20 23:16:51 -08:00
|
|
|
|
|
|
|
from pywb.manager.manager import CollectionsManager
|
|
|
|
from pywb.utils.canonicalize import canonicalize
|
|
|
|
from pywb.warcserver.access_checker import AccessChecker
|
2019-04-03 18:02:25 -04:00
|
|
|
from pywb.warcserver.index.cdxobject import CDXObject
|
2018-02-20 23:16:51 -08:00
|
|
|
|
|
|
|
|
|
|
|
# ============================================================================
|
|
|
|
class ACLManager(CollectionsManager):
|
|
|
|
SURT_RX = re.compile('([^:.]+[,)])+')
|
|
|
|
|
2021-05-18 20:09:18 -07:00
|
|
|
VALID_ACCESS = ('allow', 'block', 'exclude', 'allow_ignore_embargo')
|
2018-02-20 23:16:51 -08:00
|
|
|
|
|
|
|
DEFAULT_FILE = 'access-rules.aclj'
|
|
|
|
|
|
|
|
def __init__(self, r):
|
2019-04-03 12:32:09 -07:00
|
|
|
"""
|
2019-04-03 18:02:25 -04:00
|
|
|
:param argparse.Namespace r: Parsed result from ArgumentParser
|
|
|
|
:rtype: None
|
2019-04-03 12:32:09 -07:00
|
|
|
"""
|
2018-02-20 23:16:51 -08:00
|
|
|
self.rules = []
|
|
|
|
|
|
|
|
coll_name = r.coll_name
|
|
|
|
if not self.is_valid_auto_coll(r.coll_name):
|
|
|
|
coll_name = ''
|
|
|
|
|
|
|
|
self.target = r.coll_name
|
|
|
|
|
|
|
|
super(ACLManager, self).__init__(coll_name, must_exist=False)
|
|
|
|
|
2019-04-03 18:02:25 -04:00
|
|
|
self.acl_file = None
|
|
|
|
|
2019-04-03 12:32:09 -07:00
|
|
|
def process(self, r):
|
|
|
|
"""
|
|
|
|
Process acl command
|
2019-04-03 18:02:25 -04:00
|
|
|
|
|
|
|
:param argparse.Namespace r: Parsed result from ArgumentParser
|
|
|
|
:rtype: None
|
2019-04-03 12:32:09 -07:00
|
|
|
"""
|
|
|
|
|
2019-02-14 19:52:46 -08:00
|
|
|
# if target exists as a file, use that
|
|
|
|
if os.path.isfile(self.target):
|
|
|
|
self.acl_file = self.target
|
|
|
|
|
|
|
|
# otherwise, if auto collection, use default file in ./collections/<coll>/acl/<DEFAULT_FILE>
|
|
|
|
elif os.path.isdir(self.curr_coll_dir):
|
2018-02-20 23:16:51 -08:00
|
|
|
self.acl_file = os.path.join(self.acl_dir, self.DEFAULT_FILE)
|
|
|
|
|
2019-02-14 19:52:46 -08:00
|
|
|
# else, assume filename (may not exist yet)
|
2018-02-20 23:16:51 -08:00
|
|
|
else:
|
2019-02-14 19:52:46 -08:00
|
|
|
self.acl_file = self.target
|
2018-02-20 23:16:51 -08:00
|
|
|
|
2018-02-21 12:56:22 -08:00
|
|
|
# for add/import, file doesn't have to exist
|
|
|
|
if r.op in ('add', 'importtxt'):
|
2018-02-20 23:16:51 -08:00
|
|
|
self.load_acl(False)
|
|
|
|
|
2019-02-14 19:52:46 -08:00
|
|
|
# for other ops (except matching), ensure entire file loads successfully, log errors
|
|
|
|
elif r.op not in ('match'):
|
2018-02-20 23:16:51 -08:00
|
|
|
if not self.load_acl(True):
|
2018-02-21 12:56:22 -08:00
|
|
|
sys.exit(2)
|
2018-02-20 23:16:51 -08:00
|
|
|
return
|
|
|
|
|
|
|
|
# if 'validate', the command itself is validation
|
|
|
|
if r.op != 'validate':
|
|
|
|
self.validate()
|
|
|
|
|
|
|
|
r.acl_func(self, r)
|
|
|
|
|
|
|
|
def is_valid_auto_coll(self, coll_name):
|
2019-04-03 18:02:25 -04:00
|
|
|
"""Returns T/F indicating if the supplied collection name
|
|
|
|
is a valid collection
|
|
|
|
|
|
|
|
:param coll_name: The collection name to check
|
|
|
|
:return: T/F indicating a valid collection
|
|
|
|
:rtype: bool
|
|
|
|
"""
|
2018-02-20 23:16:51 -08:00
|
|
|
if not self.COLL_RX.match(coll_name):
|
|
|
|
return False
|
|
|
|
|
|
|
|
if not os.path.isdir(os.path.join(self.COLLS_DIR, coll_name)):
|
|
|
|
return False
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
def load_acl(self, must_exist=True):
|
2019-04-03 18:02:25 -04:00
|
|
|
"""Loads the access control list
|
|
|
|
|
|
|
|
:param bool must_exist: Does the acl file have to exist
|
|
|
|
:return: T/F indicating load success
|
|
|
|
:rtype: bool
|
|
|
|
"""
|
2018-02-20 23:16:51 -08:00
|
|
|
try:
|
|
|
|
with open(self.acl_file, 'rb') as fh:
|
|
|
|
for line in fh:
|
|
|
|
if line:
|
|
|
|
self.rules.append(CDXObject(line))
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
except IOError as io:
|
|
|
|
if must_exist:
|
2023-02-15 18:04:20 +00:00
|
|
|
print('Error Occurred: ' + str(io))
|
2018-02-20 23:16:51 -08:00
|
|
|
return False
|
|
|
|
|
|
|
|
except Exception as e:
|
2023-02-15 18:04:20 +00:00
|
|
|
print('Error Occurred: ' + str(e))
|
2018-02-20 23:16:51 -08:00
|
|
|
return False
|
|
|
|
|
|
|
|
def save_acl(self, r=None):
|
2019-04-03 18:02:25 -04:00
|
|
|
"""Save the contents of the rules as cdxj entries to
|
|
|
|
the access control list file
|
|
|
|
|
|
|
|
:param argparse.Namespace|None r: Not used
|
|
|
|
:rtype: None
|
|
|
|
"""
|
2018-02-20 23:16:51 -08:00
|
|
|
try:
|
|
|
|
os.makedirs(os.path.dirname(self.acl_file))
|
2018-02-21 12:56:22 -08:00
|
|
|
except OSError:
|
2018-02-20 23:16:51 -08:00
|
|
|
pass
|
|
|
|
|
|
|
|
try:
|
|
|
|
with open(self.acl_file, 'wb') as fh:
|
|
|
|
for acl in self.rules:
|
|
|
|
fh.write(acl.to_cdxj().encode('utf-8'))
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
print('Error Saving ACL Rules: ' + str(e))
|
|
|
|
|
2019-03-08 10:10:02 -08:00
|
|
|
def to_key(self, url_or_surt, exact_match=False):
|
2018-02-20 23:16:51 -08:00
|
|
|
""" If 'url_or_surt' already a SURT, use as is
|
2019-03-08 10:10:02 -08:00
|
|
|
If exact match, add the exact match suffix
|
2019-04-03 18:02:25 -04:00
|
|
|
|
|
|
|
:param str url_or_surt: The url or surt to be converted to an acl key
|
|
|
|
:param bool exact_match: Should the exact match suffix be added to key
|
|
|
|
:rtype: str
|
2018-02-20 23:16:51 -08:00
|
|
|
"""
|
|
|
|
if self.SURT_RX.search(url_or_surt):
|
2019-03-08 10:10:02 -08:00
|
|
|
result = url_or_surt
|
2018-02-20 23:16:51 -08:00
|
|
|
else:
|
2019-03-08 10:10:02 -08:00
|
|
|
result = canonicalize(url_or_surt)
|
|
|
|
|
|
|
|
if exact_match:
|
|
|
|
result += AccessChecker.EXACT_SUFFIX
|
|
|
|
|
|
|
|
return result
|
2018-02-20 23:16:51 -08:00
|
|
|
|
|
|
|
def validate_access(self, access):
|
2019-04-03 18:02:25 -04:00
|
|
|
"""Returns true if the supplied access value is valid
|
2019-04-03 18:06:57 -04:00
|
|
|
otherwise terminates the process
|
2019-04-03 18:02:25 -04:00
|
|
|
|
|
|
|
:param str access: The access value to be validated
|
|
|
|
:return: True if valid
|
|
|
|
:rtype: bool
|
|
|
|
"""
|
2018-02-20 23:16:51 -08:00
|
|
|
if access not in self.VALID_ACCESS:
|
|
|
|
print('Valid access values are: ' + ', '.join(self.VALID_ACCESS))
|
2018-02-21 12:56:22 -08:00
|
|
|
sys.exit(1)
|
2018-02-20 23:16:51 -08:00
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
def add_rule(self, r):
|
2019-04-03 18:02:25 -04:00
|
|
|
"""Adds a rule the ACL manager
|
|
|
|
|
|
|
|
:param argparse.Namespace r: The argparse namespace representing the rule to be added
|
|
|
|
:rtype: None
|
|
|
|
"""
|
2021-05-18 20:09:18 -07:00
|
|
|
return self._add_rule(r.url, r.access, r.exact_match, r.user)
|
2018-02-20 23:16:51 -08:00
|
|
|
|
2021-05-18 20:09:18 -07:00
|
|
|
def _add_rule(self, url, access, exact_match=False, user=None):
|
2019-04-03 18:02:25 -04:00
|
|
|
"""Adds an rule to the acl file
|
|
|
|
|
|
|
|
:param str url: The URL for the rule
|
|
|
|
:param str access: The access value for the rule
|
2019-04-03 18:06:57 -04:00
|
|
|
:param bool exact_match: Is the rule to be added an exact match
|
2019-04-03 18:02:25 -04:00
|
|
|
:rtype: None
|
|
|
|
"""
|
2018-02-20 23:16:51 -08:00
|
|
|
if not self.validate_access(access):
|
|
|
|
return
|
|
|
|
|
|
|
|
acl = CDXObject()
|
2019-03-08 10:10:02 -08:00
|
|
|
acl['urlkey'] = self.to_key(url, exact_match)
|
2018-02-20 23:16:51 -08:00
|
|
|
acl['timestamp'] = '-'
|
|
|
|
acl['access'] = access
|
|
|
|
acl['url'] = url
|
2021-05-18 20:09:18 -07:00
|
|
|
if user:
|
|
|
|
acl['user'] = user
|
2018-02-20 23:16:51 -08:00
|
|
|
|
|
|
|
i = 0
|
|
|
|
replace = False
|
|
|
|
|
|
|
|
for rule in self.rules:
|
2021-05-18 20:09:18 -07:00
|
|
|
if acl['urlkey'] == rule['urlkey'] and acl['timestamp'] == rule['timestamp'] and acl.get('user') == rule.get('user'):
|
2018-02-20 23:16:51 -08:00
|
|
|
replace = True
|
|
|
|
break
|
|
|
|
|
|
|
|
if acl > rule:
|
|
|
|
break
|
|
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
|
|
if replace:
|
|
|
|
print('Existing Rule Found, Replacing:')
|
|
|
|
self.print_rule(self.rules[i])
|
|
|
|
print('with:')
|
|
|
|
self.print_rule(acl)
|
|
|
|
self.rules[i] = acl
|
|
|
|
else:
|
|
|
|
print('Added new Rule:')
|
|
|
|
self.print_rule(acl)
|
|
|
|
self.rules.insert(i, acl)
|
|
|
|
|
|
|
|
self.save_acl()
|
|
|
|
|
2019-04-03 18:02:25 -04:00
|
|
|
def validate_save(self, r=None, log=False):
|
|
|
|
"""Validates the acl rules and saves the file
|
2018-02-20 23:16:51 -08:00
|
|
|
|
2019-04-03 18:02:25 -04:00
|
|
|
:param argparse.Namespace|None r: Not used
|
|
|
|
:param bool log: Should a report be printed to stdout
|
|
|
|
:rtype: None
|
|
|
|
"""
|
|
|
|
self.validate(log=log, correct=True)
|
|
|
|
|
|
|
|
def validate(self, log=False, correct=False):
|
|
|
|
"""Validates the acl rules returning T/F if the list should be saved
|
|
|
|
|
|
|
|
:param bool log: Should the results of validating be logged to stdout
|
|
|
|
:param bool correct: Should invalid results be corrected and saved
|
|
|
|
:rtype: None
|
|
|
|
"""
|
2018-02-20 23:16:51 -08:00
|
|
|
last_rule = None
|
|
|
|
out_of_order = False
|
|
|
|
for rule in self.rules:
|
|
|
|
if last_rule and rule > last_rule:
|
|
|
|
out_of_order = True
|
|
|
|
break
|
|
|
|
|
|
|
|
last_rule = rule
|
|
|
|
|
|
|
|
if out_of_order:
|
|
|
|
if log:
|
|
|
|
print('Rules out of order, resorting')
|
2019-04-03 18:02:25 -04:00
|
|
|
if correct:
|
|
|
|
self.rules.sort(reverse=True)
|
|
|
|
self.save_acl()
|
|
|
|
elif log:
|
|
|
|
print('Rules in order')
|
2018-02-20 23:16:51 -08:00
|
|
|
|
|
|
|
def remove_rule(self, r):
|
2019-04-03 18:02:25 -04:00
|
|
|
"""Removes a rule from the acl file
|
|
|
|
|
|
|
|
:param argparse.Namespace r: Parsed result from ArgumentParser
|
|
|
|
:rtype: None
|
|
|
|
"""
|
2018-02-20 23:16:51 -08:00
|
|
|
i = 0
|
2019-03-08 10:10:02 -08:00
|
|
|
urlkey = self.to_key(r.url, r.exact_match)
|
2018-02-20 23:16:51 -08:00
|
|
|
for rule in self.rules:
|
2021-05-18 20:09:18 -07:00
|
|
|
if urlkey == rule['urlkey'] and r.user == rule.get('user'):
|
2018-02-20 23:16:51 -08:00
|
|
|
acl = self.rules.pop(i)
|
|
|
|
print('Removed Rule:')
|
|
|
|
self.print_rule(acl)
|
|
|
|
self.save_acl()
|
|
|
|
return
|
|
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
|
|
print('Rule to remove not found!')
|
|
|
|
|
|
|
|
def list_rules(self, r):
|
2019-04-03 18:02:25 -04:00
|
|
|
"""Print the acl rules to the stdout
|
|
|
|
|
|
|
|
:param argparse.Namespace|None r: Not used
|
|
|
|
:rtype: None
|
|
|
|
"""
|
2018-02-20 23:16:51 -08:00
|
|
|
print('Rules for {0} from {1}:'.format(self.target, self.acl_file))
|
|
|
|
print('')
|
|
|
|
for rule in self.rules:
|
|
|
|
sys.stdout.write(rule.to_cdxj())
|
|
|
|
print('')
|
|
|
|
|
|
|
|
def find_match(self, r):
|
2019-04-03 18:02:25 -04:00
|
|
|
"""Finds a matching acl rule
|
|
|
|
|
|
|
|
:param argparse.Namespace r: Parsed result from ArgumentParser
|
|
|
|
:rtype: None
|
|
|
|
"""
|
2018-02-20 23:16:51 -08:00
|
|
|
access_checker = AccessChecker(self.acl_file, '<default>')
|
2021-05-18 20:09:18 -07:00
|
|
|
rule = access_checker.find_access_rule(r.url, acl_user=r.user)
|
2018-02-20 23:16:51 -08:00
|
|
|
|
|
|
|
print('Matched rule:')
|
|
|
|
print('')
|
|
|
|
if rule['urlkey'] == '':
|
|
|
|
print(' <No Match, Using Default Rule>')
|
|
|
|
print('')
|
|
|
|
else:
|
|
|
|
self.print_rule(rule)
|
|
|
|
|
|
|
|
def add_excludes(self, r):
|
|
|
|
"""
|
|
|
|
Import old-style excludes, in url-per-line format
|
2019-04-03 18:02:25 -04:00
|
|
|
|
|
|
|
:param argparse.Namespace r: Parsed result from ArgumentParser
|
2018-02-20 23:16:51 -08:00
|
|
|
"""
|
|
|
|
if not self.validate_access(r.access):
|
|
|
|
return
|
|
|
|
|
|
|
|
try:
|
|
|
|
with open(r.filename, 'rb') as fh:
|
|
|
|
count = 0
|
|
|
|
for url in fh:
|
|
|
|
url = url.decode('utf-8').strip()
|
|
|
|
self._add_rule(url, r.access)
|
|
|
|
count += 1
|
|
|
|
|
|
|
|
print('Added or replaced {0} rules from '.format(count) + r.filename)
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
print('Error Importing: ' + str(e))
|
2018-02-21 12:56:22 -08:00
|
|
|
sys.exit(1)
|
2018-02-20 23:16:51 -08:00
|
|
|
|
|
|
|
def print_rule(self, rule):
|
2019-04-03 18:02:25 -04:00
|
|
|
"""Prints the supplied rule to the std out
|
|
|
|
|
|
|
|
:param CDXObject rule: The rule to be printed
|
|
|
|
:rtype: None
|
|
|
|
"""
|
2018-02-20 23:16:51 -08:00
|
|
|
print(' ' + rule.to_cdxj())
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def init_parser(cls, parser):
|
2019-04-03 18:02:25 -04:00
|
|
|
"""Initializes an argument parser for acl commands
|
|
|
|
|
|
|
|
:param argparse.ArgumentParser parser: The parser to be initialized
|
|
|
|
:rtype: None
|
|
|
|
"""
|
2018-02-20 23:16:51 -08:00
|
|
|
subparsers = parser.add_subparsers(dest='op')
|
|
|
|
subparsers.required = True
|
|
|
|
|
2018-02-21 12:56:22 -08:00
|
|
|
def command(name, *args, **kwargs):
|
2018-02-20 23:16:51 -08:00
|
|
|
op = subparsers.add_parser(name)
|
|
|
|
for arg in args:
|
|
|
|
if arg == 'default_access':
|
|
|
|
op.add_argument(arg, nargs='?', default='allow')
|
|
|
|
else:
|
|
|
|
op.add_argument(arg)
|
2019-03-08 10:10:02 -08:00
|
|
|
|
2021-05-18 20:09:18 -07:00
|
|
|
if kwargs.get('user_opt'):
|
|
|
|
op.add_argument('-u', '--user')
|
|
|
|
|
2019-03-08 10:10:02 -08:00
|
|
|
if kwargs.get('exact_opt'):
|
|
|
|
op.add_argument('-e', '--exact-match', action='store_true', default=False)
|
|
|
|
|
2018-02-21 12:56:22 -08:00
|
|
|
op.set_defaults(acl_func=kwargs['func'])
|
2018-02-20 23:16:51 -08:00
|
|
|
|
2021-05-18 20:09:18 -07:00
|
|
|
command('add', 'coll_name', 'url', 'access', func=cls.add_rule, exact_opt=True, user_opt=True)
|
|
|
|
command('remove', 'coll_name', 'url', func=cls.remove_rule, exact_opt=True, user_opt=True)
|
2018-02-20 23:16:51 -08:00
|
|
|
command('list', 'coll_name', func=cls.list_rules)
|
|
|
|
command('validate', 'coll_name', func=cls.validate_save)
|
2021-05-18 20:09:18 -07:00
|
|
|
command('match', 'coll_name', 'url', 'default_access', func=cls.find_match, user_opt=True)
|
2018-02-20 23:16:51 -08:00
|
|
|
command('importtxt', 'coll_name', 'filename', 'access', func=cls.add_excludes)
|
|
|
|
|