1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-21 19:12:10 +01:00
pywb/pywb/warcserver/access_checker.py
Ilya Kreymer f07d35709a
Access Control Improvements: Embargo + ACL User Support (#642)
* embargo: add support for per-collection date range embargo with embargo options of 'before', 'after', 'newer' and 'older'
'before' and 'after' accept a timestamp
'newer' and 'older' options configured with a dictionary consisting of any combo of 'years', 'months', 'days'
add basic test for each embargo option

* acl/embargo work:
- support acl access value 'allow_ignore_embargo' for overriding embargo
- support 'user' in acl setting, matched with value of 'X-Pywb-ACL-User' header
- support passing through 'X-Pywb-ACL-User' setting to warcserver
- aclmanager: support -u/--user param for adding, removing and matching rules
- tests: add test for 'allow_ignore_embargo', user-specific acl rule matching

* docs: add docs for new embargo system!

* docs: add info on how to configure ACL header with short examples to usage page.
sample-deploy: add examples of configuring X-pywb-ACL-user header based on IP for nginx and apache sample deployments

* docs: fix access control page header, text tweaks

* bump version to 2.6.0b0
2021-05-18 20:09:18 -07:00

346 lines
12 KiB
Python

from pywb.warcserver.index.indexsource import FileIndexSource
from pywb.warcserver.index.aggregator import DirectoryIndexSource, CacheDirectoryMixin
from pywb.warcserver.index.aggregator import SimpleAggregator
from pywb.warcserver.index.cdxobject import CDXObject
from pywb.utils.binsearch import search
from pywb.utils.merge import merge
from warcio.timeutils import timestamp_to_datetime
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import os
# ============================================================================
class FileAccessIndexSource(FileIndexSource):
"""An Index Source class specific to access control lists"""
@staticmethod
def rev_cmp(a, b):
"""Performs a comparison between two items using the
algorithm of the removed builtin cmp
:param a: A value to be compared
:param b: A value to be compared
:return: The result of the comparison
:rtype: int
"""
return (a < b) - (a > b)
def _do_iter(self, fh, params):
"""Iterates over the supplied file handle to an access control list
yielding the results of the search for the params key
:param TextIO fh: The file handle to an access control list
:param dict params: The params of the
:return: A generator yielding the results of the param search
"""
exact_suffix = params.get('exact_match_suffix')
key = params['key']
if exact_suffix:
key += exact_suffix
for line in search(fh, key, prev_size=1, compare_func=self.rev_cmp):
yield line
# ============================================================================
class ReverseMergeMixin(object):
"""A mixin that provides revered merge functionality"""
def _merge(self, iter_list):
"""Merges the supplied list of iterators in reverse
:param iter_list: The list of iterators to be merged
:return: An iterator that yields the results of the reverse merge
"""
return merge(*(iter_list), reverse=True)
# ============================================================================
class AccessRulesAggregator(ReverseMergeMixin, SimpleAggregator):
"""An Aggregator specific to access control"""
# ============================================================================
class DirectoryAccessSource(ReverseMergeMixin, DirectoryIndexSource):
"""An directory index source specific to access control"""
INDEX_SOURCES = [('.aclj', FileAccessIndexSource)] # type: list[tuple]
# ============================================================================
class CacheDirectoryAccessSource(CacheDirectoryMixin, DirectoryAccessSource):
"""An cache directory index source specific to access control"""
# ============================================================================
class AccessChecker(object):
"""An access checker class"""
EXACT_SUFFIX = '###' # type: str
EXACT_SUFFIX_B = b'###' # type: bytes
# rules in the ACL file are followed by a white space (U+0020):
# for searching we need a match suffix which sorts/compares after
# (resp. before because we use the rev_cmp function). Simply add
# another '#' (U+0023 > U+0020)
EXACT_SUFFIX_SEARCH_B = b'####' # type: bytes
def __init__(self, access_source, default_access='allow', embargo=None):
"""Initialize a new AccessChecker
:param str|list[str]|AccessRulesAggregator access_source: An access source
:param str default_access: The default access action (allow)
:param dict embargo: A dict specifying optional embargo setting
"""
if isinstance(access_source, str):
self.aggregator = self.create_access_aggregator([access_source])
elif isinstance(access_source, list):
self.aggregator = self.create_access_aggregator(access_source)
else:
self.aggregator = access_source
self.default_rule = CDXObject()
self.default_rule['urlkey'] = ''
self.default_rule['timestamp'] = '-'
self.default_rule['access'] = default_access
self.default_rule['default'] = 'true'
self.embargo = self.parse_embargo(embargo)
def parse_embargo(self, embargo):
if not embargo:
return None
value = embargo.get('before')
if value:
embargo['before'] = timestamp_to_datetime(str(value))
value = embargo.get('after')
if value:
embargo['after'] = timestamp_to_datetime(str(value))
value = embargo.get('older')
if value:
delta = relativedelta(
years=value.get('years', 0),
months=value.get('months', 0),
weeks=value.get('weeks', 0),
days=value.get('days', 0))
embargo['older'] = delta
value = embargo.get('newer')
if value:
delta = relativedelta(
years=value.get('years', 0),
months=value.get('months', 0),
weeks=value.get('weeks', 0),
days=value.get('days', 0))
embargo['newer'] = delta
return embargo
def check_embargo(self, url, ts):
if not self.embargo:
return None
dt = timestamp_to_datetime(ts)
access = self.embargo.get('access', 'exclude')
# embargo before
before = self.embargo.get('before')
if before:
print(dt, before)
return access if dt < before else None
# embargo after
after = self.embargo.get('after')
if after:
return access if dt > after else None
# embargo if newser than
newer = self.embargo.get('newer')
if newer:
actual = datetime.utcnow() - newer
return access if actual < dt else None
# embargo if older than
older = self.embargo.get('older')
if older:
actual = datetime.utcnow() - older
return access if actual > dt else None
def create_access_aggregator(self, source_files):
"""Creates a new AccessRulesAggregator using the supplied list
of access control file names
:param list[str] source_files: The list of access control file names
:return: The created AccessRulesAggregator
:rtype: AccessRulesAggregator
"""
sources = {}
for filename in source_files:
sources[filename] = self.create_access_source(filename)
aggregator = AccessRulesAggregator(sources)
return aggregator
def create_access_source(self, filename):
"""Creates a new access source for the supplied filename.
If the filename is for a directory an CacheDirectoryAccessSource
instance is returned otherwise an FileAccessIndexSource instance
:param str filename: The name of an file/directory
:return: An instance of CacheDirectoryAccessSource or FileAccessIndexSource
depending on if the supplied filename is for a directory or file
:rtype: CacheDirectoryAccessSource|FileAccessIndexSource
:raises Exception: Indicates an invalid access source was supplied
"""
if os.path.isdir(filename):
return CacheDirectoryAccessSource(filename)
elif os.path.isfile(filename):
return FileAccessIndexSource(filename)
else:
raise Exception('Invalid Access Source: ' + filename)
def find_access_rule(self, url, ts=None, urlkey=None, collection=None, acl_user=None):
"""Attempts to find the access control rule for the
supplied URL otherwise returns the default rule
:param str url: The URL for the rule to be found
:param str|None ts: A timestamp (not used)
:param str|None urlkey: The access control url key
:param str|None collection: The collection, if any
:param str|None acl_user: The access control user, if any
:return: The access control rule for the supplied URL
if one exists otherwise the default rule
:rtype: CDXObject
"""
params = {'url': url,
'urlkey': urlkey,
'nosource': 'true',
'exact_match_suffix': self.EXACT_SUFFIX_SEARCH_B
}
if collection:
params['param.coll'] = collection
acl_iter, errs = self.aggregator(params)
if errs:
print(errs)
key = params['key']
key_exact = key + self.EXACT_SUFFIX_B
tld = key.split(b',')[0]
last_obj = None
last_key = None
for acl in acl_iter:
# skip empty/invalid lines
if not acl:
continue
acl_key = acl.split(b' ')[0]
acl_obj = None
if acl_key != last_key and last_obj:
return last_obj
if key_exact == acl_key:
acl_obj = CDXObject(acl)
if key.startswith(acl_key):
acl_obj = CDXObject(acl)
if acl_obj:
user = acl_obj.get('user')
if user == acl_user:
return acl_obj
elif not user:
last_key = acl_key
last_obj = acl_obj
# if acl key already less than first tld,
# no match can be found
if acl_key < tld:
break
return last_obj if last_obj else self.default_rule
def __call__(self, res, acl_user):
"""Wraps the cdx iter in the supplied tuple returning a
the wrapped cdx iter and the other members of the supplied
tuple in same order
:param tuple res: The result tuple
:param str acl_user: The user associated with this request (optional)
:return: An tuple
"""
cdx_iter, errs = res
return self.wrap_iter(cdx_iter, acl_user), errs
def wrap_iter(self, cdx_iter, acl_user):
"""Wraps the supplied cdx iter and yields cdx objects
that contain the access control results for the cdx object
being yielded
:param cdx_iter: The cdx object iterator to be wrapped
:param str acl_user: The user associated with this request (optional)
:return: The wrapped cdx object iterator
"""
last_rule = None
last_url = None
last_user = None
rule = None
for cdx in cdx_iter:
url = cdx.get('url')
timestamp = cdx.get('timestamp')
# if no url, possible idx or other object, don't apply any checks and pass through
if not url:
yield cdx
continue
access = None
if self.aggregator:
# TODO: optimization until date range support is included
if url == last_url and acl_user == last_user:
rule = last_rule
else:
rule = self.find_access_rule(url, timestamp,
cdx.get('urlkey'),
cdx.get('source-coll'),
acl_user)
access = rule.get('access', 'exclude')
if access != 'allow_ignore_embargo' and access != 'exclude':
embargo_access = self.check_embargo(url, timestamp)
if embargo_access and embargo_access != 'allow':
access = embargo_access
if access == 'exclude':
continue
if not access:
access = self.default_rule['access']
if access == 'allow_ignore_embargo':
access = 'allow'
cdx['access'] = access
yield cdx
last_rule = rule
last_url = url
last_user = acl_user