From f07d35709ad96cc08b536fcd4ff6410ab4a885d9 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 18 May 2021 20:09:18 -0700 Subject: [PATCH] Access Control Improvements: Embargo + ACL User Support (#642) * embargo: add support for per-collection date range embargo with embargo options of 'before', 'after', 'newer' and 'older' 'before' and 'after' accept a timestamp 'newer' and 'older' options configured with a dictionary consisting of any combo of 'years', 'months', 'days' add basic test for each embargo option * acl/embargo work: - support acl access value 'allow_ignore_embargo' for overriding embargo - support 'user' in acl setting, matched with value of 'X-Pywb-ACL-User' header - support passing through 'X-Pywb-ACL-User' setting to warcserver - aclmanager: support -u/--user param for adding, removing and matching rules - tests: add test for 'allow_ignore_embargo', user-specific acl rule matching * docs: add docs for new embargo system! * docs: add info on how to configure ACL header with short examples to usage page. sample-deploy: add examples of configuring X-pywb-ACL-user header based on IP for nginx and apache sample deployments * docs: fix access control page header, text tweaks * bump version to 2.6.0b0 --- docs/manual/access-control.rst | 147 ++++++++++++++++++++++++++++-- docs/manual/cdxserver_api.rst | 2 +- docs/manual/usage.rst | 44 +++++++++ pywb/apps/rewriterapp.py | 2 + pywb/manager/aclmanager.py | 23 +++-- pywb/rewrite/rewriteinputreq.py | 6 ++ pywb/version.py | 2 +- pywb/warcserver/access_checker.py | 140 +++++++++++++++++++++++++--- pywb/warcserver/handlers.py | 4 +- pywb/warcserver/warcserver.py | 6 +- requirements.txt | 1 + sample-deploy/pywb-apache.conf | 9 ++ sample-deploy/pywb-nginx.conf | 17 ++++ sample_archive/access/pywb.aclj | 5 + tests/config_test_access.yaml | 40 ++++++++ tests/test_acl.py | 7 ++ tests/test_acl_manager.py | 58 ++++++++++++ tests/test_embargo.py | 56 ++++++++++++ 18 files changed, 531 insertions(+), 38 deletions(-) create mode 100644 tests/test_embargo.py diff --git a/docs/manual/access-control.rst b/docs/manual/access-control.rst index 79e922a9..a9fd89b8 100644 --- a/docs/manual/access-control.rst +++ b/docs/manual/access-control.rst @@ -1,15 +1,87 @@ .. _access-control: -Access Control System ---------------------- +Embargo and Access Control +-------------------------- -The access controls system allows for a flexible configuration of rules to allow, -block or exclude access to individual urls by longest-prefix match. +The embargo system allows for date-based rules to block access to captures based on their capture dates. + +The access controls system provides additional URL-based rules to allow, block or exclude access to specific URL prefixes or exact URLs. + +The embargo and access control rules are configured per collection. + +Embargo Settings +================ + +The embargo system allows restricting access to all URLs within a collection based on the timestamp of each URL. +Access to these resources is 'embargoed' until the date range is adjusted or the time interval passes. + +The embargo can be used to disallow access to captures based on following criteria: +- Captures before an exact date +- Captures after an exact date +- Captures newer than a time interval +- Captures older than a time interval + +Embargo Before/After Exact Date +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To block access to all captures before or after a specific date, use the ``before`` or ``after`` embargo blocks +with a specific timestamp. + +For example, the following blocks access to all URLs captured before 2020-12-26 in the collection ``embargo-before``:: + + embargo-before: + index_paths: ... + archive_paths: ... + embargo: + before: '20201226' + + +The following blocks access to all URLs captured on or after 2020-12-26 in collection ``embargo-after``:: + + embargo-after: + index_paths: ... + archive_paths: ... + embargo: + after: '20201226' + +Embargo By Time Interval +^^^^^^^^^^^^^^^^^^^^^^^^ + +The embargo can also be set for a relative time interval, consisting of years, months, weeks and/or days. + + +For example, the following blocks access to all URLs newer than 1 year:: + + embargo-newer: + ... + embargo: + newer: + years: 1 + + + +The following blocks access to all URLs older than 1 year, 2 months, 3 weeks and 4 days:: + + embargo-older: + ... + embargo: + older: + years: 1 + months: 2 + weeks: 3 + days: 4 + + +Any combination of years, months, weeks and days can be used (as long as at least one is provided) for the ``newer`` or ``older`` embargo settings. + + +Access Control Settings +======================= Access Control Files (.aclj) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Access controls are set in one or more access control JSON files (.aclj), sorted in reverse alphabetical order. +URL-based access controls are set in one or more access control JSON files (.aclj), sorted in reverse alphabetical order. To determine the best match, a binary search is used (similar to CDXJ) lookup and then the best match is found forward. An .aclj file may look as follows:: @@ -22,6 +94,8 @@ An .aclj file may look as follows:: Each JSON entry contains an ``access`` field and the original ``url`` field that was used to convert to the SURT (if any). +The JSON entry may also contain a ``user`` field, as explained below. + The prefix consists of a SURT key and a ``-`` (currently reserved for a timestamp/date range field to be added later) Given these rules, a user would: @@ -30,19 +104,55 @@ Given these rules, a user would: * would receive a 404 not found error when viewing ``http://httpbin.org/anything`` (exclude) -Access Types: allow, block, exclude -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Access Types: allow, block, exclude, allow_ignore_embargo +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The available access types are as follows: - ``exclude`` - when matched, results are excluded from the index, as if they do not exist. User will receive a 404. - ``block`` - when matched, results are not excluded from the index, marked with ``access: block``, but access to the actual is blocked. User will see a 451 -- ``allow`` - full access to the index and the resource. +- ``allow`` - full access to the index and the resource, but may be overriden by embargo +- ``allow_ignore_embargo`` - full access to the index and resource, overriding any embargo settings The difference between ``exclude`` and ``block`` is that when blocked, the user can be notified that access is blocked, while with exclude, no trace of the resource is presented to the user. -The use of ``allow`` is useful to provide access to more specific resources within a broader block/exclude rule. +The use of ``allow`` is useful to provide access to more specific resources within a broader block/exclude rule, while ``allow_ignore_embargo`` +can be used to override any embargo settings. + +If both are present, the embargo restrictions are checked first and take precedence, unless the ``allow_ignore_embargo`` option is used +to override the embargo. + + +User-Based Access Controls +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The access control rules can further be customized be specifying different permissions for different 'users'. Since pywb does not have a user system, +a special header, ``X-Pywb-ACL-User`` can be used to indicate a specific user. + +This setting is designed to allow a more priveleged user to access additional setting or override an embargo. + +For example, the following access control settings restricts access to ``https://example.com/restricted/`` by default, but allows access for the ``staff`` user:: + + com,example)/restricted - {"access": "allow", "user": "staff"} + com,example)/restricted - {"access": "block"} + + +Combined with the embargo settings, this can also be used to override the embargo for internal organizational users, while keeping the embargo for general access:: + + com,example)/restricted - {"access": "allow_ignore_embargo", "user": "staff"} + com,example)/restricted - {"access": "allow"} + +To make this work, pywb must be running behind an Apache or Nginx system that is configured to set ``X-Pywb-ACL-User: staff`` based on certain settings. + +For example, this header may be set based on IP range, or based on password authentication. + +Further examples of how to set this header will be provided in the deployments section. + +**Note: Do not use the user-based rules without configuring proper authentication on an Apache or Nginx frontend to set or remove this header, otherwise the 'X-Pywb-ACL-User' can easily be faked.** + +See the :ref:`config-acl-header` section in Usage for examples on how to configure this header. + Access Error Messages ^^^^^^^^^^^^^^^^^^^^^ @@ -73,6 +183,11 @@ The URL supplied can be a URL or a SURT prefix. If a SURT is supplied, it is use wb-manager acl add com, allow +A specific user for user-based rules can also be specified, for example to add ``allow_ignore_embargo`` for user ``staff`` only, run:: + + wb-manager acl add http://httpbin.org/anything/something allow_ignore_embargo staff + + By default, access control rules apply to a prefix of a given URL or SURT. To have the rule apply only to the exact match, use:: @@ -136,6 +251,20 @@ set merge-sorted to find the best match (very similar to the CDXJ index lookup). Note: It might make sense to separate ``allows.aclj`` and ``blocks.aclj`` into individual files for organizational reasons, but there is no specific need to keep more than one access control files. +Finally, ACLJ and embargo settings combined for the same collection might look as follows:: + + collections: + test: + ... + embargo: + newer: + days: 366 + + acl_paths: + - ./path/to/allows.aclj + - ./path/to/blocks.aclj + + Default Access ^^^^^^^^^^^^^^ diff --git a/docs/manual/cdxserver_api.rst b/docs/manual/cdxserver_api.rst index 54ea7332..66b5108e 100644 --- a/docs/manual/cdxserver_api.rst +++ b/docs/manual/cdxserver_api.rst @@ -182,7 +182,7 @@ the following modifiers: ``fields`` -^^^^^^ +^^^^^^^^^^ The ``fields`` param can be used to specify which fields to include in the output. The standard available fields are usually: ``urlkey``, diff --git a/docs/manual/usage.rst b/docs/manual/usage.rst index 7ee73df2..d322ad20 100644 --- a/docs/manual/usage.rst +++ b/docs/manual/usage.rst @@ -293,6 +293,50 @@ Then, in your config, simply include: The configuration assumes uwsgi is started with ``uwsgi uwsgi.ini`` +.. _config-acl-header: + +Configuring Access Control Header +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :ref:`access-control` system allows users to be granted different access settings based on the value of an ACL header, ``X-pywb-ACL-user``. + +The header can be set via Nginx or Apache to grant custom access priviliges based on IP address, password, or other combination of rules. + +For example, to set the value of the header to ``staff`` if the IP of the request is from designated local IP ranges (127.0.0.1, 192.168.1.0/24), the following settings can be added to the configs: + +For Nginx:: + + geo $acl_user { + # ensure user is set to empty by default + default ""; + + # optional: add IP ranges to allow privileged access + 127.0.0.1 "staff"; + 192.168.0.0/24 "staff"; + } + + ... + location /wayback/ { + ... + uwsgi_param HTTP_X_PYWB_ACL_USER $acl_user; + } + + +For Apache:: + + + RequestHeader set X-Pywb-ACL-User staff + + # ensure header is cleared if no match + + RequestHeader set X-Pywb-ACL-User "" + + +} + + + + Running on Subdirectory Path ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index 66b45607..9df559cd 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -704,6 +704,8 @@ class RewriterApp(object): headers = {'Content-Length': str(len(req_data)), 'Content-Type': 'application/request'} + headers.update(inputreq.warcserver_headers) + if skip_record: headers['Recorder-Skip'] = '1' diff --git a/pywb/manager/aclmanager.py b/pywb/manager/aclmanager.py index d4ce218b..3305b912 100644 --- a/pywb/manager/aclmanager.py +++ b/pywb/manager/aclmanager.py @@ -12,7 +12,7 @@ from pywb.warcserver.index.cdxobject import CDXObject class ACLManager(CollectionsManager): SURT_RX = re.compile('([^:.]+[,)])+') - VALID_ACCESS = ('allow', 'block', 'exclude') + VALID_ACCESS = ('allow', 'block', 'exclude', 'allow_ignore_embargo') DEFAULT_FILE = 'access-rules.aclj' @@ -167,9 +167,9 @@ class ACLManager(CollectionsManager): :param argparse.Namespace r: The argparse namespace representing the rule to be added :rtype: None """ - return self._add_rule(r.url, r.access, r.exact_match) + return self._add_rule(r.url, r.access, r.exact_match, r.user) - def _add_rule(self, url, access, exact_match=False): + def _add_rule(self, url, access, exact_match=False, user=None): """Adds an rule to the acl file :param str url: The URL for the rule @@ -185,12 +185,14 @@ class ACLManager(CollectionsManager): acl['timestamp'] = '-' acl['access'] = access acl['url'] = url + if user: + acl['user'] = user i = 0 replace = False for rule in self.rules: - if acl['urlkey'] == rule['urlkey'] and acl['timestamp'] == rule['timestamp']: + if acl['urlkey'] == rule['urlkey'] and acl['timestamp'] == rule['timestamp'] and acl.get('user') == rule.get('user'): replace = True break @@ -255,7 +257,7 @@ class ACLManager(CollectionsManager): i = 0 urlkey = self.to_key(r.url, r.exact_match) for rule in self.rules: - if urlkey == rule['urlkey']: + if urlkey == rule['urlkey'] and r.user == rule.get('user'): acl = self.rules.pop(i) print('Removed Rule:') self.print_rule(acl) @@ -285,7 +287,7 @@ class ACLManager(CollectionsManager): :rtype: None """ access_checker = AccessChecker(self.acl_file, '') - rule = access_checker.find_access_rule(r.url) + rule = access_checker.find_access_rule(r.url, acl_user=r.user) print('Matched rule:') print('') @@ -344,15 +346,18 @@ class ACLManager(CollectionsManager): else: op.add_argument(arg) + if kwargs.get('user_opt'): + op.add_argument('-u', '--user') + if kwargs.get('exact_opt'): op.add_argument('-e', '--exact-match', action='store_true', default=False) op.set_defaults(acl_func=kwargs['func']) - command('add', 'coll_name', 'url', 'access', func=cls.add_rule, exact_opt=True) - command('remove', 'coll_name', 'url', func=cls.remove_rule, exact_opt=True) + command('add', 'coll_name', 'url', 'access', func=cls.add_rule, exact_opt=True, user_opt=True) + command('remove', 'coll_name', 'url', func=cls.remove_rule, exact_opt=True, user_opt=True) command('list', 'coll_name', func=cls.list_rules) command('validate', 'coll_name', func=cls.validate_save) - command('match', 'coll_name', 'url', 'default_access', func=cls.find_match) + command('match', 'coll_name', 'url', 'default_access', func=cls.find_match, user_opt=True) command('importtxt', 'coll_name', 'filename', 'access', func=cls.add_excludes) diff --git a/pywb/rewrite/rewriteinputreq.py b/pywb/rewrite/rewriteinputreq.py index 11d12d92..6eab1ce0 100644 --- a/pywb/rewrite/rewriteinputreq.py +++ b/pywb/rewrite/rewriteinputreq.py @@ -26,6 +26,7 @@ class RewriteInputRequest(DirectWSGIInputRequest): self.url = url self.rewriter = rewriter self.extra_cookie = None + self.warcserver_headers = {} is_proxy = ('wsgiprox.proxy_host' in env) @@ -82,6 +83,11 @@ class RewriteInputRequest(DirectWSGIInputRequest): elif name in ('HTTP_IF_MODIFIED_SINCE', 'HTTP_IF_UNMODIFIED_SINCE'): continue + elif name == 'HTTP_X_PYWB_ACL_USER': + name = name[5:].title().replace('_', '-') + self.warcserver_headers[name] = value + continue + elif name == 'HTTP_X_FORWARDED_PROTO': name = 'X-Forwarded-Proto' if self.splits: diff --git a/pywb/version.py b/pywb/version.py index ff40f4b4..844b041d 100644 --- a/pywb/version.py +++ b/pywb/version.py @@ -1,4 +1,4 @@ -__version__ = '2.6.0.dev0' +__version__ = '2.6.0b0' if __name__ == '__main__': print(__version__) diff --git a/pywb/warcserver/access_checker.py b/pywb/warcserver/access_checker.py index bcf2a4e0..46cd7acd 100644 --- a/pywb/warcserver/access_checker.py +++ b/pywb/warcserver/access_checker.py @@ -6,6 +6,9 @@ from pywb.warcserver.index.cdxobject import CDXObject from pywb.utils.binsearch import search from pywb.utils.merge import merge +from warcio.timeutils import timestamp_to_datetime +from datetime import datetime, timedelta +from dateutil.relativedelta import relativedelta import os @@ -84,11 +87,12 @@ class AccessChecker(object): # another '#' (U+0023 > U+0020) EXACT_SUFFIX_SEARCH_B = b'####' # type: bytes - def __init__(self, access_source, default_access='allow'): + def __init__(self, access_source, default_access='allow', embargo=None): """Initialize a new AccessChecker :param str|list[str]|AccessRulesAggregator access_source: An access source :param str default_access: The default access action (allow) + :param dict embargo: A dict specifying optional embargo setting """ if isinstance(access_source, str): self.aggregator = self.create_access_aggregator([access_source]) @@ -103,6 +107,72 @@ class AccessChecker(object): self.default_rule['access'] = default_access self.default_rule['default'] = 'true' + self.embargo = self.parse_embargo(embargo) + + def parse_embargo(self, embargo): + if not embargo: + return None + + value = embargo.get('before') + if value: + embargo['before'] = timestamp_to_datetime(str(value)) + + value = embargo.get('after') + if value: + embargo['after'] = timestamp_to_datetime(str(value)) + + value = embargo.get('older') + if value: + delta = relativedelta( + years=value.get('years', 0), + months=value.get('months', 0), + weeks=value.get('weeks', 0), + days=value.get('days', 0)) + + embargo['older'] = delta + + value = embargo.get('newer') + if value: + delta = relativedelta( + years=value.get('years', 0), + months=value.get('months', 0), + weeks=value.get('weeks', 0), + days=value.get('days', 0)) + + embargo['newer'] = delta + + return embargo + + def check_embargo(self, url, ts): + if not self.embargo: + return None + + dt = timestamp_to_datetime(ts) + access = self.embargo.get('access', 'exclude') + + # embargo before + before = self.embargo.get('before') + if before: + print(dt, before) + return access if dt < before else None + + # embargo after + after = self.embargo.get('after') + if after: + return access if dt > after else None + + # embargo if newser than + newer = self.embargo.get('newer') + if newer: + actual = datetime.utcnow() - newer + return access if actual < dt else None + + # embargo if older than + older = self.embargo.get('older') + if older: + actual = datetime.utcnow() - older + return access if actual > dt else None + def create_access_aggregator(self, source_files): """Creates a new AccessRulesAggregator using the supplied list of access control file names @@ -139,13 +209,15 @@ class AccessChecker(object): else: raise Exception('Invalid Access Source: ' + filename) - def find_access_rule(self, url, ts=None, urlkey=None, collection=None): + def find_access_rule(self, url, ts=None, urlkey=None, collection=None, acl_user=None): """Attempts to find the access control rule for the supplied URL otherwise returns the default rule :param str url: The URL for the rule to be found :param str|None ts: A timestamp (not used) :param str|None urlkey: The access control url key + :param str|None collection: The collection, if any + :param str|None acl_user: The access control user, if any :return: The access control rule for the supplied URL if one exists otherwise the default rule :rtype: CDXObject @@ -167,6 +239,9 @@ class AccessChecker(object): tld = key.split(b',')[0] + last_obj = None + last_key = None + for acl in acl_iter: # skip empty/invalid lines @@ -174,62 +249,97 @@ class AccessChecker(object): continue acl_key = acl.split(b' ')[0] + acl_obj = None + + if acl_key != last_key and last_obj: + return last_obj if key_exact == acl_key: - return CDXObject(acl) + acl_obj = CDXObject(acl) if key.startswith(acl_key): - return CDXObject(acl) + acl_obj = CDXObject(acl) + + if acl_obj: + user = acl_obj.get('user') + if user == acl_user: + return acl_obj + elif not user: + last_key = acl_key + last_obj = acl_obj # if acl key already less than first tld, # no match can be found if acl_key < tld: break - return self.default_rule + return last_obj if last_obj else self.default_rule - def __call__(self, res): + def __call__(self, res, acl_user): """Wraps the cdx iter in the supplied tuple returning a the wrapped cdx iter and the other members of the supplied tuple in same order :param tuple res: The result tuple + :param str acl_user: The user associated with this request (optional) :return: An tuple """ cdx_iter, errs = res - return self.wrap_iter(cdx_iter), errs + return self.wrap_iter(cdx_iter, acl_user), errs - def wrap_iter(self, cdx_iter): + def wrap_iter(self, cdx_iter, acl_user): """Wraps the supplied cdx iter and yields cdx objects that contain the access control results for the cdx object being yielded :param cdx_iter: The cdx object iterator to be wrapped + :param str acl_user: The user associated with this request (optional) :return: The wrapped cdx object iterator """ last_rule = None last_url = None + last_user = None + rule = None for cdx in cdx_iter: url = cdx.get('url') + timestamp = cdx.get('timestamp') + # if no url, possible idx or other object, don't apply any checks and pass through if not url: yield cdx continue - # TODO: optimization until date range support is included - if url == last_url: - rule = last_rule - else: - rule = self.find_access_rule(url, cdx.get('timestamp'), cdx.get('urlkey'), - cdx.get('source-coll')) + access = None + if self.aggregator: + # TODO: optimization until date range support is included + if url == last_url and acl_user == last_user: + rule = last_rule + else: + rule = self.find_access_rule(url, timestamp, + cdx.get('urlkey'), + cdx.get('source-coll'), + acl_user) + + access = rule.get('access', 'exclude') + + if access != 'allow_ignore_embargo' and access != 'exclude': + embargo_access = self.check_embargo(url, timestamp) + if embargo_access and embargo_access != 'allow': + access = embargo_access - access = rule.get('access', 'exclude') if access == 'exclude': continue + if not access: + access = self.default_rule['access'] + + if access == 'allow_ignore_embargo': + access = 'allow' + cdx['access'] = access yield cdx last_rule = rule last_url = url + last_user = acl_user diff --git a/pywb/warcserver/handlers.py b/pywb/warcserver/handlers.py index 70a2ffc3..0abd5466 100644 --- a/pywb/warcserver/handlers.py +++ b/pywb/warcserver/handlers.py @@ -66,8 +66,10 @@ class IndexHandler(object): cdx_iter = self.fuzzy(self.index_source, params) + acl_user = params['_input_req'].env.get("HTTP_X_PYWB_ACL_USER") + if self.access_checker: - cdx_iter = self.access_checker(cdx_iter) + cdx_iter = self.access_checker(cdx_iter, acl_user) return cdx_iter diff --git a/pywb/warcserver/warcserver.py b/pywb/warcserver/warcserver.py index a6728671..e4abc7fe 100644 --- a/pywb/warcserver/warcserver.py +++ b/pywb/warcserver/warcserver.py @@ -210,6 +210,7 @@ class WarcServer(BaseWarcServer): archive_paths = None acl_paths = None default_access = self.default_access + embargo = None elif isinstance(coll_config, dict): index = coll_config.get('index') if not index: @@ -217,6 +218,7 @@ class WarcServer(BaseWarcServer): archive_paths = coll_config.get('archive_paths') acl_paths = coll_config.get('acl_paths') default_access = coll_config.get('default_access', self.default_access) + embargo = coll_config.get('embargo') else: raise Exception('collection config must be string or dict') @@ -245,8 +247,8 @@ class WarcServer(BaseWarcServer): # ACCESS CONFIG access_checker = None - if acl_paths: - access_checker = AccessChecker(acl_paths, default_access) + if acl_paths or embargo: + access_checker = AccessChecker(acl_paths, default_access, embargo) return DefaultResourceHandler(agg, archive_paths, rules_file=self.rules_file, diff --git a/requirements.txt b/requirements.txt index 0c3e0af2..7f3d3401 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,4 @@ wsgiprox>=1.5.1 fakeredis<1.0 tldextract babel +python-dateutil diff --git a/sample-deploy/pywb-apache.conf b/sample-deploy/pywb-apache.conf index 1fdfab51..96c5ff5b 100644 --- a/sample-deploy/pywb-apache.conf +++ b/sample-deploy/pywb-apache.conf @@ -14,4 +14,13 @@ # required: proxy pass to pywb ProxyPass /wayback uwsgi://pywb:8081/ + # optional: set custom header based on IP ranges + + RequestHeader set X-Pywb-ACL-User staff + + # ensure header is cleared if no match + + RequestHeader set X-Pywb-ACL-User "" + + diff --git a/sample-deploy/pywb-nginx.conf b/sample-deploy/pywb-nginx.conf index dd22ea69..23c55337 100644 --- a/sample-deploy/pywb-nginx.conf +++ b/sample-deploy/pywb-nginx.conf @@ -1,5 +1,18 @@ # nginx config for running under /wayback/ prefix + +# set acl_user, defaulting to empty (any public user) +geo $acl_user { + # ensure user is set to empty by default + default ""; + + # optional: add IP ranges to allow privileged access + 127.0.0.1 "staff"; + 192.168.0.0/24 "staff"; +} + + + server { listen 80; @@ -14,8 +27,12 @@ server { uwsgi_pass pywb:8081; + include uwsgi_params; uwsgi_param UWSGI_SCHEME $scheme; + + # pass acl_user (which should be empty by default) + uwsgi_param HTTP_X_PYWB_ACL_USER $acl_user; } } diff --git a/sample_archive/access/pywb.aclj b/sample_archive/access/pywb.aclj index 84b7e417..44382df3 100644 --- a/sample_archive/access/pywb.aclj +++ b/sample_archive/access/pywb.aclj @@ -1,7 +1,12 @@ org,iana)/exact/match/first/line/aclj### - {"access": "allow", "url": "https://www.iana.org/exact/match/first/line/aclj/"} org,iana)/about - {"access": "block"} +org,iana)/about - {"access": "allow", "user": "staff"} org,iana)/_css/2013.1/fonts/opensans-semibold.ttf - {"access": "allow"} org,iana)/_css - {"access": "exclude"} org,iana)/### - {"access": "allow"} org,iana)/ - {"access": "exclude"} org,example)/?example=1 - {"access": "block"} +com,example)/?example=2 - {"access": "allow_ignore_embargo"} +com,example)/?example=1 - {"access": "allow_ignore_embargo", "user": "staff2"} +com,example)/?example=1 - {"access": "allow", "user": "staff"} +com,example)/ - {"access": "allow"} diff --git a/tests/config_test_access.yaml b/tests/config_test_access.yaml index 49c4220c..8fb352f7 100644 --- a/tests/config_test_access.yaml +++ b/tests/config_test_access.yaml @@ -24,4 +24,44 @@ collections: default_access: block + pywb-embargo-before: + index_paths: ./sample_archive/cdx/ + archive_paths: ./sample_archive/warcs/ + embargo: + before: '2014012700' + + pywb-embargo-after: + index_paths: ./sample_archive/cdx/ + archive_paths: ./sample_archive/warcs/ + embargo: + after: '2014012700' + + pywb-embargo-older: + index_paths: ./sample_archive/cdx/ + archive_paths: ./sample_archive/warcs/ + embargo: + older: + years: 1 + months: 6 + + pywb-embargo-newer: + index_paths: ./sample_archive/cdx/ + archive_paths: ./sample_archive/warcs/ + embargo: + newer: + years: 1 + months: 6 + + pywb-embargo-acl: + index_paths: ./sample_archive/cdx/ + archive_paths: ./sample_archive/warcs/ + embargo: + older: + years: 1 + + acl_paths: + - ./sample_archive/access/pywb.aclj + + + diff --git a/tests/test_acl.py b/tests/test_acl.py index 2554d2e5..5ed532d1 100644 --- a/tests/test_acl.py +++ b/tests/test_acl.py @@ -40,6 +40,13 @@ class TestACLApp(BaseConfigTest): assert 'Access Blocked' in resp.text + def test_allow_via_acl_header(self): + resp = self.query('http://www.iana.org/about/') + + assert len(resp.text.splitlines()) == 1 + + resp = self.testapp.get('/pywb/mp_/http://www.iana.org/about/', headers={"X-Pywb-Acl-User": "staff"}, status=200) + def test_allowed_more_specific(self): resp = self.query('http://www.iana.org/_css/2013.1/fonts/opensans-semibold.ttf') diff --git a/tests/test_acl_manager.py b/tests/test_acl_manager.py index 16f2239d..4e732be1 100644 --- a/tests/test_acl_manager.py +++ b/tests/test_acl_manager.py @@ -40,6 +40,16 @@ com,example)/ - {"access": "allow", "url": "http://example.com/"} assert fh.read() == """\ com,example, - {"access": "exclude", "url": "com,example,"} com,example)/ - {"access": "allow", "url": "http://example.com/"} +""" + + def test_acl_add_with_user(self): + wb_manager(['acl', 'add', self.acl_filename, 'http://example.com/', 'block', '-u', 'public']) + + with open(self.acl_filename, 'rt') as fh: + assert fh.read() == """\ +com,example, - {"access": "exclude", "url": "com,example,"} +com,example)/ - {"access": "block", "url": "http://example.com/", "user": "public"} +com,example)/ - {"access": "allow", "url": "http://example.com/"} """ def test_acl_list(self, capsys): @@ -51,6 +61,7 @@ com,example)/ - {"access": "allow", "url": "http://example.com/"} Rules for %s from %s: com,example, - {"access": "exclude", "url": "com,example,"} +com,example)/ - {"access": "block", "url": "http://example.com/", "user": "public"} com,example)/ - {"access": "allow", "url": "http://example.com/"} """ % (self.acl_filename, self.acl_filename) @@ -71,6 +82,42 @@ Matched rule: com,example, - {"access": "exclude", "url": "com,example,"} +""" + + def test_acl_match_user(self, capsys): + wb_manager(['acl', 'match', self.acl_filename, 'http://example.com/foo', '-u', 'public']) + + out, err = capsys.readouterr() + + assert out == """\ +Matched rule: + + com,example)/ - {"access": "block", "url": "http://example.com/", "user": "public"} + +""" + + def test_acl_match_unknown_user(self, capsys): + wb_manager(['acl', 'match', self.acl_filename, 'http://example.com/foo', '-u', 'data']) + + out, err = capsys.readouterr() + + assert out == """\ +Matched rule: + + com,example)/ - {"access": "allow", "url": "http://example.com/"} + +""" + + def test_acl_match_default_user(self, capsys): + wb_manager(['acl', 'match', self.acl_filename, 'http://example.com/foo']) + + out, err = capsys.readouterr() + + assert out == """\ +Matched rule: + + com,example)/ - {"access": "allow", "url": "http://example.com/"} + """ def test_remove_acl(self): @@ -78,9 +125,20 @@ Matched rule: with open(self.acl_filename, 'rt') as fh: assert fh.read() == """\ +com,example)/ - {"access": "block", "url": "http://example.com/", "user": "public"} com,example)/ - {"access": "allow", "url": "http://example.com/"} """ + def test_remove_acl_user(self): + wb_manager(['acl', 'remove', self.acl_filename, 'com,example)/', '-u', 'public']) + + with open(self.acl_filename, 'rt') as fh: + assert fh.read() == """\ +com,example)/ - {"access": "allow", "url": "http://example.com/"} +""" + + + def test_acl_add_exact(self): wb_manager(['acl', 'add', '--exact-match', self.acl_filename, 'example.com', 'block']) diff --git a/tests/test_embargo.py b/tests/test_embargo.py new file mode 100644 index 00000000..4c1ab21e --- /dev/null +++ b/tests/test_embargo.py @@ -0,0 +1,56 @@ +from .base_config_test import BaseConfigTest, fmod + +import webtest +import os + +from six.moves.urllib.parse import urlencode + + +# ============================================================================ +class TestEmbargoApp(BaseConfigTest): + @classmethod + def setup_class(cls): + super(TestEmbargoApp, cls).setup_class('config_test_access.yaml') + + def test_embargo_before(self): + resp = self.testapp.get('/pywb-embargo-before/20140126201054mp_/http://www.iana.org/domains/reserved', status=404) + + resp = self.testapp.get('/pywb-embargo-before/20140127mp_/http://example.com/', status=200) + assert resp.headers['Content-Location'] == 'http://localhost:80/pywb-embargo-before/20140127171251mp_/http://example.com' + + def test_embargo_after(self): + resp = self.testapp.get('/pywb-embargo-after/20140126201054mp_/http://www.iana.org/domains/reserved', status=200) + + resp = self.testapp.get('/pywb-embargo-after/20140127mp_/http://example.com/', status=200) + assert resp.headers['Content-Location'] == 'http://localhost:80/pywb-embargo-after/20130729195151mp_/http://test@example.com/' + + def test_embargo_older(self): + resp = self.testapp.get('/pywb-embargo-older/20140126201054mp_/http://www.iana.org/domains/reserved', status=404) + + resp = self.testapp.get('/pywb-embargo-older/20140127mp_/http://example.com/', status=404) + + def test_embargo_newer(self): + resp = self.testapp.get('/pywb-embargo-newer/20140126201054mp_/http://www.iana.org/domains/reserved', status=200) + + resp = self.testapp.get('/pywb-embargo-newer/20140127mp_/http://example.com/', status=200) + assert resp.headers['Content-Location'] == 'http://localhost:80/pywb-embargo-newer/20140127171251mp_/http://example.com' + + def test_embargo_ignore_acl(self): + # embargoed + resp = self.testapp.get('/pywb-embargo-acl/20140126201054mp_/http://example.com/', status=404) + + # ignore embargo + resp = self.testapp.get('/pywb-embargo-acl/20140126201054mp_/http://example.com/?example=2', status=200) + + + def test_embargo_ignore_acl_with_header_only(self): + # ignore embargo with custom header only + headers = {"X-Pywb-ACL-User": "staff2"} + resp = self.testapp.get('/pywb-embargo-acl/20140126201054mp_/http://example.com/?example=1', status=200, headers=headers) + + resp = self.testapp.get('/pywb-embargo-acl/20140126201054mp_/http://example.com/?example=1', status=404) + + + + +