From 662fc747bf4d5e011fb3c200c0adae8b9e3bf1c6 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 27 Apr 2021 04:58:56 +0200 Subject: [PATCH] Fix ACL loading for auto collections (#620) * Pass collection name to ACL checker to load ACL lists for automatic collections * Typo: file suffix must be `.aclj` --- docs/manual/access-control.rst | 4 ++-- pywb/warcserver/access_checker.py | 7 +++++-- pywb/warcserver/warcserver.py | 4 +++- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/docs/manual/access-control.rst b/docs/manual/access-control.rst index 3293742c..79e922a9 100644 --- a/docs/manual/access-control.rst +++ b/docs/manual/access-control.rst @@ -9,7 +9,7 @@ block or exclude access to individual urls by longest-prefix match. Access Control Files (.aclj) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Access controls are set in one or more access control json files (.aclj), sorted in reverse alphabetical order. +Access controls are set in one or more access control JSON files (.aclj), sorted in reverse alphabetical order. To determine the best match, a binary search is used (similar to CDXJ) lookup and then the best match is found forward. An .aclj file may look as follows:: @@ -61,7 +61,7 @@ The .aclj files need not ever be added or edited manually. The pywb ``wb-manager`` utility has been extended to provide tools for adding, removing and checking access control rules. -The access rules are written to ``/acl/access-rules.acl`` for a given collection ```` for automatic collections. +The access rules are written to ``/acl/access-rules.aclj`` for a given collection ```` for automatic collections. For example, to add the first line to an ACL file ``access.aclj``, one could run:: diff --git a/pywb/warcserver/access_checker.py b/pywb/warcserver/access_checker.py index 9cd2790e..dcbb07cb 100644 --- a/pywb/warcserver/access_checker.py +++ b/pywb/warcserver/access_checker.py @@ -134,7 +134,7 @@ class AccessChecker(object): else: raise Exception('Invalid Access Source: ' + filename) - def find_access_rule(self, url, ts=None, urlkey=None): + def find_access_rule(self, url, ts=None, urlkey=None, collection=None): """Attempts to find the access control rule for the supplied URL otherwise returns the default rule @@ -150,6 +150,8 @@ class AccessChecker(object): 'nosource': 'true', 'exact_match_suffix': self.EXACT_SUFFIX_B } + if collection: + params['param.coll'] = collection acl_iter, errs = self.aggregator(params) if errs: @@ -214,7 +216,8 @@ class AccessChecker(object): if url == last_url: rule = last_rule else: - rule = self.find_access_rule(url, cdx.get('timestamp'), cdx.get('urlkey')) + rule = self.find_access_rule(url, cdx.get('timestamp'), cdx.get('urlkey'), + cdx.get('source-coll')) access = rule.get('access', 'exclude') if access == 'exclude': diff --git a/pywb/warcserver/warcserver.py b/pywb/warcserver/warcserver.py index c162437a..cfa782b2 100644 --- a/pywb/warcserver/warcserver.py +++ b/pywb/warcserver/warcserver.py @@ -142,7 +142,9 @@ class WarcServer(BaseWarcServer): base_dir=self.index_paths, config=self.config) - access_checker = AccessChecker(CacheDirectoryAccessSource(self.acl_paths), + access_checker = AccessChecker(CacheDirectoryAccessSource(base_prefix=self.root_dir, + base_dir=self.acl_paths, + config=self.config), self.default_access) if self.dedup_index_url: