From a3f81dcc0fecbf4ff572f144e99a83a7955fe1aa Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 18 Feb 2018 12:04:15 -0800 Subject: [PATCH] access system work for ukwa/ukwa-pywb#7 - 'acl_paths' config can accept a list of files or directories, a file or a directory string - tests_acl: test collection with acl list, single file, dir --- pywb/utils/merge.py | 2 ++ pywb/warcserver/access_checker.py | 45 ++++++++++++++++++++++--------- pywb/warcserver/warcserver.py | 4 ++- sample_archive/access/list2.aclj | 1 + tests/config_test_access.yaml | 20 +++++++++++++- tests/test_acl.py | 24 +++++++++++++++-- 6 files changed, 79 insertions(+), 17 deletions(-) diff --git a/pywb/utils/merge.py b/pywb/utils/merge.py index 78c18d3f..eaafc9e7 100644 --- a/pywb/utils/merge.py +++ b/pywb/utils/merge.py @@ -35,8 +35,10 @@ else: #pragma: no cover the input streams is already sorted (smallest to largest). >>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25])) [0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25] + If *key* is not None, applies a key function to each element to determine its sort order. + >>> list(merge(['dog', 'horse'], ['cat', 'fish', 'kangaroo'], key=len)) ['dog', 'cat', 'fish', 'horse', 'kangaroo'] ''' diff --git a/pywb/warcserver/access_checker.py b/pywb/warcserver/access_checker.py index 932ccb6b..aaaf67b7 100644 --- a/pywb/warcserver/access_checker.py +++ b/pywb/warcserver/access_checker.py @@ -1,5 +1,6 @@ from pywb.warcserver.index.indexsource import FileIndexSource from pywb.warcserver.index.aggregator import DirectoryIndexSource, CacheDirectoryMixin +from pywb.warcserver.index.aggregator import SimpleAggregator from pywb.utils.binsearch import search from pywb.utils.merge import merge @@ -18,13 +19,21 @@ class FileAccessIndexSource(FileIndexSource): # ============================================================================ -class DirectoryAccessSource(DirectoryIndexSource): - INDEX_SOURCES = [('.aclj', FileAccessIndexSource)] - +class ReverseMergeMixin(object): def _merge(self, iter_list): return merge(*(iter_list), reverse=True) +# ============================================================================ +class AccessRulesAggregator(ReverseMergeMixin, SimpleAggregator): + pass + + +# ============================================================================ +class DirectoryAccessSource(ReverseMergeMixin, DirectoryIndexSource): + INDEX_SOURCES = [('.aclj', FileAccessIndexSource)] + + # ============================================================================ class CacheDirectoryAccessSource(CacheDirectoryMixin, DirectoryAccessSource): pass @@ -32,15 +41,25 @@ class CacheDirectoryAccessSource(CacheDirectoryMixin, DirectoryAccessSource): # ============================================================================ class AccessChecker(object): - def __init__(self, access_source_file, default_access='allow'): - if isinstance(access_source_file, str): - self.aggregator = self.create_access_aggregator(access_source_file) + def __init__(self, access_source, default_access='allow'): + if isinstance(access_source, str): + self.aggregator = self.create_access_aggregator([access_source]) + elif isinstance(access_source, list): + self.aggregator = self.create_access_aggregator(access_source) else: - self.aggregator = access_source_file + self.aggregator = access_source self.default_rule = {'urlkey': '', 'access': default_access} - def create_access_aggregator(self, filename): + def create_access_aggregator(self, source_files): + sources = {} + for filename in source_files: + sources[filename] = self.create_access_source(filename) + + aggregator = AccessRulesAggregator(sources) + return aggregator + + def create_access_source(self, filename): if os.path.isdir(filename): return CacheDirectoryAccessSource(filename) @@ -52,18 +71,18 @@ class AccessChecker(object): def find_access_rule(self, url, ts=None, urlkey=None): params = {'url': url, 'urlkey': urlkey} - cdx_iter, errs = self.aggregator(params) + acl_iter, errs = self.aggregator(params) if errs: print(errs) key = params['key'].decode('utf-8') - for cdx in cdx_iter: - if 'urlkey' not in cdx: + for acl in acl_iter: + if 'urlkey' not in acl: continue - if key.startswith(cdx['urlkey']): - return cdx + if key.startswith(acl['urlkey']): + return acl return self.default_rule diff --git a/pywb/warcserver/warcserver.py b/pywb/warcserver/warcserver.py index 3c346a53..d9dde162 100644 --- a/pywb/warcserver/warcserver.py +++ b/pywb/warcserver/warcserver.py @@ -178,9 +178,9 @@ class WarcServer(BaseWarcServer): else: raise Exception('collection config must be string or dict') + # INDEX CONFIG if index: agg = init_index_agg({name: index}) - else: if not isinstance(coll_config, dict): raise Exception('collection config missing') @@ -196,9 +196,11 @@ class WarcServer(BaseWarcServer): timeout = int(coll_config.get('timeout', 0)) agg = init_index_agg(index_group, True, timeout) + # ARCHIVE CONFIG if not archive_paths: archive_paths = self.config.get('archive_paths') + # ACCESS CONFIG access_checker = None if acl_paths: access_checker = AccessChecker(acl_paths, default_access) diff --git a/sample_archive/access/list2.aclj b/sample_archive/access/list2.aclj index 249aa7bb..516cb3ab 100644 --- a/sample_archive/access/list2.aclj +++ b/sample_archive/access/list2.aclj @@ -1,2 +1,3 @@ +org,httpbin)/ - {"access": "allow"} com,example)/ - {"access": "allow"} bo,example)/ - {"access": "exclude"} diff --git a/tests/config_test_access.yaml b/tests/config_test_access.yaml index ed6e6d04..49c4220c 100644 --- a/tests/config_test_access.yaml +++ b/tests/config_test_access.yaml @@ -4,6 +4,24 @@ collections: pywb: index_paths: ./sample_archive/cdx/ archive_paths: ./sample_archive/warcs/ - acl_paths: ./sample_archive/access/ + acl_paths: ./sample_archive/access/pywb.aclj + default_access: block + pywb-acl-list: + index_paths: ./sample_archive/cdx/ + archive_paths: ./sample_archive/warcs/ + acl_paths: + - ./sample_archive/access/pywb.aclj + - ./sample_archive/access/list2.aclj + + default_access: block + + pywb-acl-dir: + index_paths: ./sample_archive/cdx/ + archive_paths: ./sample_archive/warcs/ + acl_paths: ./sample_archive/access/ + + default_access: block + + diff --git a/tests/test_acl.py b/tests/test_acl.py index 09d2d630..7c87c6d3 100644 --- a/tests/test_acl.py +++ b/tests/test_acl.py @@ -12,9 +12,10 @@ class TestACLApp(BaseConfigTest): def setup_class(cls): super(TestACLApp, cls).setup_class('config_test_access.yaml') - def query(self, url, is_error=False, **params): + def query(self, url, coll='pywb'): + params = {} params['url'] = url - return self.testapp.get('/pywb/cdx?' + urlencode(params, doseq=1), expect_errors=is_error) + return self.testapp.get('/{coll}/cdx?'.format(coll=coll) + urlencode(params, doseq=1)) def test_excluded_url(self): resp = self.query('http://www.iana.org/') @@ -52,4 +53,23 @@ class TestACLApp(BaseConfigTest): assert 'Access Blocked' in resp.text + def test_allowed_different_coll_acl_list(self): + resp = self.query('http://httpbin.org/anything/resource.json', coll='pywb-acl-list') + + assert len(resp.text.splitlines()) > 0 + + resp = self.testapp.get('/pywb-acl-list/mp_/http://httpbin.org/anything/resource.json') + + assert '"http://httpbin.org/anything/resource.json"' in resp.text + + def test_allowed_different_coll_acl_dir(self): + resp = self.query('http://httpbin.org/anything/resource.json', coll='pywb-acl-dir') + + assert len(resp.text.splitlines()) > 0 + + resp = self.testapp.get('/pywb-acl-dir/mp_/http://httpbin.org/anything/resource.json') + + assert '"http://httpbin.org/anything/resource.json"' in resp.text + +