1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

access system work for ukwa/ukwa-pywb#7

- 'acl_paths' config can accept a list of files or directories, a file or a directory string
- tests_acl: test collection with acl list, single file, dir
This commit is contained in:
Ilya Kreymer 2018-02-18 12:04:15 -08:00 committed by John Berlin
parent 77eefcdce6
commit a3f81dcc0f
No known key found for this signature in database
GPG Key ID: 6EF5E4B442011B02
6 changed files with 79 additions and 17 deletions

View File

@ -35,8 +35,10 @@ else: #pragma: no cover
the input streams is already sorted (smallest to largest). the input streams is already sorted (smallest to largest).
>>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25])) >>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25]))
[0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25] [0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25]
If *key* is not None, applies a key function to each element to determine If *key* is not None, applies a key function to each element to determine
its sort order. its sort order.
>>> list(merge(['dog', 'horse'], ['cat', 'fish', 'kangaroo'], key=len)) >>> list(merge(['dog', 'horse'], ['cat', 'fish', 'kangaroo'], key=len))
['dog', 'cat', 'fish', 'horse', 'kangaroo'] ['dog', 'cat', 'fish', 'horse', 'kangaroo']
''' '''

View File

@ -1,5 +1,6 @@
from pywb.warcserver.index.indexsource import FileIndexSource from pywb.warcserver.index.indexsource import FileIndexSource
from pywb.warcserver.index.aggregator import DirectoryIndexSource, CacheDirectoryMixin from pywb.warcserver.index.aggregator import DirectoryIndexSource, CacheDirectoryMixin
from pywb.warcserver.index.aggregator import SimpleAggregator
from pywb.utils.binsearch import search from pywb.utils.binsearch import search
from pywb.utils.merge import merge from pywb.utils.merge import merge
@ -18,13 +19,21 @@ class FileAccessIndexSource(FileIndexSource):
# ============================================================================ # ============================================================================
class DirectoryAccessSource(DirectoryIndexSource): class ReverseMergeMixin(object):
INDEX_SOURCES = [('.aclj', FileAccessIndexSource)]
def _merge(self, iter_list): def _merge(self, iter_list):
return merge(*(iter_list), reverse=True) return merge(*(iter_list), reverse=True)
# ============================================================================
class AccessRulesAggregator(ReverseMergeMixin, SimpleAggregator):
pass
# ============================================================================
class DirectoryAccessSource(ReverseMergeMixin, DirectoryIndexSource):
INDEX_SOURCES = [('.aclj', FileAccessIndexSource)]
# ============================================================================ # ============================================================================
class CacheDirectoryAccessSource(CacheDirectoryMixin, DirectoryAccessSource): class CacheDirectoryAccessSource(CacheDirectoryMixin, DirectoryAccessSource):
pass pass
@ -32,15 +41,25 @@ class CacheDirectoryAccessSource(CacheDirectoryMixin, DirectoryAccessSource):
# ============================================================================ # ============================================================================
class AccessChecker(object): class AccessChecker(object):
def __init__(self, access_source_file, default_access='allow'): def __init__(self, access_source, default_access='allow'):
if isinstance(access_source_file, str): if isinstance(access_source, str):
self.aggregator = self.create_access_aggregator(access_source_file) self.aggregator = self.create_access_aggregator([access_source])
elif isinstance(access_source, list):
self.aggregator = self.create_access_aggregator(access_source)
else: else:
self.aggregator = access_source_file self.aggregator = access_source
self.default_rule = {'urlkey': '', 'access': default_access} self.default_rule = {'urlkey': '', 'access': default_access}
def create_access_aggregator(self, filename): def create_access_aggregator(self, source_files):
sources = {}
for filename in source_files:
sources[filename] = self.create_access_source(filename)
aggregator = AccessRulesAggregator(sources)
return aggregator
def create_access_source(self, filename):
if os.path.isdir(filename): if os.path.isdir(filename):
return CacheDirectoryAccessSource(filename) return CacheDirectoryAccessSource(filename)
@ -52,18 +71,18 @@ class AccessChecker(object):
def find_access_rule(self, url, ts=None, urlkey=None): def find_access_rule(self, url, ts=None, urlkey=None):
params = {'url': url, 'urlkey': urlkey} params = {'url': url, 'urlkey': urlkey}
cdx_iter, errs = self.aggregator(params) acl_iter, errs = self.aggregator(params)
if errs: if errs:
print(errs) print(errs)
key = params['key'].decode('utf-8') key = params['key'].decode('utf-8')
for cdx in cdx_iter: for acl in acl_iter:
if 'urlkey' not in cdx: if 'urlkey' not in acl:
continue continue
if key.startswith(cdx['urlkey']): if key.startswith(acl['urlkey']):
return cdx return acl
return self.default_rule return self.default_rule

View File

@ -178,9 +178,9 @@ class WarcServer(BaseWarcServer):
else: else:
raise Exception('collection config must be string or dict') raise Exception('collection config must be string or dict')
# INDEX CONFIG
if index: if index:
agg = init_index_agg({name: index}) agg = init_index_agg({name: index})
else: else:
if not isinstance(coll_config, dict): if not isinstance(coll_config, dict):
raise Exception('collection config missing') raise Exception('collection config missing')
@ -196,9 +196,11 @@ class WarcServer(BaseWarcServer):
timeout = int(coll_config.get('timeout', 0)) timeout = int(coll_config.get('timeout', 0))
agg = init_index_agg(index_group, True, timeout) agg = init_index_agg(index_group, True, timeout)
# ARCHIVE CONFIG
if not archive_paths: if not archive_paths:
archive_paths = self.config.get('archive_paths') archive_paths = self.config.get('archive_paths')
# ACCESS CONFIG
access_checker = None access_checker = None
if acl_paths: if acl_paths:
access_checker = AccessChecker(acl_paths, default_access) access_checker = AccessChecker(acl_paths, default_access)

View File

@ -1,2 +1,3 @@
org,httpbin)/ - {"access": "allow"}
com,example)/ - {"access": "allow"} com,example)/ - {"access": "allow"}
bo,example)/ - {"access": "exclude"} bo,example)/ - {"access": "exclude"}

View File

@ -4,6 +4,24 @@ collections:
pywb: pywb:
index_paths: ./sample_archive/cdx/ index_paths: ./sample_archive/cdx/
archive_paths: ./sample_archive/warcs/ archive_paths: ./sample_archive/warcs/
acl_paths: ./sample_archive/access/ acl_paths: ./sample_archive/access/pywb.aclj
default_access: block default_access: block
pywb-acl-list:
index_paths: ./sample_archive/cdx/
archive_paths: ./sample_archive/warcs/
acl_paths:
- ./sample_archive/access/pywb.aclj
- ./sample_archive/access/list2.aclj
default_access: block
pywb-acl-dir:
index_paths: ./sample_archive/cdx/
archive_paths: ./sample_archive/warcs/
acl_paths: ./sample_archive/access/
default_access: block

View File

@ -12,9 +12,10 @@ class TestACLApp(BaseConfigTest):
def setup_class(cls): def setup_class(cls):
super(TestACLApp, cls).setup_class('config_test_access.yaml') super(TestACLApp, cls).setup_class('config_test_access.yaml')
def query(self, url, is_error=False, **params): def query(self, url, coll='pywb'):
params = {}
params['url'] = url params['url'] = url
return self.testapp.get('/pywb/cdx?' + urlencode(params, doseq=1), expect_errors=is_error) return self.testapp.get('/{coll}/cdx?'.format(coll=coll) + urlencode(params, doseq=1))
def test_excluded_url(self): def test_excluded_url(self):
resp = self.query('http://www.iana.org/') resp = self.query('http://www.iana.org/')
@ -52,4 +53,23 @@ class TestACLApp(BaseConfigTest):
assert 'Access Blocked' in resp.text assert 'Access Blocked' in resp.text
def test_allowed_different_coll_acl_list(self):
resp = self.query('http://httpbin.org/anything/resource.json', coll='pywb-acl-list')
assert len(resp.text.splitlines()) > 0
resp = self.testapp.get('/pywb-acl-list/mp_/http://httpbin.org/anything/resource.json')
assert '"http://httpbin.org/anything/resource.json"' in resp.text
def test_allowed_different_coll_acl_dir(self):
resp = self.query('http://httpbin.org/anything/resource.json', coll='pywb-acl-dir')
assert len(resp.text.splitlines()) > 0
resp = self.testapp.get('/pywb-acl-dir/mp_/http://httpbin.org/anything/resource.json')
assert '"http://httpbin.org/anything/resource.json"' in resp.text