mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
access system work for ukwa/ukwa-pywb#7
- 'acl_paths' config can accept a list of files or directories, a file or a directory string - tests_acl: test collection with acl list, single file, dir
This commit is contained in:
parent
77eefcdce6
commit
a3f81dcc0f
@ -35,8 +35,10 @@ else: #pragma: no cover
|
|||||||
the input streams is already sorted (smallest to largest).
|
the input streams is already sorted (smallest to largest).
|
||||||
>>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25]))
|
>>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25]))
|
||||||
[0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25]
|
[0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25]
|
||||||
|
|
||||||
If *key* is not None, applies a key function to each element to determine
|
If *key* is not None, applies a key function to each element to determine
|
||||||
its sort order.
|
its sort order.
|
||||||
|
|
||||||
>>> list(merge(['dog', 'horse'], ['cat', 'fish', 'kangaroo'], key=len))
|
>>> list(merge(['dog', 'horse'], ['cat', 'fish', 'kangaroo'], key=len))
|
||||||
['dog', 'cat', 'fish', 'horse', 'kangaroo']
|
['dog', 'cat', 'fish', 'horse', 'kangaroo']
|
||||||
'''
|
'''
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
from pywb.warcserver.index.indexsource import FileIndexSource
|
from pywb.warcserver.index.indexsource import FileIndexSource
|
||||||
from pywb.warcserver.index.aggregator import DirectoryIndexSource, CacheDirectoryMixin
|
from pywb.warcserver.index.aggregator import DirectoryIndexSource, CacheDirectoryMixin
|
||||||
|
from pywb.warcserver.index.aggregator import SimpleAggregator
|
||||||
|
|
||||||
from pywb.utils.binsearch import search
|
from pywb.utils.binsearch import search
|
||||||
from pywb.utils.merge import merge
|
from pywb.utils.merge import merge
|
||||||
@ -18,13 +19,21 @@ class FileAccessIndexSource(FileIndexSource):
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class DirectoryAccessSource(DirectoryIndexSource):
|
class ReverseMergeMixin(object):
|
||||||
INDEX_SOURCES = [('.aclj', FileAccessIndexSource)]
|
|
||||||
|
|
||||||
def _merge(self, iter_list):
|
def _merge(self, iter_list):
|
||||||
return merge(*(iter_list), reverse=True)
|
return merge(*(iter_list), reverse=True)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
class AccessRulesAggregator(ReverseMergeMixin, SimpleAggregator):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
class DirectoryAccessSource(ReverseMergeMixin, DirectoryIndexSource):
|
||||||
|
INDEX_SOURCES = [('.aclj', FileAccessIndexSource)]
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class CacheDirectoryAccessSource(CacheDirectoryMixin, DirectoryAccessSource):
|
class CacheDirectoryAccessSource(CacheDirectoryMixin, DirectoryAccessSource):
|
||||||
pass
|
pass
|
||||||
@ -32,15 +41,25 @@ class CacheDirectoryAccessSource(CacheDirectoryMixin, DirectoryAccessSource):
|
|||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class AccessChecker(object):
|
class AccessChecker(object):
|
||||||
def __init__(self, access_source_file, default_access='allow'):
|
def __init__(self, access_source, default_access='allow'):
|
||||||
if isinstance(access_source_file, str):
|
if isinstance(access_source, str):
|
||||||
self.aggregator = self.create_access_aggregator(access_source_file)
|
self.aggregator = self.create_access_aggregator([access_source])
|
||||||
|
elif isinstance(access_source, list):
|
||||||
|
self.aggregator = self.create_access_aggregator(access_source)
|
||||||
else:
|
else:
|
||||||
self.aggregator = access_source_file
|
self.aggregator = access_source
|
||||||
|
|
||||||
self.default_rule = {'urlkey': '', 'access': default_access}
|
self.default_rule = {'urlkey': '', 'access': default_access}
|
||||||
|
|
||||||
def create_access_aggregator(self, filename):
|
def create_access_aggregator(self, source_files):
|
||||||
|
sources = {}
|
||||||
|
for filename in source_files:
|
||||||
|
sources[filename] = self.create_access_source(filename)
|
||||||
|
|
||||||
|
aggregator = AccessRulesAggregator(sources)
|
||||||
|
return aggregator
|
||||||
|
|
||||||
|
def create_access_source(self, filename):
|
||||||
if os.path.isdir(filename):
|
if os.path.isdir(filename):
|
||||||
return CacheDirectoryAccessSource(filename)
|
return CacheDirectoryAccessSource(filename)
|
||||||
|
|
||||||
@ -52,18 +71,18 @@ class AccessChecker(object):
|
|||||||
|
|
||||||
def find_access_rule(self, url, ts=None, urlkey=None):
|
def find_access_rule(self, url, ts=None, urlkey=None):
|
||||||
params = {'url': url, 'urlkey': urlkey}
|
params = {'url': url, 'urlkey': urlkey}
|
||||||
cdx_iter, errs = self.aggregator(params)
|
acl_iter, errs = self.aggregator(params)
|
||||||
if errs:
|
if errs:
|
||||||
print(errs)
|
print(errs)
|
||||||
|
|
||||||
key = params['key'].decode('utf-8')
|
key = params['key'].decode('utf-8')
|
||||||
|
|
||||||
for cdx in cdx_iter:
|
for acl in acl_iter:
|
||||||
if 'urlkey' not in cdx:
|
if 'urlkey' not in acl:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if key.startswith(cdx['urlkey']):
|
if key.startswith(acl['urlkey']):
|
||||||
return cdx
|
return acl
|
||||||
|
|
||||||
return self.default_rule
|
return self.default_rule
|
||||||
|
|
||||||
|
@ -178,9 +178,9 @@ class WarcServer(BaseWarcServer):
|
|||||||
else:
|
else:
|
||||||
raise Exception('collection config must be string or dict')
|
raise Exception('collection config must be string or dict')
|
||||||
|
|
||||||
|
# INDEX CONFIG
|
||||||
if index:
|
if index:
|
||||||
agg = init_index_agg({name: index})
|
agg = init_index_agg({name: index})
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if not isinstance(coll_config, dict):
|
if not isinstance(coll_config, dict):
|
||||||
raise Exception('collection config missing')
|
raise Exception('collection config missing')
|
||||||
@ -196,9 +196,11 @@ class WarcServer(BaseWarcServer):
|
|||||||
timeout = int(coll_config.get('timeout', 0))
|
timeout = int(coll_config.get('timeout', 0))
|
||||||
agg = init_index_agg(index_group, True, timeout)
|
agg = init_index_agg(index_group, True, timeout)
|
||||||
|
|
||||||
|
# ARCHIVE CONFIG
|
||||||
if not archive_paths:
|
if not archive_paths:
|
||||||
archive_paths = self.config.get('archive_paths')
|
archive_paths = self.config.get('archive_paths')
|
||||||
|
|
||||||
|
# ACCESS CONFIG
|
||||||
access_checker = None
|
access_checker = None
|
||||||
if acl_paths:
|
if acl_paths:
|
||||||
access_checker = AccessChecker(acl_paths, default_access)
|
access_checker = AccessChecker(acl_paths, default_access)
|
||||||
|
@ -1,2 +1,3 @@
|
|||||||
|
org,httpbin)/ - {"access": "allow"}
|
||||||
com,example)/ - {"access": "allow"}
|
com,example)/ - {"access": "allow"}
|
||||||
bo,example)/ - {"access": "exclude"}
|
bo,example)/ - {"access": "exclude"}
|
||||||
|
@ -4,6 +4,24 @@ collections:
|
|||||||
pywb:
|
pywb:
|
||||||
index_paths: ./sample_archive/cdx/
|
index_paths: ./sample_archive/cdx/
|
||||||
archive_paths: ./sample_archive/warcs/
|
archive_paths: ./sample_archive/warcs/
|
||||||
acl_paths: ./sample_archive/access/
|
acl_paths: ./sample_archive/access/pywb.aclj
|
||||||
|
|
||||||
default_access: block
|
default_access: block
|
||||||
|
|
||||||
|
pywb-acl-list:
|
||||||
|
index_paths: ./sample_archive/cdx/
|
||||||
|
archive_paths: ./sample_archive/warcs/
|
||||||
|
acl_paths:
|
||||||
|
- ./sample_archive/access/pywb.aclj
|
||||||
|
- ./sample_archive/access/list2.aclj
|
||||||
|
|
||||||
|
default_access: block
|
||||||
|
|
||||||
|
pywb-acl-dir:
|
||||||
|
index_paths: ./sample_archive/cdx/
|
||||||
|
archive_paths: ./sample_archive/warcs/
|
||||||
|
acl_paths: ./sample_archive/access/
|
||||||
|
|
||||||
|
default_access: block
|
||||||
|
|
||||||
|
|
||||||
|
@ -12,9 +12,10 @@ class TestACLApp(BaseConfigTest):
|
|||||||
def setup_class(cls):
|
def setup_class(cls):
|
||||||
super(TestACLApp, cls).setup_class('config_test_access.yaml')
|
super(TestACLApp, cls).setup_class('config_test_access.yaml')
|
||||||
|
|
||||||
def query(self, url, is_error=False, **params):
|
def query(self, url, coll='pywb'):
|
||||||
|
params = {}
|
||||||
params['url'] = url
|
params['url'] = url
|
||||||
return self.testapp.get('/pywb/cdx?' + urlencode(params, doseq=1), expect_errors=is_error)
|
return self.testapp.get('/{coll}/cdx?'.format(coll=coll) + urlencode(params, doseq=1))
|
||||||
|
|
||||||
def test_excluded_url(self):
|
def test_excluded_url(self):
|
||||||
resp = self.query('http://www.iana.org/')
|
resp = self.query('http://www.iana.org/')
|
||||||
@ -52,4 +53,23 @@ class TestACLApp(BaseConfigTest):
|
|||||||
|
|
||||||
assert 'Access Blocked' in resp.text
|
assert 'Access Blocked' in resp.text
|
||||||
|
|
||||||
|
def test_allowed_different_coll_acl_list(self):
|
||||||
|
resp = self.query('http://httpbin.org/anything/resource.json', coll='pywb-acl-list')
|
||||||
|
|
||||||
|
assert len(resp.text.splitlines()) > 0
|
||||||
|
|
||||||
|
resp = self.testapp.get('/pywb-acl-list/mp_/http://httpbin.org/anything/resource.json')
|
||||||
|
|
||||||
|
assert '"http://httpbin.org/anything/resource.json"' in resp.text
|
||||||
|
|
||||||
|
def test_allowed_different_coll_acl_dir(self):
|
||||||
|
resp = self.query('http://httpbin.org/anything/resource.json', coll='pywb-acl-dir')
|
||||||
|
|
||||||
|
assert len(resp.text.splitlines()) > 0
|
||||||
|
|
||||||
|
resp = self.testapp.get('/pywb-acl-dir/mp_/http://httpbin.org/anything/resource.json')
|
||||||
|
|
||||||
|
assert '"http://httpbin.org/anything/resource.json"' in resp.text
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user