1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

access system work for ukwa/ukwa-pywb#7

- 'acl_paths' config can accept a list of files or directories, a file or a directory string
- tests_acl: test collection with acl list, single file, dir
This commit is contained in:
Ilya Kreymer 2018-02-18 12:04:15 -08:00 committed by John Berlin
parent 77eefcdce6
commit a3f81dcc0f
No known key found for this signature in database
GPG Key ID: 6EF5E4B442011B02
6 changed files with 79 additions and 17 deletions

View File

@ -35,8 +35,10 @@ else: #pragma: no cover
the input streams is already sorted (smallest to largest).
>>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25]))
[0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25]
If *key* is not None, applies a key function to each element to determine
its sort order.
>>> list(merge(['dog', 'horse'], ['cat', 'fish', 'kangaroo'], key=len))
['dog', 'cat', 'fish', 'horse', 'kangaroo']
'''

View File

@ -1,5 +1,6 @@
from pywb.warcserver.index.indexsource import FileIndexSource
from pywb.warcserver.index.aggregator import DirectoryIndexSource, CacheDirectoryMixin
from pywb.warcserver.index.aggregator import SimpleAggregator
from pywb.utils.binsearch import search
from pywb.utils.merge import merge
@ -18,13 +19,21 @@ class FileAccessIndexSource(FileIndexSource):
# ============================================================================
class DirectoryAccessSource(DirectoryIndexSource):
INDEX_SOURCES = [('.aclj', FileAccessIndexSource)]
class ReverseMergeMixin(object):
def _merge(self, iter_list):
return merge(*(iter_list), reverse=True)
# ============================================================================
class AccessRulesAggregator(ReverseMergeMixin, SimpleAggregator):
pass
# ============================================================================
class DirectoryAccessSource(ReverseMergeMixin, DirectoryIndexSource):
INDEX_SOURCES = [('.aclj', FileAccessIndexSource)]
# ============================================================================
class CacheDirectoryAccessSource(CacheDirectoryMixin, DirectoryAccessSource):
pass
@ -32,15 +41,25 @@ class CacheDirectoryAccessSource(CacheDirectoryMixin, DirectoryAccessSource):
# ============================================================================
class AccessChecker(object):
def __init__(self, access_source_file, default_access='allow'):
if isinstance(access_source_file, str):
self.aggregator = self.create_access_aggregator(access_source_file)
def __init__(self, access_source, default_access='allow'):
if isinstance(access_source, str):
self.aggregator = self.create_access_aggregator([access_source])
elif isinstance(access_source, list):
self.aggregator = self.create_access_aggregator(access_source)
else:
self.aggregator = access_source_file
self.aggregator = access_source
self.default_rule = {'urlkey': '', 'access': default_access}
def create_access_aggregator(self, filename):
def create_access_aggregator(self, source_files):
sources = {}
for filename in source_files:
sources[filename] = self.create_access_source(filename)
aggregator = AccessRulesAggregator(sources)
return aggregator
def create_access_source(self, filename):
if os.path.isdir(filename):
return CacheDirectoryAccessSource(filename)
@ -52,18 +71,18 @@ class AccessChecker(object):
def find_access_rule(self, url, ts=None, urlkey=None):
params = {'url': url, 'urlkey': urlkey}
cdx_iter, errs = self.aggregator(params)
acl_iter, errs = self.aggregator(params)
if errs:
print(errs)
key = params['key'].decode('utf-8')
for cdx in cdx_iter:
if 'urlkey' not in cdx:
for acl in acl_iter:
if 'urlkey' not in acl:
continue
if key.startswith(cdx['urlkey']):
return cdx
if key.startswith(acl['urlkey']):
return acl
return self.default_rule

View File

@ -178,9 +178,9 @@ class WarcServer(BaseWarcServer):
else:
raise Exception('collection config must be string or dict')
# INDEX CONFIG
if index:
agg = init_index_agg({name: index})
else:
if not isinstance(coll_config, dict):
raise Exception('collection config missing')
@ -196,9 +196,11 @@ class WarcServer(BaseWarcServer):
timeout = int(coll_config.get('timeout', 0))
agg = init_index_agg(index_group, True, timeout)
# ARCHIVE CONFIG
if not archive_paths:
archive_paths = self.config.get('archive_paths')
# ACCESS CONFIG
access_checker = None
if acl_paths:
access_checker = AccessChecker(acl_paths, default_access)

View File

@ -1,2 +1,3 @@
org,httpbin)/ - {"access": "allow"}
com,example)/ - {"access": "allow"}
bo,example)/ - {"access": "exclude"}

View File

@ -4,6 +4,24 @@ collections:
pywb:
index_paths: ./sample_archive/cdx/
archive_paths: ./sample_archive/warcs/
acl_paths: ./sample_archive/access/
acl_paths: ./sample_archive/access/pywb.aclj
default_access: block
pywb-acl-list:
index_paths: ./sample_archive/cdx/
archive_paths: ./sample_archive/warcs/
acl_paths:
- ./sample_archive/access/pywb.aclj
- ./sample_archive/access/list2.aclj
default_access: block
pywb-acl-dir:
index_paths: ./sample_archive/cdx/
archive_paths: ./sample_archive/warcs/
acl_paths: ./sample_archive/access/
default_access: block

View File

@ -12,9 +12,10 @@ class TestACLApp(BaseConfigTest):
def setup_class(cls):
super(TestACLApp, cls).setup_class('config_test_access.yaml')
def query(self, url, is_error=False, **params):
def query(self, url, coll='pywb'):
params = {}
params['url'] = url
return self.testapp.get('/pywb/cdx?' + urlencode(params, doseq=1), expect_errors=is_error)
return self.testapp.get('/{coll}/cdx?'.format(coll=coll) + urlencode(params, doseq=1))
def test_excluded_url(self):
resp = self.query('http://www.iana.org/')
@ -52,4 +53,23 @@ class TestACLApp(BaseConfigTest):
assert 'Access Blocked' in resp.text
def test_allowed_different_coll_acl_list(self):
resp = self.query('http://httpbin.org/anything/resource.json', coll='pywb-acl-list')
assert len(resp.text.splitlines()) > 0
resp = self.testapp.get('/pywb-acl-list/mp_/http://httpbin.org/anything/resource.json')
assert '"http://httpbin.org/anything/resource.json"' in resp.text
def test_allowed_different_coll_acl_dir(self):
resp = self.query('http://httpbin.org/anything/resource.json', coll='pywb-acl-dir')
assert len(resp.text.splitlines()) > 0
resp = self.testapp.get('/pywb-acl-dir/mp_/http://httpbin.org/anything/resource.json')
assert '"http://httpbin.org/anything/resource.json"' in resp.text