1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

acl optimization: fixes ukwa/ukwa-pywb#39

- don't parse json on every aclj line until key prefix matches, resulting in speed boost!
- convert aclj to dict (via cdxobject) only when match is found (disable aggregator source tracking)
This commit is contained in:
Ilya Kreymer 2019-02-14 18:55:33 -08:00 committed by John Berlin
parent ce0ed610bd
commit 9b2ae35b93
No known key found for this signature in database
GPG Key ID: 6EF5E4B442011B02
3 changed files with 22 additions and 18 deletions

View File

@ -15,8 +15,9 @@ class FileAccessIndexSource(FileIndexSource):
def rev_cmp(a, b):
return (a < b) - (a > b)
def _get_gen(self, fh, params):
return search(fh, params['key'], prev_size=1, compare_func=self.rev_cmp)
def _do_iter(self, fh, params):
for line in search(fh, params['key'], prev_size=1, compare_func=self.rev_cmp):
yield line
# ============================================================================
@ -75,26 +76,28 @@ class AccessChecker(object):
raise Exception('Invalid Access Source: ' + filename)
def find_access_rule(self, url, ts=None, urlkey=None):
params = {'url': url, 'urlkey': urlkey}
params = {'url': url, 'urlkey': urlkey, 'nosource': 'true'}
acl_iter, errs = self.aggregator(params)
if errs:
print(errs)
key = params['key'].decode('utf-8')
key = params['key']
tld = key.split(',')[0]
tld = key.split(b',')[0]
for acl in acl_iter:
# skip empty/invalid lines
if 'urlkey' not in acl:
if not acl:
continue
if key.startswith(acl['urlkey']):
return acl
acl_key = acl.split(b' ')[0]
if key.startswith(acl_key):
return CDXObject(acl)
# if acl key already less than first tld,
# no match can be found
if acl['urlkey'] < tld:
if acl_key < tld:
break
return self.default_rule

View File

@ -76,20 +76,21 @@ class FileIndexSource(BaseIndexSource):
except IOError:
raise NotFoundException(filename)
def _get_gen(self, fh, params):
return iter_range(fh, params['key'], params['end_key'])
def load_index(self, params):
filename = res_template(self.filename_template, params)
fh = self._do_open(filename)
def do_load(fh):
def do_iter():
with fh:
for line in self._get_gen(fh, params):
yield CDXObject(line)
for obj in self._do_iter(fh, params):
yield obj
return do_load(fh)
return do_iter()
def _do_iter(self, fh, params):
for line in iter_range(fh, params['key'], params['end_key']):
yield CDXObject(line)
def __repr__(self):
return '{0}(file://{1})'.format(self.__class__.__name__,

View File

@ -69,9 +69,9 @@ com,example)/ - {"access": "allow", "url": "http://example.com/"}
assert out == """\
Matched rule:
com,example, - {"access": "exclude", "url": "com,example,", "source": "%s", "source-coll": "%s"}
com,example, - {"access": "exclude", "url": "com,example,"}
""" % (self.acl_filename, self.acl_filename)
"""
def test_remove_acl(self):
wb_manager(['acl', 'remove', self.acl_filename, 'com,example,'])