mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
acl optimization: addresses ukwa/ukwa-pywb#38
- stop checking acl rules linearly if acl key < tld - use existing rule for same url (at least until date-range checking)
This commit is contained in:
parent
60ad1739b7
commit
0c08b9b5d5
@ -76,21 +76,27 @@ class AccessChecker(object):
|
|||||||
|
|
||||||
def find_access_rule(self, url, ts=None, urlkey=None):
|
def find_access_rule(self, url, ts=None, urlkey=None):
|
||||||
params = {'url': url, 'urlkey': urlkey}
|
params = {'url': url, 'urlkey': urlkey}
|
||||||
print("Getting acl_iter...')
|
|
||||||
acl_iter, errs = self.aggregator(params)
|
acl_iter, errs = self.aggregator(params)
|
||||||
if errs:
|
if errs:
|
||||||
print(errs)
|
print(errs)
|
||||||
|
|
||||||
key = params['key'].decode('utf-8')
|
key = params['key'].decode('utf-8')
|
||||||
|
|
||||||
print("Iterating acl_iter...')
|
tld = key.split(',')[0]
|
||||||
|
|
||||||
for acl in acl_iter:
|
for acl in acl_iter:
|
||||||
|
# skip empty/invalid lines
|
||||||
if 'urlkey' not in acl:
|
if 'urlkey' not in acl:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if key.startswith(acl['urlkey']):
|
if key.startswith(acl['urlkey']):
|
||||||
return acl
|
return acl
|
||||||
|
|
||||||
|
# if acl key already less than first tld,
|
||||||
|
# no match can be found
|
||||||
|
if acl['urlkey'] < tld:
|
||||||
|
break
|
||||||
|
|
||||||
return self.default_rule
|
return self.default_rule
|
||||||
|
|
||||||
def __call__(self, res):
|
def __call__(self, res):
|
||||||
@ -102,21 +108,24 @@ class AccessChecker(object):
|
|||||||
last_url = None
|
last_url = None
|
||||||
|
|
||||||
for cdx in cdx_iter:
|
for cdx in cdx_iter:
|
||||||
print("Looking at",cdx)
|
|
||||||
url = cdx.get('url')
|
url = cdx.get('url')
|
||||||
print(url)
|
|
||||||
# if no url, possible idx or other object, don't apply any checks and pass through
|
# if no url, possible idx or other object, don't apply any checks and pass through
|
||||||
if not url:
|
if not url:
|
||||||
yield cdx
|
yield cdx
|
||||||
continue
|
continue
|
||||||
|
|
||||||
rule = self.find_access_rule(url, cdx.get('timestamp'), cdx.get('urlkey'))
|
# TODO: optimization until date range support is included
|
||||||
print(rule)
|
if url == last_url:
|
||||||
|
rule = last_rule
|
||||||
|
else:
|
||||||
|
rule = self.find_access_rule(url, cdx.get('timestamp'), cdx.get('urlkey'))
|
||||||
|
|
||||||
access = rule.get('access', 'exclude')
|
access = rule.get('access', 'exclude')
|
||||||
print(access)
|
|
||||||
if access == 'exclude':
|
if access == 'exclude':
|
||||||
continue
|
continue
|
||||||
|
|
||||||
print("Yielding...")
|
|
||||||
cdx['access'] = access
|
cdx['access'] = access
|
||||||
yield cdx
|
yield cdx
|
||||||
|
|
||||||
|
last_rule = rule
|
||||||
|
last_url = url
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
org,iana)/about - {"access": "block"}
|
org,iana)/about - {"access": "block"}
|
||||||
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf - {"access": "allow"}
|
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf - {"access": "allow"}
|
||||||
org,iana)/_css - {"access": "exclude"}
|
org,iana)/_css - {"access": "exclude"}
|
||||||
org,example)/?example=1 - {"access": "block"}
|
|
||||||
org,iana)/ - {"access": "exclude"}
|
org,iana)/ - {"access": "exclude"}
|
||||||
|
org,example)/?example=1 - {"access": "block"}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user