mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
cdxops: make sure sort reverse and closest are lazy (create generators)
perms: allow_url_lookup() only takes key param for simplicity
This commit is contained in:
parent
5a28bc6992
commit
331976748e
@ -18,11 +18,11 @@ def cdx_load(sources, query, perms_checker=None, process=True):
|
|||||||
|
|
||||||
:param sources: iterable for text CDX sources.
|
:param sources: iterable for text CDX sources.
|
||||||
:param perms_checker: access check filter object implementing
|
:param perms_checker: access check filter object implementing
|
||||||
allow_url_lookup(key, url), allow_capture(cdxobj) and
|
allow_url_lookup(key), allow_capture(cdxobj) and
|
||||||
filter_fields(cdxobj) methods.
|
filter_fields(cdxobj) methods.
|
||||||
:param process: bool, perform processing sorting/filtering/grouping ops
|
:param process: bool, perform processing sorting/filtering/grouping ops
|
||||||
"""
|
"""
|
||||||
cdx_iter = load_cdx_streams(sources, query)
|
cdx_iter = create_merged_cdx_gen(sources, query)
|
||||||
cdx_iter = make_obj_iter(cdx_iter, query)
|
cdx_iter = make_obj_iter(cdx_iter, query)
|
||||||
|
|
||||||
if process and not query.secondary_index_only:
|
if process and not query.secondary_index_only:
|
||||||
@ -52,7 +52,7 @@ def restrict_cdx(cdx_iter, query, perms_checker):
|
|||||||
:param query: request parameters (CDXQuery)
|
:param query: request parameters (CDXQuery)
|
||||||
:param perms_checker: object implementing permission checker
|
:param perms_checker: object implementing permission checker
|
||||||
"""
|
"""
|
||||||
if not perms_checker.allow_url_lookup(query.key, query.url):
|
if not perms_checker.allow_url_lookup(query.key):
|
||||||
if query.is_exact:
|
if query.is_exact:
|
||||||
raise AccessException('Excluded')
|
raise AccessException('Excluded')
|
||||||
|
|
||||||
@ -96,8 +96,11 @@ def process_cdx(cdx_iter, query):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# load and source merge cdx streams
|
def create_merged_cdx_gen(sources, query):
|
||||||
def load_cdx_streams(sources, query):
|
"""
|
||||||
|
create a generator which loads and merges cdx streams
|
||||||
|
ensures cdxs are lazy loaded
|
||||||
|
"""
|
||||||
# Optimize: no need to merge if just one input
|
# Optimize: no need to merge if just one input
|
||||||
if len(sources) == 1:
|
if len(sources) == 1:
|
||||||
cdx_iter = sources[0].load_cdx(query)
|
cdx_iter = sources[0].load_cdx(query)
|
||||||
@ -138,17 +141,18 @@ def cdx_reverse(cdx_iter, limit):
|
|||||||
for cdx in cdx_iter:
|
for cdx in cdx_iter:
|
||||||
last = cdx
|
last = cdx
|
||||||
|
|
||||||
return [last] if last else []
|
yield last
|
||||||
|
|
||||||
reverse_cdxs = deque(maxlen=limit)
|
reverse_cdxs = deque(maxlen=limit)
|
||||||
|
|
||||||
for cdx in cdx_iter:
|
for cdx in cdx_iter:
|
||||||
reverse_cdxs.appendleft(cdx)
|
reverse_cdxs.appendleft(cdx)
|
||||||
|
|
||||||
return reverse_cdxs
|
for cdx in reverse_cdxs:
|
||||||
|
yield cdx
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# filter cdx by regex if each filter is field:regex form,
|
# filter cdx by regex if each filter is field:regex form,
|
||||||
# apply filter to cdx[field]
|
# apply filter to cdx[field]
|
||||||
def cdx_filter(cdx_iter, filter_strings):
|
def cdx_filter(cdx_iter, filter_strings):
|
||||||
@ -252,7 +256,8 @@ def cdx_sort_closest(closest, cdx_iter, limit=10):
|
|||||||
if len(closest_cdx) > limit:
|
if len(closest_cdx) > limit:
|
||||||
closest_cdx.pop()
|
closest_cdx.pop()
|
||||||
|
|
||||||
return itertools.imap(lambda x: x[1], closest_cdx)
|
for cdx in itertools.imap(lambda x: x[1], closest_cdx):
|
||||||
|
yield cdx
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
@ -5,7 +5,7 @@ class AllowAllPerms(object):
|
|||||||
"""
|
"""
|
||||||
Sample Perm Checker which allows all
|
Sample Perm Checker which allows all
|
||||||
"""
|
"""
|
||||||
def allow_url_lookup(self, urlkey, url):
|
def allow_url_lookup(self, urlkey):
|
||||||
"""
|
"""
|
||||||
Return true/false if url or urlkey (canonicalized url)
|
Return true/false if url or urlkey (canonicalized url)
|
||||||
should be allowed
|
should be allowed
|
||||||
|
@ -6,7 +6,7 @@ from pywb.cdx.cdxobject import AccessException
|
|||||||
from pytest import raises
|
from pytest import raises
|
||||||
|
|
||||||
class BlockAllPerms(AllowAllPerms):
|
class BlockAllPerms(AllowAllPerms):
|
||||||
def allow_url_lookup(self, urlkey, url):
|
def allow_url_lookup(self, urlkey):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
@ -34,7 +34,7 @@ class TestExclusionPerms(AllowAllPerms):
|
|||||||
# sample_archive has captures for this URLKEY
|
# sample_archive has captures for this URLKEY
|
||||||
URLKEY_EXCLUDED = 'org,iana)/_img/bookmark_icon.ico'
|
URLKEY_EXCLUDED = 'org,iana)/_img/bookmark_icon.ico'
|
||||||
|
|
||||||
def allow_url_lookup(self, urlkey, url):
|
def allow_url_lookup(self, urlkey):
|
||||||
"""
|
"""
|
||||||
Return true/false if url or urlkey (canonicalized url)
|
Return true/false if url or urlkey (canonicalized url)
|
||||||
should be allowed
|
should be allowed
|
||||||
@ -42,4 +42,4 @@ class TestExclusionPerms(AllowAllPerms):
|
|||||||
if urlkey == self.URLKEY_EXCLUDED:
|
if urlkey == self.URLKEY_EXCLUDED:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return super(TestExclusionPerms, self).allow_url_lookup(urlkey, url)
|
return super(TestExclusionPerms, self).allow_url_lookup(urlkey)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user