1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

cdxops: make sure sort reverse and closest are lazy (create generators)

perms: allow_url_lookup() only takes key param for simplicity
This commit is contained in:
Ilya Kreymer 2014-03-03 12:16:07 -08:00
parent 5a28bc6992
commit 331976748e
4 changed files with 18 additions and 13 deletions

View File

@ -18,11 +18,11 @@ def cdx_load(sources, query, perms_checker=None, process=True):
:param sources: iterable for text CDX sources. :param sources: iterable for text CDX sources.
:param perms_checker: access check filter object implementing :param perms_checker: access check filter object implementing
allow_url_lookup(key, url), allow_capture(cdxobj) and allow_url_lookup(key), allow_capture(cdxobj) and
filter_fields(cdxobj) methods. filter_fields(cdxobj) methods.
:param process: bool, perform processing sorting/filtering/grouping ops :param process: bool, perform processing sorting/filtering/grouping ops
""" """
cdx_iter = load_cdx_streams(sources, query) cdx_iter = create_merged_cdx_gen(sources, query)
cdx_iter = make_obj_iter(cdx_iter, query) cdx_iter = make_obj_iter(cdx_iter, query)
if process and not query.secondary_index_only: if process and not query.secondary_index_only:
@ -52,7 +52,7 @@ def restrict_cdx(cdx_iter, query, perms_checker):
:param query: request parameters (CDXQuery) :param query: request parameters (CDXQuery)
:param perms_checker: object implementing permission checker :param perms_checker: object implementing permission checker
""" """
if not perms_checker.allow_url_lookup(query.key, query.url): if not perms_checker.allow_url_lookup(query.key):
if query.is_exact: if query.is_exact:
raise AccessException('Excluded') raise AccessException('Excluded')
@ -96,8 +96,11 @@ def process_cdx(cdx_iter, query):
#================================================================= #=================================================================
# load and source merge cdx streams def create_merged_cdx_gen(sources, query):
def load_cdx_streams(sources, query): """
create a generator which loads and merges cdx streams
ensures cdxs are lazy loaded
"""
# Optimize: no need to merge if just one input # Optimize: no need to merge if just one input
if len(sources) == 1: if len(sources) == 1:
cdx_iter = sources[0].load_cdx(query) cdx_iter = sources[0].load_cdx(query)
@ -138,17 +141,18 @@ def cdx_reverse(cdx_iter, limit):
for cdx in cdx_iter: for cdx in cdx_iter:
last = cdx last = cdx
return [last] if last else [] yield last
reverse_cdxs = deque(maxlen=limit) reverse_cdxs = deque(maxlen=limit)
for cdx in cdx_iter: for cdx in cdx_iter:
reverse_cdxs.appendleft(cdx) reverse_cdxs.appendleft(cdx)
return reverse_cdxs for cdx in reverse_cdxs:
yield cdx
#================================================================= #=================================================================
# filter cdx by regex if each filter is field:regex form, # filter cdx by regex if each filter is field:regex form,
# apply filter to cdx[field] # apply filter to cdx[field]
def cdx_filter(cdx_iter, filter_strings): def cdx_filter(cdx_iter, filter_strings):
@ -252,7 +256,8 @@ def cdx_sort_closest(closest, cdx_iter, limit=10):
if len(closest_cdx) > limit: if len(closest_cdx) > limit:
closest_cdx.pop() closest_cdx.pop()
return itertools.imap(lambda x: x[1], closest_cdx) for cdx in itertools.imap(lambda x: x[1], closest_cdx):
yield cdx
#================================================================= #=================================================================

View File

@ -5,7 +5,7 @@ class AllowAllPerms(object):
""" """
Sample Perm Checker which allows all Sample Perm Checker which allows all
""" """
def allow_url_lookup(self, urlkey, url): def allow_url_lookup(self, urlkey):
""" """
Return true/false if url or urlkey (canonicalized url) Return true/false if url or urlkey (canonicalized url)
should be allowed should be allowed

View File

@ -6,7 +6,7 @@ from pywb.cdx.cdxobject import AccessException
from pytest import raises from pytest import raises
class BlockAllPerms(AllowAllPerms): class BlockAllPerms(AllowAllPerms):
def allow_url_lookup(self, urlkey, url): def allow_url_lookup(self, urlkey):
return False return False

View File

@ -34,7 +34,7 @@ class TestExclusionPerms(AllowAllPerms):
# sample_archive has captures for this URLKEY # sample_archive has captures for this URLKEY
URLKEY_EXCLUDED = 'org,iana)/_img/bookmark_icon.ico' URLKEY_EXCLUDED = 'org,iana)/_img/bookmark_icon.ico'
def allow_url_lookup(self, urlkey, url): def allow_url_lookup(self, urlkey):
""" """
Return true/false if url or urlkey (canonicalized url) Return true/false if url or urlkey (canonicalized url)
should be allowed should be allowed
@ -42,4 +42,4 @@ class TestExclusionPerms(AllowAllPerms):
if urlkey == self.URLKEY_EXCLUDED: if urlkey == self.URLKEY_EXCLUDED:
return False return False
return super(TestExclusionPerms, self).allow_url_lookup(urlkey, url) return super(TestExclusionPerms, self).allow_url_lookup(urlkey)