diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index 6963b28c..819214c4 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -18,11 +18,11 @@ def cdx_load(sources, query, perms_checker=None, process=True): :param sources: iterable for text CDX sources. :param perms_checker: access check filter object implementing - allow_url_lookup(key, url), allow_capture(cdxobj) and + allow_url_lookup(key), allow_capture(cdxobj) and filter_fields(cdxobj) methods. :param process: bool, perform processing sorting/filtering/grouping ops """ - cdx_iter = load_cdx_streams(sources, query) + cdx_iter = create_merged_cdx_gen(sources, query) cdx_iter = make_obj_iter(cdx_iter, query) if process and not query.secondary_index_only: @@ -52,7 +52,7 @@ def restrict_cdx(cdx_iter, query, perms_checker): :param query: request parameters (CDXQuery) :param perms_checker: object implementing permission checker """ - if not perms_checker.allow_url_lookup(query.key, query.url): + if not perms_checker.allow_url_lookup(query.key): if query.is_exact: raise AccessException('Excluded') @@ -96,8 +96,11 @@ def process_cdx(cdx_iter, query): #================================================================= -# load and source merge cdx streams -def load_cdx_streams(sources, query): +def create_merged_cdx_gen(sources, query): + """ + create a generator which loads and merges cdx streams + ensures cdxs are lazy loaded + """ # Optimize: no need to merge if just one input if len(sources) == 1: cdx_iter = sources[0].load_cdx(query) @@ -138,17 +141,18 @@ def cdx_reverse(cdx_iter, limit): for cdx in cdx_iter: last = cdx - return [last] if last else [] + yield last reverse_cdxs = deque(maxlen=limit) for cdx in cdx_iter: reverse_cdxs.appendleft(cdx) - return reverse_cdxs + for cdx in reverse_cdxs: + yield cdx - #================================================================= +#================================================================= # filter cdx by regex if each filter is field:regex form, # apply filter to cdx[field] def cdx_filter(cdx_iter, filter_strings): @@ -252,7 +256,8 @@ def cdx_sort_closest(closest, cdx_iter, limit=10): if len(closest_cdx) > limit: closest_cdx.pop() - return itertools.imap(lambda x: x[1], closest_cdx) + for cdx in itertools.imap(lambda x: x[1], closest_cdx): + yield cdx #================================================================= diff --git a/pywb/cdx/perms.py b/pywb/cdx/perms.py index ad6ea00d..10a7b7dd 100644 --- a/pywb/cdx/perms.py +++ b/pywb/cdx/perms.py @@ -5,7 +5,7 @@ class AllowAllPerms(object): """ Sample Perm Checker which allows all """ - def allow_url_lookup(self, urlkey, url): + def allow_url_lookup(self, urlkey): """ Return true/false if url or urlkey (canonicalized url) should be allowed diff --git a/pywb/cdx/test/test_perms.py b/pywb/cdx/test/test_perms.py index eb5a30ac..e500ac93 100644 --- a/pywb/cdx/test/test_perms.py +++ b/pywb/cdx/test/test_perms.py @@ -6,7 +6,7 @@ from pywb.cdx.cdxobject import AccessException from pytest import raises class BlockAllPerms(AllowAllPerms): - def allow_url_lookup(self, urlkey, url): + def allow_url_lookup(self, urlkey): return False diff --git a/tests/fixture.py b/tests/fixture.py index ff7c4307..ef895a37 100644 --- a/tests/fixture.py +++ b/tests/fixture.py @@ -34,7 +34,7 @@ class TestExclusionPerms(AllowAllPerms): # sample_archive has captures for this URLKEY URLKEY_EXCLUDED = 'org,iana)/_img/bookmark_icon.ico' - def allow_url_lookup(self, urlkey, url): + def allow_url_lookup(self, urlkey): """ Return true/false if url or urlkey (canonicalized url) should be allowed @@ -42,4 +42,4 @@ class TestExclusionPerms(AllowAllPerms): if urlkey == self.URLKEY_EXCLUDED: return False - return super(TestExclusionPerms, self).allow_url_lookup(urlkey, url) + return super(TestExclusionPerms, self).allow_url_lookup(urlkey)