From be284859be99f7403073f6df2557ca62e79178bc Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 19 Feb 2014 17:52:13 -0800 Subject: [PATCH] sample perms addition to cdx ops --- pywb/cdx/cdxops.py | 112 +++++++++++++++++++++++++++++++-------------- 1 file changed, 78 insertions(+), 34 deletions(-) diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index 4af38cfd..99798e9b 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -10,44 +10,25 @@ from collections import deque #================================================================= -def cdx_text_out(cdx, fields): - if not fields: - return str(cdx) - else: - return ' '.join(map(lambda x: cdx[x], fields.split(','))) +class AllowAllPerms: + """ + Sample Perm Checker which allows all + """ + def allow_url(self, url): + return True + + def allow_url_timestamp(self, url, timestamp): + return True + + def filter_fields(self, cdx): + return cdx #================================================================= -def cdx_load(sources, params): - cdx_iter = load_cdx_streams(sources, params) +def cdx_load(source, params, perms_checker = AllowAllPerms()): - cdx_iter = make_cdx_iter(cdx_iter) - - if not params.get('proxy_all'): - resolve_revisits = params.get('resolve_revisits', False) - if resolve_revisits: - cdx_iter = cdx_resolve_revisits(cdx_iter) - - filters = params.get('filter', None) - if filters: - cdx_iter = cdx_filter(cdx_iter, filters) - - collapse_time = params.get('collapse_time', None) - if collapse_time: - cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time) - - limit = int(params.get('limit', 1000000)) - - reverse = params.get('reverse', False) - if reverse: - cdx_iter = cdx_reverse(cdx_iter, limit) - - closest_to = params.get('closest', None) - if closest_to: - cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit) - - if limit: - cdx_iter = cdx_limit(cdx_iter, limit) + #cdx_iter = cdx_load_all(source, params) + cdx_iter = cdx_load_with_perms(source, params, perms_checker) # output raw cdx objects if params.get('output') == 'raw': @@ -60,6 +41,69 @@ def cdx_load(sources, params): return write_cdx(params.get('fields')) +#================================================================= +def cdx_load_with_perms(source, params, perms_checker): + if not perms_checker.allow_url(params['url']): + if params.get('matchType', 'exact') == 'exact': + yield + + cdx_iter = cdx_load_all(source, params) + + for cdx in cdx_iter: + if not perms_checker.allow_url_timestamp(cdx['original'], + cdx['timestamp']): + continue + + cdx = perms_checker.filter_fields(cdx) + + yield cdx + + +#================================================================= +def cdx_text_out(cdx, fields): + if not fields: + return str(cdx) + else: + return ' '.join(map(lambda x: cdx[x], fields.split(','))) + + +#================================================================= +def cdx_load_all(sources, params): + cdx_iter = load_cdx_streams(sources, params) + + cdx_iter = make_cdx_iter(cdx_iter) + + if params.get('proxy_all'): + return cdx_iter + + resolve_revisits = params.get('resolve_revisits', False) + if resolve_revisits: + cdx_iter = cdx_resolve_revisits(cdx_iter) + + filters = params.get('filter', None) + if filters: + cdx_iter = cdx_filter(cdx_iter, filters) + + collapse_time = params.get('collapse_time', None) + if collapse_time: + cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time) + + limit = int(params.get('limit', 1000000)) + + reverse = params.get('reverse', False) + if reverse: + cdx_iter = cdx_reverse(cdx_iter, limit) + + closest_to = params.get('closest', None) + if closest_to: + cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit) + + if limit: + cdx_iter = cdx_limit(cdx_iter, limit) + + return cdx_iter + + #================================================================= # load and source merge cdx streams def load_cdx_streams(sources, params):