1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

sample perms addition to cdx ops

This commit is contained in:
Ilya Kreymer 2014-02-19 17:52:13 -08:00
parent 531464902f
commit be284859be

View File

@ -10,44 +10,25 @@ from collections import deque
#=================================================================
def cdx_text_out(cdx, fields):
if not fields:
return str(cdx)
else:
return ' '.join(map(lambda x: cdx[x], fields.split(',')))
class AllowAllPerms:
"""
Sample Perm Checker which allows all
"""
def allow_url(self, url):
return True
def allow_url_timestamp(self, url, timestamp):
return True
def filter_fields(self, cdx):
return cdx
#=================================================================
def cdx_load(sources, params):
cdx_iter = load_cdx_streams(sources, params)
def cdx_load(source, params, perms_checker = AllowAllPerms()):
cdx_iter = make_cdx_iter(cdx_iter)
if not params.get('proxy_all'):
resolve_revisits = params.get('resolve_revisits', False)
if resolve_revisits:
cdx_iter = cdx_resolve_revisits(cdx_iter)
filters = params.get('filter', None)
if filters:
cdx_iter = cdx_filter(cdx_iter, filters)
collapse_time = params.get('collapse_time', None)
if collapse_time:
cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time)
limit = int(params.get('limit', 1000000))
reverse = params.get('reverse', False)
if reverse:
cdx_iter = cdx_reverse(cdx_iter, limit)
closest_to = params.get('closest', None)
if closest_to:
cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit)
if limit:
cdx_iter = cdx_limit(cdx_iter, limit)
#cdx_iter = cdx_load_all(source, params)
cdx_iter = cdx_load_with_perms(source, params, perms_checker)
# output raw cdx objects
if params.get('output') == 'raw':
@ -60,6 +41,69 @@ def cdx_load(sources, params):
return write_cdx(params.get('fields'))
#=================================================================
def cdx_load_with_perms(source, params, perms_checker):
if not perms_checker.allow_url(params['url']):
if params.get('matchType', 'exact') == 'exact':
yield
cdx_iter = cdx_load_all(source, params)
for cdx in cdx_iter:
if not perms_checker.allow_url_timestamp(cdx['original'],
cdx['timestamp']):
continue
cdx = perms_checker.filter_fields(cdx)
yield cdx
#=================================================================
def cdx_text_out(cdx, fields):
if not fields:
return str(cdx)
else:
return ' '.join(map(lambda x: cdx[x], fields.split(',')))
#=================================================================
def cdx_load_all(sources, params):
cdx_iter = load_cdx_streams(sources, params)
cdx_iter = make_cdx_iter(cdx_iter)
if params.get('proxy_all'):
return cdx_iter
resolve_revisits = params.get('resolve_revisits', False)
if resolve_revisits:
cdx_iter = cdx_resolve_revisits(cdx_iter)
filters = params.get('filter', None)
if filters:
cdx_iter = cdx_filter(cdx_iter, filters)
collapse_time = params.get('collapse_time', None)
if collapse_time:
cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time)
limit = int(params.get('limit', 1000000))
reverse = params.get('reverse', False)
if reverse:
cdx_iter = cdx_reverse(cdx_iter, limit)
closest_to = params.get('closest', None)
if closest_to:
cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit)
if limit:
cdx_iter = cdx_limit(cdx_iter, limit)
return cdx_iter
#=================================================================
# load and source merge cdx streams
def load_cdx_streams(sources, params):