From be284859be99f7403073f6df2557ca62e79178bc Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 19 Feb 2014 17:52:13 -0800 Subject: [PATCH 1/2] sample perms addition to cdx ops --- pywb/cdx/cdxops.py | 112 +++++++++++++++++++++++++++++++-------------- 1 file changed, 78 insertions(+), 34 deletions(-) diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index 4af38cfd..99798e9b 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -10,44 +10,25 @@ from collections import deque #================================================================= -def cdx_text_out(cdx, fields): - if not fields: - return str(cdx) - else: - return ' '.join(map(lambda x: cdx[x], fields.split(','))) +class AllowAllPerms: + """ + Sample Perm Checker which allows all + """ + def allow_url(self, url): + return True + + def allow_url_timestamp(self, url, timestamp): + return True + + def filter_fields(self, cdx): + return cdx #================================================================= -def cdx_load(sources, params): - cdx_iter = load_cdx_streams(sources, params) +def cdx_load(source, params, perms_checker = AllowAllPerms()): - cdx_iter = make_cdx_iter(cdx_iter) - - if not params.get('proxy_all'): - resolve_revisits = params.get('resolve_revisits', False) - if resolve_revisits: - cdx_iter = cdx_resolve_revisits(cdx_iter) - - filters = params.get('filter', None) - if filters: - cdx_iter = cdx_filter(cdx_iter, filters) - - collapse_time = params.get('collapse_time', None) - if collapse_time: - cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time) - - limit = int(params.get('limit', 1000000)) - - reverse = params.get('reverse', False) - if reverse: - cdx_iter = cdx_reverse(cdx_iter, limit) - - closest_to = params.get('closest', None) - if closest_to: - cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit) - - if limit: - cdx_iter = cdx_limit(cdx_iter, limit) + #cdx_iter = cdx_load_all(source, params) + cdx_iter = cdx_load_with_perms(source, params, perms_checker) # output raw cdx objects if params.get('output') == 'raw': @@ -60,6 +41,69 @@ def cdx_load(sources, params): return write_cdx(params.get('fields')) +#================================================================= +def cdx_load_with_perms(source, params, perms_checker): + if not perms_checker.allow_url(params['url']): + if params.get('matchType', 'exact') == 'exact': + yield + + cdx_iter = cdx_load_all(source, params) + + for cdx in cdx_iter: + if not perms_checker.allow_url_timestamp(cdx['original'], + cdx['timestamp']): + continue + + cdx = perms_checker.filter_fields(cdx) + + yield cdx + + +#================================================================= +def cdx_text_out(cdx, fields): + if not fields: + return str(cdx) + else: + return ' '.join(map(lambda x: cdx[x], fields.split(','))) + + +#================================================================= +def cdx_load_all(sources, params): + cdx_iter = load_cdx_streams(sources, params) + + cdx_iter = make_cdx_iter(cdx_iter) + + if params.get('proxy_all'): + return cdx_iter + + resolve_revisits = params.get('resolve_revisits', False) + if resolve_revisits: + cdx_iter = cdx_resolve_revisits(cdx_iter) + + filters = params.get('filter', None) + if filters: + cdx_iter = cdx_filter(cdx_iter, filters) + + collapse_time = params.get('collapse_time', None) + if collapse_time: + cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time) + + limit = int(params.get('limit', 1000000)) + + reverse = params.get('reverse', False) + if reverse: + cdx_iter = cdx_reverse(cdx_iter, limit) + + closest_to = params.get('closest', None) + if closest_to: + cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit) + + if limit: + cdx_iter = cdx_limit(cdx_iter, limit) + + return cdx_iter + + #================================================================= # load and source merge cdx streams def load_cdx_streams(sources, params): From ff428ed43e668f7a127c5dcc70293e89755f576f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 19 Feb 2014 20:20:31 -0800 Subject: [PATCH 2/2] exclusions: add AllAllowPerms and refactor exclusions interface add TestExclusionPerms and a sample exclusion integration test refactor cdx server init params into **kwargs convert all cdx params to use camelCase --- config.yaml | 4 +++ pywb/cdx/cdxops.py | 45 ++++++++++------------------ pywb/cdx/cdxserver.py | 53 +++++++++++---------------------- pywb/cdx/cdxsource.py | 6 ++-- pywb/cdx/perms.py | 30 +++++++++++++++++++ pywb/cdx/test/cdxserver_test.py | 30 +++++++++---------- pywb/indexreader.py | 2 +- pywb/pywb_init.py | 10 +++---- test_config.yaml | 5 +++- tests/test_integration.py | 50 +++++++++++++++++++++++++++---- 10 files changed, 138 insertions(+), 97 deletions(-) create mode 100644 pywb/cdx/perms.py diff --git a/config.yaml b/config.yaml index 8891f756..67f5dc71 100644 --- a/config.yaml +++ b/config.yaml @@ -97,3 +97,7 @@ enable_cdx_api: true # custom rules for domain specific matching # set to false to disable #domain_specific_rules: rules.yaml + +# Permissions checker +#perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms [] + diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index 99798e9b..0dbd97eb 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -1,4 +1,4 @@ -from cdxobject import CDXObject +from cdxobject import CDXObject, AccessException from pywb.utils.timeutils import timestamp_to_sec import bisect @@ -10,25 +10,11 @@ from collections import deque #================================================================= -class AllowAllPerms: - """ - Sample Perm Checker which allows all - """ - def allow_url(self, url): - return True - - def allow_url_timestamp(self, url, timestamp): - return True - - def filter_fields(self, cdx): - return cdx - - -#================================================================= -def cdx_load(source, params, perms_checker = AllowAllPerms()): - - #cdx_iter = cdx_load_all(source, params) - cdx_iter = cdx_load_with_perms(source, params, perms_checker) +def cdx_load(sources, params, perms_checker=None): + if perms_checker: + cdx_iter = cdx_load_with_perms(sources, params, perms_checker) + else: + cdx_iter = cdx_load_and_filter(sources, params) # output raw cdx objects if params.get('output') == 'raw': @@ -42,16 +28,15 @@ def cdx_load(source, params, perms_checker = AllowAllPerms()): #================================================================= -def cdx_load_with_perms(source, params, perms_checker): - if not perms_checker.allow_url(params['url']): +def cdx_load_with_perms(sources, params, perms_checker): + if not perms_checker.allow_url_lookup(params['key'], params['url']): if params.get('matchType', 'exact') == 'exact': - yield + raise AccessException('Excluded') - cdx_iter = cdx_load_all(source, params) + cdx_iter = cdx_load_and_filter(sources, params) for cdx in cdx_iter: - if not perms_checker.allow_url_timestamp(cdx['original'], - cdx['timestamp']): + if not perms_checker.allow_capture(cdx): continue cdx = perms_checker.filter_fields(cdx) @@ -68,7 +53,7 @@ def cdx_text_out(cdx, fields): #================================================================= -def cdx_load_all(sources, params): +def cdx_load_and_filter(sources, params): cdx_iter = load_cdx_streams(sources, params) cdx_iter = make_cdx_iter(cdx_iter) @@ -76,7 +61,7 @@ def cdx_load_all(sources, params): if params.get('proxy_all'): return cdx_iter - resolve_revisits = params.get('resolve_revisits', False) + resolve_revisits = params.get('resolveRevisits', False) if resolve_revisits: cdx_iter = cdx_resolve_revisits(cdx_iter) @@ -84,13 +69,13 @@ def cdx_load_all(sources, params): if filters: cdx_iter = cdx_filter(cdx_iter, filters) - collapse_time = params.get('collapse_time', None) + collapse_time = params.get('collapseTime', None) if collapse_time: cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time) limit = int(params.get('limit', 1000000)) - reverse = params.get('reverse', False) + reverse = params.get('reverse', False) or params.get('sort') == 'reverse' if reverse: cdx_iter = cdx_reverse(cdx_iter, limit) diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 17d16314..69f19d21 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -13,9 +13,10 @@ import urlparse #================================================================= class BaseCDXServer(object): - def __init__(self, url_canon=None, fuzzy_query=None): - self.url_canon = url_canon if url_canon else UrlCanonicalizer() - self.fuzzy_query = fuzzy_query + def __init__(self, **kwargs): + self.url_canon = kwargs.get('url_canon', UrlCanonicalizer()) + self.fuzzy_query = kwargs.get('fuzzy_query') + self.perms_checker = kwargs.get('perms_checker') def _check_cdx_iter(self, cdx_iter, params): """ Check cdx iter semantics @@ -31,13 +32,13 @@ class BaseCDXServer(object): url = params['url'] - if self.fuzzy_query and params.get('allow_fuzzy'): + if self.fuzzy_query and params.get('allowFuzzy'): if not 'key' in params: params['key'] = self.url_canon(url) params = self.fuzzy_query(params) if params: - params['allow_fuzzy'] = False + params['allowFuzzy'] = False return self.load_cdx(**params) msg = 'No Captures found for: ' + url @@ -63,8 +64,8 @@ class CDXServer(BaseCDXServer): responds to queries and dispatches to the cdx ops for processing """ - def __init__(self, paths, url_canon=None, fuzzy_query=None): - super(CDXServer, self).__init__(url_canon, fuzzy_query) + def __init__(self, paths, **kwargs): + super(CDXServer, self).__init__(**kwargs) self.sources = create_cdx_sources(paths) def load_cdx(self, **params): @@ -78,9 +79,7 @@ class CDXServer(BaseCDXServer): params['key'] = self.url_canon(url) - convert_old_style_params(params) - - cdx_iter = cdx_load(self.sources, params) + cdx_iter = cdx_load(self.sources, params, self.perms_checker) return self._check_cdx_iter(cdx_iter, params) @@ -95,8 +94,8 @@ class RemoteCDXServer(BaseCDXServer): It simply proxies the query params to the remote source and performs no local processing/filtering """ - def __init__(self, source, url_canon=None, fuzzy_query=None): - super(RemoteCDXServer, self).__init__(url_canon, fuzzy_query) + def __init__(self, source, **kwargs): + super(RemoteCDXServer, self).__init__(**kwargs) if isinstance(source, RemoteCDXSource): self.source = source @@ -124,9 +123,11 @@ def create_cdx_server(config, ds_rules_file=None): if hasattr(config, 'get'): paths = config.get('index_paths') surt_ordered = config.get('surt_ordered', True) + perms_checker = config.get('perms_checker') else: paths = config surt_ordered = True + perms_checker = None logging.debug('CDX Surt-Ordered? ' + str(surt_ordered)) @@ -145,7 +146,10 @@ def create_cdx_server(config, ds_rules_file=None): else: server_cls = CDXServer - return server_cls(paths, url_canon=canon, fuzzy_query=fuzzy) + return server_cls(paths, + url_canon=canon, + fuzzy_query=fuzzy, + perms_checker=perms_checker) #================================================================= @@ -198,29 +202,6 @@ def create_cdx_source(filename): # return RedisCDXSource(filename) -#================================================================= -def convert_old_style_params(params): - """ - Convert old-style CDX Server param semantics - """ - param = params.get('collapseTime') - if param: - params['collapse_time'] = param - - param = params.get('matchType') - if param: - params['match_type'] = param - - param = params.get('resolveRevisits') - if param: - params['resolve_revisits'] = param - - if params.get('sort') == 'reverse': - params['reverse'] = True - - return params - - #================================================================= def extract_params_from_wsgi_env(env): """ utility function to extract params from the query diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index addd60f7..a8c92be5 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -25,7 +25,7 @@ class CDXFile(CDXSource): def load_cdx(self, params): source = SeekableTextFileReader(self.filename) - match_type = params.get('match_type') + match_type = params.get('matchType') if match_type == 'prefix': iter_func = iter_prefix @@ -56,12 +56,12 @@ class RemoteCDXSource(CDXSource): def load_cdx(self, proxy_params): if self.proxy_all: params = proxy_params - params['proxy_all'] = True + params['proxyAll'] = True else: # Only send url and matchType params to remote params = {} params['url'] = proxy_params['url'] - match_type = proxy_params.get('match_type') + match_type = proxy_params.get('matchType') if match_type: proxy_params['matchType'] = match_type diff --git a/pywb/cdx/perms.py b/pywb/cdx/perms.py new file mode 100644 index 00000000..a7b90eb4 --- /dev/null +++ b/pywb/cdx/perms.py @@ -0,0 +1,30 @@ + + +#================================================================= +class AllowAllPerms: + """ + Sample Perm Checker which allows all + """ + def allow_url_lookup(self, urlkey, url): + """ + Return true/false if url or urlkey (canonicalized url) + should be allowed + """ + return True + + def allow_capture(self, cdx): + """ + Return true/false is specified capture (cdx) should be + allowed + """ + return True + + def filter_fields(self, cdx): + """ + Filter out any forbidden cdx fields from cdx dictionary + """ + return cdx + + +#================================================================= +#TODO: other types of perm handlers diff --git a/pywb/cdx/test/cdxserver_test.py b/pywb/cdx/test/cdxserver_test.py index fc96acb2..2d023729 100644 --- a/pywb/cdx/test/cdxserver_test.py +++ b/pywb/cdx/test/cdxserver_test.py @@ -15,22 +15,22 @@ org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org # Reverse CDX Stream ->>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolve_revisits = True, limit = 3) +>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolveRevisits = True, limit = 3) org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz ->>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolve_revisits = True, limit = 1) +>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolveRevisits = True, limit = 1) org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz # No matching results ->>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolve_revisits = True, limit = 2) +>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 2) Traceback (most recent call last): CaptureNotFoundException: No Captures found for: http://iana.org/dont_have_this # Filter cdx (default: regex) ->>> cdx_ops_test(url = 'http://iana.org/domains', match_type = 'prefix', filter = ['mimetype:text/html']) +>>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['mimetype:text/html']) org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz @@ -45,24 +45,24 @@ org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/s org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz # Filter exact ->>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], match_type = 'prefix', filter = '=urlkey:com,example)/?example=1') +>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '=urlkey:com,example)/?example=1') com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz # Filter exact invert ->>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], match_type = 'prefix', filter = '!=urlkey:com,example)/?example=1') +>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!=urlkey:com,example)/?example=1') com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz # Collapse by timestamp # unresolved revisits, different statuscode results in an extra repeat ->>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = 11) +>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11) org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz # resolved revisits ->>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = '11', resolve_revisits = True) +>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = '11', resolveRevisits = True) org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - - org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz @@ -80,38 +80,38 @@ org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/ 20140126200654 20140126200625 ->>> cdx_ops_test(closest = '20140126201306', url = 'http://iana.org/dnssec', resolve_revisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx']) +>>> cdx_ops_test(closest = '20140126201306', url = 'http://iana.org/dnssec', resolveRevisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx']) org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - - org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - - ->>> cdx_ops_test(closest = '20140126201307', url = 'http://iana.org/dnssec', resolve_revisits = True) +>>> cdx_ops_test(closest = '20140126201307', url = 'http://iana.org/dnssec', resolveRevisits = True) org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - - org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - - # equal dist prefer earlier ->>> cdx_ops_test(closest = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2) +>>> cdx_ops_test(closest = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2) org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz ->>> cdx_ops_test(closest = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp') +>>> cdx_ops_test(closest = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp') 20140126200654 20140126200706 ->>> cdx_ops_test(closest = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp') +>>> cdx_ops_test(closest = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp') 20140126200706 20140126200654 # Resolve Revisits ->>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolve_revisits = True) +>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolveRevisits = True) org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - - org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz ->>> cdx_ops_test('http://iana.org/domains/root/db', resolve_revisits = True) +>>> cdx_ops_test('http://iana.org/domains/root/db', resolveRevisits = True) org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - - org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - - diff --git a/pywb/indexreader.py b/pywb/indexreader.py index 493c1bbd..b55de029 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -28,7 +28,7 @@ class IndexReader(object): if wbrequest.custom_params: params.update(wbrequest.custom_params) - params['allow_fuzzy'] = True + params['allowFuzzy'] = True cdxlines = self.load_cdx(url=wburl.url, output='raw', **params) diff --git a/pywb/pywb_init.py b/pywb/pywb_init.py index c4b40ee2..be4bdded 100644 --- a/pywb/pywb_init.py +++ b/pywb/pywb_init.py @@ -53,14 +53,12 @@ def pywb_config_manual(passed_config = {}): for name, value in collections.iteritems(): if isinstance(value, str): - route_config = config - cdx_config = value - else: - route_config = DictChain(value, config) - cdx_config = route_config + value = {'index_paths': value} + + route_config = DictChain(value, config) ds_rules = route_config.get('domain_specific_rules', None) - cdx_server = IndexReader(cdx_config, ds_rules) + cdx_server = IndexReader(route_config, ds_rules) wb_handler = config_utils.create_wb_handler( cdx_server = cdx_server, diff --git a/test_config.yaml b/test_config.yaml index 04dfee37..8421aead 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -92,7 +92,10 @@ enable_cdx_api: true # optional reporter callback func # if set, called with request and cdx object -reporter_func: pywb.run-tests.print_reporter +reporter: !!python/object/new:tests.test_integration.PrintReporter [] # custom rules for domain specific matching #domain_specific_rules: rules.yaml + +#perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms [] +perms_checker: !!python/object/new:tests.test_integration.TestExclusionPerms [] diff --git a/tests/test_integration.py b/tests/test_integration.py index 59b4fc36..ec7fd6bd 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -119,6 +119,12 @@ class TestWb: assert resp.content_type == 'text/css' + def test_excluded_content(self): + resp = self.testapp.get('/pywb/http://www.iana.org/_img/bookmark_icon.ico', status = 403) + assert resp.status_int == 403 + assert 'Excluded' in resp.body + + def test_static_content(self): resp = self.testapp.get('/static/test/route/wb.css') assert resp.status_int == 200 @@ -149,7 +155,7 @@ class TestWb: def test_cdx_server_advanced(self): # combine collapsing, reversing and revisit resolving - resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/print.css&collapse_time=11&resolve_revisits=true&reverse=true') + resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/print.css&collapseTime=11&resolveRevisits=true&reverse=true') # convert back to CDXObject cdxs = map(CDXObject, resp.body.rstrip().split('\n')) @@ -169,8 +175,42 @@ class TestWb: assert resp.status_int == 400 assert 'Invalid Url: http://?abc' in resp.body +#================================================================= # Reporter callback for replay view -def print_reporter(wbrequest, cdx, response): - print wbrequest - print cdx - pass +class PrintReporter: + def __call__(self, wbrequest, cdx, response): + print wbrequest + print cdx + pass + +#================================================================= +class TestExclusionPerms: + """ + Sample Perm Checker which allows all + """ + def allow_url_lookup(self, urlkey, url): + """ + Return true/false if url or urlkey (canonicalized url) + should be allowed + """ + print urlkey + if urlkey == 'org,iana)/_img/bookmark_icon.ico': + return False + + return True + + def allow_capture(self, cdx): + """ + Return true/false is specified capture (cdx) should be + allowed + """ + return True + + def filter_fields(self, cdx): + """ + Filter out any forbidden cdx fields from cdx dictionary + """ + return cdx + + +