mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
exclusions: add AllAllowPerms and refactor exclusions interface
add TestExclusionPerms and a sample exclusion integration test refactor cdx server init params into **kwargs convert all cdx params to use camelCase
This commit is contained in:
parent
be284859be
commit
ff428ed43e
@ -97,3 +97,7 @@ enable_cdx_api: true
|
|||||||
# custom rules for domain specific matching
|
# custom rules for domain specific matching
|
||||||
# set to false to disable
|
# set to false to disable
|
||||||
#domain_specific_rules: rules.yaml
|
#domain_specific_rules: rules.yaml
|
||||||
|
|
||||||
|
# Permissions checker
|
||||||
|
#perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms []
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from cdxobject import CDXObject
|
from cdxobject import CDXObject, AccessException
|
||||||
from pywb.utils.timeutils import timestamp_to_sec
|
from pywb.utils.timeutils import timestamp_to_sec
|
||||||
|
|
||||||
import bisect
|
import bisect
|
||||||
@ -10,25 +10,11 @@ from collections import deque
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class AllowAllPerms:
|
def cdx_load(sources, params, perms_checker=None):
|
||||||
"""
|
if perms_checker:
|
||||||
Sample Perm Checker which allows all
|
cdx_iter = cdx_load_with_perms(sources, params, perms_checker)
|
||||||
"""
|
else:
|
||||||
def allow_url(self, url):
|
cdx_iter = cdx_load_and_filter(sources, params)
|
||||||
return True
|
|
||||||
|
|
||||||
def allow_url_timestamp(self, url, timestamp):
|
|
||||||
return True
|
|
||||||
|
|
||||||
def filter_fields(self, cdx):
|
|
||||||
return cdx
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def cdx_load(source, params, perms_checker = AllowAllPerms()):
|
|
||||||
|
|
||||||
#cdx_iter = cdx_load_all(source, params)
|
|
||||||
cdx_iter = cdx_load_with_perms(source, params, perms_checker)
|
|
||||||
|
|
||||||
# output raw cdx objects
|
# output raw cdx objects
|
||||||
if params.get('output') == 'raw':
|
if params.get('output') == 'raw':
|
||||||
@ -42,16 +28,15 @@ def cdx_load(source, params, perms_checker = AllowAllPerms()):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def cdx_load_with_perms(source, params, perms_checker):
|
def cdx_load_with_perms(sources, params, perms_checker):
|
||||||
if not perms_checker.allow_url(params['url']):
|
if not perms_checker.allow_url_lookup(params['key'], params['url']):
|
||||||
if params.get('matchType', 'exact') == 'exact':
|
if params.get('matchType', 'exact') == 'exact':
|
||||||
yield
|
raise AccessException('Excluded')
|
||||||
|
|
||||||
cdx_iter = cdx_load_all(source, params)
|
cdx_iter = cdx_load_and_filter(sources, params)
|
||||||
|
|
||||||
for cdx in cdx_iter:
|
for cdx in cdx_iter:
|
||||||
if not perms_checker.allow_url_timestamp(cdx['original'],
|
if not perms_checker.allow_capture(cdx):
|
||||||
cdx['timestamp']):
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
cdx = perms_checker.filter_fields(cdx)
|
cdx = perms_checker.filter_fields(cdx)
|
||||||
@ -68,7 +53,7 @@ def cdx_text_out(cdx, fields):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def cdx_load_all(sources, params):
|
def cdx_load_and_filter(sources, params):
|
||||||
cdx_iter = load_cdx_streams(sources, params)
|
cdx_iter = load_cdx_streams(sources, params)
|
||||||
|
|
||||||
cdx_iter = make_cdx_iter(cdx_iter)
|
cdx_iter = make_cdx_iter(cdx_iter)
|
||||||
@ -76,7 +61,7 @@ def cdx_load_all(sources, params):
|
|||||||
if params.get('proxy_all'):
|
if params.get('proxy_all'):
|
||||||
return cdx_iter
|
return cdx_iter
|
||||||
|
|
||||||
resolve_revisits = params.get('resolve_revisits', False)
|
resolve_revisits = params.get('resolveRevisits', False)
|
||||||
if resolve_revisits:
|
if resolve_revisits:
|
||||||
cdx_iter = cdx_resolve_revisits(cdx_iter)
|
cdx_iter = cdx_resolve_revisits(cdx_iter)
|
||||||
|
|
||||||
@ -84,13 +69,13 @@ def cdx_load_all(sources, params):
|
|||||||
if filters:
|
if filters:
|
||||||
cdx_iter = cdx_filter(cdx_iter, filters)
|
cdx_iter = cdx_filter(cdx_iter, filters)
|
||||||
|
|
||||||
collapse_time = params.get('collapse_time', None)
|
collapse_time = params.get('collapseTime', None)
|
||||||
if collapse_time:
|
if collapse_time:
|
||||||
cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time)
|
cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time)
|
||||||
|
|
||||||
limit = int(params.get('limit', 1000000))
|
limit = int(params.get('limit', 1000000))
|
||||||
|
|
||||||
reverse = params.get('reverse', False)
|
reverse = params.get('reverse', False) or params.get('sort') == 'reverse'
|
||||||
if reverse:
|
if reverse:
|
||||||
cdx_iter = cdx_reverse(cdx_iter, limit)
|
cdx_iter = cdx_reverse(cdx_iter, limit)
|
||||||
|
|
||||||
|
@ -13,9 +13,10 @@ import urlparse
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class BaseCDXServer(object):
|
class BaseCDXServer(object):
|
||||||
def __init__(self, url_canon=None, fuzzy_query=None):
|
def __init__(self, **kwargs):
|
||||||
self.url_canon = url_canon if url_canon else UrlCanonicalizer()
|
self.url_canon = kwargs.get('url_canon', UrlCanonicalizer())
|
||||||
self.fuzzy_query = fuzzy_query
|
self.fuzzy_query = kwargs.get('fuzzy_query')
|
||||||
|
self.perms_checker = kwargs.get('perms_checker')
|
||||||
|
|
||||||
def _check_cdx_iter(self, cdx_iter, params):
|
def _check_cdx_iter(self, cdx_iter, params):
|
||||||
""" Check cdx iter semantics
|
""" Check cdx iter semantics
|
||||||
@ -31,13 +32,13 @@ class BaseCDXServer(object):
|
|||||||
|
|
||||||
url = params['url']
|
url = params['url']
|
||||||
|
|
||||||
if self.fuzzy_query and params.get('allow_fuzzy'):
|
if self.fuzzy_query and params.get('allowFuzzy'):
|
||||||
if not 'key' in params:
|
if not 'key' in params:
|
||||||
params['key'] = self.url_canon(url)
|
params['key'] = self.url_canon(url)
|
||||||
|
|
||||||
params = self.fuzzy_query(params)
|
params = self.fuzzy_query(params)
|
||||||
if params:
|
if params:
|
||||||
params['allow_fuzzy'] = False
|
params['allowFuzzy'] = False
|
||||||
return self.load_cdx(**params)
|
return self.load_cdx(**params)
|
||||||
|
|
||||||
msg = 'No Captures found for: ' + url
|
msg = 'No Captures found for: ' + url
|
||||||
@ -63,8 +64,8 @@ class CDXServer(BaseCDXServer):
|
|||||||
responds to queries and dispatches to the cdx ops for processing
|
responds to queries and dispatches to the cdx ops for processing
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, paths, url_canon=None, fuzzy_query=None):
|
def __init__(self, paths, **kwargs):
|
||||||
super(CDXServer, self).__init__(url_canon, fuzzy_query)
|
super(CDXServer, self).__init__(**kwargs)
|
||||||
self.sources = create_cdx_sources(paths)
|
self.sources = create_cdx_sources(paths)
|
||||||
|
|
||||||
def load_cdx(self, **params):
|
def load_cdx(self, **params):
|
||||||
@ -78,9 +79,7 @@ class CDXServer(BaseCDXServer):
|
|||||||
|
|
||||||
params['key'] = self.url_canon(url)
|
params['key'] = self.url_canon(url)
|
||||||
|
|
||||||
convert_old_style_params(params)
|
cdx_iter = cdx_load(self.sources, params, self.perms_checker)
|
||||||
|
|
||||||
cdx_iter = cdx_load(self.sources, params)
|
|
||||||
|
|
||||||
return self._check_cdx_iter(cdx_iter, params)
|
return self._check_cdx_iter(cdx_iter, params)
|
||||||
|
|
||||||
@ -95,8 +94,8 @@ class RemoteCDXServer(BaseCDXServer):
|
|||||||
It simply proxies the query params to the remote source
|
It simply proxies the query params to the remote source
|
||||||
and performs no local processing/filtering
|
and performs no local processing/filtering
|
||||||
"""
|
"""
|
||||||
def __init__(self, source, url_canon=None, fuzzy_query=None):
|
def __init__(self, source, **kwargs):
|
||||||
super(RemoteCDXServer, self).__init__(url_canon, fuzzy_query)
|
super(RemoteCDXServer, self).__init__(**kwargs)
|
||||||
|
|
||||||
if isinstance(source, RemoteCDXSource):
|
if isinstance(source, RemoteCDXSource):
|
||||||
self.source = source
|
self.source = source
|
||||||
@ -124,9 +123,11 @@ def create_cdx_server(config, ds_rules_file=None):
|
|||||||
if hasattr(config, 'get'):
|
if hasattr(config, 'get'):
|
||||||
paths = config.get('index_paths')
|
paths = config.get('index_paths')
|
||||||
surt_ordered = config.get('surt_ordered', True)
|
surt_ordered = config.get('surt_ordered', True)
|
||||||
|
perms_checker = config.get('perms_checker')
|
||||||
else:
|
else:
|
||||||
paths = config
|
paths = config
|
||||||
surt_ordered = True
|
surt_ordered = True
|
||||||
|
perms_checker = None
|
||||||
|
|
||||||
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
|
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
|
||||||
|
|
||||||
@ -145,7 +146,10 @@ def create_cdx_server(config, ds_rules_file=None):
|
|||||||
else:
|
else:
|
||||||
server_cls = CDXServer
|
server_cls = CDXServer
|
||||||
|
|
||||||
return server_cls(paths, url_canon=canon, fuzzy_query=fuzzy)
|
return server_cls(paths,
|
||||||
|
url_canon=canon,
|
||||||
|
fuzzy_query=fuzzy,
|
||||||
|
perms_checker=perms_checker)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -198,29 +202,6 @@ def create_cdx_source(filename):
|
|||||||
# return RedisCDXSource(filename)
|
# return RedisCDXSource(filename)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def convert_old_style_params(params):
|
|
||||||
"""
|
|
||||||
Convert old-style CDX Server param semantics
|
|
||||||
"""
|
|
||||||
param = params.get('collapseTime')
|
|
||||||
if param:
|
|
||||||
params['collapse_time'] = param
|
|
||||||
|
|
||||||
param = params.get('matchType')
|
|
||||||
if param:
|
|
||||||
params['match_type'] = param
|
|
||||||
|
|
||||||
param = params.get('resolveRevisits')
|
|
||||||
if param:
|
|
||||||
params['resolve_revisits'] = param
|
|
||||||
|
|
||||||
if params.get('sort') == 'reverse':
|
|
||||||
params['reverse'] = True
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def extract_params_from_wsgi_env(env):
|
def extract_params_from_wsgi_env(env):
|
||||||
""" utility function to extract params from the query
|
""" utility function to extract params from the query
|
||||||
|
@ -25,7 +25,7 @@ class CDXFile(CDXSource):
|
|||||||
def load_cdx(self, params):
|
def load_cdx(self, params):
|
||||||
source = SeekableTextFileReader(self.filename)
|
source = SeekableTextFileReader(self.filename)
|
||||||
|
|
||||||
match_type = params.get('match_type')
|
match_type = params.get('matchType')
|
||||||
|
|
||||||
if match_type == 'prefix':
|
if match_type == 'prefix':
|
||||||
iter_func = iter_prefix
|
iter_func = iter_prefix
|
||||||
@ -56,12 +56,12 @@ class RemoteCDXSource(CDXSource):
|
|||||||
def load_cdx(self, proxy_params):
|
def load_cdx(self, proxy_params):
|
||||||
if self.proxy_all:
|
if self.proxy_all:
|
||||||
params = proxy_params
|
params = proxy_params
|
||||||
params['proxy_all'] = True
|
params['proxyAll'] = True
|
||||||
else:
|
else:
|
||||||
# Only send url and matchType params to remote
|
# Only send url and matchType params to remote
|
||||||
params = {}
|
params = {}
|
||||||
params['url'] = proxy_params['url']
|
params['url'] = proxy_params['url']
|
||||||
match_type = proxy_params.get('match_type')
|
match_type = proxy_params.get('matchType')
|
||||||
|
|
||||||
if match_type:
|
if match_type:
|
||||||
proxy_params['matchType'] = match_type
|
proxy_params['matchType'] = match_type
|
||||||
|
30
pywb/cdx/perms.py
Normal file
30
pywb/cdx/perms.py
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class AllowAllPerms:
|
||||||
|
"""
|
||||||
|
Sample Perm Checker which allows all
|
||||||
|
"""
|
||||||
|
def allow_url_lookup(self, urlkey, url):
|
||||||
|
"""
|
||||||
|
Return true/false if url or urlkey (canonicalized url)
|
||||||
|
should be allowed
|
||||||
|
"""
|
||||||
|
return True
|
||||||
|
|
||||||
|
def allow_capture(self, cdx):
|
||||||
|
"""
|
||||||
|
Return true/false is specified capture (cdx) should be
|
||||||
|
allowed
|
||||||
|
"""
|
||||||
|
return True
|
||||||
|
|
||||||
|
def filter_fields(self, cdx):
|
||||||
|
"""
|
||||||
|
Filter out any forbidden cdx fields from cdx dictionary
|
||||||
|
"""
|
||||||
|
return cdx
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
#TODO: other types of perm handlers
|
@ -15,22 +15,22 @@ org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org
|
|||||||
|
|
||||||
|
|
||||||
# Reverse CDX Stream
|
# Reverse CDX Stream
|
||||||
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolve_revisits = True, limit = 3)
|
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolveRevisits = True, limit = 3)
|
||||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz
|
||||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
|
||||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
|
||||||
|
|
||||||
>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolve_revisits = True, limit = 1)
|
>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolveRevisits = True, limit = 1)
|
||||||
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
|
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
|
||||||
|
|
||||||
# No matching results
|
# No matching results
|
||||||
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolve_revisits = True, limit = 2)
|
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 2)
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
CaptureNotFoundException: No Captures found for: http://iana.org/dont_have_this
|
CaptureNotFoundException: No Captures found for: http://iana.org/dont_have_this
|
||||||
|
|
||||||
|
|
||||||
# Filter cdx (default: regex)
|
# Filter cdx (default: regex)
|
||||||
>>> cdx_ops_test(url = 'http://iana.org/domains', match_type = 'prefix', filter = ['mimetype:text/html'])
|
>>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['mimetype:text/html'])
|
||||||
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
|
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
|
||||||
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
|
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
|
||||||
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
|
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
|
||||||
@ -45,24 +45,24 @@ org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/s
|
|||||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||||
|
|
||||||
# Filter exact
|
# Filter exact
|
||||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], match_type = 'prefix', filter = '=urlkey:com,example)/?example=1')
|
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '=urlkey:com,example)/?example=1')
|
||||||
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||||
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
|
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
|
||||||
|
|
||||||
# Filter exact invert
|
# Filter exact invert
|
||||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], match_type = 'prefix', filter = '!=urlkey:com,example)/?example=1')
|
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!=urlkey:com,example)/?example=1')
|
||||||
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
||||||
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
||||||
|
|
||||||
# Collapse by timestamp
|
# Collapse by timestamp
|
||||||
# unresolved revisits, different statuscode results in an extra repeat
|
# unresolved revisits, different statuscode results in an extra repeat
|
||||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = 11)
|
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11)
|
||||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||||
org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz
|
org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz
|
||||||
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
|
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
|
||||||
|
|
||||||
# resolved revisits
|
# resolved revisits
|
||||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = '11', resolve_revisits = True)
|
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = '11', resolveRevisits = True)
|
||||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
|
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
|
||||||
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
|
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
|
||||||
|
|
||||||
@ -80,38 +80,38 @@ org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/
|
|||||||
20140126200654
|
20140126200654
|
||||||
20140126200625
|
20140126200625
|
||||||
|
|
||||||
>>> cdx_ops_test(closest = '20140126201306', url = 'http://iana.org/dnssec', resolve_revisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
|
>>> cdx_ops_test(closest = '20140126201306', url = 'http://iana.org/dnssec', resolveRevisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
|
||||||
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
|
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
|
||||||
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
|
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
|
||||||
|
|
||||||
|
|
||||||
>>> cdx_ops_test(closest = '20140126201307', url = 'http://iana.org/dnssec', resolve_revisits = True)
|
>>> cdx_ops_test(closest = '20140126201307', url = 'http://iana.org/dnssec', resolveRevisits = True)
|
||||||
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
|
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
|
||||||
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
|
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
|
||||||
|
|
||||||
# equal dist prefer earlier
|
# equal dist prefer earlier
|
||||||
>>> cdx_ops_test(closest = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2)
|
>>> cdx_ops_test(closest = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2)
|
||||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz
|
||||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz
|
||||||
|
|
||||||
>>> cdx_ops_test(closest = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
|
>>> cdx_ops_test(closest = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp')
|
||||||
20140126200654
|
20140126200654
|
||||||
20140126200706
|
20140126200706
|
||||||
|
|
||||||
>>> cdx_ops_test(closest = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
|
>>> cdx_ops_test(closest = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp')
|
||||||
20140126200706
|
20140126200706
|
||||||
20140126200654
|
20140126200654
|
||||||
|
|
||||||
|
|
||||||
# Resolve Revisits
|
# Resolve Revisits
|
||||||
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolve_revisits = True)
|
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolveRevisits = True)
|
||||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - -
|
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - -
|
||||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz
|
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz
|
||||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz
|
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz
|
||||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz
|
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz
|
||||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz
|
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz
|
||||||
|
|
||||||
>>> cdx_ops_test('http://iana.org/domains/root/db', resolve_revisits = True)
|
>>> cdx_ops_test('http://iana.org/domains/root/db', resolveRevisits = True)
|
||||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
|
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
|
||||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
|
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
|
||||||
|
|
||||||
|
@ -28,7 +28,7 @@ class IndexReader(object):
|
|||||||
if wbrequest.custom_params:
|
if wbrequest.custom_params:
|
||||||
params.update(wbrequest.custom_params)
|
params.update(wbrequest.custom_params)
|
||||||
|
|
||||||
params['allow_fuzzy'] = True
|
params['allowFuzzy'] = True
|
||||||
|
|
||||||
cdxlines = self.load_cdx(url=wburl.url, output='raw', **params)
|
cdxlines = self.load_cdx(url=wburl.url, output='raw', **params)
|
||||||
|
|
||||||
|
@ -53,14 +53,12 @@ def pywb_config_manual(passed_config = {}):
|
|||||||
|
|
||||||
for name, value in collections.iteritems():
|
for name, value in collections.iteritems():
|
||||||
if isinstance(value, str):
|
if isinstance(value, str):
|
||||||
route_config = config
|
value = {'index_paths': value}
|
||||||
cdx_config = value
|
|
||||||
else:
|
route_config = DictChain(value, config)
|
||||||
route_config = DictChain(value, config)
|
|
||||||
cdx_config = route_config
|
|
||||||
|
|
||||||
ds_rules = route_config.get('domain_specific_rules', None)
|
ds_rules = route_config.get('domain_specific_rules', None)
|
||||||
cdx_server = IndexReader(cdx_config, ds_rules)
|
cdx_server = IndexReader(route_config, ds_rules)
|
||||||
|
|
||||||
wb_handler = config_utils.create_wb_handler(
|
wb_handler = config_utils.create_wb_handler(
|
||||||
cdx_server = cdx_server,
|
cdx_server = cdx_server,
|
||||||
|
@ -92,7 +92,10 @@ enable_cdx_api: true
|
|||||||
|
|
||||||
# optional reporter callback func
|
# optional reporter callback func
|
||||||
# if set, called with request and cdx object
|
# if set, called with request and cdx object
|
||||||
reporter_func: pywb.run-tests.print_reporter
|
reporter: !!python/object/new:tests.test_integration.PrintReporter []
|
||||||
|
|
||||||
# custom rules for domain specific matching
|
# custom rules for domain specific matching
|
||||||
#domain_specific_rules: rules.yaml
|
#domain_specific_rules: rules.yaml
|
||||||
|
|
||||||
|
#perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms []
|
||||||
|
perms_checker: !!python/object/new:tests.test_integration.TestExclusionPerms []
|
||||||
|
@ -119,6 +119,12 @@ class TestWb:
|
|||||||
assert resp.content_type == 'text/css'
|
assert resp.content_type == 'text/css'
|
||||||
|
|
||||||
|
|
||||||
|
def test_excluded_content(self):
|
||||||
|
resp = self.testapp.get('/pywb/http://www.iana.org/_img/bookmark_icon.ico', status = 403)
|
||||||
|
assert resp.status_int == 403
|
||||||
|
assert 'Excluded' in resp.body
|
||||||
|
|
||||||
|
|
||||||
def test_static_content(self):
|
def test_static_content(self):
|
||||||
resp = self.testapp.get('/static/test/route/wb.css')
|
resp = self.testapp.get('/static/test/route/wb.css')
|
||||||
assert resp.status_int == 200
|
assert resp.status_int == 200
|
||||||
@ -149,7 +155,7 @@ class TestWb:
|
|||||||
|
|
||||||
def test_cdx_server_advanced(self):
|
def test_cdx_server_advanced(self):
|
||||||
# combine collapsing, reversing and revisit resolving
|
# combine collapsing, reversing and revisit resolving
|
||||||
resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/print.css&collapse_time=11&resolve_revisits=true&reverse=true')
|
resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/print.css&collapseTime=11&resolveRevisits=true&reverse=true')
|
||||||
|
|
||||||
# convert back to CDXObject
|
# convert back to CDXObject
|
||||||
cdxs = map(CDXObject, resp.body.rstrip().split('\n'))
|
cdxs = map(CDXObject, resp.body.rstrip().split('\n'))
|
||||||
@ -169,8 +175,42 @@ class TestWb:
|
|||||||
assert resp.status_int == 400
|
assert resp.status_int == 400
|
||||||
assert 'Invalid Url: http://?abc' in resp.body
|
assert 'Invalid Url: http://?abc' in resp.body
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
# Reporter callback for replay view
|
# Reporter callback for replay view
|
||||||
def print_reporter(wbrequest, cdx, response):
|
class PrintReporter:
|
||||||
print wbrequest
|
def __call__(self, wbrequest, cdx, response):
|
||||||
print cdx
|
print wbrequest
|
||||||
pass
|
print cdx
|
||||||
|
pass
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class TestExclusionPerms:
|
||||||
|
"""
|
||||||
|
Sample Perm Checker which allows all
|
||||||
|
"""
|
||||||
|
def allow_url_lookup(self, urlkey, url):
|
||||||
|
"""
|
||||||
|
Return true/false if url or urlkey (canonicalized url)
|
||||||
|
should be allowed
|
||||||
|
"""
|
||||||
|
print urlkey
|
||||||
|
if urlkey == 'org,iana)/_img/bookmark_icon.ico':
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def allow_capture(self, cdx):
|
||||||
|
"""
|
||||||
|
Return true/false is specified capture (cdx) should be
|
||||||
|
allowed
|
||||||
|
"""
|
||||||
|
return True
|
||||||
|
|
||||||
|
def filter_fields(self, cdx):
|
||||||
|
"""
|
||||||
|
Filter out any forbidden cdx fields from cdx dictionary
|
||||||
|
"""
|
||||||
|
return cdx
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user