mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Merge remote-tracking branch 'origin/perms-work' into cdx-server
This commit is contained in:
commit
0b768ce11a
@ -97,3 +97,7 @@ enable_cdx_api: true
|
||||
# custom rules for domain specific matching
|
||||
# set to false to disable
|
||||
#domain_specific_rules: rules.yaml
|
||||
|
||||
# Permissions checker
|
||||
#perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms []
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
from cdxobject import CDXObject
|
||||
from cdxobject import CDXObject, AccessException
|
||||
from pywb.utils.timeutils import timestamp_to_sec
|
||||
|
||||
import bisect
|
||||
@ -10,44 +10,11 @@ from collections import deque
|
||||
|
||||
|
||||
#=================================================================
|
||||
def cdx_text_out(cdx, fields):
|
||||
if not fields:
|
||||
return str(cdx)
|
||||
def cdx_load(sources, params, perms_checker=None):
|
||||
if perms_checker:
|
||||
cdx_iter = cdx_load_with_perms(sources, params, perms_checker)
|
||||
else:
|
||||
return ' '.join(map(lambda x: cdx[x], fields.split(',')))
|
||||
|
||||
|
||||
#=================================================================
|
||||
def cdx_load(sources, params):
|
||||
cdx_iter = load_cdx_streams(sources, params)
|
||||
|
||||
cdx_iter = make_cdx_iter(cdx_iter)
|
||||
|
||||
if not params.get('proxy_all'):
|
||||
resolve_revisits = params.get('resolve_revisits', False)
|
||||
if resolve_revisits:
|
||||
cdx_iter = cdx_resolve_revisits(cdx_iter)
|
||||
|
||||
filters = params.get('filter', None)
|
||||
if filters:
|
||||
cdx_iter = cdx_filter(cdx_iter, filters)
|
||||
|
||||
collapse_time = params.get('collapse_time', None)
|
||||
if collapse_time:
|
||||
cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time)
|
||||
|
||||
limit = int(params.get('limit', 1000000))
|
||||
|
||||
reverse = params.get('reverse', False)
|
||||
if reverse:
|
||||
cdx_iter = cdx_reverse(cdx_iter, limit)
|
||||
|
||||
closest_to = params.get('closest', None)
|
||||
if closest_to:
|
||||
cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit)
|
||||
|
||||
if limit:
|
||||
cdx_iter = cdx_limit(cdx_iter, limit)
|
||||
cdx_iter = cdx_load_and_filter(sources, params)
|
||||
|
||||
# output raw cdx objects
|
||||
if params.get('output') == 'raw':
|
||||
@ -60,6 +27,68 @@ def cdx_load(sources, params):
|
||||
return write_cdx(params.get('fields'))
|
||||
|
||||
|
||||
#=================================================================
|
||||
def cdx_load_with_perms(sources, params, perms_checker):
|
||||
if not perms_checker.allow_url_lookup(params['key'], params['url']):
|
||||
if params.get('matchType', 'exact') == 'exact':
|
||||
raise AccessException('Excluded')
|
||||
|
||||
cdx_iter = cdx_load_and_filter(sources, params)
|
||||
|
||||
for cdx in cdx_iter:
|
||||
if not perms_checker.allow_capture(cdx):
|
||||
continue
|
||||
|
||||
cdx = perms_checker.filter_fields(cdx)
|
||||
|
||||
yield cdx
|
||||
|
||||
|
||||
#=================================================================
|
||||
def cdx_text_out(cdx, fields):
|
||||
if not fields:
|
||||
return str(cdx)
|
||||
else:
|
||||
return ' '.join(map(lambda x: cdx[x], fields.split(',')))
|
||||
|
||||
|
||||
#=================================================================
|
||||
def cdx_load_and_filter(sources, params):
|
||||
cdx_iter = load_cdx_streams(sources, params)
|
||||
|
||||
cdx_iter = make_cdx_iter(cdx_iter)
|
||||
|
||||
if params.get('proxy_all'):
|
||||
return cdx_iter
|
||||
|
||||
resolve_revisits = params.get('resolveRevisits', False)
|
||||
if resolve_revisits:
|
||||
cdx_iter = cdx_resolve_revisits(cdx_iter)
|
||||
|
||||
filters = params.get('filter', None)
|
||||
if filters:
|
||||
cdx_iter = cdx_filter(cdx_iter, filters)
|
||||
|
||||
collapse_time = params.get('collapseTime', None)
|
||||
if collapse_time:
|
||||
cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time)
|
||||
|
||||
limit = int(params.get('limit', 1000000))
|
||||
|
||||
reverse = params.get('reverse', False) or params.get('sort') == 'reverse'
|
||||
if reverse:
|
||||
cdx_iter = cdx_reverse(cdx_iter, limit)
|
||||
|
||||
closest_to = params.get('closest', None)
|
||||
if closest_to:
|
||||
cdx_iter = cdx_sort_closest(closest_to, cdx_iter, limit)
|
||||
|
||||
if limit:
|
||||
cdx_iter = cdx_limit(cdx_iter, limit)
|
||||
|
||||
return cdx_iter
|
||||
|
||||
|
||||
#=================================================================
|
||||
# load and source merge cdx streams
|
||||
def load_cdx_streams(sources, params):
|
||||
|
@ -13,9 +13,10 @@ import urlparse
|
||||
|
||||
#=================================================================
|
||||
class BaseCDXServer(object):
|
||||
def __init__(self, url_canon=None, fuzzy_query=None):
|
||||
self.url_canon = url_canon if url_canon else UrlCanonicalizer()
|
||||
self.fuzzy_query = fuzzy_query
|
||||
def __init__(self, **kwargs):
|
||||
self.url_canon = kwargs.get('url_canon', UrlCanonicalizer())
|
||||
self.fuzzy_query = kwargs.get('fuzzy_query')
|
||||
self.perms_checker = kwargs.get('perms_checker')
|
||||
|
||||
def _check_cdx_iter(self, cdx_iter, params):
|
||||
""" Check cdx iter semantics
|
||||
@ -31,13 +32,13 @@ class BaseCDXServer(object):
|
||||
|
||||
url = params['url']
|
||||
|
||||
if self.fuzzy_query and params.get('allow_fuzzy'):
|
||||
if self.fuzzy_query and params.get('allowFuzzy'):
|
||||
if not 'key' in params:
|
||||
params['key'] = self.url_canon(url)
|
||||
|
||||
params = self.fuzzy_query(params)
|
||||
if params:
|
||||
params['allow_fuzzy'] = False
|
||||
params['allowFuzzy'] = False
|
||||
return self.load_cdx(**params)
|
||||
|
||||
msg = 'No Captures found for: ' + url
|
||||
@ -63,8 +64,8 @@ class CDXServer(BaseCDXServer):
|
||||
responds to queries and dispatches to the cdx ops for processing
|
||||
"""
|
||||
|
||||
def __init__(self, paths, url_canon=None, fuzzy_query=None):
|
||||
super(CDXServer, self).__init__(url_canon, fuzzy_query)
|
||||
def __init__(self, paths, **kwargs):
|
||||
super(CDXServer, self).__init__(**kwargs)
|
||||
self.sources = create_cdx_sources(paths)
|
||||
|
||||
def load_cdx(self, **params):
|
||||
@ -78,9 +79,7 @@ class CDXServer(BaseCDXServer):
|
||||
|
||||
params['key'] = self.url_canon(url)
|
||||
|
||||
convert_old_style_params(params)
|
||||
|
||||
cdx_iter = cdx_load(self.sources, params)
|
||||
cdx_iter = cdx_load(self.sources, params, self.perms_checker)
|
||||
|
||||
return self._check_cdx_iter(cdx_iter, params)
|
||||
|
||||
@ -95,8 +94,8 @@ class RemoteCDXServer(BaseCDXServer):
|
||||
It simply proxies the query params to the remote source
|
||||
and performs no local processing/filtering
|
||||
"""
|
||||
def __init__(self, source, url_canon=None, fuzzy_query=None):
|
||||
super(RemoteCDXServer, self).__init__(url_canon, fuzzy_query)
|
||||
def __init__(self, source, **kwargs):
|
||||
super(RemoteCDXServer, self).__init__(**kwargs)
|
||||
|
||||
if isinstance(source, RemoteCDXSource):
|
||||
self.source = source
|
||||
@ -124,9 +123,11 @@ def create_cdx_server(config, ds_rules_file=None):
|
||||
if hasattr(config, 'get'):
|
||||
paths = config.get('index_paths')
|
||||
surt_ordered = config.get('surt_ordered', True)
|
||||
perms_checker = config.get('perms_checker')
|
||||
else:
|
||||
paths = config
|
||||
surt_ordered = True
|
||||
perms_checker = None
|
||||
|
||||
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
|
||||
|
||||
@ -145,7 +146,10 @@ def create_cdx_server(config, ds_rules_file=None):
|
||||
else:
|
||||
server_cls = CDXServer
|
||||
|
||||
return server_cls(paths, url_canon=canon, fuzzy_query=fuzzy)
|
||||
return server_cls(paths,
|
||||
url_canon=canon,
|
||||
fuzzy_query=fuzzy,
|
||||
perms_checker=perms_checker)
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -198,29 +202,6 @@ def create_cdx_source(filename):
|
||||
# return RedisCDXSource(filename)
|
||||
|
||||
|
||||
#=================================================================
|
||||
def convert_old_style_params(params):
|
||||
"""
|
||||
Convert old-style CDX Server param semantics
|
||||
"""
|
||||
param = params.get('collapseTime')
|
||||
if param:
|
||||
params['collapse_time'] = param
|
||||
|
||||
param = params.get('matchType')
|
||||
if param:
|
||||
params['match_type'] = param
|
||||
|
||||
param = params.get('resolveRevisits')
|
||||
if param:
|
||||
params['resolve_revisits'] = param
|
||||
|
||||
if params.get('sort') == 'reverse':
|
||||
params['reverse'] = True
|
||||
|
||||
return params
|
||||
|
||||
|
||||
#=================================================================
|
||||
def extract_params_from_wsgi_env(env):
|
||||
""" utility function to extract params from the query
|
||||
|
@ -25,7 +25,7 @@ class CDXFile(CDXSource):
|
||||
def load_cdx(self, params):
|
||||
source = SeekableTextFileReader(self.filename)
|
||||
|
||||
match_type = params.get('match_type')
|
||||
match_type = params.get('matchType')
|
||||
|
||||
if match_type == 'prefix':
|
||||
iter_func = iter_prefix
|
||||
@ -56,12 +56,12 @@ class RemoteCDXSource(CDXSource):
|
||||
def load_cdx(self, proxy_params):
|
||||
if self.proxy_all:
|
||||
params = proxy_params
|
||||
params['proxy_all'] = True
|
||||
params['proxyAll'] = True
|
||||
else:
|
||||
# Only send url and matchType params to remote
|
||||
params = {}
|
||||
params['url'] = proxy_params['url']
|
||||
match_type = proxy_params.get('match_type')
|
||||
match_type = proxy_params.get('matchType')
|
||||
|
||||
if match_type:
|
||||
proxy_params['matchType'] = match_type
|
||||
|
30
pywb/cdx/perms.py
Normal file
30
pywb/cdx/perms.py
Normal file
@ -0,0 +1,30 @@
|
||||
|
||||
|
||||
#=================================================================
|
||||
class AllowAllPerms:
|
||||
"""
|
||||
Sample Perm Checker which allows all
|
||||
"""
|
||||
def allow_url_lookup(self, urlkey, url):
|
||||
"""
|
||||
Return true/false if url or urlkey (canonicalized url)
|
||||
should be allowed
|
||||
"""
|
||||
return True
|
||||
|
||||
def allow_capture(self, cdx):
|
||||
"""
|
||||
Return true/false is specified capture (cdx) should be
|
||||
allowed
|
||||
"""
|
||||
return True
|
||||
|
||||
def filter_fields(self, cdx):
|
||||
"""
|
||||
Filter out any forbidden cdx fields from cdx dictionary
|
||||
"""
|
||||
return cdx
|
||||
|
||||
|
||||
#=================================================================
|
||||
#TODO: other types of perm handlers
|
@ -15,22 +15,22 @@ org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org
|
||||
|
||||
|
||||
# Reverse CDX Stream
|
||||
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolve_revisits = True, limit = 3)
|
||||
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolveRevisits = True, limit = 3)
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
|
||||
>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolve_revisits = True, limit = 1)
|
||||
>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolveRevisits = True, limit = 1)
|
||||
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
|
||||
|
||||
# No matching results
|
||||
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolve_revisits = True, limit = 2)
|
||||
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 2)
|
||||
Traceback (most recent call last):
|
||||
CaptureNotFoundException: No Captures found for: http://iana.org/dont_have_this
|
||||
|
||||
|
||||
# Filter cdx (default: regex)
|
||||
>>> cdx_ops_test(url = 'http://iana.org/domains', match_type = 'prefix', filter = ['mimetype:text/html'])
|
||||
>>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['mimetype:text/html'])
|
||||
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
|
||||
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
|
||||
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
|
||||
@ -45,24 +45,24 @@ org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/s
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||
|
||||
# Filter exact
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], match_type = 'prefix', filter = '=urlkey:com,example)/?example=1')
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '=urlkey:com,example)/?example=1')
|
||||
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
|
||||
|
||||
# Filter exact invert
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], match_type = 'prefix', filter = '!=urlkey:com,example)/?example=1')
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!=urlkey:com,example)/?example=1')
|
||||
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
||||
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
||||
|
||||
# Collapse by timestamp
|
||||
# unresolved revisits, different statuscode results in an extra repeat
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = 11)
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11)
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||
org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz
|
||||
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
|
||||
|
||||
# resolved revisits
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = '11', resolve_revisits = True)
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = '11', resolveRevisits = True)
|
||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
|
||||
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
|
||||
|
||||
@ -80,38 +80,38 @@ org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/
|
||||
20140126200654
|
||||
20140126200625
|
||||
|
||||
>>> cdx_ops_test(closest = '20140126201306', url = 'http://iana.org/dnssec', resolve_revisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
|
||||
>>> cdx_ops_test(closest = '20140126201306', url = 'http://iana.org/dnssec', resolveRevisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
|
||||
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
|
||||
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
|
||||
|
||||
|
||||
>>> cdx_ops_test(closest = '20140126201307', url = 'http://iana.org/dnssec', resolve_revisits = True)
|
||||
>>> cdx_ops_test(closest = '20140126201307', url = 'http://iana.org/dnssec', resolveRevisits = True)
|
||||
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
|
||||
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
|
||||
|
||||
# equal dist prefer earlier
|
||||
>>> cdx_ops_test(closest = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2)
|
||||
>>> cdx_ops_test(closest = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2)
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz
|
||||
|
||||
>>> cdx_ops_test(closest = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
|
||||
>>> cdx_ops_test(closest = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp')
|
||||
20140126200654
|
||||
20140126200706
|
||||
|
||||
>>> cdx_ops_test(closest = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
|
||||
>>> cdx_ops_test(closest = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp')
|
||||
20140126200706
|
||||
20140126200654
|
||||
|
||||
|
||||
# Resolve Revisits
|
||||
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolve_revisits = True)
|
||||
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolveRevisits = True)
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - -
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz
|
||||
|
||||
>>> cdx_ops_test('http://iana.org/domains/root/db', resolve_revisits = True)
|
||||
>>> cdx_ops_test('http://iana.org/domains/root/db', resolveRevisits = True)
|
||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
|
||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -
|
||||
|
||||
|
@ -28,7 +28,7 @@ class IndexReader(object):
|
||||
if wbrequest.custom_params:
|
||||
params.update(wbrequest.custom_params)
|
||||
|
||||
params['allow_fuzzy'] = True
|
||||
params['allowFuzzy'] = True
|
||||
|
||||
cdxlines = self.load_cdx(url=wburl.url, output='raw', **params)
|
||||
|
||||
|
@ -53,14 +53,12 @@ def pywb_config_manual(passed_config = {}):
|
||||
|
||||
for name, value in collections.iteritems():
|
||||
if isinstance(value, str):
|
||||
route_config = config
|
||||
cdx_config = value
|
||||
else:
|
||||
route_config = DictChain(value, config)
|
||||
cdx_config = route_config
|
||||
value = {'index_paths': value}
|
||||
|
||||
route_config = DictChain(value, config)
|
||||
|
||||
ds_rules = route_config.get('domain_specific_rules', None)
|
||||
cdx_server = IndexReader(cdx_config, ds_rules)
|
||||
cdx_server = IndexReader(route_config, ds_rules)
|
||||
|
||||
wb_handler = config_utils.create_wb_handler(
|
||||
cdx_server = cdx_server,
|
||||
|
@ -92,7 +92,10 @@ enable_cdx_api: true
|
||||
|
||||
# optional reporter callback func
|
||||
# if set, called with request and cdx object
|
||||
reporter_func: pywb.run-tests.print_reporter
|
||||
reporter: !!python/object/new:tests.test_integration.PrintReporter []
|
||||
|
||||
# custom rules for domain specific matching
|
||||
#domain_specific_rules: rules.yaml
|
||||
|
||||
#perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms []
|
||||
perms_checker: !!python/object/new:tests.test_integration.TestExclusionPerms []
|
||||
|
@ -119,6 +119,12 @@ class TestWb:
|
||||
assert resp.content_type == 'text/css'
|
||||
|
||||
|
||||
def test_excluded_content(self):
|
||||
resp = self.testapp.get('/pywb/http://www.iana.org/_img/bookmark_icon.ico', status = 403)
|
||||
assert resp.status_int == 403
|
||||
assert 'Excluded' in resp.body
|
||||
|
||||
|
||||
def test_static_content(self):
|
||||
resp = self.testapp.get('/static/test/route/wb.css')
|
||||
assert resp.status_int == 200
|
||||
@ -149,7 +155,7 @@ class TestWb:
|
||||
|
||||
def test_cdx_server_advanced(self):
|
||||
# combine collapsing, reversing and revisit resolving
|
||||
resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/print.css&collapse_time=11&resolve_revisits=true&reverse=true')
|
||||
resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/print.css&collapseTime=11&resolveRevisits=true&reverse=true')
|
||||
|
||||
# convert back to CDXObject
|
||||
cdxs = map(CDXObject, resp.body.rstrip().split('\n'))
|
||||
@ -169,8 +175,42 @@ class TestWb:
|
||||
assert resp.status_int == 400
|
||||
assert 'Invalid Url: http://?abc' in resp.body
|
||||
|
||||
#=================================================================
|
||||
# Reporter callback for replay view
|
||||
def print_reporter(wbrequest, cdx, response):
|
||||
print wbrequest
|
||||
print cdx
|
||||
pass
|
||||
class PrintReporter:
|
||||
def __call__(self, wbrequest, cdx, response):
|
||||
print wbrequest
|
||||
print cdx
|
||||
pass
|
||||
|
||||
#=================================================================
|
||||
class TestExclusionPerms:
|
||||
"""
|
||||
Sample Perm Checker which allows all
|
||||
"""
|
||||
def allow_url_lookup(self, urlkey, url):
|
||||
"""
|
||||
Return true/false if url or urlkey (canonicalized url)
|
||||
should be allowed
|
||||
"""
|
||||
print urlkey
|
||||
if urlkey == 'org,iana)/_img/bookmark_icon.ico':
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def allow_capture(self, cdx):
|
||||
"""
|
||||
Return true/false is specified capture (cdx) should be
|
||||
allowed
|
||||
"""
|
||||
return True
|
||||
|
||||
def filter_fields(self, cdx):
|
||||
"""
|
||||
Filter out any forbidden cdx fields from cdx dictionary
|
||||
"""
|
||||
return cdx
|
||||
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user