1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

exclusions: add AllAllowPerms and refactor exclusions interface

add TestExclusionPerms and a sample exclusion integration test
refactor cdx server init params into **kwargs
convert all cdx params to use camelCase
This commit is contained in:
Ilya Kreymer 2014-02-19 20:20:31 -08:00
parent be284859be
commit ff428ed43e
10 changed files with 138 additions and 97 deletions

View File

@ -97,3 +97,7 @@ enable_cdx_api: true
# custom rules for domain specific matching
# set to false to disable
#domain_specific_rules: rules.yaml
# Permissions checker
#perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms []

View File

@ -1,4 +1,4 @@
from cdxobject import CDXObject
from cdxobject import CDXObject, AccessException
from pywb.utils.timeutils import timestamp_to_sec
import bisect
@ -10,25 +10,11 @@ from collections import deque
#=================================================================
class AllowAllPerms:
"""
Sample Perm Checker which allows all
"""
def allow_url(self, url):
return True
def allow_url_timestamp(self, url, timestamp):
return True
def filter_fields(self, cdx):
return cdx
#=================================================================
def cdx_load(source, params, perms_checker = AllowAllPerms()):
#cdx_iter = cdx_load_all(source, params)
cdx_iter = cdx_load_with_perms(source, params, perms_checker)
def cdx_load(sources, params, perms_checker=None):
if perms_checker:
cdx_iter = cdx_load_with_perms(sources, params, perms_checker)
else:
cdx_iter = cdx_load_and_filter(sources, params)
# output raw cdx objects
if params.get('output') == 'raw':
@ -42,16 +28,15 @@ def cdx_load(source, params, perms_checker = AllowAllPerms()):
#=================================================================
def cdx_load_with_perms(source, params, perms_checker):
if not perms_checker.allow_url(params['url']):
def cdx_load_with_perms(sources, params, perms_checker):
if not perms_checker.allow_url_lookup(params['key'], params['url']):
if params.get('matchType', 'exact') == 'exact':
yield
raise AccessException('Excluded')
cdx_iter = cdx_load_all(source, params)
cdx_iter = cdx_load_and_filter(sources, params)
for cdx in cdx_iter:
if not perms_checker.allow_url_timestamp(cdx['original'],
cdx['timestamp']):
if not perms_checker.allow_capture(cdx):
continue
cdx = perms_checker.filter_fields(cdx)
@ -68,7 +53,7 @@ def cdx_text_out(cdx, fields):
#=================================================================
def cdx_load_all(sources, params):
def cdx_load_and_filter(sources, params):
cdx_iter = load_cdx_streams(sources, params)
cdx_iter = make_cdx_iter(cdx_iter)
@ -76,7 +61,7 @@ def cdx_load_all(sources, params):
if params.get('proxy_all'):
return cdx_iter
resolve_revisits = params.get('resolve_revisits', False)
resolve_revisits = params.get('resolveRevisits', False)
if resolve_revisits:
cdx_iter = cdx_resolve_revisits(cdx_iter)
@ -84,13 +69,13 @@ def cdx_load_all(sources, params):
if filters:
cdx_iter = cdx_filter(cdx_iter, filters)
collapse_time = params.get('collapse_time', None)
collapse_time = params.get('collapseTime', None)
if collapse_time:
cdx_iter = cdx_collapse_time_status(cdx_iter, collapse_time)
limit = int(params.get('limit', 1000000))
reverse = params.get('reverse', False)
reverse = params.get('reverse', False) or params.get('sort') == 'reverse'
if reverse:
cdx_iter = cdx_reverse(cdx_iter, limit)

View File

@ -13,9 +13,10 @@ import urlparse
#=================================================================
class BaseCDXServer(object):
def __init__(self, url_canon=None, fuzzy_query=None):
self.url_canon = url_canon if url_canon else UrlCanonicalizer()
self.fuzzy_query = fuzzy_query
def __init__(self, **kwargs):
self.url_canon = kwargs.get('url_canon', UrlCanonicalizer())
self.fuzzy_query = kwargs.get('fuzzy_query')
self.perms_checker = kwargs.get('perms_checker')
def _check_cdx_iter(self, cdx_iter, params):
""" Check cdx iter semantics
@ -31,13 +32,13 @@ class BaseCDXServer(object):
url = params['url']
if self.fuzzy_query and params.get('allow_fuzzy'):
if self.fuzzy_query and params.get('allowFuzzy'):
if not 'key' in params:
params['key'] = self.url_canon(url)
params = self.fuzzy_query(params)
if params:
params['allow_fuzzy'] = False
params['allowFuzzy'] = False
return self.load_cdx(**params)
msg = 'No Captures found for: ' + url
@ -63,8 +64,8 @@ class CDXServer(BaseCDXServer):
responds to queries and dispatches to the cdx ops for processing
"""
def __init__(self, paths, url_canon=None, fuzzy_query=None):
super(CDXServer, self).__init__(url_canon, fuzzy_query)
def __init__(self, paths, **kwargs):
super(CDXServer, self).__init__(**kwargs)
self.sources = create_cdx_sources(paths)
def load_cdx(self, **params):
@ -78,9 +79,7 @@ class CDXServer(BaseCDXServer):
params['key'] = self.url_canon(url)
convert_old_style_params(params)
cdx_iter = cdx_load(self.sources, params)
cdx_iter = cdx_load(self.sources, params, self.perms_checker)
return self._check_cdx_iter(cdx_iter, params)
@ -95,8 +94,8 @@ class RemoteCDXServer(BaseCDXServer):
It simply proxies the query params to the remote source
and performs no local processing/filtering
"""
def __init__(self, source, url_canon=None, fuzzy_query=None):
super(RemoteCDXServer, self).__init__(url_canon, fuzzy_query)
def __init__(self, source, **kwargs):
super(RemoteCDXServer, self).__init__(**kwargs)
if isinstance(source, RemoteCDXSource):
self.source = source
@ -124,9 +123,11 @@ def create_cdx_server(config, ds_rules_file=None):
if hasattr(config, 'get'):
paths = config.get('index_paths')
surt_ordered = config.get('surt_ordered', True)
perms_checker = config.get('perms_checker')
else:
paths = config
surt_ordered = True
perms_checker = None
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
@ -145,7 +146,10 @@ def create_cdx_server(config, ds_rules_file=None):
else:
server_cls = CDXServer
return server_cls(paths, url_canon=canon, fuzzy_query=fuzzy)
return server_cls(paths,
url_canon=canon,
fuzzy_query=fuzzy,
perms_checker=perms_checker)
#=================================================================
@ -198,29 +202,6 @@ def create_cdx_source(filename):
# return RedisCDXSource(filename)
#=================================================================
def convert_old_style_params(params):
"""
Convert old-style CDX Server param semantics
"""
param = params.get('collapseTime')
if param:
params['collapse_time'] = param
param = params.get('matchType')
if param:
params['match_type'] = param
param = params.get('resolveRevisits')
if param:
params['resolve_revisits'] = param
if params.get('sort') == 'reverse':
params['reverse'] = True
return params
#=================================================================
def extract_params_from_wsgi_env(env):
""" utility function to extract params from the query

View File

@ -25,7 +25,7 @@ class CDXFile(CDXSource):
def load_cdx(self, params):
source = SeekableTextFileReader(self.filename)
match_type = params.get('match_type')
match_type = params.get('matchType')
if match_type == 'prefix':
iter_func = iter_prefix
@ -56,12 +56,12 @@ class RemoteCDXSource(CDXSource):
def load_cdx(self, proxy_params):
if self.proxy_all:
params = proxy_params
params['proxy_all'] = True
params['proxyAll'] = True
else:
# Only send url and matchType params to remote
params = {}
params['url'] = proxy_params['url']
match_type = proxy_params.get('match_type')
match_type = proxy_params.get('matchType')
if match_type:
proxy_params['matchType'] = match_type

30
pywb/cdx/perms.py Normal file
View File

@ -0,0 +1,30 @@
#=================================================================
class AllowAllPerms:
"""
Sample Perm Checker which allows all
"""
def allow_url_lookup(self, urlkey, url):
"""
Return true/false if url or urlkey (canonicalized url)
should be allowed
"""
return True
def allow_capture(self, cdx):
"""
Return true/false is specified capture (cdx) should be
allowed
"""
return True
def filter_fields(self, cdx):
"""
Filter out any forbidden cdx fields from cdx dictionary
"""
return cdx
#=================================================================
#TODO: other types of perm handlers

View File

@ -15,22 +15,22 @@ org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org
# Reverse CDX Stream
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolve_revisits = True, limit = 3)
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', reverse = True, resolveRevisits = True, limit = 3)
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201308 https://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 783712 iana.warc.gz 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201249 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 771773 iana.warc.gz 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 551 757988 iana.warc.gz 117166 198285 iana.warc.gz
>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolve_revisits = True, limit = 1)
>>> cdx_ops_test('http://iana.org/_js/2013.1/jquery.js', reverse = True, resolveRevisits = True, limit = 1)
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
# No matching results
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolve_revisits = True, limit = 2)
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolveRevisits = True, limit = 2)
Traceback (most recent call last):
CaptureNotFoundException: No Captures found for: http://iana.org/dont_have_this
# Filter cdx (default: regex)
>>> cdx_ops_test(url = 'http://iana.org/domains', match_type = 'prefix', filter = ['mimetype:text/html'])
>>> cdx_ops_test(url = 'http://iana.org/domains', matchType = 'prefix', filter = ['mimetype:text/html'])
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
@ -45,24 +45,24 @@ org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/s
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
# Filter exact
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], match_type = 'prefix', filter = '=urlkey:com,example)/?example=1')
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '=urlkey:com,example)/?example=1')
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
# Filter exact invert
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], match_type = 'prefix', filter = '!=urlkey:com,example)/?example=1')
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!=urlkey:com,example)/?example=1')
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
# Collapse by timestamp
# unresolved revisits, different statuscode results in an extra repeat
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = 11)
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11)
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
org,iana)/_css/2013.1/screen.css 20140126200653 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 533 328367 iana.warc.gz
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
# resolved revisits
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapse_time = '11', resolve_revisits = True)
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = '11', resolveRevisits = True)
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
@ -80,38 +80,38 @@ org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/
20140126200654
20140126200625
>>> cdx_ops_test(closest = '20140126201306', url = 'http://iana.org/dnssec', resolve_revisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
>>> cdx_ops_test(closest = '20140126201306', url = 'http://iana.org/dnssec', resolveRevisits = True, sources = [test_cdx_dir + 'dupes.cdx', test_cdx_dir + 'iana.cdx'])
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
>>> cdx_ops_test(closest = '20140126201307', url = 'http://iana.org/dnssec', resolve_revisits = True)
>>> cdx_ops_test(closest = '20140126201307', url = 'http://iana.org/dnssec', resolveRevisits = True)
org,iana)/dnssec 20140126201307 https://www.iana.org/dnssec text/html 200 PHLRSX73EV3WSZRFXMWDO6BRKTVUSASI - - 2278 773766 iana.warc.gz - - -
org,iana)/dnssec 20140126201306 http://www.iana.org/dnssec text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 442 772827 iana.warc.gz - - -
# equal dist prefer earlier
>>> cdx_ops_test(closest = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2)
>>> cdx_ops_test(closest = '20140126200700', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2)
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200654 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 548 482544 iana.warc.gz 117166 198285 iana.warc.gz
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200706 http://www.iana.org/_css/2013.1/fonts/OpenSans-Bold.ttf application/octet-stream 200 YFUR5ALIWJMWV6FAAFRLVRQNXZQF5HRW - - 552 495230 iana.warc.gz 117166 198285 iana.warc.gz
>>> cdx_ops_test(closest = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
>>> cdx_ops_test(closest = '20140126200659', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp')
20140126200654
20140126200706
>>> cdx_ops_test(closest = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolve_revisits = True, limit = 2, fields = 'timestamp')
>>> cdx_ops_test(closest = '20140126200701', url = 'http://iana.org/_css/2013.1/fonts/opensans-bold.ttf', resolveRevisits = True, limit = 2, fields = 'timestamp')
20140126200706
20140126200654
# Resolve Revisits
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolve_revisits = True)
>>> cdx_ops_test('http://iana.org/_css/2013.1/fonts/inconsolata.otf', resolveRevisits = True)
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz - - -
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200912 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 546 667073 iana.warc.gz 34054 620049 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 534 697255 iana.warc.gz 34054 620049 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 547 714833 iana.warc.gz 34054 620049 iana.warc.gz
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201249 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 551 768625 iana.warc.gz 34054 620049 iana.warc.gz
>>> cdx_ops_test('http://iana.org/domains/root/db', resolve_revisits = True)
>>> cdx_ops_test('http://iana.org/domains/root/db', resolveRevisits = True)
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz - - -
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz - - -

View File

@ -28,7 +28,7 @@ class IndexReader(object):
if wbrequest.custom_params:
params.update(wbrequest.custom_params)
params['allow_fuzzy'] = True
params['allowFuzzy'] = True
cdxlines = self.load_cdx(url=wburl.url, output='raw', **params)

View File

@ -53,14 +53,12 @@ def pywb_config_manual(passed_config = {}):
for name, value in collections.iteritems():
if isinstance(value, str):
route_config = config
cdx_config = value
else:
value = {'index_paths': value}
route_config = DictChain(value, config)
cdx_config = route_config
ds_rules = route_config.get('domain_specific_rules', None)
cdx_server = IndexReader(cdx_config, ds_rules)
cdx_server = IndexReader(route_config, ds_rules)
wb_handler = config_utils.create_wb_handler(
cdx_server = cdx_server,

View File

@ -92,7 +92,10 @@ enable_cdx_api: true
# optional reporter callback func
# if set, called with request and cdx object
reporter_func: pywb.run-tests.print_reporter
reporter: !!python/object/new:tests.test_integration.PrintReporter []
# custom rules for domain specific matching
#domain_specific_rules: rules.yaml
#perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms []
perms_checker: !!python/object/new:tests.test_integration.TestExclusionPerms []

View File

@ -119,6 +119,12 @@ class TestWb:
assert resp.content_type == 'text/css'
def test_excluded_content(self):
resp = self.testapp.get('/pywb/http://www.iana.org/_img/bookmark_icon.ico', status = 403)
assert resp.status_int == 403
assert 'Excluded' in resp.body
def test_static_content(self):
resp = self.testapp.get('/static/test/route/wb.css')
assert resp.status_int == 200
@ -149,7 +155,7 @@ class TestWb:
def test_cdx_server_advanced(self):
# combine collapsing, reversing and revisit resolving
resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/print.css&collapse_time=11&resolve_revisits=true&reverse=true')
resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/print.css&collapseTime=11&resolveRevisits=true&reverse=true')
# convert back to CDXObject
cdxs = map(CDXObject, resp.body.rstrip().split('\n'))
@ -169,8 +175,42 @@ class TestWb:
assert resp.status_int == 400
assert 'Invalid Url: http://?abc' in resp.body
#=================================================================
# Reporter callback for replay view
def print_reporter(wbrequest, cdx, response):
class PrintReporter:
def __call__(self, wbrequest, cdx, response):
print wbrequest
print cdx
pass
#=================================================================
class TestExclusionPerms:
"""
Sample Perm Checker which allows all
"""
def allow_url_lookup(self, urlkey, url):
"""
Return true/false if url or urlkey (canonicalized url)
should be allowed
"""
print urlkey
if urlkey == 'org,iana)/_img/bookmark_icon.ico':
return False
return True
def allow_capture(self, cdx):
"""
Return true/false is specified capture (cdx) should be
allowed
"""
return True
def filter_fields(self, cdx):
"""
Filter out any forbidden cdx fields from cdx dictionary
"""
return cdx