mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
cdx: clean up filters, add '~' modifier for contains
rules: fix regex to be lazy not greedy, turn off unneeded custom canonicalizer (need tests for custom canon) cleanup fuzzy match query fix data package in setup.py
This commit is contained in:
parent
453ab678ed
commit
22f1f78fca
@ -38,8 +38,8 @@ def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
|
||||
if rules:
|
||||
fuzzy = FuzzyQuery(rules)
|
||||
|
||||
logging.debug('CANON: ' + str(canon))
|
||||
logging.debug('FUZZY: ' + str(fuzzy))
|
||||
logging.debug('CustomCanonilizer? ' + str(bool(canon)))
|
||||
logging.debug('FuzzyMatcher? ' + str(bool(canon)))
|
||||
return (canon, fuzzy)
|
||||
|
||||
|
||||
@ -73,6 +73,8 @@ class FuzzyQuery:
|
||||
|
||||
urlkey = params['key']
|
||||
url = params['url']
|
||||
filter_ = params.get('filter', [])
|
||||
output = params.get('output')
|
||||
|
||||
for rule in self.rules.iter_matching(urlkey):
|
||||
m = rule.regex.search(urlkey)
|
||||
@ -82,7 +84,7 @@ class FuzzyQuery:
|
||||
matched_rule = rule
|
||||
|
||||
if len(m.groups()) == 1:
|
||||
params['filter'] = '=urlkey:' + m.group(1)
|
||||
filter_.append('~urlkey:' + m.group(1))
|
||||
|
||||
break
|
||||
|
||||
@ -91,10 +93,13 @@ class FuzzyQuery:
|
||||
|
||||
inx = url.find('?')
|
||||
if inx > 0:
|
||||
params['url'] = url[:inx + 1]
|
||||
url = url[:inx + 1]
|
||||
|
||||
params = {'url': url,
|
||||
'matchType': 'prefix',
|
||||
'filter': filter_,
|
||||
'output': output}
|
||||
|
||||
params['matchType'] = 'prefix'
|
||||
params['key'] = None
|
||||
return params
|
||||
|
||||
|
||||
|
@ -157,9 +157,15 @@ def cdx_filter(cdx_iter, filter_strings):
|
||||
if self.invert:
|
||||
string = string[1:]
|
||||
|
||||
self.exact = string.startswith('=')
|
||||
if self.exact:
|
||||
# exact match
|
||||
if string.startswith('='):
|
||||
string = string[1:]
|
||||
self.compare_func = self.exact
|
||||
elif string.startswith('~'):
|
||||
string = string[1:]
|
||||
self.compare_func = self.contains
|
||||
else:
|
||||
self.compare_func = self.regex
|
||||
|
||||
parts = string.split(':', 1)
|
||||
# no field set, apply filter to entire cdx
|
||||
@ -170,19 +176,28 @@ def cdx_filter(cdx_iter, filter_strings):
|
||||
self.field = parts[0]
|
||||
string = parts[1]
|
||||
|
||||
if self.exact:
|
||||
self.exact_str = string
|
||||
else:
|
||||
# make regex if regex mode
|
||||
if self.compare_func == self.regex:
|
||||
self.regex = re.compile(string)
|
||||
else:
|
||||
self.filter_str = string
|
||||
|
||||
def __call__(self, cdx):
|
||||
val = cdx[self.field] if self.field else str(cdx)
|
||||
if self.exact:
|
||||
matched = (self.exact_str == val)
|
||||
else:
|
||||
matched = self.regex.match(val) is not None
|
||||
|
||||
matched = self.compare_func(val)
|
||||
|
||||
return matched ^ self.invert
|
||||
|
||||
def exact(self, val):
|
||||
return (self.filter_str == val)
|
||||
|
||||
def contains(self, val):
|
||||
return (self.filter_str in val)
|
||||
|
||||
def regex(self, val):
|
||||
return self.regex.match(val) is not None
|
||||
|
||||
filters = map(Filter, filter_strings)
|
||||
|
||||
for cdx in cdx_iter:
|
||||
|
@ -50,14 +50,14 @@ class BaseCDXServer(object):
|
||||
|
||||
url = params['url']
|
||||
|
||||
if self.fuzzy_query and params.get('allowFuzzy'):
|
||||
if not 'key' in params:
|
||||
params['key'] = self.url_canon(url)
|
||||
# check if fuzzy is allowed and ensure that its an
|
||||
# exact match
|
||||
if (self.fuzzy_query and params.get('allowFuzzy') and
|
||||
params.get('matchType', 'exact') == 'exact'):
|
||||
|
||||
params = self.fuzzy_query(params)
|
||||
if params:
|
||||
params['allowFuzzy'] = False
|
||||
return self.load_cdx(**params)
|
||||
fuzzy_params = self.fuzzy_query(params)
|
||||
if fuzzy_params:
|
||||
return self.load_cdx(**fuzzy_params)
|
||||
|
||||
msg = 'No Captures found for: ' + url
|
||||
raise CaptureNotFoundException(msg)
|
||||
@ -95,7 +95,6 @@ class CDXServer(BaseCDXServer):
|
||||
msg = 'A url= param must be specified to query the cdx server'
|
||||
raise CDXException(msg)
|
||||
|
||||
#params['key'] = self.url_canon(url)
|
||||
match_type = params.get('matchType', 'exact')
|
||||
|
||||
key, end_key = calc_search_range(url=url,
|
||||
|
@ -54,6 +54,16 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit
|
||||
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
||||
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
||||
|
||||
# Filter contains
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '~urlkey:example=1')
|
||||
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
|
||||
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
|
||||
|
||||
# Filter contains invert
|
||||
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1')
|
||||
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
||||
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
||||
|
||||
# Collapse by timestamp
|
||||
# unresolved revisits, different statuscode results in an extra repeat
|
||||
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11)
|
||||
|
@ -12,11 +12,12 @@ rules:
|
||||
#=================================================================
|
||||
- url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
|
||||
|
||||
fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))'
|
||||
fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))'
|
||||
|
||||
canonicalize:
|
||||
match: 'com,facebook\)/.*[?&]data=([^&]+).*'
|
||||
replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'
|
||||
# not actually needed, fuzzy match is used instead here
|
||||
# canonicalize:
|
||||
# match: 'com,facebook\)/.*[?&]data=([^&]+).*'
|
||||
# replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'
|
||||
|
||||
|
||||
- url_prefix: 'com,facebook)/'
|
||||
|
2
setup.py
2
setup.py
@ -13,7 +13,7 @@ setuptools.setup(name='pywb',
|
||||
license='GPL',
|
||||
packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
|
||||
provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
|
||||
package_data={'pywb': ['ui/*', 'static/*'], 'pywb': ['*.yaml']},
|
||||
package_data={'pywb': ['ui/*', 'static/*', '*.yaml']},
|
||||
data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
|
||||
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')),
|
||||
('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))],
|
||||
|
Loading…
x
Reference in New Issue
Block a user