1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

cdx: clean up filters, add '~' modifier for contains

rules: fix regex to be lazy not greedy, turn off unneeded custom
canonicalizer (need tests for custom canon)
cleanup fuzzy match query
fix data package in setup.py
This commit is contained in:
Ilya Kreymer 2014-02-27 18:22:10 +00:00
parent 453ab678ed
commit 22f1f78fca
6 changed files with 58 additions and 28 deletions

View File

@ -38,8 +38,8 @@ def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
if rules:
fuzzy = FuzzyQuery(rules)
logging.debug('CANON: ' + str(canon))
logging.debug('FUZZY: ' + str(fuzzy))
logging.debug('CustomCanonilizer? ' + str(bool(canon)))
logging.debug('FuzzyMatcher? ' + str(bool(canon)))
return (canon, fuzzy)
@ -73,6 +73,8 @@ class FuzzyQuery:
urlkey = params['key']
url = params['url']
filter_ = params.get('filter', [])
output = params.get('output')
for rule in self.rules.iter_matching(urlkey):
m = rule.regex.search(urlkey)
@ -82,7 +84,7 @@ class FuzzyQuery:
matched_rule = rule
if len(m.groups()) == 1:
params['filter'] = '=urlkey:' + m.group(1)
filter_.append('~urlkey:' + m.group(1))
break
@ -91,10 +93,13 @@ class FuzzyQuery:
inx = url.find('?')
if inx > 0:
params['url'] = url[:inx + 1]
url = url[:inx + 1]
params = {'url': url,
'matchType': 'prefix',
'filter': filter_,
'output': output}
params['matchType'] = 'prefix'
params['key'] = None
return params

View File

@ -157,9 +157,15 @@ def cdx_filter(cdx_iter, filter_strings):
if self.invert:
string = string[1:]
self.exact = string.startswith('=')
if self.exact:
# exact match
if string.startswith('='):
string = string[1:]
self.compare_func = self.exact
elif string.startswith('~'):
string = string[1:]
self.compare_func = self.contains
else:
self.compare_func = self.regex
parts = string.split(':', 1)
# no field set, apply filter to entire cdx
@ -170,19 +176,28 @@ def cdx_filter(cdx_iter, filter_strings):
self.field = parts[0]
string = parts[1]
if self.exact:
self.exact_str = string
else:
# make regex if regex mode
if self.compare_func == self.regex:
self.regex = re.compile(string)
else:
self.filter_str = string
def __call__(self, cdx):
val = cdx[self.field] if self.field else str(cdx)
if self.exact:
matched = (self.exact_str == val)
else:
matched = self.regex.match(val) is not None
matched = self.compare_func(val)
return matched ^ self.invert
def exact(self, val):
return (self.filter_str == val)
def contains(self, val):
return (self.filter_str in val)
def regex(self, val):
return self.regex.match(val) is not None
filters = map(Filter, filter_strings)
for cdx in cdx_iter:

View File

@ -50,14 +50,14 @@ class BaseCDXServer(object):
url = params['url']
if self.fuzzy_query and params.get('allowFuzzy'):
if not 'key' in params:
params['key'] = self.url_canon(url)
# check if fuzzy is allowed and ensure that its an
# exact match
if (self.fuzzy_query and params.get('allowFuzzy') and
params.get('matchType', 'exact') == 'exact'):
params = self.fuzzy_query(params)
if params:
params['allowFuzzy'] = False
return self.load_cdx(**params)
fuzzy_params = self.fuzzy_query(params)
if fuzzy_params:
return self.load_cdx(**fuzzy_params)
msg = 'No Captures found for: ' + url
raise CaptureNotFoundException(msg)
@ -95,7 +95,6 @@ class CDXServer(BaseCDXServer):
msg = 'A url= param must be specified to query the cdx server'
raise CDXException(msg)
#params['key'] = self.url_canon(url)
match_type = params.get('matchType', 'exact')
key, end_key = calc_search_range(url=url,

View File

@ -54,6 +54,16 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
# Filter contains
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '~urlkey:example=1')
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
# Filter contains invert
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1')
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
# Collapse by timestamp
# unresolved revisits, different statuscode results in an extra repeat
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11)

View File

@ -12,11 +12,12 @@ rules:
#=================================================================
- url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))'
fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))'
canonicalize:
match: 'com,facebook\)/.*[?&]data=([^&]+).*'
replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'
# not actually needed, fuzzy match is used instead here
# canonicalize:
# match: 'com,facebook\)/.*[?&]data=([^&]+).*'
# replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'
- url_prefix: 'com,facebook)/'

View File

@ -13,7 +13,7 @@ setuptools.setup(name='pywb',
license='GPL',
packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
package_data={'pywb': ['ui/*', 'static/*'], 'pywb': ['*.yaml']},
package_data={'pywb': ['ui/*', 'static/*', '*.yaml']},
data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')),
('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))],