1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

cdx: clean up filters, add '~' modifier for contains

rules: fix regex to be lazy not greedy, turn off unneeded custom
canonicalizer (need tests for custom canon)
cleanup fuzzy match query
fix data package in setup.py
This commit is contained in:
Ilya Kreymer 2014-02-27 18:22:10 +00:00
parent 453ab678ed
commit 22f1f78fca
6 changed files with 58 additions and 28 deletions

View File

@ -38,8 +38,8 @@ def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
if rules: if rules:
fuzzy = FuzzyQuery(rules) fuzzy = FuzzyQuery(rules)
logging.debug('CANON: ' + str(canon)) logging.debug('CustomCanonilizer? ' + str(bool(canon)))
logging.debug('FUZZY: ' + str(fuzzy)) logging.debug('FuzzyMatcher? ' + str(bool(canon)))
return (canon, fuzzy) return (canon, fuzzy)
@ -73,6 +73,8 @@ class FuzzyQuery:
urlkey = params['key'] urlkey = params['key']
url = params['url'] url = params['url']
filter_ = params.get('filter', [])
output = params.get('output')
for rule in self.rules.iter_matching(urlkey): for rule in self.rules.iter_matching(urlkey):
m = rule.regex.search(urlkey) m = rule.regex.search(urlkey)
@ -82,7 +84,7 @@ class FuzzyQuery:
matched_rule = rule matched_rule = rule
if len(m.groups()) == 1: if len(m.groups()) == 1:
params['filter'] = '=urlkey:' + m.group(1) filter_.append('~urlkey:' + m.group(1))
break break
@ -91,10 +93,13 @@ class FuzzyQuery:
inx = url.find('?') inx = url.find('?')
if inx > 0: if inx > 0:
params['url'] = url[:inx + 1] url = url[:inx + 1]
params = {'url': url,
'matchType': 'prefix',
'filter': filter_,
'output': output}
params['matchType'] = 'prefix'
params['key'] = None
return params return params

View File

@ -157,9 +157,15 @@ def cdx_filter(cdx_iter, filter_strings):
if self.invert: if self.invert:
string = string[1:] string = string[1:]
self.exact = string.startswith('=') # exact match
if self.exact: if string.startswith('='):
string = string[1:] string = string[1:]
self.compare_func = self.exact
elif string.startswith('~'):
string = string[1:]
self.compare_func = self.contains
else:
self.compare_func = self.regex
parts = string.split(':', 1) parts = string.split(':', 1)
# no field set, apply filter to entire cdx # no field set, apply filter to entire cdx
@ -170,19 +176,28 @@ def cdx_filter(cdx_iter, filter_strings):
self.field = parts[0] self.field = parts[0]
string = parts[1] string = parts[1]
if self.exact: # make regex if regex mode
self.exact_str = string if self.compare_func == self.regex:
else:
self.regex = re.compile(string) self.regex = re.compile(string)
else:
self.filter_str = string
def __call__(self, cdx): def __call__(self, cdx):
val = cdx[self.field] if self.field else str(cdx) val = cdx[self.field] if self.field else str(cdx)
if self.exact:
matched = (self.exact_str == val) matched = self.compare_func(val)
else:
matched = self.regex.match(val) is not None
return matched ^ self.invert return matched ^ self.invert
def exact(self, val):
return (self.filter_str == val)
def contains(self, val):
return (self.filter_str in val)
def regex(self, val):
return self.regex.match(val) is not None
filters = map(Filter, filter_strings) filters = map(Filter, filter_strings)
for cdx in cdx_iter: for cdx in cdx_iter:

View File

@ -50,14 +50,14 @@ class BaseCDXServer(object):
url = params['url'] url = params['url']
if self.fuzzy_query and params.get('allowFuzzy'): # check if fuzzy is allowed and ensure that its an
if not 'key' in params: # exact match
params['key'] = self.url_canon(url) if (self.fuzzy_query and params.get('allowFuzzy') and
params.get('matchType', 'exact') == 'exact'):
params = self.fuzzy_query(params) fuzzy_params = self.fuzzy_query(params)
if params: if fuzzy_params:
params['allowFuzzy'] = False return self.load_cdx(**fuzzy_params)
return self.load_cdx(**params)
msg = 'No Captures found for: ' + url msg = 'No Captures found for: ' + url
raise CaptureNotFoundException(msg) raise CaptureNotFoundException(msg)
@ -95,7 +95,6 @@ class CDXServer(BaseCDXServer):
msg = 'A url= param must be specified to query the cdx server' msg = 'A url= param must be specified to query the cdx server'
raise CDXException(msg) raise CDXException(msg)
#params['key'] = self.url_canon(url)
match_type = params.get('matchType', 'exact') match_type = params.get('matchType', 'exact')
key, end_key = calc_search_range(url=url, key, end_key = calc_search_range(url=url,

View File

@ -54,6 +54,16 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
# Filter contains
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '~urlkey:example=1')
com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
# Filter contains invert
>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1')
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
# Collapse by timestamp # Collapse by timestamp
# unresolved revisits, different statuscode results in an extra repeat # unresolved revisits, different statuscode results in an extra repeat
>>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11) >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11)

View File

@ -12,11 +12,12 @@ rules:
#================================================================= #=================================================================
- url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet' - url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))' fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))'
canonicalize: # not actually needed, fuzzy match is used instead here
match: 'com,facebook\)/.*[?&]data=([^&]+).*' # canonicalize:
replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1' # match: 'com,facebook\)/.*[?&]data=([^&]+).*'
# replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'
- url_prefix: 'com,facebook)/' - url_prefix: 'com,facebook)/'

View File

@ -13,7 +13,7 @@ setuptools.setup(name='pywb',
license='GPL', license='GPL',
packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
package_data={'pywb': ['ui/*', 'static/*'], 'pywb': ['*.yaml']}, package_data={'pywb': ['ui/*', 'static/*', '*.yaml']},
data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')), data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')), ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')),
('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))], ('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))],