diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index 006dd88d..54654b5e 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -38,8 +38,8 @@ def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered): if rules: fuzzy = FuzzyQuery(rules) - logging.debug('CANON: ' + str(canon)) - logging.debug('FUZZY: ' + str(fuzzy)) + logging.debug('CustomCanonilizer? ' + str(bool(canon))) + logging.debug('FuzzyMatcher? ' + str(bool(canon))) return (canon, fuzzy) @@ -73,6 +73,8 @@ class FuzzyQuery: urlkey = params['key'] url = params['url'] + filter_ = params.get('filter', []) + output = params.get('output') for rule in self.rules.iter_matching(urlkey): m = rule.regex.search(urlkey) @@ -82,7 +84,7 @@ class FuzzyQuery: matched_rule = rule if len(m.groups()) == 1: - params['filter'] = '=urlkey:' + m.group(1) + filter_.append('~urlkey:' + m.group(1)) break @@ -91,10 +93,13 @@ class FuzzyQuery: inx = url.find('?') if inx > 0: - params['url'] = url[:inx + 1] + url = url[:inx + 1] + + params = {'url': url, + 'matchType': 'prefix', + 'filter': filter_, + 'output': output} - params['matchType'] = 'prefix' - params['key'] = None return params diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index 247f3d18..1a90d7ca 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -157,9 +157,15 @@ def cdx_filter(cdx_iter, filter_strings): if self.invert: string = string[1:] - self.exact = string.startswith('=') - if self.exact: + # exact match + if string.startswith('='): string = string[1:] + self.compare_func = self.exact + elif string.startswith('~'): + string = string[1:] + self.compare_func = self.contains + else: + self.compare_func = self.regex parts = string.split(':', 1) # no field set, apply filter to entire cdx @@ -170,19 +176,28 @@ def cdx_filter(cdx_iter, filter_strings): self.field = parts[0] string = parts[1] - if self.exact: - self.exact_str = string - else: + # make regex if regex mode + if self.compare_func == self.regex: self.regex = re.compile(string) + else: + self.filter_str = string def __call__(self, cdx): val = cdx[self.field] if self.field else str(cdx) - if self.exact: - matched = (self.exact_str == val) - else: - matched = self.regex.match(val) is not None + + matched = self.compare_func(val) + return matched ^ self.invert + def exact(self, val): + return (self.filter_str == val) + + def contains(self, val): + return (self.filter_str in val) + + def regex(self, val): + return self.regex.match(val) is not None + filters = map(Filter, filter_strings) for cdx in cdx_iter: diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 7f548ec4..8eff842c 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -50,14 +50,14 @@ class BaseCDXServer(object): url = params['url'] - if self.fuzzy_query and params.get('allowFuzzy'): - if not 'key' in params: - params['key'] = self.url_canon(url) + # check if fuzzy is allowed and ensure that its an + # exact match + if (self.fuzzy_query and params.get('allowFuzzy') and + params.get('matchType', 'exact') == 'exact'): - params = self.fuzzy_query(params) - if params: - params['allowFuzzy'] = False - return self.load_cdx(**params) + fuzzy_params = self.fuzzy_query(params) + if fuzzy_params: + return self.load_cdx(**fuzzy_params) msg = 'No Captures found for: ' + url raise CaptureNotFoundException(msg) @@ -95,7 +95,6 @@ class CDXServer(BaseCDXServer): msg = 'A url= param must be specified to query the cdx server' raise CDXException(msg) - #params['key'] = self.url_canon(url) match_type = params.get('matchType', 'exact') key, end_key = calc_search_range(url=url, diff --git a/pywb/cdx/test/cdxserver_test.py b/pywb/cdx/test/cdxserver_test.py index 0e799ce9..384d7187 100644 --- a/pywb/cdx/test/cdxserver_test.py +++ b/pywb/cdx/test/cdxserver_test.py @@ -54,6 +54,16 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz +# Filter contains +>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '~urlkey:example=1') +com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz +com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz + +# Filter contains invert +>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1') +com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz +com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz + # Collapse by timestamp # unresolved revisits, different statuscode results in an extra repeat >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11) diff --git a/pywb/rules.yaml b/pywb/rules.yaml index 5cf29154..8927d2f1 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -12,11 +12,12 @@ rules: #================================================================= - url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet' - fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))' + fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))' - canonicalize: - match: 'com,facebook\)/.*[?&]data=([^&]+).*' - replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1' +# not actually needed, fuzzy match is used instead here +# canonicalize: +# match: 'com,facebook\)/.*[?&]data=([^&]+).*' +# replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1' - url_prefix: 'com,facebook)/' diff --git a/setup.py b/setup.py index 0750fe55..94c1bca7 100755 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ setuptools.setup(name='pywb', license='GPL', packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], - package_data={'pywb': ['ui/*', 'static/*'], 'pywb': ['*.yaml']}, + package_data={'pywb': ['ui/*', 'static/*', '*.yaml']}, data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')), ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')), ('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))],