cdx: clean up filters, add '~' modifier for contains

rules: fix regex to be lazy not greedy, turn off unneeded custom canonicalizer (need tests for custom canon) cleanup fuzzy match query fix data package in setup.py
2025-03-24 06:59:52 +01:00 · 2014-02-27 18:22:10 +00:00 · 2014-02-27 18:22:10 +00:00 · 22f1f78fca
commit 22f1f78fca
parent 453ab678ed
6 changed files with 58 additions and 28 deletions
--- a/pywb/cdx/cdxdomainspecific.py
+++ b/pywb/cdx/cdxdomainspecific.py
@ -38,8 +38,8 @@ def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
    if rules:
        fuzzy = FuzzyQuery(rules)
-    logging.debug('CANON: ' + str(canon))
+    logging.debug('CustomCanonilizer? ' + str(bool(canon)))
-    logging.debug('FUZZY: ' + str(fuzzy))
+    logging.debug('FuzzyMatcher? ' + str(bool(canon)))
    return (canon, fuzzy)
@ -73,6 +73,8 @@ class FuzzyQuery:
        urlkey = params['key']
        url = params['url']
        filter_ = params.get('filter', [])
        output = params.get('output')
        for rule in self.rules.iter_matching(urlkey):
            m = rule.regex.search(urlkey)
@ -82,7 +84,7 @@ class FuzzyQuery:
            matched_rule = rule
            if len(m.groups()) == 1:
-                params['filter'] = '=urlkey:' + m.group(1)
+                filter_.append('~urlkey:' + m.group(1))
            break
@ -91,10 +93,13 @@ class FuzzyQuery:
        inx = url.find('?')
        if inx > 0:
-            params['url'] = url[:inx + 1]
+            url = url[:inx + 1]
        params = {'url': url,
                  'matchType': 'prefix',
                  'filter': filter_,
                  'output': output}
        params['matchType'] = 'prefix'
        params['key'] = None
        return params
--- a/pywb/cdx/cdxops.py
+++ b/pywb/cdx/cdxops.py
@ -157,9 +157,15 @@ def cdx_filter(cdx_iter, filter_strings):
            if self.invert:
                string = string[1:]
-            self.exact = string.startswith('=')
+            # exact match
-            if self.exact:
+            if string.startswith('='):
                string = string[1:]
                self.compare_func = self.exact
            elif string.startswith('~'):
                string = string[1:]
                self.compare_func = self.contains
            else:
                self.compare_func = self.regex
            parts = string.split(':', 1)
            # no field set, apply filter to entire cdx
@ -170,19 +176,28 @@ def cdx_filter(cdx_iter, filter_strings):
                self.field = parts[0]
                string = parts[1]
-            if self.exact:
+            # make regex if regex mode
-                self.exact_str = string
+            if self.compare_func == self.regex:
            else:
                self.regex = re.compile(string)
            else:
                self.filter_str = string
        def __call__(self, cdx):
            val = cdx[self.field] if self.field else str(cdx)
-            if self.exact:
+
-                matched = (self.exact_str == val)
+            matched = self.compare_func(val)
-            else:
+
                matched = self.regex.match(val) is not None
            return matched ^ self.invert
        def exact(self, val):
            return (self.filter_str == val)
        def contains(self, val):
            return (self.filter_str in val)
        def regex(self, val):
            return self.regex.match(val) is not None
    filters = map(Filter, filter_strings)
    for cdx in cdx_iter:
--- a/pywb/cdx/cdxserver.py
+++ b/pywb/cdx/cdxserver.py
@ -50,14 +50,14 @@ class BaseCDXServer(object):
        url = params['url']
-        if self.fuzzy_query and params.get('allowFuzzy'):
+        # check if fuzzy is allowed and ensure that its an
-            if not 'key' in params:
+        # exact match
-                params['key'] = self.url_canon(url)
+        if (self.fuzzy_query and params.get('allowFuzzy') and
            params.get('matchType', 'exact') == 'exact'):
-            params = self.fuzzy_query(params)
+            fuzzy_params = self.fuzzy_query(params)
-            if params:
+            if fuzzy_params:
-                params['allowFuzzy'] = False
+                return self.load_cdx(**fuzzy_params)
                return self.load_cdx(**params)
        msg = 'No Captures found for: ' + url
        raise CaptureNotFoundException(msg)
@ -95,7 +95,6 @@ class CDXServer(BaseCDXServer):
                msg = 'A url= param must be specified to query the cdx server'
                raise CDXException(msg)
            #params['key'] = self.url_canon(url)
            match_type = params.get('matchType', 'exact')
            key, end_key = calc_search_range(url=url,
--- a/pywb/cdx/test/cdxserver_test.py
+++ b/pywb/cdx/test/cdxserver_test.py
@ -54,6 +54,16 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit
 com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
 com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
 # Filter contains
 >>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '~urlkey:example=1')
 com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
 com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
 # Filter contains invert
 >>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1')
 com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
 com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
 # Collapse by timestamp
 # unresolved revisits, different statuscode results in an extra repeat
 >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11)
--- a/pywb/rules.yaml
+++ b/pywb/rules.yaml
@ -12,11 +12,12 @@ rules:
    #=================================================================
    - url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
-      fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))'
+      fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))'
-      canonicalize:
+# not actually needed, fuzzy match is used instead here
-        match: 'com,facebook\)/.*[?&]data=([^&]+).*'
+#      canonicalize:
-        replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'
+#        match: 'com,facebook\)/.*[?&]data=([^&]+).*'
 #        replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'
    - url_prefix: 'com,facebook)/'
--- a/setup.py
+++ b/setup.py
@ -13,7 +13,7 @@ setuptools.setup(name='pywb',
        license='GPL',
        packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
        provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
-        package_data={'pywb': ['ui/*', 'static/*'], 'pywb': ['*.yaml']},
+        package_data={'pywb': ['ui/*', 'static/*', '*.yaml']},
        data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
                      ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')),
                      ('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))],