cdx: clean up filters, add '~' modifier for contains

rules: fix regex to be lazy not greedy, turn off unneeded custom canonicalizer (need tests for custom canon) cleanup fuzzy match query fix data package in setup.py
2025-03-15 00:03:28 +01:00 · 2014-02-27 18:22:10 +00:00 · 2014-02-27 18:22:10 +00:00 · 22f1f78fca
commit 22f1f78fca
parent 453ab678ed
6 changed files with 58 additions and 28 deletions
--- a/pywb/cdx/cdxdomainspecific.py
+++ b/pywb/cdx/cdxdomainspecific.py
@ -38,8 +38,8 @@ def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
    if rules:
        fuzzy = FuzzyQuery(rules)

-    logging.debug('CANON: ' + str(canon))
-    logging.debug('FUZZY: ' + str(fuzzy))
+    logging.debug('CustomCanonilizer? ' + str(bool(canon)))
+    logging.debug('FuzzyMatcher? ' + str(bool(canon)))
    return (canon, fuzzy)


@ -73,6 +73,8 @@ class FuzzyQuery:

        urlkey = params['key']
        url = params['url']
+        filter_ = params.get('filter', [])
+        output = params.get('output')

        for rule in self.rules.iter_matching(urlkey):
            m = rule.regex.search(urlkey)
@ -82,7 +84,7 @@ class FuzzyQuery:
            matched_rule = rule

            if len(m.groups()) == 1:
-                params['filter'] = '=urlkey:' + m.group(1)
+                filter_.append('~urlkey:' + m.group(1))

            break

@ -91,10 +93,13 @@ class FuzzyQuery:

        inx = url.find('?')
        if inx > 0:
-            params['url'] = url[:inx + 1]
+            url = url[:inx + 1]
+
+        params = {'url': url,
+                  'matchType': 'prefix',
+                  'filter': filter_,
+                  'output': output}

-        params['matchType'] = 'prefix'
-        params['key'] = None
        return params


--- a/pywb/cdx/cdxops.py
+++ b/pywb/cdx/cdxops.py
@ -157,9 +157,15 @@ def cdx_filter(cdx_iter, filter_strings):
            if self.invert:
                string = string[1:]

-            self.exact = string.startswith('=')
-            if self.exact:
+            # exact match
+            if string.startswith('='):
                string = string[1:]
+                self.compare_func = self.exact
+            elif string.startswith('~'):
+                string = string[1:]
+                self.compare_func = self.contains
+            else:
+                self.compare_func = self.regex

            parts = string.split(':', 1)
            # no field set, apply filter to entire cdx
@ -170,19 +176,28 @@ def cdx_filter(cdx_iter, filter_strings):
                self.field = parts[0]
                string = parts[1]

-            if self.exact:
-                self.exact_str = string
-            else:
+            # make regex if regex mode
+            if self.compare_func == self.regex:
                self.regex = re.compile(string)
+            else:
+                self.filter_str = string

        def __call__(self, cdx):
            val = cdx[self.field] if self.field else str(cdx)
-            if self.exact:
-                matched = (self.exact_str == val)
-            else:
-                matched = self.regex.match(val) is not None
+
+            matched = self.compare_func(val)
+
            return matched ^ self.invert

+        def exact(self, val):
+            return (self.filter_str == val)
+
+        def contains(self, val):
+            return (self.filter_str in val)
+
+        def regex(self, val):
+            return self.regex.match(val) is not None
+
    filters = map(Filter, filter_strings)

    for cdx in cdx_iter:
--- a/pywb/cdx/cdxserver.py
+++ b/pywb/cdx/cdxserver.py
@ -50,14 +50,14 @@ class BaseCDXServer(object):

        url = params['url']

-        if self.fuzzy_query and params.get('allowFuzzy'):
-            if not 'key' in params:
-                params['key'] = self.url_canon(url)
+        # check if fuzzy is allowed and ensure that its an
+        # exact match
+        if (self.fuzzy_query and params.get('allowFuzzy') and
+            params.get('matchType', 'exact') == 'exact'):

-            params = self.fuzzy_query(params)
-            if params:
-                params['allowFuzzy'] = False
-                return self.load_cdx(**params)
+            fuzzy_params = self.fuzzy_query(params)
+            if fuzzy_params:
+                return self.load_cdx(**fuzzy_params)

        msg = 'No Captures found for: ' + url
        raise CaptureNotFoundException(msg)
@ -95,7 +95,6 @@ class CDXServer(BaseCDXServer):
                msg = 'A url= param must be specified to query the cdx server'
                raise CDXException(msg)

-            #params['key'] = self.url_canon(url)
            match_type = params.get('matchType', 'exact')

            key, end_key = calc_search_range(url=url,
--- a/pywb/cdx/test/cdxserver_test.py
+++ b/pywb/cdx/test/cdxserver_test.py
@ -54,6 +54,16 @@ com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit
 com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
 com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz

+# Filter contains
+>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '~urlkey:example=1')
+com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1043 333 example.warc.gz
+com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
+
+# Filter contains invert
+>>> cdx_ops_test(url = 'http://example.com', sources = [test_cdx_dir], matchType = 'prefix', filter = '!~urlkey:example=1')
+com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
+com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
+
 # Collapse by timestamp
 # unresolved revisits, different statuscode results in an extra repeat
 >>> cdx_ops_test(url = 'http://iana.org/_css/2013.1/screen.css', collapseTime = 11)
--- a/pywb/rules.yaml
+++ b/pywb/rules.yaml
@ -12,11 +12,12 @@ rules:
    #=================================================================
    - url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'

-      fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))'
+      fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))'

-      canonicalize:
-        match: 'com,facebook\)/.*[?&]data=([^&]+).*'
-        replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'
+# not actually needed, fuzzy match is used instead here
+#      canonicalize:
+#        match: 'com,facebook\)/.*[?&]data=([^&]+).*'
+#        replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'


    - url_prefix: 'com,facebook)/'
--- a/setup.py
+++ b/setup.py
@ -13,7 +13,7 @@ setuptools.setup(name='pywb',
        license='GPL',
        packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
        provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
-        package_data={'pywb': ['ui/*', 'static/*'], 'pywb': ['*.yaml']},
+        package_data={'pywb': ['ui/*', 'static/*', '*.yaml']},
        data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
                      ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')),
                      ('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))],