feat: regex substitution on surt rules match (#780)

substituion functionality already exists on a global level for matched rules but this causes issues when rule sets conflict in the desired outcome. This change enables setting regex substitution at the rule level to avoid these conflicts.
2025-03-15 00:03:28 +01:00 · 2023-02-01 02:48:19 +00:00 · 2023-02-01 02:48:19 +00:00 · 59d9beac05
commit 59d9beac05
parent 0758e81b62
3 changed files with 31 additions and 10 deletions
--- a/pywb/rules.yaml
+++ b/pywb/rules.yaml
@ -110,7 +110,7 @@ rules:
      fuzzy_lookup:
        match: '("(?:cursor|cursorindex)":["\d\w]+)'
-        find_all: true
+        re_type: findall
    - url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimeline'
      fuzzy_lookup: 'com,facebook\)/.*[?&](__adt=[^&]+).*[&]data=(?:.*?(?:[&]|(profile_id|pagelet_token)[^,]+))'
@ -175,7 +175,7 @@ rules:
      fuzzy_lookup:
        match: '("q[\d]+":|after:\\"[^"]+)'
-        find_all: true
+        re_type: findall
    - url_prefix: 'com,facebook)/pages_reaction_units/more'
@ -538,6 +538,12 @@ rules:
      rewrite:
        js_rewrite_location: urls
    - url_prefix: 'com,example)/matched'
      fuzzy_lookup:
        re_type: sub
        match: 'matched'
        replace: 'replaced'          
    # all domain rules -- fallback to this dataset
    #=================================================================
    # Applies to all urls -- should be last
--- a/pywb/warcserver/index/fuzzymatcher.py
+++ b/pywb/warcserver/index/fuzzymatcher.py
@ -15,7 +15,7 @@ from collections import namedtuple
 # ============================================================================
 FuzzyRule = namedtuple('FuzzyRule',
                       'url_prefix, regex, replace_after, filter_str, ' +
-                       'match_type, find_all')
+                       'match_type, re_type')
 # ============================================================================
@ -23,6 +23,7 @@ class FuzzyMatcher(object):
    DEFAULT_FILTER = ['urlkey:{0}']
    DEFAULT_MATCH_TYPE = 'prefix'
    DEFAULT_REPLACE_AFTER = '?'
    DEFAULT_RE_TYPE = 'search'
    FUZZY_SKIP_PARAMS = ('alt_url', 'reverse', 'closest', 'end_key',
                         'url', 'matchType', 'filter')
@ -58,16 +59,16 @@ class FuzzyMatcher(object):
            replace_after = self.DEFAULT_REPLACE_AFTER
            filter_str = self.DEFAULT_FILTER
            match_type = self.DEFAULT_MATCH_TYPE
-            find_all = False
+            re_type = self.DEFAULT_RE_TYPE
        else:
            regex = self.make_regex(config.get('match'))
            replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER)
            filter_str = config.get('filter', self.DEFAULT_FILTER)
            match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
-            find_all = config.get('find_all', False)
+            re_type = config.get('re_type', self.DEFAULT_RE_TYPE)
-        return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type, find_all)
+        return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type, re_type)
    def get_fuzzy_match(self, urlkey, url, params):
        filters = set()
@ -78,9 +79,12 @@ class FuzzyMatcher(object):
                continue
            groups = None
-            if rule.find_all:
+            if rule.re_type == 'findall':
                groups = rule.regex.findall(urlkey)
-            else:
+            if rule.re_type == 'sub':
                matched_rule = rule
                break
            elif rule.re_type == 'search':
                m = rule.regex.search(urlkey)
                groups = m and m.groups()
@ -102,7 +106,7 @@ class FuzzyMatcher(object):
        no_filters = (not filters or filters == {'urlkey:'}) and (matched_rule.replace_after == '?')
        inx = url.find(matched_rule.replace_after)
-        if inx > 0:
+        if inx > 0 and matched_rule.re_type != 'sub':
            length = inx + len(matched_rule.replace_after)
            # don't include trailing '?' for default filter
            if no_filters:
@ -111,13 +115,17 @@ class FuzzyMatcher(object):
                if url[length - 1] == '/':
                    length -= 1
            url = url[:length]
-        elif not no_filters:
+        elif not no_filters and matched_rule.re_type != 'sub':
            url += matched_rule.replace_after[0]
        if matched_rule.match_type == 'domain':
            host = urlsplit(url).netloc
            url = host.split('.', 1)[1]
        if matched_rule.re_type == 'sub':
            filters = {'urlkey:'}
            url = re.sub(rule.regex, rule.replace_after, url)            
        fuzzy_params = {'url': url,
                        'matchType': matched_rule.match_type,
                        'filter': filters,
--- a/pywb/warcserver/index/test/test_fuzzymatcher.py
+++ b/pywb/warcserver/index/test/test_fuzzymatcher.py
@ -234,3 +234,10 @@ class TestFuzzy(object):
        params = self.get_params(url, actual_url, mime='application/x-shockwave-flash')
        cdx_iter, errs = self.fuzzy(self.source, params)
        assert list(cdx_iter) == []
    def test_fuzzy_sub_replacement(self):
        url = 'https://example.com/matched'
        actual_url = 'https://example.com/replaced'
        params = self.get_params(url, actual_url)
        cdx_iter, errs = self.fuzzy(self.source, params)
        assert list(cdx_iter) == self.get_expected(actual_url)