From 59d9beac057143d906d3b778088168225c6fc7e0 Mon Sep 17 00:00:00 2001
From: Mark Johnson <30983976+mijho@users.noreply.github.com>
Date: Wed, 1 Feb 2023 02:48:19 +0000
Subject: [PATCH] feat: regex substitution on surt rules match (#780)

substituion functionality already exists on a global level for matched
rules but this causes issues when rule sets conflict in the desired
outcome. This change enables setting regex substitution at the rule
level to avoid these conflicts.
---
 pywb/rules.yaml                               | 10 ++++++--
 pywb/warcserver/index/fuzzymatcher.py         | 24 ++++++++++++-------
 .../index/test/test_fuzzymatcher.py           |  7 ++++++
 3 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/pywb/rules.yaml b/pywb/rules.yaml
index afdb8f20..b3bc0720 100644
--- a/pywb/rules.yaml
+++ b/pywb/rules.yaml
@@ -110,7 +110,7 @@ rules:
 
       fuzzy_lookup:
         match: '("(?:cursor|cursorindex)":["\d\w]+)'
-        find_all: true
+        re_type: findall
 
     - url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimeline'
       fuzzy_lookup: 'com,facebook\)/.*[?&](__adt=[^&]+).*[&]data=(?:.*?(?:[&]|(profile_id|pagelet_token)[^,]+))'
@@ -175,7 +175,7 @@ rules:
 
       fuzzy_lookup:
         match: '("q[\d]+":|after:\\"[^"]+)'
-        find_all: true
+        re_type: findall
 
     - url_prefix: 'com,facebook)/pages_reaction_units/more'
 
@@ -538,6 +538,12 @@ rules:
       rewrite:
         js_rewrite_location: urls
 
+    - url_prefix: 'com,example)/matched'
+      fuzzy_lookup:
+        re_type: sub
+        match: 'matched'
+        replace: 'replaced'          
+
     # all domain rules -- fallback to this dataset
     #=================================================================
     # Applies to all urls -- should be last
diff --git a/pywb/warcserver/index/fuzzymatcher.py b/pywb/warcserver/index/fuzzymatcher.py
index dac755d9..786d1b60 100644
--- a/pywb/warcserver/index/fuzzymatcher.py
+++ b/pywb/warcserver/index/fuzzymatcher.py
@@ -15,7 +15,7 @@ from collections import namedtuple
 # ============================================================================
 FuzzyRule = namedtuple('FuzzyRule',
                        'url_prefix, regex, replace_after, filter_str, ' +
-                       'match_type, find_all')
+                       'match_type, re_type')
 
 
 # ============================================================================
@@ -23,6 +23,7 @@ class FuzzyMatcher(object):
     DEFAULT_FILTER = ['urlkey:{0}']
     DEFAULT_MATCH_TYPE = 'prefix'
     DEFAULT_REPLACE_AFTER = '?'
+    DEFAULT_RE_TYPE = 'search'
 
     FUZZY_SKIP_PARAMS = ('alt_url', 'reverse', 'closest', 'end_key',
                          'url', 'matchType', 'filter')
@@ -58,16 +59,16 @@ class FuzzyMatcher(object):
             replace_after = self.DEFAULT_REPLACE_AFTER
             filter_str = self.DEFAULT_FILTER
             match_type = self.DEFAULT_MATCH_TYPE
-            find_all = False
+            re_type = self.DEFAULT_RE_TYPE
 
         else:
             regex = self.make_regex(config.get('match'))
             replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER)
             filter_str = config.get('filter', self.DEFAULT_FILTER)
             match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
-            find_all = config.get('find_all', False)
+            re_type = config.get('re_type', self.DEFAULT_RE_TYPE)
 
-        return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type, find_all)
+        return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type, re_type)
 
     def get_fuzzy_match(self, urlkey, url, params):
         filters = set()
@@ -78,9 +79,12 @@ class FuzzyMatcher(object):
                 continue
 
             groups = None
-            if rule.find_all:
+            if rule.re_type == 'findall':
                 groups = rule.regex.findall(urlkey)
-            else:
+            if rule.re_type == 'sub':
+                matched_rule = rule
+                break
+            elif rule.re_type == 'search':
                 m = rule.regex.search(urlkey)
                 groups = m and m.groups()
 
@@ -102,7 +106,7 @@ class FuzzyMatcher(object):
         no_filters = (not filters or filters == {'urlkey:'}) and (matched_rule.replace_after == '?')
 
         inx = url.find(matched_rule.replace_after)
-        if inx > 0:
+        if inx > 0 and matched_rule.re_type != 'sub':
             length = inx + len(matched_rule.replace_after)
             # don't include trailing '?' for default filter
             if no_filters:
@@ -111,13 +115,17 @@ class FuzzyMatcher(object):
                 if url[length - 1] == '/':
                     length -= 1
             url = url[:length]
-        elif not no_filters:
+        elif not no_filters and matched_rule.re_type != 'sub':
             url += matched_rule.replace_after[0]
 
         if matched_rule.match_type == 'domain':
             host = urlsplit(url).netloc
             url = host.split('.', 1)[1]
 
+        if matched_rule.re_type == 'sub':
+            filters = {'urlkey:'}
+            url = re.sub(rule.regex, rule.replace_after, url)            
+
         fuzzy_params = {'url': url,
                         'matchType': matched_rule.match_type,
                         'filter': filters,
diff --git a/pywb/warcserver/index/test/test_fuzzymatcher.py b/pywb/warcserver/index/test/test_fuzzymatcher.py
index 13b17976..a5b14d5b 100644
--- a/pywb/warcserver/index/test/test_fuzzymatcher.py
+++ b/pywb/warcserver/index/test/test_fuzzymatcher.py
@@ -234,3 +234,10 @@ class TestFuzzy(object):
         params = self.get_params(url, actual_url, mime='application/x-shockwave-flash')
         cdx_iter, errs = self.fuzzy(self.source, params)
         assert list(cdx_iter) == []
+
+    def test_fuzzy_sub_replacement(self):
+        url = 'https://example.com/matched'
+        actual_url = 'https://example.com/replaced'
+        params = self.get_params(url, actual_url)
+        cdx_iter, errs = self.fuzzy(self.source, params)
+        assert list(cdx_iter) == self.get_expected(actual_url)