From 59d9beac057143d906d3b778088168225c6fc7e0 Mon Sep 17 00:00:00 2001 From: Mark Johnson <30983976+mijho@users.noreply.github.com> Date: Wed, 1 Feb 2023 02:48:19 +0000 Subject: [PATCH] feat: regex substitution on surt rules match (#780) substituion functionality already exists on a global level for matched rules but this causes issues when rule sets conflict in the desired outcome. This change enables setting regex substitution at the rule level to avoid these conflicts. --- pywb/rules.yaml | 10 ++++++-- pywb/warcserver/index/fuzzymatcher.py | 24 ++++++++++++------- .../index/test/test_fuzzymatcher.py | 7 ++++++ 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/pywb/rules.yaml b/pywb/rules.yaml index afdb8f20..b3bc0720 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -110,7 +110,7 @@ rules: fuzzy_lookup: match: '("(?:cursor|cursorindex)":["\d\w]+)' - find_all: true + re_type: findall - url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimeline' fuzzy_lookup: 'com,facebook\)/.*[?&](__adt=[^&]+).*[&]data=(?:.*?(?:[&]|(profile_id|pagelet_token)[^,]+))' @@ -175,7 +175,7 @@ rules: fuzzy_lookup: match: '("q[\d]+":|after:\\"[^"]+)' - find_all: true + re_type: findall - url_prefix: 'com,facebook)/pages_reaction_units/more' @@ -538,6 +538,12 @@ rules: rewrite: js_rewrite_location: urls + - url_prefix: 'com,example)/matched' + fuzzy_lookup: + re_type: sub + match: 'matched' + replace: 'replaced' + # all domain rules -- fallback to this dataset #================================================================= # Applies to all urls -- should be last diff --git a/pywb/warcserver/index/fuzzymatcher.py b/pywb/warcserver/index/fuzzymatcher.py index dac755d9..786d1b60 100644 --- a/pywb/warcserver/index/fuzzymatcher.py +++ b/pywb/warcserver/index/fuzzymatcher.py @@ -15,7 +15,7 @@ from collections import namedtuple # ============================================================================ FuzzyRule = namedtuple('FuzzyRule', 'url_prefix, regex, replace_after, filter_str, ' + - 'match_type, find_all') + 'match_type, re_type') # ============================================================================ @@ -23,6 +23,7 @@ class FuzzyMatcher(object): DEFAULT_FILTER = ['urlkey:{0}'] DEFAULT_MATCH_TYPE = 'prefix' DEFAULT_REPLACE_AFTER = '?' + DEFAULT_RE_TYPE = 'search' FUZZY_SKIP_PARAMS = ('alt_url', 'reverse', 'closest', 'end_key', 'url', 'matchType', 'filter') @@ -58,16 +59,16 @@ class FuzzyMatcher(object): replace_after = self.DEFAULT_REPLACE_AFTER filter_str = self.DEFAULT_FILTER match_type = self.DEFAULT_MATCH_TYPE - find_all = False + re_type = self.DEFAULT_RE_TYPE else: regex = self.make_regex(config.get('match')) replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER) filter_str = config.get('filter', self.DEFAULT_FILTER) match_type = config.get('type', self.DEFAULT_MATCH_TYPE) - find_all = config.get('find_all', False) + re_type = config.get('re_type', self.DEFAULT_RE_TYPE) - return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type, find_all) + return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type, re_type) def get_fuzzy_match(self, urlkey, url, params): filters = set() @@ -78,9 +79,12 @@ class FuzzyMatcher(object): continue groups = None - if rule.find_all: + if rule.re_type == 'findall': groups = rule.regex.findall(urlkey) - else: + if rule.re_type == 'sub': + matched_rule = rule + break + elif rule.re_type == 'search': m = rule.regex.search(urlkey) groups = m and m.groups() @@ -102,7 +106,7 @@ class FuzzyMatcher(object): no_filters = (not filters or filters == {'urlkey:'}) and (matched_rule.replace_after == '?') inx = url.find(matched_rule.replace_after) - if inx > 0: + if inx > 0 and matched_rule.re_type != 'sub': length = inx + len(matched_rule.replace_after) # don't include trailing '?' for default filter if no_filters: @@ -111,13 +115,17 @@ class FuzzyMatcher(object): if url[length - 1] == '/': length -= 1 url = url[:length] - elif not no_filters: + elif not no_filters and matched_rule.re_type != 'sub': url += matched_rule.replace_after[0] if matched_rule.match_type == 'domain': host = urlsplit(url).netloc url = host.split('.', 1)[1] + if matched_rule.re_type == 'sub': + filters = {'urlkey:'} + url = re.sub(rule.regex, rule.replace_after, url) + fuzzy_params = {'url': url, 'matchType': matched_rule.match_type, 'filter': filters, diff --git a/pywb/warcserver/index/test/test_fuzzymatcher.py b/pywb/warcserver/index/test/test_fuzzymatcher.py index 13b17976..a5b14d5b 100644 --- a/pywb/warcserver/index/test/test_fuzzymatcher.py +++ b/pywb/warcserver/index/test/test_fuzzymatcher.py @@ -234,3 +234,10 @@ class TestFuzzy(object): params = self.get_params(url, actual_url, mime='application/x-shockwave-flash') cdx_iter, errs = self.fuzzy(self.source, params) assert list(cdx_iter) == [] + + def test_fuzzy_sub_replacement(self): + url = 'https://example.com/matched' + actual_url = 'https://example.com/replaced' + params = self.get_params(url, actual_url) + cdx_iter, errs = self.fuzzy(self.source, params) + assert list(cdx_iter) == self.get_expected(actual_url)