diff --git a/pywb/rules.yaml b/pywb/rules.yaml index afdb8f20..b3bc0720 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -110,7 +110,7 @@ rules: fuzzy_lookup: match: '("(?:cursor|cursorindex)":["\d\w]+)' - find_all: true + re_type: findall - url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimeline' fuzzy_lookup: 'com,facebook\)/.*[?&](__adt=[^&]+).*[&]data=(?:.*?(?:[&]|(profile_id|pagelet_token)[^,]+))' @@ -175,7 +175,7 @@ rules: fuzzy_lookup: match: '("q[\d]+":|after:\\"[^"]+)' - find_all: true + re_type: findall - url_prefix: 'com,facebook)/pages_reaction_units/more' @@ -538,6 +538,12 @@ rules: rewrite: js_rewrite_location: urls + - url_prefix: 'com,example)/matched' + fuzzy_lookup: + re_type: sub + match: 'matched' + replace: 'replaced' + # all domain rules -- fallback to this dataset #================================================================= # Applies to all urls -- should be last diff --git a/pywb/warcserver/index/fuzzymatcher.py b/pywb/warcserver/index/fuzzymatcher.py index dac755d9..786d1b60 100644 --- a/pywb/warcserver/index/fuzzymatcher.py +++ b/pywb/warcserver/index/fuzzymatcher.py @@ -15,7 +15,7 @@ from collections import namedtuple # ============================================================================ FuzzyRule = namedtuple('FuzzyRule', 'url_prefix, regex, replace_after, filter_str, ' + - 'match_type, find_all') + 'match_type, re_type') # ============================================================================ @@ -23,6 +23,7 @@ class FuzzyMatcher(object): DEFAULT_FILTER = ['urlkey:{0}'] DEFAULT_MATCH_TYPE = 'prefix' DEFAULT_REPLACE_AFTER = '?' + DEFAULT_RE_TYPE = 'search' FUZZY_SKIP_PARAMS = ('alt_url', 'reverse', 'closest', 'end_key', 'url', 'matchType', 'filter') @@ -58,16 +59,16 @@ class FuzzyMatcher(object): replace_after = self.DEFAULT_REPLACE_AFTER filter_str = self.DEFAULT_FILTER match_type = self.DEFAULT_MATCH_TYPE - find_all = False + re_type = self.DEFAULT_RE_TYPE else: regex = self.make_regex(config.get('match')) replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER) filter_str = config.get('filter', self.DEFAULT_FILTER) match_type = config.get('type', self.DEFAULT_MATCH_TYPE) - find_all = config.get('find_all', False) + re_type = config.get('re_type', self.DEFAULT_RE_TYPE) - return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type, find_all) + return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type, re_type) def get_fuzzy_match(self, urlkey, url, params): filters = set() @@ -78,9 +79,12 @@ class FuzzyMatcher(object): continue groups = None - if rule.find_all: + if rule.re_type == 'findall': groups = rule.regex.findall(urlkey) - else: + if rule.re_type == 'sub': + matched_rule = rule + break + elif rule.re_type == 'search': m = rule.regex.search(urlkey) groups = m and m.groups() @@ -102,7 +106,7 @@ class FuzzyMatcher(object): no_filters = (not filters or filters == {'urlkey:'}) and (matched_rule.replace_after == '?') inx = url.find(matched_rule.replace_after) - if inx > 0: + if inx > 0 and matched_rule.re_type != 'sub': length = inx + len(matched_rule.replace_after) # don't include trailing '?' for default filter if no_filters: @@ -111,13 +115,17 @@ class FuzzyMatcher(object): if url[length - 1] == '/': length -= 1 url = url[:length] - elif not no_filters: + elif not no_filters and matched_rule.re_type != 'sub': url += matched_rule.replace_after[0] if matched_rule.match_type == 'domain': host = urlsplit(url).netloc url = host.split('.', 1)[1] + if matched_rule.re_type == 'sub': + filters = {'urlkey:'} + url = re.sub(rule.regex, rule.replace_after, url) + fuzzy_params = {'url': url, 'matchType': matched_rule.match_type, 'filter': filters, diff --git a/pywb/warcserver/index/test/test_fuzzymatcher.py b/pywb/warcserver/index/test/test_fuzzymatcher.py index 13b17976..a5b14d5b 100644 --- a/pywb/warcserver/index/test/test_fuzzymatcher.py +++ b/pywb/warcserver/index/test/test_fuzzymatcher.py @@ -234,3 +234,10 @@ class TestFuzzy(object): params = self.get_params(url, actual_url, mime='application/x-shockwave-flash') cdx_iter, errs = self.fuzzy(self.source, params) assert list(cdx_iter) == [] + + def test_fuzzy_sub_replacement(self): + url = 'https://example.com/matched' + actual_url = 'https://example.com/replaced' + params = self.get_params(url, actual_url) + cdx_iter, errs = self.fuzzy(self.source, params) + assert list(cdx_iter) == self.get_expected(actual_url)