mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
feat: regex substitution on surt rules match (#780)
substituion functionality already exists on a global level for matched rules but this causes issues when rule sets conflict in the desired outcome. This change enables setting regex substitution at the rule level to avoid these conflicts.
This commit is contained in:
parent
0758e81b62
commit
59d9beac05
@ -110,7 +110,7 @@ rules:
|
|||||||
|
|
||||||
fuzzy_lookup:
|
fuzzy_lookup:
|
||||||
match: '("(?:cursor|cursorindex)":["\d\w]+)'
|
match: '("(?:cursor|cursorindex)":["\d\w]+)'
|
||||||
find_all: true
|
re_type: findall
|
||||||
|
|
||||||
- url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimeline'
|
- url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimeline'
|
||||||
fuzzy_lookup: 'com,facebook\)/.*[?&](__adt=[^&]+).*[&]data=(?:.*?(?:[&]|(profile_id|pagelet_token)[^,]+))'
|
fuzzy_lookup: 'com,facebook\)/.*[?&](__adt=[^&]+).*[&]data=(?:.*?(?:[&]|(profile_id|pagelet_token)[^,]+))'
|
||||||
@ -175,7 +175,7 @@ rules:
|
|||||||
|
|
||||||
fuzzy_lookup:
|
fuzzy_lookup:
|
||||||
match: '("q[\d]+":|after:\\"[^"]+)'
|
match: '("q[\d]+":|after:\\"[^"]+)'
|
||||||
find_all: true
|
re_type: findall
|
||||||
|
|
||||||
- url_prefix: 'com,facebook)/pages_reaction_units/more'
|
- url_prefix: 'com,facebook)/pages_reaction_units/more'
|
||||||
|
|
||||||
@ -538,6 +538,12 @@ rules:
|
|||||||
rewrite:
|
rewrite:
|
||||||
js_rewrite_location: urls
|
js_rewrite_location: urls
|
||||||
|
|
||||||
|
- url_prefix: 'com,example)/matched'
|
||||||
|
fuzzy_lookup:
|
||||||
|
re_type: sub
|
||||||
|
match: 'matched'
|
||||||
|
replace: 'replaced'
|
||||||
|
|
||||||
# all domain rules -- fallback to this dataset
|
# all domain rules -- fallback to this dataset
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# Applies to all urls -- should be last
|
# Applies to all urls -- should be last
|
||||||
|
@ -15,7 +15,7 @@ from collections import namedtuple
|
|||||||
# ============================================================================
|
# ============================================================================
|
||||||
FuzzyRule = namedtuple('FuzzyRule',
|
FuzzyRule = namedtuple('FuzzyRule',
|
||||||
'url_prefix, regex, replace_after, filter_str, ' +
|
'url_prefix, regex, replace_after, filter_str, ' +
|
||||||
'match_type, find_all')
|
'match_type, re_type')
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@ -23,6 +23,7 @@ class FuzzyMatcher(object):
|
|||||||
DEFAULT_FILTER = ['urlkey:{0}']
|
DEFAULT_FILTER = ['urlkey:{0}']
|
||||||
DEFAULT_MATCH_TYPE = 'prefix'
|
DEFAULT_MATCH_TYPE = 'prefix'
|
||||||
DEFAULT_REPLACE_AFTER = '?'
|
DEFAULT_REPLACE_AFTER = '?'
|
||||||
|
DEFAULT_RE_TYPE = 'search'
|
||||||
|
|
||||||
FUZZY_SKIP_PARAMS = ('alt_url', 'reverse', 'closest', 'end_key',
|
FUZZY_SKIP_PARAMS = ('alt_url', 'reverse', 'closest', 'end_key',
|
||||||
'url', 'matchType', 'filter')
|
'url', 'matchType', 'filter')
|
||||||
@ -58,16 +59,16 @@ class FuzzyMatcher(object):
|
|||||||
replace_after = self.DEFAULT_REPLACE_AFTER
|
replace_after = self.DEFAULT_REPLACE_AFTER
|
||||||
filter_str = self.DEFAULT_FILTER
|
filter_str = self.DEFAULT_FILTER
|
||||||
match_type = self.DEFAULT_MATCH_TYPE
|
match_type = self.DEFAULT_MATCH_TYPE
|
||||||
find_all = False
|
re_type = self.DEFAULT_RE_TYPE
|
||||||
|
|
||||||
else:
|
else:
|
||||||
regex = self.make_regex(config.get('match'))
|
regex = self.make_regex(config.get('match'))
|
||||||
replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER)
|
replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER)
|
||||||
filter_str = config.get('filter', self.DEFAULT_FILTER)
|
filter_str = config.get('filter', self.DEFAULT_FILTER)
|
||||||
match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
|
match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
|
||||||
find_all = config.get('find_all', False)
|
re_type = config.get('re_type', self.DEFAULT_RE_TYPE)
|
||||||
|
|
||||||
return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type, find_all)
|
return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type, re_type)
|
||||||
|
|
||||||
def get_fuzzy_match(self, urlkey, url, params):
|
def get_fuzzy_match(self, urlkey, url, params):
|
||||||
filters = set()
|
filters = set()
|
||||||
@ -78,9 +79,12 @@ class FuzzyMatcher(object):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
groups = None
|
groups = None
|
||||||
if rule.find_all:
|
if rule.re_type == 'findall':
|
||||||
groups = rule.regex.findall(urlkey)
|
groups = rule.regex.findall(urlkey)
|
||||||
else:
|
if rule.re_type == 'sub':
|
||||||
|
matched_rule = rule
|
||||||
|
break
|
||||||
|
elif rule.re_type == 'search':
|
||||||
m = rule.regex.search(urlkey)
|
m = rule.regex.search(urlkey)
|
||||||
groups = m and m.groups()
|
groups = m and m.groups()
|
||||||
|
|
||||||
@ -102,7 +106,7 @@ class FuzzyMatcher(object):
|
|||||||
no_filters = (not filters or filters == {'urlkey:'}) and (matched_rule.replace_after == '?')
|
no_filters = (not filters or filters == {'urlkey:'}) and (matched_rule.replace_after == '?')
|
||||||
|
|
||||||
inx = url.find(matched_rule.replace_after)
|
inx = url.find(matched_rule.replace_after)
|
||||||
if inx > 0:
|
if inx > 0 and matched_rule.re_type != 'sub':
|
||||||
length = inx + len(matched_rule.replace_after)
|
length = inx + len(matched_rule.replace_after)
|
||||||
# don't include trailing '?' for default filter
|
# don't include trailing '?' for default filter
|
||||||
if no_filters:
|
if no_filters:
|
||||||
@ -111,13 +115,17 @@ class FuzzyMatcher(object):
|
|||||||
if url[length - 1] == '/':
|
if url[length - 1] == '/':
|
||||||
length -= 1
|
length -= 1
|
||||||
url = url[:length]
|
url = url[:length]
|
||||||
elif not no_filters:
|
elif not no_filters and matched_rule.re_type != 'sub':
|
||||||
url += matched_rule.replace_after[0]
|
url += matched_rule.replace_after[0]
|
||||||
|
|
||||||
if matched_rule.match_type == 'domain':
|
if matched_rule.match_type == 'domain':
|
||||||
host = urlsplit(url).netloc
|
host = urlsplit(url).netloc
|
||||||
url = host.split('.', 1)[1]
|
url = host.split('.', 1)[1]
|
||||||
|
|
||||||
|
if matched_rule.re_type == 'sub':
|
||||||
|
filters = {'urlkey:'}
|
||||||
|
url = re.sub(rule.regex, rule.replace_after, url)
|
||||||
|
|
||||||
fuzzy_params = {'url': url,
|
fuzzy_params = {'url': url,
|
||||||
'matchType': matched_rule.match_type,
|
'matchType': matched_rule.match_type,
|
||||||
'filter': filters,
|
'filter': filters,
|
||||||
|
@ -234,3 +234,10 @@ class TestFuzzy(object):
|
|||||||
params = self.get_params(url, actual_url, mime='application/x-shockwave-flash')
|
params = self.get_params(url, actual_url, mime='application/x-shockwave-flash')
|
||||||
cdx_iter, errs = self.fuzzy(self.source, params)
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
assert list(cdx_iter) == []
|
assert list(cdx_iter) == []
|
||||||
|
|
||||||
|
def test_fuzzy_sub_replacement(self):
|
||||||
|
url = 'https://example.com/matched'
|
||||||
|
actual_url = 'https://example.com/replaced'
|
||||||
|
params = self.get_params(url, actual_url)
|
||||||
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
|
assert list(cdx_iter) == self.get_expected(actual_url)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user