1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

fuzzy match rules: to simplify custom fuzzy match use cases, add support

for matching fuzzy match query params as a list
This commit is contained in:
Ilya Kreymer 2014-09-21 14:46:10 -07:00
parent 7ac98fbfe2
commit ec27ccfbb6
2 changed files with 44 additions and 10 deletions

View File

@ -125,12 +125,12 @@ class CDXDomainSpecificRule(BaseRule):
def __init__(self, name, config):
super(CDXDomainSpecificRule, self).__init__(name, config)
if isinstance(config, basestring):
self.regex = re.compile(config)
if not isinstance(config, dict):
self.regex = self.make_regex(config)
self.replace = None
self.filter = self.DEFAULT_FILTER
else:
self.regex = re.compile(config.get('match'))
self.regex = self.make_regex(config.get('match'))
self.replace = config.get('replace')
self.filter = config.get('filter', self.DEFAULT_FILTER)
@ -147,6 +147,35 @@ class CDXDomainSpecificRule(BaseRule):
if self.replace:
self.replace = unsurt(self.replace)
if __name__ == "__main__":
import doctest
doctest.testmod()
@staticmethod
def make_regex(config):
if isinstance(config, list):
string = CDXDomainSpecificRule.make_query_match_regex(config)
# assumes string
else:
string = config
return re.compile(string)
@staticmethod
def make_query_match_regex(params_list):
r"""
>>> CDXDomainSpecificRule.make_query_match_regex(['param1', 'id', 'abc'])
'(abc=[^&]+).*(id=[^&]+).*(param1=[^&]+)'
>>> CDXDomainSpecificRule.make_query_match_regex(['id[0]', 'abc()'])
'(abc\\(\\)=[^&]+).*(id\\[0\\]=[^&]+)'
"""
params_list.sort()
def conv(value):
return '({}=[^&]+)'.format(re.escape(value))
params_list = map(conv, params_list)
final_str = '.*'.join(params_list)
return final_str
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -15,12 +15,14 @@ rules:
fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))'
- url_prefix: 'com,facebook)/ajax/ufi/'
fuzzy_lookup: '(ft_ent_identifier=[^&]+).*(lsd=[^&]+)'
fuzzy_lookup: ['ft_ent_identifier', 'lsd']
#fuzzy_lookup: '(ft_ent_identifier=[^&]+).*(lsd=[^&]+)'
- url_prefix: 'com,facebook)/ajax/chat/hovercard/sidebar.php'
fuzzy_lookup: '(ids\[0\]=[^&]+)'
fuzzy_lookup: ['ids[0]']
#fuzzy_lookup: '(ids\[0\]=[^&]+)'
- url_prefix: 'com,facebook)/ajax/'
@ -28,7 +30,8 @@ rules:
- url_prefix: 'com,facebook)/login.php'
fuzzy_lookup: '(email=[^&]+).*(lgnrnd=[^&]+).*(lsd=[^&]+)'
#fuzzy_lookup: '(email=[^&]+).*(lgnrnd=[^&]+).*(lsd=[^&]+)'
fuzzy_lookup: ['email', 'lgnrnd', 'lsd']
# not actually needed, fuzzy match is used instead here
# canonicalize:
@ -88,6 +91,8 @@ rules:
match: '(example,example,test\)/.*?)[?].*?(id=value).*'
replace: '\1?\2'
fuzzy_lookup: ['param1', 'id']
rewrite:
js_rewrite_location: False