1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

rules and fuzzy match fix:

- rules: fix rule from regex '~' switch, add test
- fuzzymatch filters: use set instead of list to avoid dupes
This commit is contained in:
Ilya Kreymer 2017-10-21 14:39:11 -07:00
parent 30be6f2e4c
commit 9d681d1a8a
3 changed files with 18 additions and 6 deletions

View File

@ -303,7 +303,7 @@ rules:
#- mime
filter:
- '~urlkey:{0}'
- 'urlkey:{0}'
- '!mimetype:text/plain'
type: 'domain'

View File

@ -64,7 +64,7 @@ class FuzzyMatcher(object):
return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type)
def get_fuzzy_match(self, urlkey, params):
filters = []
filters = set()
matched_rule = None
for rule in self.rules:
@ -78,7 +78,7 @@ class FuzzyMatcher(object):
matched_rule = rule
for g in m.groups():
for f in matched_rule.filter_str:
filters.append(f.format(g))
filters.add(f.format(g))
break

View File

@ -36,7 +36,7 @@ class TestFuzzy(object):
return params
def get_expected(self, url, mime='text/html', filters=None):
filters = filters or ['urlkey:']
filters = filters or {'urlkey:'}
exp = [{'filter': filters,
'is_fuzzy': True,
'urlkey': canonicalize(url),
@ -102,12 +102,23 @@ class TestFuzzy(object):
cdx_iter, errs = self.fuzzy(self.source, params)
assert list(cdx_iter) == self.get_expected(actual_url)
def test_fuzzy_custom_rule(self):
def test_fuzzy_custom_rule_yt(self):
url = 'http://youtube.com/get_video_info?a=b&html5=true&___abc=123&video_id=ABCD&id=1234'
actual_url = 'http://youtube.com/get_video_info?a=d&html5=true&___abc=125&video_id=ABCD&id=1234'
params = self.get_params(url, actual_url)
cdx_iter, errs = self.fuzzy(self.source, params)
filters = ['urlkey:html5=true', 'urlkey:video_id=abcd']
filters = {'urlkey:html5=true', 'urlkey:video_id=abcd'}
assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters)
def test_fuzzy_custom_rule_yt_2(self):
url = 'https://r1---sn-xyz.googlevideo.com/videoplayback?id=ABCDEFG&itag=22&food=abc'
actual_url = 'https://r1---sn-abcdefg.googlevideo.com/videoplayback?id=ABCDEFG&itag=22&foo=abc&_1=2'
params = self.get_params(url, actual_url)
cdx_iter, errs = self.fuzzy(self.source, params)
filters = {'urlkey:id=abcdefg',
'urlkey:itag=22',
'!mimetype:text/plain'}
assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters)
def test_no_fuzzy_custom_rule_video_id_diff(self):
@ -159,3 +170,4 @@ class TestFuzzy(object):
cdx_iter, errs = self.fuzzy(self.source, params)
assert list(cdx_iter) == []