mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
rules and fuzzy match fix:
- rules: fix rule from regex '~' switch, add test - fuzzymatch filters: use set instead of list to avoid dupes
This commit is contained in:
parent
30be6f2e4c
commit
9d681d1a8a
@ -303,7 +303,7 @@ rules:
|
|||||||
#- mime
|
#- mime
|
||||||
|
|
||||||
filter:
|
filter:
|
||||||
- '~urlkey:{0}'
|
- 'urlkey:{0}'
|
||||||
- '!mimetype:text/plain'
|
- '!mimetype:text/plain'
|
||||||
|
|
||||||
type: 'domain'
|
type: 'domain'
|
||||||
|
@ -64,7 +64,7 @@ class FuzzyMatcher(object):
|
|||||||
return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type)
|
return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type)
|
||||||
|
|
||||||
def get_fuzzy_match(self, urlkey, params):
|
def get_fuzzy_match(self, urlkey, params):
|
||||||
filters = []
|
filters = set()
|
||||||
matched_rule = None
|
matched_rule = None
|
||||||
|
|
||||||
for rule in self.rules:
|
for rule in self.rules:
|
||||||
@ -78,7 +78,7 @@ class FuzzyMatcher(object):
|
|||||||
matched_rule = rule
|
matched_rule = rule
|
||||||
for g in m.groups():
|
for g in m.groups():
|
||||||
for f in matched_rule.filter_str:
|
for f in matched_rule.filter_str:
|
||||||
filters.append(f.format(g))
|
filters.add(f.format(g))
|
||||||
|
|
||||||
break
|
break
|
||||||
|
|
||||||
|
@ -36,7 +36,7 @@ class TestFuzzy(object):
|
|||||||
return params
|
return params
|
||||||
|
|
||||||
def get_expected(self, url, mime='text/html', filters=None):
|
def get_expected(self, url, mime='text/html', filters=None):
|
||||||
filters = filters or ['urlkey:']
|
filters = filters or {'urlkey:'}
|
||||||
exp = [{'filter': filters,
|
exp = [{'filter': filters,
|
||||||
'is_fuzzy': True,
|
'is_fuzzy': True,
|
||||||
'urlkey': canonicalize(url),
|
'urlkey': canonicalize(url),
|
||||||
@ -102,12 +102,23 @@ class TestFuzzy(object):
|
|||||||
cdx_iter, errs = self.fuzzy(self.source, params)
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
assert list(cdx_iter) == self.get_expected(actual_url)
|
assert list(cdx_iter) == self.get_expected(actual_url)
|
||||||
|
|
||||||
def test_fuzzy_custom_rule(self):
|
def test_fuzzy_custom_rule_yt(self):
|
||||||
url = 'http://youtube.com/get_video_info?a=b&html5=true&___abc=123&video_id=ABCD&id=1234'
|
url = 'http://youtube.com/get_video_info?a=b&html5=true&___abc=123&video_id=ABCD&id=1234'
|
||||||
actual_url = 'http://youtube.com/get_video_info?a=d&html5=true&___abc=125&video_id=ABCD&id=1234'
|
actual_url = 'http://youtube.com/get_video_info?a=d&html5=true&___abc=125&video_id=ABCD&id=1234'
|
||||||
params = self.get_params(url, actual_url)
|
params = self.get_params(url, actual_url)
|
||||||
cdx_iter, errs = self.fuzzy(self.source, params)
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
filters = ['urlkey:html5=true', 'urlkey:video_id=abcd']
|
filters = {'urlkey:html5=true', 'urlkey:video_id=abcd'}
|
||||||
|
assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters)
|
||||||
|
|
||||||
|
def test_fuzzy_custom_rule_yt_2(self):
|
||||||
|
url = 'https://r1---sn-xyz.googlevideo.com/videoplayback?id=ABCDEFG&itag=22&food=abc'
|
||||||
|
actual_url = 'https://r1---sn-abcdefg.googlevideo.com/videoplayback?id=ABCDEFG&itag=22&foo=abc&_1=2'
|
||||||
|
params = self.get_params(url, actual_url)
|
||||||
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
|
filters = {'urlkey:id=abcdefg',
|
||||||
|
'urlkey:itag=22',
|
||||||
|
'!mimetype:text/plain'}
|
||||||
|
|
||||||
assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters)
|
assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters)
|
||||||
|
|
||||||
def test_no_fuzzy_custom_rule_video_id_diff(self):
|
def test_no_fuzzy_custom_rule_video_id_diff(self):
|
||||||
@ -159,3 +170,4 @@ class TestFuzzy(object):
|
|||||||
cdx_iter, errs = self.fuzzy(self.source, params)
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
assert list(cdx_iter) == []
|
assert list(cdx_iter) == []
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user