From 9d681d1a8a0c6e65d2992259cb3b16fdc9e4d93f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 21 Oct 2017 14:39:11 -0700 Subject: [PATCH] rules and fuzzy match fix: - rules: fix rule from regex '~' switch, add test - fuzzymatch filters: use set instead of list to avoid dupes --- pywb/rules.yaml | 2 +- pywb/warcserver/index/fuzzymatcher.py | 4 ++-- .../warcserver/index/test/test_fuzzymatcher.py | 18 +++++++++++++++--- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/pywb/rules.yaml b/pywb/rules.yaml index b24beb90..4ade4f3c 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -303,7 +303,7 @@ rules: #- mime filter: - - '~urlkey:{0}' + - 'urlkey:{0}' - '!mimetype:text/plain' type: 'domain' diff --git a/pywb/warcserver/index/fuzzymatcher.py b/pywb/warcserver/index/fuzzymatcher.py index eca0c490..47289401 100644 --- a/pywb/warcserver/index/fuzzymatcher.py +++ b/pywb/warcserver/index/fuzzymatcher.py @@ -64,7 +64,7 @@ class FuzzyMatcher(object): return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type) def get_fuzzy_match(self, urlkey, params): - filters = [] + filters = set() matched_rule = None for rule in self.rules: @@ -78,7 +78,7 @@ class FuzzyMatcher(object): matched_rule = rule for g in m.groups(): for f in matched_rule.filter_str: - filters.append(f.format(g)) + filters.add(f.format(g)) break diff --git a/pywb/warcserver/index/test/test_fuzzymatcher.py b/pywb/warcserver/index/test/test_fuzzymatcher.py index eea00c43..85319172 100644 --- a/pywb/warcserver/index/test/test_fuzzymatcher.py +++ b/pywb/warcserver/index/test/test_fuzzymatcher.py @@ -36,7 +36,7 @@ class TestFuzzy(object): return params def get_expected(self, url, mime='text/html', filters=None): - filters = filters or ['urlkey:'] + filters = filters or {'urlkey:'} exp = [{'filter': filters, 'is_fuzzy': True, 'urlkey': canonicalize(url), @@ -102,12 +102,23 @@ class TestFuzzy(object): cdx_iter, errs = self.fuzzy(self.source, params) assert list(cdx_iter) == self.get_expected(actual_url) - def test_fuzzy_custom_rule(self): + def test_fuzzy_custom_rule_yt(self): url = 'http://youtube.com/get_video_info?a=b&html5=true&___abc=123&video_id=ABCD&id=1234' actual_url = 'http://youtube.com/get_video_info?a=d&html5=true&___abc=125&video_id=ABCD&id=1234' params = self.get_params(url, actual_url) cdx_iter, errs = self.fuzzy(self.source, params) - filters = ['urlkey:html5=true', 'urlkey:video_id=abcd'] + filters = {'urlkey:html5=true', 'urlkey:video_id=abcd'} + assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters) + + def test_fuzzy_custom_rule_yt_2(self): + url = 'https://r1---sn-xyz.googlevideo.com/videoplayback?id=ABCDEFG&itag=22&food=abc' + actual_url = 'https://r1---sn-abcdefg.googlevideo.com/videoplayback?id=ABCDEFG&itag=22&foo=abc&_1=2' + params = self.get_params(url, actual_url) + cdx_iter, errs = self.fuzzy(self.source, params) + filters = {'urlkey:id=abcdefg', + 'urlkey:itag=22', + '!mimetype:text/plain'} + assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters) def test_no_fuzzy_custom_rule_video_id_diff(self): @@ -159,3 +170,4 @@ class TestFuzzy(object): cdx_iter, errs = self.fuzzy(self.source, params) assert list(cdx_iter) == [] +