diff --git a/pywb/warcserver/index/fuzzymatcher.py b/pywb/warcserver/index/fuzzymatcher.py index 23c16e8f..b758b3ec 100644 --- a/pywb/warcserver/index/fuzzymatcher.py +++ b/pywb/warcserver/index/fuzzymatcher.py @@ -186,15 +186,23 @@ class FuzzyMatcher(object): yield cdx def match_general_fuzzy_query(self, url, urlkey, cdx, rx_cache): - # check ext - ext = self.get_ext(url) - if ext and ext not in self.default_filters['not_exts']: - return True + check_query = False + url_no_query, ext = self.get_ext(url) - # check mime - mime = cdx.get('mime') - if mime and mime in self.default_filters['mimes']: - return True + # check ext + if ext and ext not in self.default_filters['not_exts']: + check_query = True + + else: + # check mime + mime = cdx.get('mime') + if mime and mime in self.default_filters['mimes']: + check_query = True + + # if check_query, ensure matched url starts with original prefix, only differs by query + if check_query: + if cdx['url'] == url_no_query or cdx['url'].startswith(url_no_query + '?'): + return True match_urlkey = cdx['urlkey'] @@ -215,5 +223,6 @@ class FuzzyMatcher(object): def get_ext(self, url): # check last path segment # if contains '.', likely a file, so fuzzy match! - last_path = url.split('?', 1)[0].rsplit('/', 1)[-1] - return os.path.splitext(last_path)[1][1:] + url_no_query = url.split('?', 1)[0] + last_path = url_no_query.rsplit('/', 1)[-1] + return url_no_query, os.path.splitext(last_path)[1][1:] diff --git a/pywb/warcserver/index/test/test_fuzzymatcher.py b/pywb/warcserver/index/test/test_fuzzymatcher.py index b309ae6e..a9b1cd72 100644 --- a/pywb/warcserver/index/test/test_fuzzymatcher.py +++ b/pywb/warcserver/index/test/test_fuzzymatcher.py @@ -133,6 +133,34 @@ class TestFuzzy(object): assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters) + def test_fuzzy_bar_baz_with_ext(self): + url = 'http://example.com/foo/bar.png?abc' + actual_url = 'http://example.com/foo/bar.png' + params = self.get_params(url, actual_url) + cdx_iter, errs = self.fuzzy(self.source, params) + assert list(cdx_iter) == self.get_expected(url=actual_url) + + def test_fuzzy_bar_baz_with_ext_2(self): + url = 'http://example.com/foo/bar.png?abc' + actual_url = 'http://example.com/foo/bar.png?def' + params = self.get_params(url, actual_url) + cdx_iter, errs = self.fuzzy(self.source, params) + assert list(cdx_iter) == self.get_expected(url=actual_url) + + def test_fuzzy_bar_baz_with_ext_3(self): + url = 'http://example.com/foo/bar.png' + actual_url = 'http://example.com/foo/bar.png?xyz' + params = self.get_params(url, actual_url) + cdx_iter, errs = self.fuzzy(self.source, params) + assert list(cdx_iter) == self.get_expected(url=actual_url) + + def test_no_fuzzy_bar_baz_with_ext(self): + url = 'http://example.com/foo/bar.png?abc' + actual_url = 'http://example.com/foo/bar' + params = self.get_params(url, actual_url) + cdx_iter, errs = self.fuzzy(self.source, params) + assert list(cdx_iter) == [] + def test_no_fuzzy_disabled(self): url = 'http://example.com/?_=123' actual_url = 'http://example.com/' @@ -190,4 +218,16 @@ class TestFuzzy(object): cdx_iter, errs = self.fuzzy(self.source, params) assert list(cdx_iter) == [] + def test_no_fuzzy_bar_baz(self): + url = 'http://example.com/foo/bar' + actual_url = 'http://example.com/foo/bas' + params = self.get_params(url, actual_url) + cdx_iter, errs = self.fuzzy(self.source, params) + assert list(cdx_iter) == [] + def test_fuzzy_no_deep_path_mime_match(self): + url = 'http://www.website.co.br/~dinosaurs/t' + actual_url = 'http://www.website.co.br/~dinosaurs/t/path2/deep-down/what.swf' + params = self.get_params(url, actual_url, mime='application/x-shockwave-flash') + cdx_iter, errs = self.fuzzy(self.source, params) + assert list(cdx_iter) == []