From 41f227d8aeb8654eb5156d6fb8ef7a1abec7438c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 9 Nov 2017 20:45:15 -0800 Subject: [PATCH] fuzzymatch fix: when fuzzy matching prefix with trailing '/' with default rule, eg. 'path/?_123', remove trailing slash to match 'path' instead of 'path/' to match canonicalizer behavior of removing trailing slashes tests: add test to verify fuzzy matching with trailing slash before query --- pywb/warcserver/index/fuzzymatcher.py | 4 ++++ tests/test_integration.py | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/pywb/warcserver/index/fuzzymatcher.py b/pywb/warcserver/index/fuzzymatcher.py index d103cbc1..46ed03e5 100644 --- a/pywb/warcserver/index/fuzzymatcher.py +++ b/pywb/warcserver/index/fuzzymatcher.py @@ -102,8 +102,12 @@ class FuzzyMatcher(object): inx = url.find(matched_rule.replace_after) if inx > 0: length = inx + len(matched_rule.replace_after) + # don't include trailing '?' for default filter if no_filters: length -= 1 + # don't include trailing '/' if match '/?' + if url[length - 1] == '/': + length -= 1 url = url[:length] elif not no_filters: url += matched_rule.replace_after[0] diff --git a/tests/test_integration.py b/tests/test_integration.py index 6cf70045..ead35c43 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -55,6 +55,13 @@ class TestWbIntegration(BaseConfigTest): # 17 Captures + header assert len(resp.html.find_all('tr')) == 18 + def test_calendar_query_fuzzy_match_add_slash(self): + # fuzzy match removing _= according to standard rules.yaml + resp = self.testapp.get('/pywb/*/http://www.iana.org/_css/2013.1/screen.css/?_=3141592653') + self._assert_basic_html(resp) + # 17 Captures + header + assert len(resp.html.find_all('tr')) == 18 + def test_calendar_not_found(self): # query with no results resp = self.testapp.get('/pywb/*/http://not-exist.example.com')