1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Less aggressive fuzzy matching on mime type. (#362)

* When mime type match is made also match on extension in order to be less aggressive when matching prefix matches.

* fuzzy matching: further restrict fuzzy matching on mime or ext match by ensuring the matched result differs only by query
This commit is contained in:
John Berlin 2018-08-07 15:03:57 -04:00 committed by Ilya Kreymer
parent 5476d75294
commit 5f938e6879
2 changed files with 59 additions and 10 deletions

View File

@ -186,15 +186,23 @@ class FuzzyMatcher(object):
yield cdx
def match_general_fuzzy_query(self, url, urlkey, cdx, rx_cache):
# check ext
ext = self.get_ext(url)
if ext and ext not in self.default_filters['not_exts']:
return True
check_query = False
url_no_query, ext = self.get_ext(url)
# check mime
mime = cdx.get('mime')
if mime and mime in self.default_filters['mimes']:
return True
# check ext
if ext and ext not in self.default_filters['not_exts']:
check_query = True
else:
# check mime
mime = cdx.get('mime')
if mime and mime in self.default_filters['mimes']:
check_query = True
# if check_query, ensure matched url starts with original prefix, only differs by query
if check_query:
if cdx['url'] == url_no_query or cdx['url'].startswith(url_no_query + '?'):
return True
match_urlkey = cdx['urlkey']
@ -215,5 +223,6 @@ class FuzzyMatcher(object):
def get_ext(self, url):
# check last path segment
# if contains '.', likely a file, so fuzzy match!
last_path = url.split('?', 1)[0].rsplit('/', 1)[-1]
return os.path.splitext(last_path)[1][1:]
url_no_query = url.split('?', 1)[0]
last_path = url_no_query.rsplit('/', 1)[-1]
return url_no_query, os.path.splitext(last_path)[1][1:]

View File

@ -133,6 +133,34 @@ class TestFuzzy(object):
assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters)
def test_fuzzy_bar_baz_with_ext(self):
url = 'http://example.com/foo/bar.png?abc'
actual_url = 'http://example.com/foo/bar.png'
params = self.get_params(url, actual_url)
cdx_iter, errs = self.fuzzy(self.source, params)
assert list(cdx_iter) == self.get_expected(url=actual_url)
def test_fuzzy_bar_baz_with_ext_2(self):
url = 'http://example.com/foo/bar.png?abc'
actual_url = 'http://example.com/foo/bar.png?def'
params = self.get_params(url, actual_url)
cdx_iter, errs = self.fuzzy(self.source, params)
assert list(cdx_iter) == self.get_expected(url=actual_url)
def test_fuzzy_bar_baz_with_ext_3(self):
url = 'http://example.com/foo/bar.png'
actual_url = 'http://example.com/foo/bar.png?xyz'
params = self.get_params(url, actual_url)
cdx_iter, errs = self.fuzzy(self.source, params)
assert list(cdx_iter) == self.get_expected(url=actual_url)
def test_no_fuzzy_bar_baz_with_ext(self):
url = 'http://example.com/foo/bar.png?abc'
actual_url = 'http://example.com/foo/bar'
params = self.get_params(url, actual_url)
cdx_iter, errs = self.fuzzy(self.source, params)
assert list(cdx_iter) == []
def test_no_fuzzy_disabled(self):
url = 'http://example.com/?_=123'
actual_url = 'http://example.com/'
@ -190,4 +218,16 @@ class TestFuzzy(object):
cdx_iter, errs = self.fuzzy(self.source, params)
assert list(cdx_iter) == []
def test_no_fuzzy_bar_baz(self):
url = 'http://example.com/foo/bar'
actual_url = 'http://example.com/foo/bas'
params = self.get_params(url, actual_url)
cdx_iter, errs = self.fuzzy(self.source, params)
assert list(cdx_iter) == []
def test_fuzzy_no_deep_path_mime_match(self):
url = 'http://www.website.co.br/~dinosaurs/t'
actual_url = 'http://www.website.co.br/~dinosaurs/t/path2/deep-down/what.swf'
params = self.get_params(url, actual_url, mime='application/x-shockwave-flash')
cdx_iter, errs = self.fuzzy(self.source, params)
assert list(cdx_iter) == []