mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Less aggressive fuzzy matching on mime type. (#362)
* When mime type match is made also match on extension in order to be less aggressive when matching prefix matches. * fuzzy matching: further restrict fuzzy matching on mime or ext match by ensuring the matched result differs only by query
This commit is contained in:
parent
5476d75294
commit
5f938e6879
@ -186,15 +186,23 @@ class FuzzyMatcher(object):
|
||||
yield cdx
|
||||
|
||||
def match_general_fuzzy_query(self, url, urlkey, cdx, rx_cache):
|
||||
# check ext
|
||||
ext = self.get_ext(url)
|
||||
if ext and ext not in self.default_filters['not_exts']:
|
||||
return True
|
||||
check_query = False
|
||||
url_no_query, ext = self.get_ext(url)
|
||||
|
||||
# check mime
|
||||
mime = cdx.get('mime')
|
||||
if mime and mime in self.default_filters['mimes']:
|
||||
return True
|
||||
# check ext
|
||||
if ext and ext not in self.default_filters['not_exts']:
|
||||
check_query = True
|
||||
|
||||
else:
|
||||
# check mime
|
||||
mime = cdx.get('mime')
|
||||
if mime and mime in self.default_filters['mimes']:
|
||||
check_query = True
|
||||
|
||||
# if check_query, ensure matched url starts with original prefix, only differs by query
|
||||
if check_query:
|
||||
if cdx['url'] == url_no_query or cdx['url'].startswith(url_no_query + '?'):
|
||||
return True
|
||||
|
||||
match_urlkey = cdx['urlkey']
|
||||
|
||||
@ -215,5 +223,6 @@ class FuzzyMatcher(object):
|
||||
def get_ext(self, url):
|
||||
# check last path segment
|
||||
# if contains '.', likely a file, so fuzzy match!
|
||||
last_path = url.split('?', 1)[0].rsplit('/', 1)[-1]
|
||||
return os.path.splitext(last_path)[1][1:]
|
||||
url_no_query = url.split('?', 1)[0]
|
||||
last_path = url_no_query.rsplit('/', 1)[-1]
|
||||
return url_no_query, os.path.splitext(last_path)[1][1:]
|
||||
|
@ -133,6 +133,34 @@ class TestFuzzy(object):
|
||||
|
||||
assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters)
|
||||
|
||||
def test_fuzzy_bar_baz_with_ext(self):
|
||||
url = 'http://example.com/foo/bar.png?abc'
|
||||
actual_url = 'http://example.com/foo/bar.png'
|
||||
params = self.get_params(url, actual_url)
|
||||
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||
assert list(cdx_iter) == self.get_expected(url=actual_url)
|
||||
|
||||
def test_fuzzy_bar_baz_with_ext_2(self):
|
||||
url = 'http://example.com/foo/bar.png?abc'
|
||||
actual_url = 'http://example.com/foo/bar.png?def'
|
||||
params = self.get_params(url, actual_url)
|
||||
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||
assert list(cdx_iter) == self.get_expected(url=actual_url)
|
||||
|
||||
def test_fuzzy_bar_baz_with_ext_3(self):
|
||||
url = 'http://example.com/foo/bar.png'
|
||||
actual_url = 'http://example.com/foo/bar.png?xyz'
|
||||
params = self.get_params(url, actual_url)
|
||||
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||
assert list(cdx_iter) == self.get_expected(url=actual_url)
|
||||
|
||||
def test_no_fuzzy_bar_baz_with_ext(self):
|
||||
url = 'http://example.com/foo/bar.png?abc'
|
||||
actual_url = 'http://example.com/foo/bar'
|
||||
params = self.get_params(url, actual_url)
|
||||
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||
assert list(cdx_iter) == []
|
||||
|
||||
def test_no_fuzzy_disabled(self):
|
||||
url = 'http://example.com/?_=123'
|
||||
actual_url = 'http://example.com/'
|
||||
@ -190,4 +218,16 @@ class TestFuzzy(object):
|
||||
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||
assert list(cdx_iter) == []
|
||||
|
||||
def test_no_fuzzy_bar_baz(self):
|
||||
url = 'http://example.com/foo/bar'
|
||||
actual_url = 'http://example.com/foo/bas'
|
||||
params = self.get_params(url, actual_url)
|
||||
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||
assert list(cdx_iter) == []
|
||||
|
||||
def test_fuzzy_no_deep_path_mime_match(self):
|
||||
url = 'http://www.website.co.br/~dinosaurs/t'
|
||||
actual_url = 'http://www.website.co.br/~dinosaurs/t/path2/deep-down/what.swf'
|
||||
params = self.get_params(url, actual_url, mime='application/x-shockwave-flash')
|
||||
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||
assert list(cdx_iter) == []
|
||||
|
Loading…
x
Reference in New Issue
Block a user