mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Less aggressive fuzzy matching on mime type. (#362)
* When mime type match is made also match on extension in order to be less aggressive when matching prefix matches. * fuzzy matching: further restrict fuzzy matching on mime or ext match by ensuring the matched result differs only by query
This commit is contained in:
parent
5476d75294
commit
5f938e6879
@ -186,15 +186,23 @@ class FuzzyMatcher(object):
|
|||||||
yield cdx
|
yield cdx
|
||||||
|
|
||||||
def match_general_fuzzy_query(self, url, urlkey, cdx, rx_cache):
|
def match_general_fuzzy_query(self, url, urlkey, cdx, rx_cache):
|
||||||
# check ext
|
check_query = False
|
||||||
ext = self.get_ext(url)
|
url_no_query, ext = self.get_ext(url)
|
||||||
if ext and ext not in self.default_filters['not_exts']:
|
|
||||||
return True
|
|
||||||
|
|
||||||
# check mime
|
# check ext
|
||||||
mime = cdx.get('mime')
|
if ext and ext not in self.default_filters['not_exts']:
|
||||||
if mime and mime in self.default_filters['mimes']:
|
check_query = True
|
||||||
return True
|
|
||||||
|
else:
|
||||||
|
# check mime
|
||||||
|
mime = cdx.get('mime')
|
||||||
|
if mime and mime in self.default_filters['mimes']:
|
||||||
|
check_query = True
|
||||||
|
|
||||||
|
# if check_query, ensure matched url starts with original prefix, only differs by query
|
||||||
|
if check_query:
|
||||||
|
if cdx['url'] == url_no_query or cdx['url'].startswith(url_no_query + '?'):
|
||||||
|
return True
|
||||||
|
|
||||||
match_urlkey = cdx['urlkey']
|
match_urlkey = cdx['urlkey']
|
||||||
|
|
||||||
@ -215,5 +223,6 @@ class FuzzyMatcher(object):
|
|||||||
def get_ext(self, url):
|
def get_ext(self, url):
|
||||||
# check last path segment
|
# check last path segment
|
||||||
# if contains '.', likely a file, so fuzzy match!
|
# if contains '.', likely a file, so fuzzy match!
|
||||||
last_path = url.split('?', 1)[0].rsplit('/', 1)[-1]
|
url_no_query = url.split('?', 1)[0]
|
||||||
return os.path.splitext(last_path)[1][1:]
|
last_path = url_no_query.rsplit('/', 1)[-1]
|
||||||
|
return url_no_query, os.path.splitext(last_path)[1][1:]
|
||||||
|
@ -133,6 +133,34 @@ class TestFuzzy(object):
|
|||||||
|
|
||||||
assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters)
|
assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters)
|
||||||
|
|
||||||
|
def test_fuzzy_bar_baz_with_ext(self):
|
||||||
|
url = 'http://example.com/foo/bar.png?abc'
|
||||||
|
actual_url = 'http://example.com/foo/bar.png'
|
||||||
|
params = self.get_params(url, actual_url)
|
||||||
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
|
assert list(cdx_iter) == self.get_expected(url=actual_url)
|
||||||
|
|
||||||
|
def test_fuzzy_bar_baz_with_ext_2(self):
|
||||||
|
url = 'http://example.com/foo/bar.png?abc'
|
||||||
|
actual_url = 'http://example.com/foo/bar.png?def'
|
||||||
|
params = self.get_params(url, actual_url)
|
||||||
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
|
assert list(cdx_iter) == self.get_expected(url=actual_url)
|
||||||
|
|
||||||
|
def test_fuzzy_bar_baz_with_ext_3(self):
|
||||||
|
url = 'http://example.com/foo/bar.png'
|
||||||
|
actual_url = 'http://example.com/foo/bar.png?xyz'
|
||||||
|
params = self.get_params(url, actual_url)
|
||||||
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
|
assert list(cdx_iter) == self.get_expected(url=actual_url)
|
||||||
|
|
||||||
|
def test_no_fuzzy_bar_baz_with_ext(self):
|
||||||
|
url = 'http://example.com/foo/bar.png?abc'
|
||||||
|
actual_url = 'http://example.com/foo/bar'
|
||||||
|
params = self.get_params(url, actual_url)
|
||||||
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
|
assert list(cdx_iter) == []
|
||||||
|
|
||||||
def test_no_fuzzy_disabled(self):
|
def test_no_fuzzy_disabled(self):
|
||||||
url = 'http://example.com/?_=123'
|
url = 'http://example.com/?_=123'
|
||||||
actual_url = 'http://example.com/'
|
actual_url = 'http://example.com/'
|
||||||
@ -190,4 +218,16 @@ class TestFuzzy(object):
|
|||||||
cdx_iter, errs = self.fuzzy(self.source, params)
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
assert list(cdx_iter) == []
|
assert list(cdx_iter) == []
|
||||||
|
|
||||||
|
def test_no_fuzzy_bar_baz(self):
|
||||||
|
url = 'http://example.com/foo/bar'
|
||||||
|
actual_url = 'http://example.com/foo/bas'
|
||||||
|
params = self.get_params(url, actual_url)
|
||||||
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
|
assert list(cdx_iter) == []
|
||||||
|
|
||||||
|
def test_fuzzy_no_deep_path_mime_match(self):
|
||||||
|
url = 'http://www.website.co.br/~dinosaurs/t'
|
||||||
|
actual_url = 'http://www.website.co.br/~dinosaurs/t/path2/deep-down/what.swf'
|
||||||
|
params = self.get_params(url, actual_url, mime='application/x-shockwave-flash')
|
||||||
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
|
assert list(cdx_iter) == []
|
||||||
|
Loading…
x
Reference in New Issue
Block a user