1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

fuzzy matcher: fix 'not_ext' check for fuzzy matching

tests: add fuzzymatcher tests!
This commit is contained in:
Ilya Kreymer 2017-06-14 20:02:27 +01:00
parent 7dae125888
commit 837d011f56
2 changed files with 94 additions and 4 deletions

View File

@ -86,9 +86,9 @@ class FuzzyMatcher(object):
continue
matched_rule = rule
groups = m.groups()
for f in matched_rule.filter_str:
filters.append(f.format(*groups))
for g in m.groups():
for f in matched_rule.filter_str:
filters.append(f.format(g))
break
@ -190,4 +190,4 @@ class FuzzyMatcher(object):
# check last path segment
# if contains '.', likely a file, so fuzzy match!
last_path = url.split('?', 1)[0].rsplit('/', 1)[-1]
return os.path.splitext(last_path)[1]
return os.path.splitext(last_path)[1][1:]

View File

@ -0,0 +1,90 @@
from pywb.warcserver.index.fuzzymatcher import FuzzyMatcher
from pywb.utils.canonicalize import canonicalize
class EchoParamsSource(object):
def __call__(self, params):
# return nothing for exact match to force fuzzy
if not params.get('matchType'):
return iter([]), None
obj = {'key': params.get('key'),
'mime': params.get('mime'),
'filter': params.get('filter')
}
return iter([obj]), None
class TestFuzzy(object):
@classmethod
def setup_class(cls):
cls.source = EchoParamsSource()
cls.fuzzy = FuzzyMatcher('pkg://pywb/rules.yaml')
def get_params(self, url, mime='text/html'):
params = {'url': url,
'key': canonicalize(url),
'mime': mime}
return params
def get_expected(self, url, mime='text/html', filters=None):
filters = filters or ['~urlkey:']
exp = [{'filter': filters,
'is_fuzzy': True,
'key': canonicalize(url),
'mime': mime}]
return exp
def test_no_fuzzy(self):
params = self.get_params('http://example.com/')
cdx_iter, errs = self.fuzzy(self.source, params)
assert list(cdx_iter) == []
def test_fuzzy_1(self):
url = 'http://example.com/?_=123'
params = self.get_params(url)
cdx_iter, errs = self.fuzzy(self.source, params)
assert list(cdx_iter) == self.get_expected(url)
def test_fuzzy_2(self):
url = 'http://example.com/somefile.html?a=b'
params = self.get_params(url)
cdx_iter, errs = self.fuzzy(self.source, params)
assert list(cdx_iter) == self.get_expected(url)
def test_fuzzy_php_cache(self):
url = 'http://example.com/somefile.php?_=123'
params = self.get_params(url)
cdx_iter, errs = self.fuzzy(self.source, params)
assert list(cdx_iter) == self.get_expected(url)
def test_fuzzy_swf(self):
url = 'http://example.com/somefile.php?a=b'
mime = 'application/x-shockwave-flash'
params = self.get_params(url, mime)
cdx_iter, errs = self.fuzzy(self.source, params)
assert list(cdx_iter) == self.get_expected(url, mime)
def test_fuzzy_custom_rule(self):
url = 'http://youtube.com/get_video_info?a=b&html5=true&___abc=123&video_id=ABCD&id=1234'
params = self.get_params(url)
cdx_iter, errs = self.fuzzy(self.source, params)
filters = ['~urlkey:html5=true', '~urlkey:video_id=abcd']
assert list(cdx_iter) == self.get_expected(url=url, filters=filters)
def test_no_fuzzy_ext_restrict(self):
url = 'http://example.com/somefile.php?a=b'
params = self.get_params(url)
cdx_iter, errs = self.fuzzy(self.source, params)
assert list(cdx_iter) == []