1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

fuzzymatcher: don't modify original params, instad create new fuzzy_params for fuzzy query

This commit is contained in:
Ilya Kreymer 2017-05-25 13:06:24 -07:00
parent 685804919a
commit f0fdc50574

View File

@ -4,6 +4,7 @@ from pywb.utils.loaders import load_yaml_config
import re import re
import os import os
from six import iterkeys
from six.moves.urllib.parse import urlsplit from six.moves.urllib.parse import urlsplit
from collections import namedtuple from collections import namedtuple
@ -20,7 +21,8 @@ class FuzzyMatcher(object):
DEFAULT_MATCH_TYPE = 'prefix' DEFAULT_MATCH_TYPE = 'prefix'
DEFAULT_REPLACE_AFTER = '?' DEFAULT_REPLACE_AFTER = '?'
REMOVE_PARAMS = ['alt_url', 'reverse', 'closest', 'end_key'] FUZZY_SKIP_PARAMS = ('alt_url', 'reverse', 'closest', 'end_key',
'url', 'matchType', 'filter')
def __init__(self, filename): def __init__(self, filename):
config = load_yaml_config(filename) config = load_yaml_config(filename)
@ -103,14 +105,15 @@ class FuzzyMatcher(object):
host = urlsplit(url).netloc host = urlsplit(url).netloc
url = host.split('.', 1)[1] url = host.split('.', 1)[1]
params.update({'url': url, fuzzy_params = {'url': url,
'matchType': matched_rule.match_type, 'matchType': matched_rule.match_type,
'filter': filters}) 'filter': filters}
for param in self.REMOVE_PARAMS: for key in iterkeys(params):
params.pop(param, '') if key not in self.FUZZY_SKIP_PARAMS:
fuzzy_params[key] = params[key]
return matched_rule return matched_rule, fuzzy_params
def make_regex(self, config): def make_regex(self, config):
if isinstance(config, list): if isinstance(config, list):
@ -148,11 +151,13 @@ class FuzzyMatcher(object):
url = params['url'] url = params['url']
rule = self.get_fuzzy_match(params) res = self.get_fuzzy_match(params)
if not rule: if not res:
return return
new_iter, errs = index_source(params) rule, fuzzy_params = res
new_iter, errs = index_source(fuzzy_params)
for cdx in new_iter: for cdx in new_iter:
if self.allow_fuzzy_result(rule, url, cdx): if self.allow_fuzzy_result(rule, url, cdx):