mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Fuzzy Rewrite Improvements (#263)
rules system: - 'mixin' class for adding custom rewrite mixin, initialized with optional 'mixin_params' - 'force_type' to always force rewriting text type for rule match (eg. if application/octet-stream) - fuzzy rewrite: 'find_all' mode for matching via regex.findall() instead of search() - load_function moved to generic load_py_name - new rules for fb! - JSReplaceFuzzy mixin to replace content based on query (or POST) regex match - tests: tests JSReplaceFuzzy rewriting query: - append '?' for fuzzy matching if filters are set - cdx['is_fuzzy'] set to '1' instead of True client-side: rewrite - add window.Request object rewrite - improved rewrite of wb server + path, avoid double-slash - fetch() rewrite proxy_to_obj() - proxy_to_obj() null check - WombatLocation prop change, skip if prop is the same
This commit is contained in:
parent
520ee35081
commit
bcbc00a89b
@ -12,7 +12,7 @@ import json
|
|||||||
|
|
||||||
from pywb.utils.io import StreamIter, BUFF_SIZE
|
from pywb.utils.io import StreamIter, BUFF_SIZE
|
||||||
|
|
||||||
from pywb.utils.loaders import load_yaml_config
|
from pywb.utils.loaders import load_yaml_config, load_py_name
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@ -55,6 +55,10 @@ class BaseContentRewriter(object):
|
|||||||
parse_rules_func = self.init_js_regex(regexs)
|
parse_rules_func = self.init_js_regex(regexs)
|
||||||
rule['js_regex_func'] = parse_rules_func
|
rule['js_regex_func'] = parse_rules_func
|
||||||
|
|
||||||
|
mixin = rule.get('mixin')
|
||||||
|
if mixin:
|
||||||
|
rule['mixin'] = load_py_name(mixin)
|
||||||
|
|
||||||
return rule
|
return rule
|
||||||
|
|
||||||
def get_rule(self, cdx):
|
def get_rule(self, cdx):
|
||||||
@ -73,6 +77,11 @@ class BaseContentRewriter(object):
|
|||||||
rw_type = rule.get(text_type, text_type)
|
rw_type = rule.get(text_type, text_type)
|
||||||
rw_class = self.get_rewriter(rw_type, rwinfo)
|
rw_class = self.get_rewriter(rw_type, rwinfo)
|
||||||
|
|
||||||
|
mixin = rule.get('mixin')
|
||||||
|
if mixin:
|
||||||
|
mixin_params = rule.get('mixin_params', {})
|
||||||
|
rw_class = type('custom_js_rewriter', (mixin, rw_class), mixin_params)
|
||||||
|
|
||||||
return rw_type, rw_class
|
return rw_type, rw_class
|
||||||
|
|
||||||
def create_rewriter(self, text_type, rule, rwinfo, cdx, head_insert_func=None):
|
def create_rewriter(self, text_type, rule, rwinfo, cdx, head_insert_func=None):
|
||||||
@ -159,8 +168,15 @@ class BaseContentRewriter(object):
|
|||||||
rwinfo = RewriteInfo(record, self, url_rewriter, cookie_rewriter)
|
rwinfo = RewriteInfo(record, self, url_rewriter, cookie_rewriter)
|
||||||
content_rewriter = None
|
content_rewriter = None
|
||||||
|
|
||||||
|
url_rewriter.rewrite_opts['cdx'] = cdx
|
||||||
|
|
||||||
|
rule = self.get_rule(cdx)
|
||||||
|
|
||||||
|
force_type = rule.get('force_type')
|
||||||
|
if force_type:
|
||||||
|
rwinfo.text_type = force_type
|
||||||
|
|
||||||
if rwinfo.should_rw_content():
|
if rwinfo.should_rw_content():
|
||||||
rule = self.get_rule(cdx)
|
|
||||||
content_rewriter = self.create_rewriter(rwinfo.text_type, rule, rwinfo, cdx, head_insert_func)
|
content_rewriter = self.create_rewriter(rwinfo.text_type, rule, rwinfo, cdx, head_insert_func)
|
||||||
|
|
||||||
gen = None
|
gen = None
|
||||||
|
@ -1,14 +1,7 @@
|
|||||||
import re
|
import re
|
||||||
from pywb.rewrite.content_rewriter import StreamingRewriter
|
from pywb.rewrite.content_rewriter import StreamingRewriter
|
||||||
|
from pywb.utils.loaders import load_py_name
|
||||||
|
from six.moves.urllib.parse import unquote
|
||||||
# =================================================================
|
|
||||||
def load_function(string):
|
|
||||||
import importlib
|
|
||||||
|
|
||||||
string = string.split(':', 1)
|
|
||||||
mod = importlib.import_module(string[0])
|
|
||||||
return getattr(mod, string[1])
|
|
||||||
|
|
||||||
|
|
||||||
# =================================================================
|
# =================================================================
|
||||||
@ -101,7 +94,7 @@ class RegexRewriter(StreamingRewriter):
|
|||||||
if 'rewrite' in obj:
|
if 'rewrite' in obj:
|
||||||
replace = RegexRewriter.archival_rewrite(rewriter)
|
replace = RegexRewriter.archival_rewrite(rewriter)
|
||||||
elif 'function' in obj:
|
elif 'function' in obj:
|
||||||
replace = load_function(obj['function'])
|
replace = load_py_name(obj['function'])
|
||||||
else:
|
else:
|
||||||
replace = RegexRewriter.format(obj.get('replace', '{0}'))
|
replace = RegexRewriter.format(obj.get('replace', '{0}'))
|
||||||
group = obj.get('group', 0)
|
group = obj.get('group', 0)
|
||||||
@ -259,6 +252,33 @@ class JSWombatProxyRewriter(JSWombatProxyRewriterMixin, RegexRewriter):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
# =================================================================
|
||||||
|
class JSReplaceFuzzy(object):
|
||||||
|
rx_obj = None
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(JSReplaceFuzzy, self).__init__(*args, **kwargs)
|
||||||
|
if not self.rx_obj:
|
||||||
|
self.rx_obj = re.compile(self.rx)
|
||||||
|
|
||||||
|
def rewrite(self, string):
|
||||||
|
string = super(JSReplaceFuzzy, self).rewrite(string)
|
||||||
|
cdx = self.url_rewriter.rewrite_opts['cdx']
|
||||||
|
if cdx.get('is_fuzzy'):
|
||||||
|
expected = unquote(cdx['url'])
|
||||||
|
actual = unquote(self.url_rewriter.wburl.url)
|
||||||
|
|
||||||
|
exp_m = self.rx_obj.search(expected)
|
||||||
|
act_m = self.rx_obj.search(actual)
|
||||||
|
|
||||||
|
if exp_m and act_m:
|
||||||
|
result = string.replace(exp_m.group(1), act_m.group(1))
|
||||||
|
if result != string:
|
||||||
|
string = result
|
||||||
|
|
||||||
|
return string
|
||||||
|
|
||||||
|
|
||||||
# =================================================================
|
# =================================================================
|
||||||
# Set 'default' JSRewriter
|
# Set 'default' JSRewriter
|
||||||
JSRewriter = JSLinkAndLocationRewriter
|
JSRewriter = JSLinkAndLocationRewriter
|
||||||
|
@ -50,17 +50,20 @@ class TestContentRewriter(object):
|
|||||||
warc_headers_dict=warc_headers)
|
warc_headers_dict=warc_headers)
|
||||||
|
|
||||||
def rewrite_record(self, headers, content, ts, url='http://example.com/',
|
def rewrite_record(self, headers, content, ts, url='http://example.com/',
|
||||||
prefix='http://localhost:8080/prefix/', warc_headers=None):
|
prefix='http://localhost:8080/prefix/', warc_headers=None,
|
||||||
|
request_url=None):
|
||||||
|
|
||||||
record = self._create_response_record(url, headers, content, warc_headers)
|
record = self._create_response_record(url, headers, content, warc_headers)
|
||||||
|
|
||||||
wburl = WbUrl(ts + '/' + url)
|
wburl = WbUrl(ts + '/' + (request_url or url))
|
||||||
url_rewriter = UrlRewriter(wburl, prefix)
|
url_rewriter = UrlRewriter(wburl, prefix)
|
||||||
|
|
||||||
cdx = CDXObject()
|
cdx = CDXObject()
|
||||||
cdx['url'] = url
|
cdx['url'] = url
|
||||||
cdx['timestamp'] = ts
|
cdx['timestamp'] = ts
|
||||||
cdx['urlkey'] = canonicalize(url)
|
cdx['urlkey'] = canonicalize(url)
|
||||||
|
if request_url != url:
|
||||||
|
cdx['is_fuzzy'] = '1'
|
||||||
|
|
||||||
return self.content_rewriter(record, url_rewriter, None, cdx=cdx)
|
return self.content_rewriter(record, url_rewriter, None, cdx=cdx)
|
||||||
|
|
||||||
@ -254,6 +257,21 @@ class TestContentRewriter(object):
|
|||||||
|
|
||||||
assert b''.join(gen).decode('utf-8') == content
|
assert b''.join(gen).decode('utf-8') == content
|
||||||
|
|
||||||
|
def test_custom_fuzzy_replace(self):
|
||||||
|
headers = {'Content-Type': 'application/octet-stream'}
|
||||||
|
content = '{"ssid":"1234"}'
|
||||||
|
|
||||||
|
actual_url = 'http://facebook.com/ajax/pagelet/generic.php/photoviewerinitpagelet?data="ssid":1234'
|
||||||
|
request_url = 'http://facebook.com/ajax/pagelet/generic.php/photoviewerinitpagelet?data="ssid":5678'
|
||||||
|
|
||||||
|
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_',
|
||||||
|
url=actual_url,
|
||||||
|
request_url=request_url)
|
||||||
|
|
||||||
|
assert headers.headers == [('Content-Type', 'application/octet-stream')]
|
||||||
|
|
||||||
|
assert b''.join(gen).decode('utf-8') == '{"ssid":"5678"}'
|
||||||
|
|
||||||
def test_hls_default_max(self):
|
def test_hls_default_max(self):
|
||||||
headers = {'Content-Type': 'application/vnd.apple.mpegurl'}
|
headers = {'Content-Type': 'application/vnd.apple.mpegurl'}
|
||||||
with open(os.path.join(get_test_dir(), 'text_content', 'sample_hls.m3u8'), 'rt') as fh:
|
with open(os.path.join(get_test_dir(), 'text_content', 'sample_hls.m3u8'), 'rt') as fh:
|
||||||
|
@ -64,11 +64,38 @@ rules:
|
|||||||
|
|
||||||
# facebook rules
|
# facebook rules
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
- url_prefix: 'com,facebook)/ajax/pagelet/generic.php/photoviewerinitpagelet'
|
||||||
|
|
||||||
|
rewrite:
|
||||||
|
mixin: 'pywb.rewrite.regex_rewriters:JSReplaceFuzzy'
|
||||||
|
mixin_params:
|
||||||
|
rx: '"ssid":([\d]+)'
|
||||||
|
|
||||||
|
force_type: 'json'
|
||||||
|
|
||||||
|
fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|(?:query_type|fbid)[^,]+))'
|
||||||
|
|
||||||
- url_prefix: 'com,facebook)/ajax/pagelet/generic.php/'
|
- url_prefix: 'com,facebook)/ajax/pagelet/generic.php/'
|
||||||
|
|
||||||
#fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))'
|
#fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))'
|
||||||
fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|(?:query_type|fbid)[^,]+))'
|
fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|(?:query_type|fbid)[^,]+))'
|
||||||
|
|
||||||
|
- url_prefix: 'com,facebook)/ajax/ufi/reply_fetch.php'
|
||||||
|
|
||||||
|
fuzzy_lookup:
|
||||||
|
- 'ft_ent_identifier'
|
||||||
|
- 'parent_comment_ids[0]'
|
||||||
|
- lsd
|
||||||
|
|
||||||
|
- url_prefix: 'com,facebook)/ajax/ufi/comment_fetch.php'
|
||||||
|
|
||||||
|
fuzzy_lookup:
|
||||||
|
- 'source'
|
||||||
|
- 'offset'
|
||||||
|
- 'length'
|
||||||
|
- 'ft_ent_identifier'
|
||||||
|
- 'feed_context'
|
||||||
|
|
||||||
- url_prefix: 'com,facebook)/ajax/ufi/'
|
- url_prefix: 'com,facebook)/ajax/ufi/'
|
||||||
|
|
||||||
fuzzy_lookup:
|
fuzzy_lookup:
|
||||||
@ -97,7 +124,16 @@ rules:
|
|||||||
|
|
||||||
fuzzy_lookup: '([?&][^_]\w+=[^&]+)+'
|
fuzzy_lookup: '([?&][^_]\w+=[^&]+)+'
|
||||||
|
|
||||||
|
- url_prefix: 'com,facebook)/api/graphqlbatch'
|
||||||
|
|
||||||
|
fuzzy_lookup:
|
||||||
|
match: '("q[\d]+":|after:\\"[^"]+)'
|
||||||
|
find_all: true
|
||||||
|
|
||||||
- url_prefix: 'com,facebook)/'
|
- url_prefix: 'com,facebook)/'
|
||||||
|
|
||||||
|
fuzzy_lookup: '([?&][^_]\w+=[^&]+)+'
|
||||||
|
|
||||||
rewrite:
|
rewrite:
|
||||||
js_regexs:
|
js_regexs:
|
||||||
- match: 'Bootloader\.configurePage.*?;'
|
- match: 'Bootloader\.configurePage.*?;'
|
||||||
|
@ -279,7 +279,10 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
|||||||
} else {
|
} else {
|
||||||
url = "";
|
url = "";
|
||||||
}
|
}
|
||||||
url += "/" + path;
|
if (path && path[0] != "/") {
|
||||||
|
url += "/";
|
||||||
|
}
|
||||||
|
url += path;
|
||||||
}
|
}
|
||||||
|
|
||||||
return url;
|
return url;
|
||||||
@ -516,6 +519,10 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (this["_" + prop] == value) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
this["_" + prop] = value;
|
this["_" + prop] = value;
|
||||||
|
|
||||||
if (!this._parser) {
|
if (!this._parser) {
|
||||||
@ -873,10 +880,44 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
|||||||
init_opts = init_opts || {};
|
init_opts = init_opts || {};
|
||||||
init_opts["credentials"] = "include";
|
init_opts["credentials"] = "include";
|
||||||
|
|
||||||
return orig_fetch.call(this, input, init_opts);
|
return orig_fetch.call(proxy_to_obj(this), input, init_opts);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//============================================
|
||||||
|
function init_request_override()
|
||||||
|
{
|
||||||
|
var orig_request = $wbwindow.Request;
|
||||||
|
|
||||||
|
if (!orig_request) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
$wbwindow.Request = (function (Request) {
|
||||||
|
return function(input, init_opts) {
|
||||||
|
if (typeof(input) === "string") {
|
||||||
|
input = rewrite_url(input);
|
||||||
|
} else if (typeof(input) === "object" && input.url) {
|
||||||
|
var new_url = rewrite_url(input.url);
|
||||||
|
|
||||||
|
if (new_url != input.url) {
|
||||||
|
// input = new Request(new_url, input);
|
||||||
|
input.url = new_url;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
init_opts = init_opts || {};
|
||||||
|
init_opts["credentials"] = "include";
|
||||||
|
|
||||||
|
return new Request(input, init_opts);
|
||||||
|
}
|
||||||
|
|
||||||
|
})($wbwindow.Request);
|
||||||
|
|
||||||
|
$wbwindow.Request.prototype = orig_request.prototype;
|
||||||
|
}
|
||||||
|
|
||||||
//============================================
|
//============================================
|
||||||
function override_prop_extract(proto, prop, cond) {
|
function override_prop_extract(proto, prop, cond) {
|
||||||
var orig_getter = get_orig_getter(proto, prop);
|
var orig_getter = get_orig_getter(proto, prop);
|
||||||
@ -2767,7 +2808,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
|||||||
//============================================
|
//============================================
|
||||||
function proxy_to_obj(source) {
|
function proxy_to_obj(source) {
|
||||||
try {
|
try {
|
||||||
return source.__WBProxyRealObj__ || source;
|
return (source && source.__WBProxyRealObj__) || source;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
return source;
|
return source;
|
||||||
}
|
}
|
||||||
@ -2997,6 +3038,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
|||||||
|
|
||||||
// Fetch
|
// Fetch
|
||||||
init_fetch_rewrite();
|
init_fetch_rewrite();
|
||||||
|
init_request_override();
|
||||||
|
|
||||||
// Worker override (experimental)
|
// Worker override (experimental)
|
||||||
init_web_worker_override();
|
init_web_worker_override();
|
||||||
|
@ -108,6 +108,16 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
|
|||||||
>>> calc_search_range('http://example.com/path/file.html', 'prefix')
|
>>> calc_search_range('http://example.com/path/file.html', 'prefix')
|
||||||
('com,example)/path/file.html', 'com,example)/path/file.htmm')
|
('com,example)/path/file.html', 'com,example)/path/file.htmm')
|
||||||
|
|
||||||
|
# slash and ?
|
||||||
|
>>> calc_search_range('http://example.com/path/', 'prefix')
|
||||||
|
('com,example)/path/', 'com,example)/path0')
|
||||||
|
|
||||||
|
>>> calc_search_range('http://example.com/path?', 'prefix')
|
||||||
|
('com,example)/path?', 'com,example)/path@')
|
||||||
|
|
||||||
|
>>> calc_search_range('http://example.com/path/?', 'prefix')
|
||||||
|
('com,example)/path?', 'com,example)/path@')
|
||||||
|
|
||||||
>>> calc_search_range('http://example.com/path/file.html', 'host')
|
>>> calc_search_range('http://example.com/path/file.html', 'host')
|
||||||
('com,example)/', 'com,example*')
|
('com,example)/', 'com,example*')
|
||||||
|
|
||||||
@ -158,6 +168,9 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
|
|||||||
if url.endswith('/') and not start_key.endswith('/'):
|
if url.endswith('/') and not start_key.endswith('/'):
|
||||||
start_key += '/'
|
start_key += '/'
|
||||||
|
|
||||||
|
if url.endswith('?') and not start_key.endswith('?'):
|
||||||
|
start_key += '?'
|
||||||
|
|
||||||
end_key = inc_last_char(start_key)
|
end_key = inc_last_char(start_key)
|
||||||
|
|
||||||
elif match_type == 'host':
|
elif match_type == 'host':
|
||||||
|
@ -30,6 +30,15 @@ except ImportError: #pragma: no cover
|
|||||||
s3_avail = False
|
s3_avail = False
|
||||||
|
|
||||||
|
|
||||||
|
# =================================================================
|
||||||
|
def load_py_name(string):
|
||||||
|
import importlib
|
||||||
|
|
||||||
|
string = string.split(':', 1)
|
||||||
|
mod = importlib.import_module(string[0])
|
||||||
|
return getattr(mod, string[1])
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def is_http(filename):
|
def is_http(filename):
|
||||||
return filename.startswith(('http://', 'https://'))
|
return filename.startswith(('http://', 'https://'))
|
||||||
|
@ -13,7 +13,7 @@ from collections import namedtuple
|
|||||||
# ============================================================================
|
# ============================================================================
|
||||||
FuzzyRule = namedtuple('FuzzyRule',
|
FuzzyRule = namedtuple('FuzzyRule',
|
||||||
'url_prefix, regex, replace_after, filter_str, ' +
|
'url_prefix, regex, replace_after, filter_str, ' +
|
||||||
'match_type')
|
'match_type, find_all')
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@ -54,14 +54,16 @@ class FuzzyMatcher(object):
|
|||||||
replace_after = self.DEFAULT_REPLACE_AFTER
|
replace_after = self.DEFAULT_REPLACE_AFTER
|
||||||
filter_str = self.DEFAULT_FILTER
|
filter_str = self.DEFAULT_FILTER
|
||||||
match_type = self.DEFAULT_MATCH_TYPE
|
match_type = self.DEFAULT_MATCH_TYPE
|
||||||
|
find_all = False
|
||||||
|
|
||||||
else:
|
else:
|
||||||
regex = self.make_regex(config.get('match'))
|
regex = self.make_regex(config.get('match'))
|
||||||
replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER)
|
replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER)
|
||||||
filter_str = config.get('filter', self.DEFAULT_FILTER)
|
filter_str = config.get('filter', self.DEFAULT_FILTER)
|
||||||
match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
|
match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
|
||||||
|
find_all = config.get('find_all', False)
|
||||||
|
|
||||||
return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type)
|
return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type, find_all)
|
||||||
|
|
||||||
def get_fuzzy_match(self, urlkey, params):
|
def get_fuzzy_match(self, urlkey, params):
|
||||||
filters = set()
|
filters = set()
|
||||||
@ -71,12 +73,18 @@ class FuzzyMatcher(object):
|
|||||||
if not any((urlkey.startswith(prefix) for prefix in rule.url_prefix)):
|
if not any((urlkey.startswith(prefix) for prefix in rule.url_prefix)):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
m = rule.regex.search(urlkey)
|
groups = None
|
||||||
if not m:
|
if rule.find_all:
|
||||||
|
groups = rule.regex.findall(urlkey)
|
||||||
|
else:
|
||||||
|
m = rule.regex.search(urlkey)
|
||||||
|
groups = m and m.groups()
|
||||||
|
|
||||||
|
if not groups:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
matched_rule = rule
|
matched_rule = rule
|
||||||
for g in m.groups():
|
for g in groups:
|
||||||
for f in matched_rule.filter_str:
|
for f in matched_rule.filter_str:
|
||||||
filters.add(f.format(g))
|
filters.add(f.format(g))
|
||||||
|
|
||||||
@ -87,9 +95,18 @@ class FuzzyMatcher(object):
|
|||||||
|
|
||||||
url = params['url']
|
url = params['url']
|
||||||
|
|
||||||
|
# support matching w/o query if no additional filters
|
||||||
|
# don't include trailing '?' if no filters and replace_after '?'
|
||||||
|
no_filters = (filters == {'urlkey:'}) and (matched_rule.replace_after == '?')
|
||||||
|
|
||||||
inx = url.find(matched_rule.replace_after)
|
inx = url.find(matched_rule.replace_after)
|
||||||
if inx > 0:
|
if inx > 0:
|
||||||
url = url[:inx + len(matched_rule.replace_after)]
|
length = inx + len(matched_rule.replace_after)
|
||||||
|
if no_filters:
|
||||||
|
length -= 1
|
||||||
|
url = url[:length]
|
||||||
|
elif not no_filters:
|
||||||
|
url += matched_rule.replace_after[0]
|
||||||
|
|
||||||
if matched_rule.match_type == 'domain':
|
if matched_rule.match_type == 'domain':
|
||||||
host = urlsplit(url).netloc
|
host = urlsplit(url).netloc
|
||||||
@ -98,7 +115,7 @@ class FuzzyMatcher(object):
|
|||||||
fuzzy_params = {'url': url,
|
fuzzy_params = {'url': url,
|
||||||
'matchType': matched_rule.match_type,
|
'matchType': matched_rule.match_type,
|
||||||
'filter': filters,
|
'filter': filters,
|
||||||
'is_fuzzy': True}
|
'is_fuzzy': '1'}
|
||||||
|
|
||||||
for key in iterkeys(params):
|
for key in iterkeys(params):
|
||||||
if key not in self.FUZZY_SKIP_PARAMS:
|
if key not in self.FUZZY_SKIP_PARAMS:
|
||||||
@ -157,7 +174,7 @@ class FuzzyMatcher(object):
|
|||||||
|
|
||||||
for cdx in new_iter:
|
for cdx in new_iter:
|
||||||
if is_custom or self.match_general_fuzzy_query(url, urlkey, cdx, rx_cache):
|
if is_custom or self.match_general_fuzzy_query(url, urlkey, cdx, rx_cache):
|
||||||
cdx['is_fuzzy'] = True
|
cdx['is_fuzzy'] = '1'
|
||||||
yield cdx
|
yield cdx
|
||||||
|
|
||||||
def match_general_fuzzy_query(self, url, urlkey, cdx, rx_cache):
|
def match_general_fuzzy_query(self, url, urlkey, cdx, rx_cache):
|
||||||
|
@ -38,7 +38,7 @@ class TestFuzzy(object):
|
|||||||
def get_expected(self, url, mime='text/html', filters=None):
|
def get_expected(self, url, mime='text/html', filters=None):
|
||||||
filters = filters or {'urlkey:'}
|
filters = filters or {'urlkey:'}
|
||||||
exp = [{'filter': filters,
|
exp = [{'filter': filters,
|
||||||
'is_fuzzy': True,
|
'is_fuzzy': '1',
|
||||||
'urlkey': canonicalize(url),
|
'urlkey': canonicalize(url),
|
||||||
'source': 'source',
|
'source': 'source',
|
||||||
'source-coll': 'source',
|
'source-coll': 'source',
|
||||||
|
Loading…
x
Reference in New Issue
Block a user