mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Fuzzy Rules Improvements (#231)
* separate default rules config for query matching: 'not_exts', 'mimes', and new 'url_normalize' - regexes in 'url_normalize' applied on each cdx entry to see if there's a match with requested url - jsonp: allow for '/* */' comments prefix in jsonp (experimental) - fuzzy rule: add rule for '\w+=jquery[\d]+' collapsing, supports any callback name - fuzzy rule: add rule for more generic 'cache busting' params, 'bust' in name, possible timestamp in value (experimental) - fuzzy rule add: add ga utm_* rule & tests tests: improve fuzzy matcher tests to use indexing system, test all new rules tests: add jsonp_rewriter tests config: use_js_obj_proxy=true in default config.yaml, setting added to each collection's metadata
This commit is contained in:
parent
d0dafb268d
commit
1360723f95
@ -2,6 +2,7 @@
|
|||||||
# ========================================
|
# ========================================
|
||||||
#
|
#
|
||||||
# Settings for each collection
|
# Settings for each collection
|
||||||
|
use_js_obj_proxy: true
|
||||||
|
|
||||||
|
|
||||||
collections:
|
collections:
|
||||||
|
@ -100,6 +100,9 @@ class FrontEndApp(object):
|
|||||||
def get_metadata(self, coll):
|
def get_metadata(self, coll):
|
||||||
metadata = {'coll': coll}
|
metadata = {'coll': coll}
|
||||||
|
|
||||||
|
if self.warcserver.config.get('use_js_obj_proxy'):
|
||||||
|
metadata['use_js_obj_proxy'] = True
|
||||||
|
|
||||||
if coll in self.warcserver.list_fixed_routes():
|
if coll in self.warcserver.list_fixed_routes():
|
||||||
metadata.update(self.warcserver.get_coll_config(coll))
|
metadata.update(self.warcserver.get_coll_config(coll))
|
||||||
metadata['type'] = 'replay-fixed'
|
metadata['type'] = 'replay-fixed'
|
||||||
|
@ -4,7 +4,7 @@ from pywb.rewrite.content_rewriter import StreamingRewriter
|
|||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class JSONPRewriter(StreamingRewriter):
|
class JSONPRewriter(StreamingRewriter):
|
||||||
JSONP = re.compile(r'^(\w+)\(\{')
|
JSONP = re.compile(r'^(?:\s*\/\*(?:.*)\*\/)*\s*(\w+)\(\{')
|
||||||
CALLBACK = re.compile(r'[?].*callback=([^&]+)')
|
CALLBACK = re.compile(r'[?].*callback=([^&]+)')
|
||||||
|
|
||||||
def rewrite(self, string):
|
def rewrite(self, string):
|
||||||
|
44
pywb/rewrite/test/test_jsonp_rewriter.py
Normal file
44
pywb/rewrite/test/test_jsonp_rewriter.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
from pywb.rewrite.jsonp_rewriter import JSONPRewriter
|
||||||
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
|
|
||||||
|
class TestJSONPRewriter(object):
|
||||||
|
@classmethod
|
||||||
|
def setup_class(cls):
|
||||||
|
urlrewriter = UrlRewriter('20161226/http://example.com/?callback=jQuery_ABC', '/web/', 'https://localhost/web/')
|
||||||
|
cls.rewriter = JSONPRewriter(urlrewriter)
|
||||||
|
|
||||||
|
urlrewriter = UrlRewriter('20161226/http://example.com/', '/web/', 'https://localhost/web/')
|
||||||
|
cls.rewriter_no_cb = JSONPRewriter(urlrewriter)
|
||||||
|
|
||||||
|
def test_jsonp_rewrite_1(self):
|
||||||
|
string = 'jQuery_1234({"foo": "bar", "some": "data"})'
|
||||||
|
expect = 'jQuery_ABC({"foo": "bar", "some": "data"})'
|
||||||
|
assert self.rewriter.rewrite(string) == expect
|
||||||
|
|
||||||
|
def test_jsonp_rewrite_2(self):
|
||||||
|
string = ' /**/ jQuery_1234({"foo": "bar", "some": "data"})'
|
||||||
|
expect = 'jQuery_ABC({"foo": "bar", "some": "data"})'
|
||||||
|
assert self.rewriter.rewrite(string) == expect
|
||||||
|
|
||||||
|
def test_jsonp_rewrite_3(self):
|
||||||
|
string = ' /* some comment */ jQuery_1234({"foo": "bar", "some": "data"})'
|
||||||
|
expect = 'jQuery_ABC({"foo": "bar", "some": "data"})'
|
||||||
|
assert self.rewriter.rewrite(string) == expect
|
||||||
|
|
||||||
|
def test_no_jsonp_rewrite_1(self):
|
||||||
|
string = ' /* comment jQuery_1234({"foo": "bar", "some": "data"})'
|
||||||
|
assert self.rewriter.rewrite(string) == string
|
||||||
|
|
||||||
|
def test_no_jsonp_rewrite_2(self):
|
||||||
|
string = 'function jQuery_1234({"foo": "bar", "some": "data"})'
|
||||||
|
assert self.rewriter.rewrite(string) == string
|
||||||
|
|
||||||
|
def test_no_jsonp_rewrite_3(self):
|
||||||
|
string = 'var foo = ({"foo": "bar", "some": "data"})'
|
||||||
|
assert self.rewriter.rewrite(string) == string
|
||||||
|
|
||||||
|
def test_no_jsonp_rewrite_no_callback_1(self):
|
||||||
|
string = 'jQuery_1234({"foo": "bar", "some": "data"})'
|
||||||
|
assert self.rewriter_no_cb.rewrite(string) == string
|
||||||
|
|
||||||
|
|
@ -1,5 +1,42 @@
|
|||||||
rules:
|
# Default Filters
|
||||||
|
default_filters:
|
||||||
|
# exts that should *not* be treated as files (ignore all query args)
|
||||||
|
not_exts:
|
||||||
|
- asp
|
||||||
|
- aspx
|
||||||
|
- jsp
|
||||||
|
- php
|
||||||
|
- pl
|
||||||
|
- exe
|
||||||
|
- dll
|
||||||
|
|
||||||
|
# ignore query args for the following mime types
|
||||||
|
mimes:
|
||||||
|
- 'application/dash+xml'
|
||||||
|
- 'application/x-shockwave-flash'
|
||||||
|
|
||||||
|
# apply following url normalization rules
|
||||||
|
# on both match url and request url
|
||||||
|
# to find a match (not limited to query argument removal)
|
||||||
|
url_normalize:
|
||||||
|
# remove known cache busting args
|
||||||
|
- match: '[?&](_|cb|uncache)=([\d]+)(?=&|$)'
|
||||||
|
replace: ''
|
||||||
|
|
||||||
|
# GA cache busting params
|
||||||
|
- match: '[?&]utm_[^=]+=[^&]+(?=&|$)'
|
||||||
|
replace: ''
|
||||||
|
|
||||||
|
# remove jquery callback dynamic timestamp
|
||||||
|
- match: '[?&]((?:\w+)=jquery)[\d]+_[\d]+'
|
||||||
|
replace: '\1'
|
||||||
|
|
||||||
|
# remove more generic cache-busting params:
|
||||||
|
# name contains 'bust', value appears to be a timestamp
|
||||||
|
- match: '[?&](\w*bust\w*=1[\d]{12,15})(?=&|$)'
|
||||||
|
replace: ''
|
||||||
|
|
||||||
|
rules:
|
||||||
|
|
||||||
# twitter rules
|
# twitter rules
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -348,24 +385,4 @@ rules:
|
|||||||
- url_prefix: ''
|
- url_prefix: ''
|
||||||
fuzzy_lookup:
|
fuzzy_lookup:
|
||||||
match: '()'
|
match: '()'
|
||||||
match_filters:
|
|
||||||
- not_ext:
|
|
||||||
- asp
|
|
||||||
- aspx
|
|
||||||
- jsp
|
|
||||||
- php
|
|
||||||
- pl
|
|
||||||
- exe
|
|
||||||
- dll
|
|
||||||
|
|
||||||
match: '()'
|
|
||||||
|
|
||||||
- mime: 'application/dash+xml'
|
|
||||||
match: '()'
|
|
||||||
|
|
||||||
- mime: 'application/x-shockwave-flash'
|
|
||||||
match: '()'
|
|
||||||
|
|
||||||
- mime: '*'
|
|
||||||
match: '(.*)[&?](?:_|cb|uncache)=[\d]+[&]?'
|
|
||||||
|
|
||||||
|
@ -12,7 +12,7 @@ from collections import namedtuple
|
|||||||
# ============================================================================
|
# ============================================================================
|
||||||
FuzzyRule = namedtuple('FuzzyRule',
|
FuzzyRule = namedtuple('FuzzyRule',
|
||||||
'url_prefix, regex, replace_after, filter_str, ' +
|
'url_prefix, regex, replace_after, filter_str, ' +
|
||||||
'match_type, match_filters')
|
'match_type')
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@ -32,6 +32,10 @@ class FuzzyMatcher(object):
|
|||||||
if rule:
|
if rule:
|
||||||
self.rules.append(rule)
|
self.rules.append(rule)
|
||||||
|
|
||||||
|
self.default_filters = config.get('default_filters')
|
||||||
|
|
||||||
|
self.url_normalize_rx = [(re.compile(rule['match']), rule['replace']) for rule in self.default_filters['url_normalize']]
|
||||||
|
|
||||||
def parse_fuzzy_rule(self, rule):
|
def parse_fuzzy_rule(self, rule):
|
||||||
""" Parse rules using all the different supported forms
|
""" Parse rules using all the different supported forms
|
||||||
"""
|
"""
|
||||||
@ -48,32 +52,16 @@ class FuzzyMatcher(object):
|
|||||||
replace_after = self.DEFAULT_REPLACE_AFTER
|
replace_after = self.DEFAULT_REPLACE_AFTER
|
||||||
filter_str = self.DEFAULT_FILTER
|
filter_str = self.DEFAULT_FILTER
|
||||||
match_type = self.DEFAULT_MATCH_TYPE
|
match_type = self.DEFAULT_MATCH_TYPE
|
||||||
match_filters = None
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
regex = self.make_regex(config.get('match'))
|
regex = self.make_regex(config.get('match'))
|
||||||
replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER)
|
replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER)
|
||||||
filter_str = config.get('filter', self.DEFAULT_FILTER)
|
filter_str = config.get('filter', self.DEFAULT_FILTER)
|
||||||
match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
|
match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
|
||||||
match_filters = self._init_match_filters(config.get('match_filters'))
|
|
||||||
|
|
||||||
return FuzzyRule(url_prefix, regex, replace_after, filter_str,
|
return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type)
|
||||||
match_type, match_filters)
|
|
||||||
|
|
||||||
def _init_match_filters(self, filter_config):
|
|
||||||
if not filter_config:
|
|
||||||
return
|
|
||||||
|
|
||||||
filters = []
|
|
||||||
for filter_ in filter_config:
|
|
||||||
filter_['match'] = re.compile(filter_['match'])
|
|
||||||
filters.append(filter_)
|
|
||||||
|
|
||||||
return filters
|
|
||||||
|
|
||||||
def get_fuzzy_match(self, params):
|
|
||||||
urlkey = to_native_str(params['key'], 'utf-8')
|
|
||||||
|
|
||||||
|
def get_fuzzy_match(self, urlkey, params):
|
||||||
filters = []
|
filters = []
|
||||||
matched_rule = None
|
matched_rule = None
|
||||||
|
|
||||||
@ -151,8 +139,9 @@ class FuzzyMatcher(object):
|
|||||||
return
|
return
|
||||||
|
|
||||||
url = params['url']
|
url = params['url']
|
||||||
|
urlkey = to_native_str(params['key'], 'utf-8')
|
||||||
|
|
||||||
res = self.get_fuzzy_match(params)
|
res = self.get_fuzzy_match(urlkey, params)
|
||||||
if not res:
|
if not res:
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -160,30 +149,39 @@ class FuzzyMatcher(object):
|
|||||||
|
|
||||||
new_iter, errs = index_source(fuzzy_params)
|
new_iter, errs = index_source(fuzzy_params)
|
||||||
|
|
||||||
|
is_custom = (rule.url_prefix != [''])
|
||||||
|
|
||||||
|
rx_cache = {}
|
||||||
|
|
||||||
for cdx in new_iter:
|
for cdx in new_iter:
|
||||||
if self.allow_fuzzy_result(rule, url, cdx):
|
if is_custom or self.match_general_fuzzy_query(url, urlkey, cdx, rx_cache):
|
||||||
cdx['is_fuzzy'] = True
|
cdx['is_fuzzy'] = True
|
||||||
yield cdx
|
yield cdx
|
||||||
|
|
||||||
def allow_fuzzy_result(self, rule, url, cdx):
|
def match_general_fuzzy_query(self, url, urlkey, cdx, rx_cache):
|
||||||
if not rule.match_filters:
|
# check ext
|
||||||
|
ext = self.get_ext(url)
|
||||||
|
if ext and ext not in self.default_filters['not_exts']:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
# check mime
|
||||||
mime = cdx.get('mime')
|
mime = cdx.get('mime')
|
||||||
if not mime:
|
if mime and mime in self.default_filters['mimes']:
|
||||||
return False
|
return True
|
||||||
|
|
||||||
for match_filter in rule.match_filters:
|
match_urlkey = cdx['urlkey']
|
||||||
not_ext = match_filter.get('not_ext')
|
|
||||||
if not_ext:
|
|
||||||
ext = self.get_ext(url)
|
|
||||||
if not ext or ext in not_ext:
|
|
||||||
continue
|
|
||||||
|
|
||||||
elif match_filter.get('mime', '--') not in (mime, '*'):
|
for normalize_rx in self.url_normalize_rx:
|
||||||
continue
|
match_urlkey = re.sub(normalize_rx[0], normalize_rx[1], match_urlkey)
|
||||||
|
curr_urlkey = rx_cache.get(normalize_rx[0])
|
||||||
|
|
||||||
return match_filter['match'].search(url)
|
if not curr_urlkey:
|
||||||
|
curr_urlkey = re.sub(normalize_rx[0], normalize_rx[1], urlkey)
|
||||||
|
rx_cache[normalize_rx[0]] = curr_urlkey
|
||||||
|
urlkey = curr_urlkey
|
||||||
|
|
||||||
|
if curr_urlkey == match_urlkey:
|
||||||
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@ -1,31 +1,36 @@
|
|||||||
from pywb.warcserver.index.fuzzymatcher import FuzzyMatcher
|
from pywb.warcserver.index.fuzzymatcher import FuzzyMatcher
|
||||||
from pywb.utils.canonicalize import canonicalize
|
from pywb.utils.canonicalize import canonicalize
|
||||||
|
|
||||||
class EchoParamsSource(object):
|
from pywb.warcserver.index.aggregator import SimpleAggregator
|
||||||
def __call__(self, params):
|
from pywb.warcserver.index.indexsource import BaseIndexSource
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
class EchoParamsSource(BaseIndexSource):
|
||||||
|
def load_index(self, params):
|
||||||
# return nothing for exact match to force fuzzy
|
# return nothing for exact match to force fuzzy
|
||||||
if not params.get('matchType'):
|
if params.get('matchType', 'exact') == 'exact':
|
||||||
return iter([]), None
|
return iter([])
|
||||||
|
|
||||||
obj = {'key': params.get('key'),
|
cdx = {'urlkey': canonicalize(params.get('cdx_url')),
|
||||||
'mime': params.get('mime'),
|
'mime': params.get('mime'),
|
||||||
'filter': params.get('filter')
|
'filter': params.get('filter'),
|
||||||
|
'url': params.get('cdx_url'),
|
||||||
}
|
}
|
||||||
return iter([obj]), None
|
|
||||||
|
return iter([cdx])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
class TestFuzzy(object):
|
class TestFuzzy(object):
|
||||||
@classmethod
|
@classmethod
|
||||||
def setup_class(cls):
|
def setup_class(cls):
|
||||||
cls.source = EchoParamsSource()
|
cls.source = SimpleAggregator({'source': EchoParamsSource()})
|
||||||
cls.fuzzy = FuzzyMatcher('pkg://pywb/rules.yaml')
|
cls.fuzzy = FuzzyMatcher('pkg://pywb/rules.yaml')
|
||||||
|
|
||||||
def get_params(self, url, mime='text/html'):
|
def get_params(self, url, actual_url, mime='text/html'):
|
||||||
params = {'url': url,
|
params = {'url': url,
|
||||||
|
'cdx_url': actual_url,
|
||||||
'key': canonicalize(url),
|
'key': canonicalize(url),
|
||||||
'mime': mime}
|
'mime': mime}
|
||||||
return params
|
return params
|
||||||
@ -34,57 +39,123 @@ class TestFuzzy(object):
|
|||||||
filters = filters or ['~urlkey:']
|
filters = filters or ['~urlkey:']
|
||||||
exp = [{'filter': filters,
|
exp = [{'filter': filters,
|
||||||
'is_fuzzy': True,
|
'is_fuzzy': True,
|
||||||
'key': canonicalize(url),
|
'urlkey': canonicalize(url),
|
||||||
|
'source': 'source',
|
||||||
|
'url': url,
|
||||||
'mime': mime}]
|
'mime': mime}]
|
||||||
|
|
||||||
return exp
|
return exp
|
||||||
|
|
||||||
def test_no_fuzzy(self):
|
def test_no_fuzzy(self):
|
||||||
params = self.get_params('http://example.com/')
|
params = self.get_params('http://example.com/', 'http://example.com/foo')
|
||||||
cdx_iter, errs = self.fuzzy(self.source, params)
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
assert list(cdx_iter) == []
|
assert list(cdx_iter) == []
|
||||||
|
|
||||||
def test_fuzzy_1(self):
|
def test_fuzzy_no_ext_ts(self):
|
||||||
url = 'http://example.com/?_=123'
|
url = 'http://example.com/?_=123'
|
||||||
params = self.get_params(url)
|
actual_url = 'http://example.com/'
|
||||||
|
params = self.get_params(url, actual_url)
|
||||||
cdx_iter, errs = self.fuzzy(self.source, params)
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
assert list(cdx_iter) == self.get_expected(url)
|
assert list(cdx_iter) == self.get_expected(actual_url)
|
||||||
|
|
||||||
def test_fuzzy_2(self):
|
def test_fuzzy_allowed_ext(self):
|
||||||
url = 'http://example.com/somefile.html?a=b'
|
url = 'http://example.com/somefile.html?a=b'
|
||||||
params = self.get_params(url)
|
actual_url = 'http://example.com/somefile.html'
|
||||||
|
params = self.get_params(url, actual_url)
|
||||||
cdx_iter, errs = self.fuzzy(self.source, params)
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
assert list(cdx_iter) == self.get_expected(url)
|
assert list(cdx_iter) == self.get_expected(actual_url)
|
||||||
|
|
||||||
def test_fuzzy_php_cache(self):
|
def test_fuzzy_php_ts(self):
|
||||||
url = 'http://example.com/somefile.php?_=123'
|
url = 'http://example.com/somefile.php?_=123'
|
||||||
params = self.get_params(url)
|
actual_url = 'http://example.com/somefile.php'
|
||||||
|
params = self.get_params(url, actual_url)
|
||||||
cdx_iter, errs = self.fuzzy(self.source, params)
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
assert list(cdx_iter) == self.get_expected(url)
|
assert list(cdx_iter) == self.get_expected(actual_url)
|
||||||
|
|
||||||
def test_fuzzy_swf(self):
|
def test_fuzzy_mime_swf(self):
|
||||||
url = 'http://example.com/somefile.php?a=b'
|
url = 'http://example.com/somefile.php?a=b'
|
||||||
|
actual_url = 'http://example.com/somefile.php'
|
||||||
mime = 'application/x-shockwave-flash'
|
mime = 'application/x-shockwave-flash'
|
||||||
params = self.get_params(url, mime)
|
params = self.get_params(url, actual_url, mime)
|
||||||
cdx_iter, errs = self.fuzzy(self.source, params)
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
assert list(cdx_iter) == self.get_expected(url, mime)
|
assert list(cdx_iter) == self.get_expected(actual_url, mime)
|
||||||
|
|
||||||
|
def test_fuzzy_ga_utm(self):
|
||||||
|
url = 'http://example.com/someresponse?_=1234&utm_A=123&id=xyz&utm_robot=blue&utm_foo=bar&A=B&utm_id=xyz'
|
||||||
|
actual_url = 'http://example.com/someresponse?utm_B=234&id=xyz&utm_bar=foo&utm_foo=bar&_=789&A=B'
|
||||||
|
params = self.get_params(url, actual_url)
|
||||||
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
|
assert list(cdx_iter) == self.get_expected(actual_url)
|
||||||
|
|
||||||
|
def test_fuzzy_jquery(self):
|
||||||
|
url = 'http://example.com/someresponse?a=b&foocallbackname=jQuery123_456&foo=bar&_=12345&'
|
||||||
|
actual_url = 'http://example.com/someresponse?a=b&foocallbackname=jQuery789_000&foo=bar&_=789&'
|
||||||
|
params = self.get_params(url, actual_url)
|
||||||
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
|
assert list(cdx_iter) == self.get_expected(actual_url)
|
||||||
|
|
||||||
|
def test_fuzzy_jquery_2(self):
|
||||||
|
# test removal of two adjacent params
|
||||||
|
url = 'http://example.com/someresponse?_=1234&callbackname=jQuery123_456&foo=bar'
|
||||||
|
actual_url = 'http://example.com/someresponse?_=123&callbackname=jQuery789_000&foo=bar'
|
||||||
|
params = self.get_params(url, actual_url)
|
||||||
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
|
assert list(cdx_iter) == self.get_expected(actual_url)
|
||||||
|
|
||||||
def test_fuzzy_custom_rule(self):
|
def test_fuzzy_custom_rule(self):
|
||||||
url = 'http://youtube.com/get_video_info?a=b&html5=true&___abc=123&video_id=ABCD&id=1234'
|
url = 'http://youtube.com/get_video_info?a=b&html5=true&___abc=123&video_id=ABCD&id=1234'
|
||||||
params = self.get_params(url)
|
actual_url = 'http://youtube.com/get_video_info?a=d&html5=true&___abc=125&video_id=ABCD&id=1234'
|
||||||
|
params = self.get_params(url, actual_url)
|
||||||
cdx_iter, errs = self.fuzzy(self.source, params)
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
filters = ['~urlkey:html5=true', '~urlkey:video_id=abcd']
|
filters = ['~urlkey:html5=true', '~urlkey:video_id=abcd']
|
||||||
assert list(cdx_iter) == self.get_expected(url=url, filters=filters)
|
assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters)
|
||||||
|
|
||||||
def test_no_fuzzy_ext_restrict(self):
|
def test_no_fuzzy_custom_rule_video_id_diff(self):
|
||||||
url = 'http://example.com/somefile.php?a=b'
|
url = 'http://youtube.com/get_video_info?a=b&html=true&___abc=123&video_id=ABCD&id=1234'
|
||||||
params = self.get_params(url)
|
actual_url = 'http://youtube.com/get_video_info?a=d&html=true&___abc=125&video_id=ABCE&id=1234'
|
||||||
|
params = self.get_params(url, actual_url)
|
||||||
cdx_iter, errs = self.fuzzy(self.source, params)
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
assert list(cdx_iter) == []
|
assert list(cdx_iter) == []
|
||||||
|
|
||||||
|
def test_no_fuzzy_custom_rule_arg_missing(self):
|
||||||
|
url = 'http://youtube.com/get_video_info?a=b&html5=&___abc=123&video_id=ABCD&id=1234'
|
||||||
|
actual_url = 'http://youtube.com/get_video_info?a=d&html5=&___abc=125&video_id=ABCD&id=1234'
|
||||||
|
params = self.get_params(url, actual_url)
|
||||||
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
|
assert list(cdx_iter) == []
|
||||||
|
|
||||||
|
def test_no_fuzzy_ext_restrict(self):
|
||||||
|
url = 'http://example.com/somefile.php?a=b'
|
||||||
|
actual_url = 'http://example.com/'
|
||||||
|
params = self.get_params(url, actual_url)
|
||||||
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
|
assert list(cdx_iter) == []
|
||||||
|
|
||||||
|
def test_no_fuzzy_ga_utm(self):
|
||||||
|
url = 'http://example.com/someresponse?_=1234&utm_A=123&id=xyz&utm_robot=blue&utm_foo=bar&A=B&utm_id=xyz'
|
||||||
|
actual_url = 'http://example.com/someresponse?utm_B=234&id=xyw&utm_bar=foo&utm_foo=bar&_=789&A=B'
|
||||||
|
params = self.get_params(url, actual_url)
|
||||||
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
|
assert list(cdx_iter) == []
|
||||||
|
|
||||||
|
def test_no_fuzzy_jquery_1(self):
|
||||||
|
url = 'http://example.com/someresponse?a=b&foocallback=jQuer123_456&foo=bar&_=1234'
|
||||||
|
actual_url = 'http://example.com/someresponse?a=b&foocallback=jQuery789_000&foo=bar&_=123'
|
||||||
|
params = self.get_params(url, actual_url)
|
||||||
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
|
assert list(cdx_iter) == []
|
||||||
|
|
||||||
|
def test_no_fuzzy_jquery_callback_arg_mismatch(self):
|
||||||
|
url = 'http://example.com/someresponse?a=b&foodcallback=jQuery123_456&foo=bar&_=1234'
|
||||||
|
actual_url = 'http://example.com/someresponse?a=b&foocallback=jQuery789_000&foo=bar&_=123'
|
||||||
|
params = self.get_params(url, actual_url)
|
||||||
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
|
assert list(cdx_iter) == []
|
||||||
|
|
||||||
|
def test_no_fuzzy_jquery_other_arg_mismatch(self):
|
||||||
|
url = 'http://example.com/someresponse?a=b&foocallback=jQuery123_456&foo=bard&_=1234'
|
||||||
|
actual_url = 'http://example.com/someresponse?a=b&foocallback=jQuery789_000&foo=bar&_=123'
|
||||||
|
params = self.get_params(url, actual_url)
|
||||||
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
|
assert list(cdx_iter) == []
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user