From 1360723f959ae4d31801d322269034d75ba6e9e0 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 21 Aug 2017 11:01:31 -0700 Subject: [PATCH] Fuzzy Rules Improvements (#231) * separate default rules config for query matching: 'not_exts', 'mimes', and new 'url_normalize' - regexes in 'url_normalize' applied on each cdx entry to see if there's a match with requested url - jsonp: allow for '/* */' comments prefix in jsonp (experimental) - fuzzy rule: add rule for '\w+=jquery[\d]+' collapsing, supports any callback name - fuzzy rule: add rule for more generic 'cache busting' params, 'bust' in name, possible timestamp in value (experimental) - fuzzy rule add: add ga utm_* rule & tests tests: improve fuzzy matcher tests to use indexing system, test all new rules tests: add jsonp_rewriter tests config: use_js_obj_proxy=true in default config.yaml, setting added to each collection's metadata --- config.yaml | 1 + pywb/apps/frontendapp.py | 3 + pywb/rewrite/jsonp_rewriter.py | 2 +- pywb/rewrite/test/test_jsonp_rewriter.py | 44 ++++++ pywb/rules.yaml | 59 +++++--- pywb/warcserver/index/fuzzymatcher.py | 66 +++++---- .../index/test/test_fuzzymatcher.py | 135 +++++++++++++----- 7 files changed, 222 insertions(+), 88 deletions(-) create mode 100644 pywb/rewrite/test/test_jsonp_rewriter.py diff --git a/config.yaml b/config.yaml index 89e01314..752c8a12 100644 --- a/config.yaml +++ b/config.yaml @@ -2,6 +2,7 @@ # ======================================== # # Settings for each collection +use_js_obj_proxy: true collections: diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index 9336b823..c96eef49 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -100,6 +100,9 @@ class FrontEndApp(object): def get_metadata(self, coll): metadata = {'coll': coll} + if self.warcserver.config.get('use_js_obj_proxy'): + metadata['use_js_obj_proxy'] = True + if coll in self.warcserver.list_fixed_routes(): metadata.update(self.warcserver.get_coll_config(coll)) metadata['type'] = 'replay-fixed' diff --git a/pywb/rewrite/jsonp_rewriter.py b/pywb/rewrite/jsonp_rewriter.py index 6d3325b1..5a262714 100644 --- a/pywb/rewrite/jsonp_rewriter.py +++ b/pywb/rewrite/jsonp_rewriter.py @@ -4,7 +4,7 @@ from pywb.rewrite.content_rewriter import StreamingRewriter # ============================================================================ class JSONPRewriter(StreamingRewriter): - JSONP = re.compile(r'^(\w+)\(\{') + JSONP = re.compile(r'^(?:\s*\/\*(?:.*)\*\/)*\s*(\w+)\(\{') CALLBACK = re.compile(r'[?].*callback=([^&]+)') def rewrite(self, string): diff --git a/pywb/rewrite/test/test_jsonp_rewriter.py b/pywb/rewrite/test/test_jsonp_rewriter.py new file mode 100644 index 00000000..64df648b --- /dev/null +++ b/pywb/rewrite/test/test_jsonp_rewriter.py @@ -0,0 +1,44 @@ +from pywb.rewrite.jsonp_rewriter import JSONPRewriter +from pywb.rewrite.url_rewriter import UrlRewriter + +class TestJSONPRewriter(object): + @classmethod + def setup_class(cls): + urlrewriter = UrlRewriter('20161226/http://example.com/?callback=jQuery_ABC', '/web/', 'https://localhost/web/') + cls.rewriter = JSONPRewriter(urlrewriter) + + urlrewriter = UrlRewriter('20161226/http://example.com/', '/web/', 'https://localhost/web/') + cls.rewriter_no_cb = JSONPRewriter(urlrewriter) + + def test_jsonp_rewrite_1(self): + string = 'jQuery_1234({"foo": "bar", "some": "data"})' + expect = 'jQuery_ABC({"foo": "bar", "some": "data"})' + assert self.rewriter.rewrite(string) == expect + + def test_jsonp_rewrite_2(self): + string = ' /**/ jQuery_1234({"foo": "bar", "some": "data"})' + expect = 'jQuery_ABC({"foo": "bar", "some": "data"})' + assert self.rewriter.rewrite(string) == expect + + def test_jsonp_rewrite_3(self): + string = ' /* some comment */ jQuery_1234({"foo": "bar", "some": "data"})' + expect = 'jQuery_ABC({"foo": "bar", "some": "data"})' + assert self.rewriter.rewrite(string) == expect + + def test_no_jsonp_rewrite_1(self): + string = ' /* comment jQuery_1234({"foo": "bar", "some": "data"})' + assert self.rewriter.rewrite(string) == string + + def test_no_jsonp_rewrite_2(self): + string = 'function jQuery_1234({"foo": "bar", "some": "data"})' + assert self.rewriter.rewrite(string) == string + + def test_no_jsonp_rewrite_3(self): + string = 'var foo = ({"foo": "bar", "some": "data"})' + assert self.rewriter.rewrite(string) == string + + def test_no_jsonp_rewrite_no_callback_1(self): + string = 'jQuery_1234({"foo": "bar", "some": "data"})' + assert self.rewriter_no_cb.rewrite(string) == string + + diff --git a/pywb/rules.yaml b/pywb/rules.yaml index 96bf68ba..2266cb0d 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -1,5 +1,42 @@ -rules: +# Default Filters +default_filters: + # exts that should *not* be treated as files (ignore all query args) + not_exts: + - asp + - aspx + - jsp + - php + - pl + - exe + - dll + # ignore query args for the following mime types + mimes: + - 'application/dash+xml' + - 'application/x-shockwave-flash' + + # apply following url normalization rules + # on both match url and request url + # to find a match (not limited to query argument removal) + url_normalize: + # remove known cache busting args + - match: '[?&](_|cb|uncache)=([\d]+)(?=&|$)' + replace: '' + + # GA cache busting params + - match: '[?&]utm_[^=]+=[^&]+(?=&|$)' + replace: '' + + # remove jquery callback dynamic timestamp + - match: '[?&]((?:\w+)=jquery)[\d]+_[\d]+' + replace: '\1' + + # remove more generic cache-busting params: + # name contains 'bust', value appears to be a timestamp + - match: '[?&](\w*bust\w*=1[\d]{12,15})(?=&|$)' + replace: '' + +rules: # twitter rules #================================================================= @@ -348,24 +385,4 @@ rules: - url_prefix: '' fuzzy_lookup: match: '()' - match_filters: - - not_ext: - - asp - - aspx - - jsp - - php - - pl - - exe - - dll - - match: '()' - - - mime: 'application/dash+xml' - match: '()' - - - mime: 'application/x-shockwave-flash' - match: '()' - - - mime: '*' - match: '(.*)[&?](?:_|cb|uncache)=[\d]+[&]?' diff --git a/pywb/warcserver/index/fuzzymatcher.py b/pywb/warcserver/index/fuzzymatcher.py index 517bc288..8c0827bd 100644 --- a/pywb/warcserver/index/fuzzymatcher.py +++ b/pywb/warcserver/index/fuzzymatcher.py @@ -12,7 +12,7 @@ from collections import namedtuple # ============================================================================ FuzzyRule = namedtuple('FuzzyRule', 'url_prefix, regex, replace_after, filter_str, ' + - 'match_type, match_filters') + 'match_type') # ============================================================================ @@ -32,6 +32,10 @@ class FuzzyMatcher(object): if rule: self.rules.append(rule) + self.default_filters = config.get('default_filters') + + self.url_normalize_rx = [(re.compile(rule['match']), rule['replace']) for rule in self.default_filters['url_normalize']] + def parse_fuzzy_rule(self, rule): """ Parse rules using all the different supported forms """ @@ -48,32 +52,16 @@ class FuzzyMatcher(object): replace_after = self.DEFAULT_REPLACE_AFTER filter_str = self.DEFAULT_FILTER match_type = self.DEFAULT_MATCH_TYPE - match_filters = None else: regex = self.make_regex(config.get('match')) replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER) filter_str = config.get('filter', self.DEFAULT_FILTER) match_type = config.get('type', self.DEFAULT_MATCH_TYPE) - match_filters = self._init_match_filters(config.get('match_filters')) - return FuzzyRule(url_prefix, regex, replace_after, filter_str, - match_type, match_filters) - - def _init_match_filters(self, filter_config): - if not filter_config: - return - - filters = [] - for filter_ in filter_config: - filter_['match'] = re.compile(filter_['match']) - filters.append(filter_) - - return filters - - def get_fuzzy_match(self, params): - urlkey = to_native_str(params['key'], 'utf-8') + return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type) + def get_fuzzy_match(self, urlkey, params): filters = [] matched_rule = None @@ -151,8 +139,9 @@ class FuzzyMatcher(object): return url = params['url'] + urlkey = to_native_str(params['key'], 'utf-8') - res = self.get_fuzzy_match(params) + res = self.get_fuzzy_match(urlkey, params) if not res: return @@ -160,30 +149,39 @@ class FuzzyMatcher(object): new_iter, errs = index_source(fuzzy_params) + is_custom = (rule.url_prefix != ['']) + + rx_cache = {} + for cdx in new_iter: - if self.allow_fuzzy_result(rule, url, cdx): + if is_custom or self.match_general_fuzzy_query(url, urlkey, cdx, rx_cache): cdx['is_fuzzy'] = True yield cdx - def allow_fuzzy_result(self, rule, url, cdx): - if not rule.match_filters: + def match_general_fuzzy_query(self, url, urlkey, cdx, rx_cache): + # check ext + ext = self.get_ext(url) + if ext and ext not in self.default_filters['not_exts']: return True + # check mime mime = cdx.get('mime') - if not mime: - return False + if mime and mime in self.default_filters['mimes']: + return True - for match_filter in rule.match_filters: - not_ext = match_filter.get('not_ext') - if not_ext: - ext = self.get_ext(url) - if not ext or ext in not_ext: - continue + match_urlkey = cdx['urlkey'] - elif match_filter.get('mime', '--') not in (mime, '*'): - continue + for normalize_rx in self.url_normalize_rx: + match_urlkey = re.sub(normalize_rx[0], normalize_rx[1], match_urlkey) + curr_urlkey = rx_cache.get(normalize_rx[0]) - return match_filter['match'].search(url) + if not curr_urlkey: + curr_urlkey = re.sub(normalize_rx[0], normalize_rx[1], urlkey) + rx_cache[normalize_rx[0]] = curr_urlkey + urlkey = curr_urlkey + + if curr_urlkey == match_urlkey: + return True return False diff --git a/pywb/warcserver/index/test/test_fuzzymatcher.py b/pywb/warcserver/index/test/test_fuzzymatcher.py index 5be2ab18..66e3e167 100644 --- a/pywb/warcserver/index/test/test_fuzzymatcher.py +++ b/pywb/warcserver/index/test/test_fuzzymatcher.py @@ -1,31 +1,36 @@ from pywb.warcserver.index.fuzzymatcher import FuzzyMatcher from pywb.utils.canonicalize import canonicalize -class EchoParamsSource(object): - def __call__(self, params): +from pywb.warcserver.index.aggregator import SimpleAggregator +from pywb.warcserver.index.indexsource import BaseIndexSource + + +# ============================================================================ +class EchoParamsSource(BaseIndexSource): + def load_index(self, params): # return nothing for exact match to force fuzzy - if not params.get('matchType'): - return iter([]), None + if params.get('matchType', 'exact') == 'exact': + return iter([]) - obj = {'key': params.get('key'), + cdx = {'urlkey': canonicalize(params.get('cdx_url')), 'mime': params.get('mime'), - 'filter': params.get('filter') + 'filter': params.get('filter'), + 'url': params.get('cdx_url'), } - return iter([obj]), None - - - - + + return iter([cdx]) +# ============================================================================ class TestFuzzy(object): @classmethod def setup_class(cls): - cls.source = EchoParamsSource() + cls.source = SimpleAggregator({'source': EchoParamsSource()}) cls.fuzzy = FuzzyMatcher('pkg://pywb/rules.yaml') - def get_params(self, url, mime='text/html'): + def get_params(self, url, actual_url, mime='text/html'): params = {'url': url, + 'cdx_url': actual_url, 'key': canonicalize(url), 'mime': mime} return params @@ -34,57 +39,123 @@ class TestFuzzy(object): filters = filters or ['~urlkey:'] exp = [{'filter': filters, 'is_fuzzy': True, - 'key': canonicalize(url), + 'urlkey': canonicalize(url), + 'source': 'source', + 'url': url, 'mime': mime}] return exp def test_no_fuzzy(self): - params = self.get_params('http://example.com/') + params = self.get_params('http://example.com/', 'http://example.com/foo') cdx_iter, errs = self.fuzzy(self.source, params) assert list(cdx_iter) == [] - def test_fuzzy_1(self): + def test_fuzzy_no_ext_ts(self): url = 'http://example.com/?_=123' - params = self.get_params(url) + actual_url = 'http://example.com/' + params = self.get_params(url, actual_url) cdx_iter, errs = self.fuzzy(self.source, params) - assert list(cdx_iter) == self.get_expected(url) + assert list(cdx_iter) == self.get_expected(actual_url) - def test_fuzzy_2(self): + def test_fuzzy_allowed_ext(self): url = 'http://example.com/somefile.html?a=b' - params = self.get_params(url) + actual_url = 'http://example.com/somefile.html' + params = self.get_params(url, actual_url) cdx_iter, errs = self.fuzzy(self.source, params) - assert list(cdx_iter) == self.get_expected(url) + assert list(cdx_iter) == self.get_expected(actual_url) - def test_fuzzy_php_cache(self): + def test_fuzzy_php_ts(self): url = 'http://example.com/somefile.php?_=123' - params = self.get_params(url) + actual_url = 'http://example.com/somefile.php' + params = self.get_params(url, actual_url) cdx_iter, errs = self.fuzzy(self.source, params) - assert list(cdx_iter) == self.get_expected(url) + assert list(cdx_iter) == self.get_expected(actual_url) - def test_fuzzy_swf(self): + def test_fuzzy_mime_swf(self): url = 'http://example.com/somefile.php?a=b' + actual_url = 'http://example.com/somefile.php' mime = 'application/x-shockwave-flash' - params = self.get_params(url, mime) + params = self.get_params(url, actual_url, mime) cdx_iter, errs = self.fuzzy(self.source, params) - assert list(cdx_iter) == self.get_expected(url, mime) + assert list(cdx_iter) == self.get_expected(actual_url, mime) + + def test_fuzzy_ga_utm(self): + url = 'http://example.com/someresponse?_=1234&utm_A=123&id=xyz&utm_robot=blue&utm_foo=bar&A=B&utm_id=xyz' + actual_url = 'http://example.com/someresponse?utm_B=234&id=xyz&utm_bar=foo&utm_foo=bar&_=789&A=B' + params = self.get_params(url, actual_url) + cdx_iter, errs = self.fuzzy(self.source, params) + assert list(cdx_iter) == self.get_expected(actual_url) + + def test_fuzzy_jquery(self): + url = 'http://example.com/someresponse?a=b&foocallbackname=jQuery123_456&foo=bar&_=12345&' + actual_url = 'http://example.com/someresponse?a=b&foocallbackname=jQuery789_000&foo=bar&_=789&' + params = self.get_params(url, actual_url) + cdx_iter, errs = self.fuzzy(self.source, params) + assert list(cdx_iter) == self.get_expected(actual_url) + + def test_fuzzy_jquery_2(self): + # test removal of two adjacent params + url = 'http://example.com/someresponse?_=1234&callbackname=jQuery123_456&foo=bar' + actual_url = 'http://example.com/someresponse?_=123&callbackname=jQuery789_000&foo=bar' + params = self.get_params(url, actual_url) + cdx_iter, errs = self.fuzzy(self.source, params) + assert list(cdx_iter) == self.get_expected(actual_url) def test_fuzzy_custom_rule(self): url = 'http://youtube.com/get_video_info?a=b&html5=true&___abc=123&video_id=ABCD&id=1234' - params = self.get_params(url) + actual_url = 'http://youtube.com/get_video_info?a=d&html5=true&___abc=125&video_id=ABCD&id=1234' + params = self.get_params(url, actual_url) cdx_iter, errs = self.fuzzy(self.source, params) filters = ['~urlkey:html5=true', '~urlkey:video_id=abcd'] - assert list(cdx_iter) == self.get_expected(url=url, filters=filters) + assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters) - def test_no_fuzzy_ext_restrict(self): - url = 'http://example.com/somefile.php?a=b' - params = self.get_params(url) + def test_no_fuzzy_custom_rule_video_id_diff(self): + url = 'http://youtube.com/get_video_info?a=b&html=true&___abc=123&video_id=ABCD&id=1234' + actual_url = 'http://youtube.com/get_video_info?a=d&html=true&___abc=125&video_id=ABCE&id=1234' + params = self.get_params(url, actual_url) cdx_iter, errs = self.fuzzy(self.source, params) assert list(cdx_iter) == [] + def test_no_fuzzy_custom_rule_arg_missing(self): + url = 'http://youtube.com/get_video_info?a=b&html5=&___abc=123&video_id=ABCD&id=1234' + actual_url = 'http://youtube.com/get_video_info?a=d&html5=&___abc=125&video_id=ABCD&id=1234' + params = self.get_params(url, actual_url) + cdx_iter, errs = self.fuzzy(self.source, params) + assert list(cdx_iter) == [] + def test_no_fuzzy_ext_restrict(self): + url = 'http://example.com/somefile.php?a=b' + actual_url = 'http://example.com/' + params = self.get_params(url, actual_url) + cdx_iter, errs = self.fuzzy(self.source, params) + assert list(cdx_iter) == [] + def test_no_fuzzy_ga_utm(self): + url = 'http://example.com/someresponse?_=1234&utm_A=123&id=xyz&utm_robot=blue&utm_foo=bar&A=B&utm_id=xyz' + actual_url = 'http://example.com/someresponse?utm_B=234&id=xyw&utm_bar=foo&utm_foo=bar&_=789&A=B' + params = self.get_params(url, actual_url) + cdx_iter, errs = self.fuzzy(self.source, params) + assert list(cdx_iter) == [] + def test_no_fuzzy_jquery_1(self): + url = 'http://example.com/someresponse?a=b&foocallback=jQuer123_456&foo=bar&_=1234' + actual_url = 'http://example.com/someresponse?a=b&foocallback=jQuery789_000&foo=bar&_=123' + params = self.get_params(url, actual_url) + cdx_iter, errs = self.fuzzy(self.source, params) + assert list(cdx_iter) == [] + def test_no_fuzzy_jquery_callback_arg_mismatch(self): + url = 'http://example.com/someresponse?a=b&foodcallback=jQuery123_456&foo=bar&_=1234' + actual_url = 'http://example.com/someresponse?a=b&foocallback=jQuery789_000&foo=bar&_=123' + params = self.get_params(url, actual_url) + cdx_iter, errs = self.fuzzy(self.source, params) + assert list(cdx_iter) == [] + def test_no_fuzzy_jquery_other_arg_mismatch(self): + url = 'http://example.com/someresponse?a=b&foocallback=jQuery123_456&foo=bard&_=1234' + actual_url = 'http://example.com/someresponse?a=b&foocallback=jQuery789_000&foo=bar&_=123' + params = self.get_params(url, actual_url) + cdx_iter, errs = self.fuzzy(self.source, params) + assert list(cdx_iter) == []