From bcbc00a89b87c9a5c00e297def6da44df8684db1 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 31 Oct 2017 20:35:29 -0700 Subject: [PATCH] Fuzzy Rewrite Improvements (#263) rules system: - 'mixin' class for adding custom rewrite mixin, initialized with optional 'mixin_params' - 'force_type' to always force rewriting text type for rule match (eg. if application/octet-stream) - fuzzy rewrite: 'find_all' mode for matching via regex.findall() instead of search() - load_function moved to generic load_py_name - new rules for fb! - JSReplaceFuzzy mixin to replace content based on query (or POST) regex match - tests: tests JSReplaceFuzzy rewriting query: - append '?' for fuzzy matching if filters are set - cdx['is_fuzzy'] set to '1' instead of True client-side: rewrite - add window.Request object rewrite - improved rewrite of wb server + path, avoid double-slash - fetch() rewrite proxy_to_obj() - proxy_to_obj() null check - WombatLocation prop change, skip if prop is the same --- pywb/rewrite/content_rewriter.py | 20 +++++++- pywb/rewrite/regex_rewriters.py | 40 ++++++++++++---- pywb/rewrite/test/test_content_rewriter.py | 22 ++++++++- pywb/rules.yaml | 36 ++++++++++++++ pywb/static/wombat.js | 48 +++++++++++++++++-- pywb/utils/canonicalize.py | 13 +++++ pywb/utils/loaders.py | 9 ++++ pywb/warcserver/index/fuzzymatcher.py | 33 +++++++++---- .../index/test/test_fuzzymatcher.py | 2 +- 9 files changed, 197 insertions(+), 26 deletions(-) diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index 7e03b5a8..b24645ec 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -12,7 +12,7 @@ import json from pywb.utils.io import StreamIter, BUFF_SIZE -from pywb.utils.loaders import load_yaml_config +from pywb.utils.loaders import load_yaml_config, load_py_name # ============================================================================ @@ -55,6 +55,10 @@ class BaseContentRewriter(object): parse_rules_func = self.init_js_regex(regexs) rule['js_regex_func'] = parse_rules_func + mixin = rule.get('mixin') + if mixin: + rule['mixin'] = load_py_name(mixin) + return rule def get_rule(self, cdx): @@ -73,6 +77,11 @@ class BaseContentRewriter(object): rw_type = rule.get(text_type, text_type) rw_class = self.get_rewriter(rw_type, rwinfo) + mixin = rule.get('mixin') + if mixin: + mixin_params = rule.get('mixin_params', {}) + rw_class = type('custom_js_rewriter', (mixin, rw_class), mixin_params) + return rw_type, rw_class def create_rewriter(self, text_type, rule, rwinfo, cdx, head_insert_func=None): @@ -159,8 +168,15 @@ class BaseContentRewriter(object): rwinfo = RewriteInfo(record, self, url_rewriter, cookie_rewriter) content_rewriter = None + url_rewriter.rewrite_opts['cdx'] = cdx + + rule = self.get_rule(cdx) + + force_type = rule.get('force_type') + if force_type: + rwinfo.text_type = force_type + if rwinfo.should_rw_content(): - rule = self.get_rule(cdx) content_rewriter = self.create_rewriter(rwinfo.text_type, rule, rwinfo, cdx, head_insert_func) gen = None diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py index 1630c5cc..086a3ad7 100644 --- a/pywb/rewrite/regex_rewriters.py +++ b/pywb/rewrite/regex_rewriters.py @@ -1,14 +1,7 @@ import re from pywb.rewrite.content_rewriter import StreamingRewriter - - -# ================================================================= -def load_function(string): - import importlib - - string = string.split(':', 1) - mod = importlib.import_module(string[0]) - return getattr(mod, string[1]) +from pywb.utils.loaders import load_py_name +from six.moves.urllib.parse import unquote # ================================================================= @@ -101,7 +94,7 @@ class RegexRewriter(StreamingRewriter): if 'rewrite' in obj: replace = RegexRewriter.archival_rewrite(rewriter) elif 'function' in obj: - replace = load_function(obj['function']) + replace = load_py_name(obj['function']) else: replace = RegexRewriter.format(obj.get('replace', '{0}')) group = obj.get('group', 0) @@ -259,6 +252,33 @@ class JSWombatProxyRewriter(JSWombatProxyRewriterMixin, RegexRewriter): pass +# ================================================================= +class JSReplaceFuzzy(object): + rx_obj = None + + def __init__(self, *args, **kwargs): + super(JSReplaceFuzzy, self).__init__(*args, **kwargs) + if not self.rx_obj: + self.rx_obj = re.compile(self.rx) + + def rewrite(self, string): + string = super(JSReplaceFuzzy, self).rewrite(string) + cdx = self.url_rewriter.rewrite_opts['cdx'] + if cdx.get('is_fuzzy'): + expected = unquote(cdx['url']) + actual = unquote(self.url_rewriter.wburl.url) + + exp_m = self.rx_obj.search(expected) + act_m = self.rx_obj.search(actual) + + if exp_m and act_m: + result = string.replace(exp_m.group(1), act_m.group(1)) + if result != string: + string = result + + return string + + # ================================================================= # Set 'default' JSRewriter JSRewriter = JSLinkAndLocationRewriter diff --git a/pywb/rewrite/test/test_content_rewriter.py b/pywb/rewrite/test/test_content_rewriter.py index 56f9ccc5..ba83e166 100644 --- a/pywb/rewrite/test/test_content_rewriter.py +++ b/pywb/rewrite/test/test_content_rewriter.py @@ -50,17 +50,20 @@ class TestContentRewriter(object): warc_headers_dict=warc_headers) def rewrite_record(self, headers, content, ts, url='http://example.com/', - prefix='http://localhost:8080/prefix/', warc_headers=None): + prefix='http://localhost:8080/prefix/', warc_headers=None, + request_url=None): record = self._create_response_record(url, headers, content, warc_headers) - wburl = WbUrl(ts + '/' + url) + wburl = WbUrl(ts + '/' + (request_url or url)) url_rewriter = UrlRewriter(wburl, prefix) cdx = CDXObject() cdx['url'] = url cdx['timestamp'] = ts cdx['urlkey'] = canonicalize(url) + if request_url != url: + cdx['is_fuzzy'] = '1' return self.content_rewriter(record, url_rewriter, None, cdx=cdx) @@ -254,6 +257,21 @@ class TestContentRewriter(object): assert b''.join(gen).decode('utf-8') == content + def test_custom_fuzzy_replace(self): + headers = {'Content-Type': 'application/octet-stream'} + content = '{"ssid":"1234"}' + + actual_url = 'http://facebook.com/ajax/pagelet/generic.php/photoviewerinitpagelet?data="ssid":1234' + request_url = 'http://facebook.com/ajax/pagelet/generic.php/photoviewerinitpagelet?data="ssid":5678' + + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_', + url=actual_url, + request_url=request_url) + + assert headers.headers == [('Content-Type', 'application/octet-stream')] + + assert b''.join(gen).decode('utf-8') == '{"ssid":"5678"}' + def test_hls_default_max(self): headers = {'Content-Type': 'application/vnd.apple.mpegurl'} with open(os.path.join(get_test_dir(), 'text_content', 'sample_hls.m3u8'), 'rt') as fh: diff --git a/pywb/rules.yaml b/pywb/rules.yaml index 4ade4f3c..40e17442 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -64,11 +64,38 @@ rules: # facebook rules #================================================================= + - url_prefix: 'com,facebook)/ajax/pagelet/generic.php/photoviewerinitpagelet' + + rewrite: + mixin: 'pywb.rewrite.regex_rewriters:JSReplaceFuzzy' + mixin_params: + rx: '"ssid":([\d]+)' + + force_type: 'json' + + fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|(?:query_type|fbid)[^,]+))' + - url_prefix: 'com,facebook)/ajax/pagelet/generic.php/' #fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))' fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|(?:query_type|fbid)[^,]+))' + - url_prefix: 'com,facebook)/ajax/ufi/reply_fetch.php' + + fuzzy_lookup: + - 'ft_ent_identifier' + - 'parent_comment_ids[0]' + - lsd + + - url_prefix: 'com,facebook)/ajax/ufi/comment_fetch.php' + + fuzzy_lookup: + - 'source' + - 'offset' + - 'length' + - 'ft_ent_identifier' + - 'feed_context' + - url_prefix: 'com,facebook)/ajax/ufi/' fuzzy_lookup: @@ -97,7 +124,16 @@ rules: fuzzy_lookup: '([?&][^_]\w+=[^&]+)+' + - url_prefix: 'com,facebook)/api/graphqlbatch' + + fuzzy_lookup: + match: '("q[\d]+":|after:\\"[^"]+)' + find_all: true + - url_prefix: 'com,facebook)/' + + fuzzy_lookup: '([?&][^_]\w+=[^&]+)+' + rewrite: js_regexs: - match: 'Bootloader\.configurePage.*?;' diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 5449c128..0642a006 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -279,7 +279,10 @@ var _WBWombat = function($wbwindow, wbinfo) { } else { url = ""; } - url += "/" + path; + if (path && path[0] != "/") { + url += "/"; + } + url += path; } return url; @@ -516,6 +519,10 @@ var _WBWombat = function($wbwindow, wbinfo) { return; } + if (this["_" + prop] == value) { + return; + } + this["_" + prop] = value; if (!this._parser) { @@ -873,10 +880,44 @@ var _WBWombat = function($wbwindow, wbinfo) { init_opts = init_opts || {}; init_opts["credentials"] = "include"; - return orig_fetch.call(this, input, init_opts); + return orig_fetch.call(proxy_to_obj(this), input, init_opts); } } + + //============================================ + function init_request_override() + { + var orig_request = $wbwindow.Request; + + if (!orig_request) { + return; + } + + $wbwindow.Request = (function (Request) { + return function(input, init_opts) { + if (typeof(input) === "string") { + input = rewrite_url(input); + } else if (typeof(input) === "object" && input.url) { + var new_url = rewrite_url(input.url); + + if (new_url != input.url) { + // input = new Request(new_url, input); + input.url = new_url; + } + } + + init_opts = init_opts || {}; + init_opts["credentials"] = "include"; + + return new Request(input, init_opts); + } + + })($wbwindow.Request); + + $wbwindow.Request.prototype = orig_request.prototype; + } + //============================================ function override_prop_extract(proto, prop, cond) { var orig_getter = get_orig_getter(proto, prop); @@ -2767,7 +2808,7 @@ var _WBWombat = function($wbwindow, wbinfo) { //============================================ function proxy_to_obj(source) { try { - return source.__WBProxyRealObj__ || source; + return (source && source.__WBProxyRealObj__) || source; } catch (e) { return source; } @@ -2997,6 +3038,7 @@ var _WBWombat = function($wbwindow, wbinfo) { // Fetch init_fetch_rewrite(); + init_request_override(); // Worker override (experimental) init_web_worker_override(); diff --git a/pywb/utils/canonicalize.py b/pywb/utils/canonicalize.py index 2eab5f32..d04a7802 100644 --- a/pywb/utils/canonicalize.py +++ b/pywb/utils/canonicalize.py @@ -108,6 +108,16 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None): >>> calc_search_range('http://example.com/path/file.html', 'prefix') ('com,example)/path/file.html', 'com,example)/path/file.htmm') + # slash and ? + >>> calc_search_range('http://example.com/path/', 'prefix') + ('com,example)/path/', 'com,example)/path0') + + >>> calc_search_range('http://example.com/path?', 'prefix') + ('com,example)/path?', 'com,example)/path@') + + >>> calc_search_range('http://example.com/path/?', 'prefix') + ('com,example)/path?', 'com,example)/path@') + >>> calc_search_range('http://example.com/path/file.html', 'host') ('com,example)/', 'com,example*') @@ -158,6 +168,9 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None): if url.endswith('/') and not start_key.endswith('/'): start_key += '/' + if url.endswith('?') and not start_key.endswith('?'): + start_key += '?' + end_key = inc_last_char(start_key) elif match_type == 'host': diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index dcda32d4..d03582c9 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -30,6 +30,15 @@ except ImportError: #pragma: no cover s3_avail = False +# ================================================================= +def load_py_name(string): + import importlib + + string = string.split(':', 1) + mod = importlib.import_module(string[0]) + return getattr(mod, string[1]) + + #================================================================= def is_http(filename): return filename.startswith(('http://', 'https://')) diff --git a/pywb/warcserver/index/fuzzymatcher.py b/pywb/warcserver/index/fuzzymatcher.py index 47289401..d103cbc1 100644 --- a/pywb/warcserver/index/fuzzymatcher.py +++ b/pywb/warcserver/index/fuzzymatcher.py @@ -13,7 +13,7 @@ from collections import namedtuple # ============================================================================ FuzzyRule = namedtuple('FuzzyRule', 'url_prefix, regex, replace_after, filter_str, ' + - 'match_type') + 'match_type, find_all') # ============================================================================ @@ -54,14 +54,16 @@ class FuzzyMatcher(object): replace_after = self.DEFAULT_REPLACE_AFTER filter_str = self.DEFAULT_FILTER match_type = self.DEFAULT_MATCH_TYPE + find_all = False else: regex = self.make_regex(config.get('match')) replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER) filter_str = config.get('filter', self.DEFAULT_FILTER) match_type = config.get('type', self.DEFAULT_MATCH_TYPE) + find_all = config.get('find_all', False) - return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type) + return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type, find_all) def get_fuzzy_match(self, urlkey, params): filters = set() @@ -71,12 +73,18 @@ class FuzzyMatcher(object): if not any((urlkey.startswith(prefix) for prefix in rule.url_prefix)): continue - m = rule.regex.search(urlkey) - if not m: + groups = None + if rule.find_all: + groups = rule.regex.findall(urlkey) + else: + m = rule.regex.search(urlkey) + groups = m and m.groups() + + if not groups: continue matched_rule = rule - for g in m.groups(): + for g in groups: for f in matched_rule.filter_str: filters.add(f.format(g)) @@ -87,9 +95,18 @@ class FuzzyMatcher(object): url = params['url'] + # support matching w/o query if no additional filters + # don't include trailing '?' if no filters and replace_after '?' + no_filters = (filters == {'urlkey:'}) and (matched_rule.replace_after == '?') + inx = url.find(matched_rule.replace_after) if inx > 0: - url = url[:inx + len(matched_rule.replace_after)] + length = inx + len(matched_rule.replace_after) + if no_filters: + length -= 1 + url = url[:length] + elif not no_filters: + url += matched_rule.replace_after[0] if matched_rule.match_type == 'domain': host = urlsplit(url).netloc @@ -98,7 +115,7 @@ class FuzzyMatcher(object): fuzzy_params = {'url': url, 'matchType': matched_rule.match_type, 'filter': filters, - 'is_fuzzy': True} + 'is_fuzzy': '1'} for key in iterkeys(params): if key not in self.FUZZY_SKIP_PARAMS: @@ -157,7 +174,7 @@ class FuzzyMatcher(object): for cdx in new_iter: if is_custom or self.match_general_fuzzy_query(url, urlkey, cdx, rx_cache): - cdx['is_fuzzy'] = True + cdx['is_fuzzy'] = '1' yield cdx def match_general_fuzzy_query(self, url, urlkey, cdx, rx_cache): diff --git a/pywb/warcserver/index/test/test_fuzzymatcher.py b/pywb/warcserver/index/test/test_fuzzymatcher.py index 08f552e2..8bf16f90 100644 --- a/pywb/warcserver/index/test/test_fuzzymatcher.py +++ b/pywb/warcserver/index/test/test_fuzzymatcher.py @@ -38,7 +38,7 @@ class TestFuzzy(object): def get_expected(self, url, mime='text/html', filters=None): filters = filters or {'urlkey:'} exp = [{'filter': filters, - 'is_fuzzy': True, + 'is_fuzzy': '1', 'urlkey': canonicalize(url), 'source': 'source', 'source-coll': 'source',