From bcbc00a89b87c9a5c00e297def6da44df8684db1 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@users.noreply.github.com>
Date: Tue, 31 Oct 2017 20:35:29 -0700
Subject: [PATCH] Fuzzy Rewrite Improvements (#263)

rules system:
- 'mixin' class for adding custom rewrite mixin, initialized with optional 'mixin_params'
- 'force_type' to always force rewriting text type for rule match (eg. if application/octet-stream)
- fuzzy rewrite: 'find_all' mode for matching via regex.findall() instead of search()
- load_function moved to generic load_py_name
- new rules for fb!
- JSReplaceFuzzy mixin to replace content based on query (or POST) regex match
- tests: tests JSReplaceFuzzy rewriting

query:
- append '?' for fuzzy matching if filters are set
- cdx['is_fuzzy'] set to '1' instead of True

client-side: rewrite
- add window.Request object rewrite
- improved rewrite of wb server + path, avoid double-slash
- fetch() rewrite proxy_to_obj()
- proxy_to_obj() null check
- WombatLocation prop change, skip if prop is the same
---
 pywb/rewrite/content_rewriter.py              | 20 +++++++-
 pywb/rewrite/regex_rewriters.py               | 40 ++++++++++++----
 pywb/rewrite/test/test_content_rewriter.py    | 22 ++++++++-
 pywb/rules.yaml                               | 36 ++++++++++++++
 pywb/static/wombat.js                         | 48 +++++++++++++++++--
 pywb/utils/canonicalize.py                    | 13 +++++
 pywb/utils/loaders.py                         |  9 ++++
 pywb/warcserver/index/fuzzymatcher.py         | 33 +++++++++----
 .../index/test/test_fuzzymatcher.py           |  2 +-
 9 files changed, 197 insertions(+), 26 deletions(-)

diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py
index 7e03b5a8..b24645ec 100644
--- a/pywb/rewrite/content_rewriter.py
+++ b/pywb/rewrite/content_rewriter.py
@@ -12,7 +12,7 @@ import json
 
 from pywb.utils.io import StreamIter, BUFF_SIZE
 
-from pywb.utils.loaders import load_yaml_config
+from pywb.utils.loaders import load_yaml_config, load_py_name
 
 
 # ============================================================================
@@ -55,6 +55,10 @@ class BaseContentRewriter(object):
             parse_rules_func = self.init_js_regex(regexs)
             rule['js_regex_func'] = parse_rules_func
 
+        mixin = rule.get('mixin')
+        if mixin:
+            rule['mixin'] = load_py_name(mixin)
+
         return rule
 
     def get_rule(self, cdx):
@@ -73,6 +77,11 @@ class BaseContentRewriter(object):
         rw_type = rule.get(text_type, text_type)
         rw_class = self.get_rewriter(rw_type, rwinfo)
 
+        mixin = rule.get('mixin')
+        if mixin:
+            mixin_params = rule.get('mixin_params', {})
+            rw_class = type('custom_js_rewriter', (mixin, rw_class), mixin_params)
+
         return rw_type, rw_class
 
     def create_rewriter(self, text_type, rule, rwinfo, cdx, head_insert_func=None):
@@ -159,8 +168,15 @@ class BaseContentRewriter(object):
         rwinfo = RewriteInfo(record, self, url_rewriter, cookie_rewriter)
         content_rewriter = None
 
+        url_rewriter.rewrite_opts['cdx'] = cdx
+
+        rule = self.get_rule(cdx)
+
+        force_type = rule.get('force_type')
+        if force_type:
+            rwinfo.text_type = force_type
+
         if rwinfo.should_rw_content():
-            rule = self.get_rule(cdx)
             content_rewriter = self.create_rewriter(rwinfo.text_type, rule, rwinfo, cdx, head_insert_func)
 
         gen = None
diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py
index 1630c5cc..086a3ad7 100644
--- a/pywb/rewrite/regex_rewriters.py
+++ b/pywb/rewrite/regex_rewriters.py
@@ -1,14 +1,7 @@
 import re
 from pywb.rewrite.content_rewriter import StreamingRewriter
-
-
-# =================================================================
-def load_function(string):
-    import importlib
-
-    string = string.split(':', 1)
-    mod = importlib.import_module(string[0])
-    return getattr(mod, string[1])
+from pywb.utils.loaders import load_py_name
+from six.moves.urllib.parse import unquote
 
 
 # =================================================================
@@ -101,7 +94,7 @@ class RegexRewriter(StreamingRewriter):
                 if 'rewrite' in obj:
                     replace = RegexRewriter.archival_rewrite(rewriter)
                 elif 'function' in obj:
-                    replace = load_function(obj['function'])
+                    replace = load_py_name(obj['function'])
                 else:
                     replace = RegexRewriter.format(obj.get('replace', '{0}'))
                 group = obj.get('group', 0)
@@ -259,6 +252,33 @@ class JSWombatProxyRewriter(JSWombatProxyRewriterMixin, RegexRewriter):
     pass
 
 
+# =================================================================
+class JSReplaceFuzzy(object):
+    rx_obj = None
+
+    def __init__(self, *args, **kwargs):
+        super(JSReplaceFuzzy, self).__init__(*args, **kwargs)
+        if not self.rx_obj:
+            self.rx_obj = re.compile(self.rx)
+
+    def rewrite(self, string):
+        string = super(JSReplaceFuzzy, self).rewrite(string)
+        cdx = self.url_rewriter.rewrite_opts['cdx']
+        if cdx.get('is_fuzzy'):
+            expected = unquote(cdx['url'])
+            actual = unquote(self.url_rewriter.wburl.url)
+
+            exp_m = self.rx_obj.search(expected)
+            act_m = self.rx_obj.search(actual)
+
+            if exp_m and act_m:
+                result = string.replace(exp_m.group(1), act_m.group(1))
+                if result != string:
+                    string = result
+
+        return string
+
+
 # =================================================================
 # Set 'default' JSRewriter
 JSRewriter = JSLinkAndLocationRewriter
diff --git a/pywb/rewrite/test/test_content_rewriter.py b/pywb/rewrite/test/test_content_rewriter.py
index 56f9ccc5..ba83e166 100644
--- a/pywb/rewrite/test/test_content_rewriter.py
+++ b/pywb/rewrite/test/test_content_rewriter.py
@@ -50,17 +50,20 @@ class TestContentRewriter(object):
                                          warc_headers_dict=warc_headers)
 
     def rewrite_record(self, headers, content, ts, url='http://example.com/',
-                       prefix='http://localhost:8080/prefix/', warc_headers=None):
+                       prefix='http://localhost:8080/prefix/', warc_headers=None,
+                       request_url=None):
 
         record = self._create_response_record(url, headers, content, warc_headers)
 
-        wburl = WbUrl(ts + '/' + url)
+        wburl = WbUrl(ts + '/' + (request_url or url))
         url_rewriter = UrlRewriter(wburl, prefix)
 
         cdx = CDXObject()
         cdx['url'] = url
         cdx['timestamp'] = ts
         cdx['urlkey'] = canonicalize(url)
+        if request_url != url:
+            cdx['is_fuzzy'] = '1'
 
         return self.content_rewriter(record, url_rewriter, None, cdx=cdx)
 
@@ -254,6 +257,21 @@ class TestContentRewriter(object):
 
         assert b''.join(gen).decode('utf-8') == content
 
+    def test_custom_fuzzy_replace(self):
+        headers = {'Content-Type': 'application/octet-stream'}
+        content = '{"ssid":"1234"}'
+
+        actual_url = 'http://facebook.com/ajax/pagelet/generic.php/photoviewerinitpagelet?data="ssid":1234'
+        request_url = 'http://facebook.com/ajax/pagelet/generic.php/photoviewerinitpagelet?data="ssid":5678'
+
+        headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_',
+                                                  url=actual_url,
+                                                  request_url=request_url)
+
+        assert headers.headers == [('Content-Type', 'application/octet-stream')]
+
+        assert b''.join(gen).decode('utf-8') == '{"ssid":"5678"}'
+
     def test_hls_default_max(self):
         headers = {'Content-Type': 'application/vnd.apple.mpegurl'}
         with open(os.path.join(get_test_dir(), 'text_content', 'sample_hls.m3u8'), 'rt') as fh:
diff --git a/pywb/rules.yaml b/pywb/rules.yaml
index 4ade4f3c..40e17442 100644
--- a/pywb/rules.yaml
+++ b/pywb/rules.yaml
@@ -64,11 +64,38 @@ rules:
 
     # facebook rules
     #=================================================================
+    - url_prefix: 'com,facebook)/ajax/pagelet/generic.php/photoviewerinitpagelet'
+
+      rewrite:
+        mixin: 'pywb.rewrite.regex_rewriters:JSReplaceFuzzy'
+        mixin_params:
+            rx: '"ssid":([\d]+)'
+
+        force_type: 'json'
+        
+      fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|(?:query_type|fbid)[^,]+))'
+
     - url_prefix: 'com,facebook)/ajax/pagelet/generic.php/'
 
       #fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))'
       fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|(?:query_type|fbid)[^,]+))'
 
+    - url_prefix: 'com,facebook)/ajax/ufi/reply_fetch.php'
+
+      fuzzy_lookup:
+          - 'ft_ent_identifier'
+          - 'parent_comment_ids[0]'
+          - lsd
+
+    - url_prefix: 'com,facebook)/ajax/ufi/comment_fetch.php'
+
+      fuzzy_lookup:
+          - 'source'
+          - 'offset'
+          - 'length'
+          - 'ft_ent_identifier'
+          - 'feed_context'
+
     - url_prefix: 'com,facebook)/ajax/ufi/'
 
       fuzzy_lookup:
@@ -97,7 +124,16 @@ rules:
 
       fuzzy_lookup: '([?&][^_]\w+=[^&]+)+'
 
+    - url_prefix: 'com,facebook)/api/graphqlbatch'
+
+      fuzzy_lookup:
+        match: '("q[\d]+":|after:\\"[^"]+)'
+        find_all: true
+
     - url_prefix: 'com,facebook)/'
+
+      fuzzy_lookup: '([?&][^_]\w+=[^&]+)+'
+
       rewrite:
         js_regexs:
             - match: 'Bootloader\.configurePage.*?;'
diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js
index 5449c128..0642a006 100644
--- a/pywb/static/wombat.js
+++ b/pywb/static/wombat.js
@@ -279,7 +279,10 @@ var _WBWombat = function($wbwindow, wbinfo) {
                     } else {
                         url = "";
                     }
-                    url += "/" + path;
+                    if (path && path[0] != "/") {
+                        url += "/";
+                    }
+                    url += path;
                 }
 
                 return url;
@@ -516,6 +519,10 @@ var _WBWombat = function($wbwindow, wbinfo) {
                     return;
                 }
 
+                if (this["_" + prop] == value) {
+                    return;
+                }
+
                 this["_" + prop] = value;
 
                 if (!this._parser) {
@@ -873,10 +880,44 @@ var _WBWombat = function($wbwindow, wbinfo) {
             init_opts = init_opts || {};
             init_opts["credentials"] = "include";
 
-            return orig_fetch.call(this, input, init_opts);
+            return orig_fetch.call(proxy_to_obj(this), input, init_opts);
         }
     }
 
+
+    //============================================
+    function init_request_override()
+    {
+        var orig_request = $wbwindow.Request;
+
+        if (!orig_request) {
+            return;
+        }
+
+        $wbwindow.Request = (function (Request) {
+            return function(input, init_opts) {
+                if (typeof(input) === "string") {
+                    input = rewrite_url(input);
+                } else if (typeof(input) === "object" && input.url) {
+                    var new_url = rewrite_url(input.url);
+
+                    if (new_url != input.url) {
+                    //    input = new Request(new_url, input);
+                        input.url = new_url;
+                    }
+                }
+
+                init_opts = init_opts || {};
+                init_opts["credentials"] = "include";
+
+                return new Request(input, init_opts);
+            }
+
+        })($wbwindow.Request);
+
+        $wbwindow.Request.prototype = orig_request.prototype;
+    }
+
     //============================================
     function override_prop_extract(proto, prop, cond) {
         var orig_getter = get_orig_getter(proto, prop);
@@ -2767,7 +2808,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
     //============================================
     function proxy_to_obj(source) {
         try {
-            return source.__WBProxyRealObj__ || source;
+            return (source && source.__WBProxyRealObj__) || source;
         } catch (e) {
             return source;
         }
@@ -2997,6 +3038,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
 
             // Fetch
             init_fetch_rewrite();
+            init_request_override();
 
             // Worker override (experimental)
             init_web_worker_override();
diff --git a/pywb/utils/canonicalize.py b/pywb/utils/canonicalize.py
index 2eab5f32..d04a7802 100644
--- a/pywb/utils/canonicalize.py
+++ b/pywb/utils/canonicalize.py
@@ -108,6 +108,16 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
     >>> calc_search_range('http://example.com/path/file.html', 'prefix')
     ('com,example)/path/file.html', 'com,example)/path/file.htmm')
 
+    # slash and ?
+    >>> calc_search_range('http://example.com/path/', 'prefix')
+    ('com,example)/path/', 'com,example)/path0')
+
+    >>> calc_search_range('http://example.com/path?', 'prefix')
+    ('com,example)/path?', 'com,example)/path@')
+
+    >>> calc_search_range('http://example.com/path/?', 'prefix')
+    ('com,example)/path?', 'com,example)/path@')
+
     >>> calc_search_range('http://example.com/path/file.html', 'host')
     ('com,example)/', 'com,example*')
 
@@ -158,6 +168,9 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
         if url.endswith('/') and not start_key.endswith('/'):
             start_key += '/'
 
+        if url.endswith('?') and not start_key.endswith('?'):
+            start_key += '?'
+
         end_key = inc_last_char(start_key)
 
     elif match_type == 'host':
diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py
index dcda32d4..d03582c9 100644
--- a/pywb/utils/loaders.py
+++ b/pywb/utils/loaders.py
@@ -30,6 +30,15 @@ except ImportError:  #pragma: no cover
     s3_avail = False
 
 
+# =================================================================
+def load_py_name(string):
+    import importlib
+
+    string = string.split(':', 1)
+    mod = importlib.import_module(string[0])
+    return getattr(mod, string[1])
+
+
 #=================================================================
 def is_http(filename):
     return filename.startswith(('http://', 'https://'))
diff --git a/pywb/warcserver/index/fuzzymatcher.py b/pywb/warcserver/index/fuzzymatcher.py
index 47289401..d103cbc1 100644
--- a/pywb/warcserver/index/fuzzymatcher.py
+++ b/pywb/warcserver/index/fuzzymatcher.py
@@ -13,7 +13,7 @@ from collections import namedtuple
 # ============================================================================
 FuzzyRule = namedtuple('FuzzyRule',
                        'url_prefix, regex, replace_after, filter_str, ' +
-                       'match_type')
+                       'match_type, find_all')
 
 
 # ============================================================================
@@ -54,14 +54,16 @@ class FuzzyMatcher(object):
             replace_after = self.DEFAULT_REPLACE_AFTER
             filter_str = self.DEFAULT_FILTER
             match_type = self.DEFAULT_MATCH_TYPE
+            find_all = False
 
         else:
             regex = self.make_regex(config.get('match'))
             replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER)
             filter_str = config.get('filter', self.DEFAULT_FILTER)
             match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
+            find_all = config.get('find_all', False)
 
-        return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type)
+        return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type, find_all)
 
     def get_fuzzy_match(self, urlkey, params):
         filters = set()
@@ -71,12 +73,18 @@ class FuzzyMatcher(object):
             if not any((urlkey.startswith(prefix) for prefix in rule.url_prefix)):
                 continue
 
-            m = rule.regex.search(urlkey)
-            if not m:
+            groups = None
+            if rule.find_all:
+                groups = rule.regex.findall(urlkey)
+            else:
+                m = rule.regex.search(urlkey)
+                groups = m and m.groups()
+
+            if not groups:
                 continue
 
             matched_rule = rule
-            for g in m.groups():
+            for g in groups:
                 for f in matched_rule.filter_str:
                     filters.add(f.format(g))
 
@@ -87,9 +95,18 @@ class FuzzyMatcher(object):
 
         url = params['url']
 
+        # support matching w/o query if no additional filters
+        # don't include trailing '?' if no filters and replace_after '?'
+        no_filters = (filters == {'urlkey:'}) and (matched_rule.replace_after == '?')
+
         inx = url.find(matched_rule.replace_after)
         if inx > 0:
-            url = url[:inx + len(matched_rule.replace_after)]
+            length = inx + len(matched_rule.replace_after)
+            if no_filters:
+                length -= 1
+            url = url[:length]
+        elif not no_filters:
+            url += matched_rule.replace_after[0]
 
         if matched_rule.match_type == 'domain':
             host = urlsplit(url).netloc
@@ -98,7 +115,7 @@ class FuzzyMatcher(object):
         fuzzy_params = {'url': url,
                         'matchType': matched_rule.match_type,
                         'filter': filters,
-                        'is_fuzzy': True}
+                        'is_fuzzy': '1'}
 
         for key in iterkeys(params):
             if key not in self.FUZZY_SKIP_PARAMS:
@@ -157,7 +174,7 @@ class FuzzyMatcher(object):
 
         for cdx in new_iter:
             if is_custom or self.match_general_fuzzy_query(url, urlkey, cdx, rx_cache):
-                cdx['is_fuzzy'] = True
+                cdx['is_fuzzy'] = '1'
                 yield cdx
 
     def match_general_fuzzy_query(self, url, urlkey, cdx, rx_cache):
diff --git a/pywb/warcserver/index/test/test_fuzzymatcher.py b/pywb/warcserver/index/test/test_fuzzymatcher.py
index 08f552e2..8bf16f90 100644
--- a/pywb/warcserver/index/test/test_fuzzymatcher.py
+++ b/pywb/warcserver/index/test/test_fuzzymatcher.py
@@ -38,7 +38,7 @@ class TestFuzzy(object):
     def get_expected(self, url, mime='text/html', filters=None):
         filters = filters or {'urlkey:'}
         exp = [{'filter': filters,
-               'is_fuzzy': True,
+               'is_fuzzy': '1',
                'urlkey': canonicalize(url),
                'source': 'source',
                'source-coll': 'source',