diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index b24645ec..3906026e 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -172,9 +172,8 @@ class BaseContentRewriter(object): rule = self.get_rule(cdx) - force_type = rule.get('force_type') - if force_type: - rwinfo.text_type = force_type + if rule.get('mixin') and not rwinfo.text_type: + rwinfo.text_type = rule.get('mixin_type', 'json') if rwinfo.should_rw_content(): content_rewriter = self.create_rewriter(rwinfo.text_type, rule, rwinfo, cdx, head_insert_func) diff --git a/pywb/rules.yaml b/pywb/rules.yaml index 40e17442..8697974a 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -71,14 +71,18 @@ rules: mixin_params: rx: '"ssid":([\d]+)' - force_type: 'json' - - fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|(?:query_type|fbid)[^,]+))' + fuzzy_lookup: 'com,facebook\)/.*[?&]data=(?:.*?(?:[&]|(query_type|fbid|v|cursor|data)[^,]+))' + + - url_prefix: 'com,facebook)/ajax/pagelet/generic.php/photoviewerpagelet' + + fuzzy_lookup: + match: '("(?:cursor|cursorindex)":["\d\w]+)' + find_all: true - url_prefix: 'com,facebook)/ajax/pagelet/generic.php/' #fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))' - fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|(?:query_type|fbid)[^,]+))' + fuzzy_lookup: 'com,facebook\)/.*[?&]data=(?:.*?(?:[&]|(query_type|fbid|v|cursor|data)[^,]+))' - url_prefix: 'com,facebook)/ajax/ufi/reply_fetch.php' @@ -119,6 +123,13 @@ rules: fuzzy_lookup: - __user + - url_prefix: 'com,facebook)/ajax/photos/' + + fuzzy_lookup: + - __spin_r + - __spin_t + - __dyn + # fallback for all /ajax/ - url_prefix: 'com,facebook)/ajax/' diff --git a/pywb/warcserver/index/test/test_fuzzymatcher.py b/pywb/warcserver/index/test/test_fuzzymatcher.py index 8bf16f90..27a92088 100644 --- a/pywb/warcserver/index/test/test_fuzzymatcher.py +++ b/pywb/warcserver/index/test/test_fuzzymatcher.py @@ -122,6 +122,17 @@ class TestFuzzy(object): assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters) + def test_fuzzy_find_all_rule(self): + url = 'http://facebook.com/ajax/pagelet/generic.php/photoviewerpagelet?data={"cursor":"ABC","food":"bar","cursorindex":6,"A":12345,"B":"foo"}' + actual_url = 'http://facebook.com/ajax/pagelet/generic.php/photoviewerpagelet?data={"some":data","cursor":"ABC","foo":"bar","cursorindex":6}' + + params = self.get_params(url, actual_url) + cdx_iter, errs = self.fuzzy(self.source, params) + filters = {'urlkey:"cursor":"abc"', + 'urlkey:"cursorindex":6'} + + assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters) + def test_no_fuzzy_custom_rule_video_id_diff(self): url = 'http://youtube.com/get_video_info?a=b&html=true&___abc=123&video_id=ABCD&id=1234' actual_url = 'http://youtube.com/get_video_info?a=d&html=true&___abc=125&video_id=ABCE&id=1234'