From a138fca5e3bea04e083340082918b21f25d0ba80 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 29 May 2018 08:57:50 -0700 Subject: [PATCH] jsonp rewriter: expand jsonp matching: (#336) - treat as jsonp if url query contains 'callback=jsonp', - fuzzy match query containing 'callback=jsonp' - tests: add test for additional jsonp matching --- pywb/rewrite/content_rewriter.py | 9 ++++++++- pywb/rewrite/test/test_content_rewriter.py | 13 +++++++++++++ pywb/rules.yaml | 4 ++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index 295e11e4..387b6b09 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -313,6 +313,11 @@ class StreamingRewriter(object): class RewriteInfo(object): TAG_REGEX = re.compile(b'^\s*\<') + JSONP_CONTAINS = ['callback=jQuery', + 'callback=jsonp', + '.json?' + ] + def __init__(self, record, content_rewriter, url_rewriter, cookie_rewriter=None): self.record = record @@ -347,12 +352,14 @@ class RewriteInfo(object): orig_text_type = self.rewrite_types.get(mime) text_type = self._resolve_text_type(orig_text_type) + url = self.url_rewriter.wburl.url if text_type in ('guess-text', 'guess-bin'): text_type = None if text_type == 'js': - if 'callback=jQuery' in self.url_rewriter.wburl.url or '.json?' in self.url_rewriter.wburl.url: + # determine if url contains strings that indicate jsonp + if any(jsonp_string in url for jsonp_string in self.JSONP_CONTAINS): text_type = 'json' if (text_type and orig_text_type != text_type) or text_type == 'html': diff --git a/pywb/rewrite/test/test_content_rewriter.py b/pywb/rewrite/test/test_content_rewriter.py index f588a2e4..3a955d50 100644 --- a/pywb/rewrite/test/test_content_rewriter.py +++ b/pywb/rewrite/test/test_content_rewriter.py @@ -250,6 +250,19 @@ class TestContentRewriter(object): exp = 'jQuery_DEF({"foo": "bar"});' assert b''.join(gen).decode('utf-8') == exp + def test_rewrite_js_as_json_generic_jsonp(self): + headers = {'Content-Type': 'application/json'} + content = '/**/ jsonpCallbackABCDEF({"foo": "bar"});' + + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_', + url='http://example.com/path/file?callback=jsonpCallback12345') + + # content-type unchanged + assert ('Content-Type', 'application/json') in headers.headers + + exp = 'jsonpCallback12345({"foo": "bar"});' + assert b''.join(gen).decode('utf-8') == exp + def test_rewrite_js_not_json(self): # callback not set headers = {} diff --git a/pywb/rules.yaml b/pywb/rules.yaml index 52121c53..f18222b1 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -34,6 +34,10 @@ default_filters: - match: '[?&]utm_[^=]+=[^&]+(?=&|$)' replace: '' + # additional callback=jsonpCallbackXYZ + - match: '[?&](callback=jsonp)[^&]+(?=&|$)' + replace: '\1' + # remove jquery callback dynamic timestamp - match: '[?&]((?:\w+)=jquery)[\d]+_[\d]+' replace: '\1'