From 827ba9b50f1ae904425b6232b995d3e0bba5ddfa Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 30 Jun 2016 12:26:18 -0400 Subject: [PATCH 01/32] cookies: add optional callback when setting cookie with domain (to experiment with server side handling of custom domain) --- pywb/static/wombat.js | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index af280f94..33301c46 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -47,6 +47,8 @@ var wombat_internal = function($wbwindow) { var wb_wombat_updating = false; + var cookie_domain_callback = undefined; + // custom options var wb_opts; @@ -1858,6 +1860,11 @@ var wombat_internal = function($wbwindow) { // rewrite domain cookie = cookie.replace(cookie_domain_regex, function(m, m1) { + + if (cookie_domain_callback) { + cookie_domain_callback(m1, cookie.split(";", 1)[0]); + } + // if no subdomain, eg. "localhost", just remove domain altogether if ($wbwindow.location.hostname.indexOf(".") >= 0 && !IP_RX.test($wbwindow.location.hostname)) { return "Domain=." + $wbwindow.location.hostname; @@ -2250,6 +2257,10 @@ var wombat_internal = function($wbwindow) { this.extract_orig = extract_orig; this.rewrite_url = rewrite_url; this.watch_elem = watch_elem; + + this.cookie_callback = function(callback) { + cookie_domain_callback = callback; + } } function init_top_frame($wbwindow) { From 0b57f4a3529a6de55c1925b80cfc145335aea6e8 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 1 Jul 2016 09:58:25 -0400 Subject: [PATCH 02/32] cookie notification: use postMessage() instead of callback to notify top frame of cookie setting with custom domain, #186 --- pywb/static/wombat.js | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 33301c46..6214aefd 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -47,8 +47,6 @@ var wombat_internal = function($wbwindow) { var wb_wombat_updating = false; - var cookie_domain_callback = undefined; - // custom options var wb_opts; @@ -1847,6 +1845,24 @@ var wombat_internal = function($wbwindow) { cookie = cookie.replace(wb_abs_prefix, ''); cookie = cookie.replace(wb_rel_prefix, ''); + // rewrite domain + cookie = cookie.replace(cookie_domain_regex, function(m, m1) { + var message = {"domain": m1, + "cookie": cookie, + "wb_type": "cookie", + } + + // norify of cookie setting to allow server-side tracking + $wbwindow.__WB_top_frame.postMessage(message, "*"); + + // if no subdomain, eg. "localhost", just remove domain altogether + if ($wbwindow.location.hostname.indexOf(".") >= 0 && !IP_RX.test($wbwindow.location.hostname)) { + return "Domain=." + $wbwindow.location.hostname; + } else { + return ""; + } + }); + // rewrite path cookie = cookie.replace(cookie_path_regex, function(m, m1) { var rewritten = rewrite_url(m1); @@ -1858,21 +1874,6 @@ var wombat_internal = function($wbwindow) { return "Path=" + rewritten; }); - // rewrite domain - cookie = cookie.replace(cookie_domain_regex, function(m, m1) { - - if (cookie_domain_callback) { - cookie_domain_callback(m1, cookie.split(";", 1)[0]); - } - - // if no subdomain, eg. "localhost", just remove domain altogether - if ($wbwindow.location.hostname.indexOf(".") >= 0 && !IP_RX.test($wbwindow.location.hostname)) { - return "Domain=." + $wbwindow.location.hostname; - } else { - return ""; - } - }); - // rewrite secure, if needed if ($wbwindow.location.protocol != "https:") { cookie = cookie.replace("secure", ""); @@ -2257,10 +2258,6 @@ var wombat_internal = function($wbwindow) { this.extract_orig = extract_orig; this.rewrite_url = rewrite_url; this.watch_elem = watch_elem; - - this.cookie_callback = function(callback) { - cookie_domain_callback = callback; - } } function init_top_frame($wbwindow) { From b46cf8492f04d9596da57c838c7cbf635e0d759f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 16 Jul 2016 12:48:26 -0400 Subject: [PATCH 03/32] bump version to 0.31.5 --- README.rst | 2 +- pywb/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 010a6f3e..4e5fec37 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -PyWb 0.31.0 +PyWb 0.31.5 =========== .. image:: https://travis-ci.org/ikreymer/pywb.svg?branch=master diff --git a/pywb/__init__.py b/pywb/__init__.py index 9f66d658..7e67dee2 100644 --- a/pywb/__init__.py +++ b/pywb/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.31.0' +__version__ = '0.31.5' DEFAULT_CONFIG = 'pywb/default_config.yaml' From 605ee22bec4ee5e90ddb6df5050968a0d5ba5a33 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 16 Jul 2016 12:55:24 -0400 Subject: [PATCH 04/32] html rewrite: rewrite href on any element, not just few designated ones, as client side rewriting does the same. avoids edge cases where href used on other tags (eg. a div) that results in incorrect rewriting, #187 --- pywb/rewrite/html_rewriter.py | 5 +++++ pywb/rewrite/test/test_html_rewriter.py | 3 +++ 2 files changed, 8 insertions(+) diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 90148c1f..7fc105d9 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -328,6 +328,11 @@ class HTMLRewriterMixin(object): elif (tag == 'base') and (attr_name == 'href') and attr_value: rw_mod = handler.get(attr_name) attr_value = self._rewrite_base(attr_value, rw_mod) + + elif attr_name == 'href': + rw_mod = self.defmod + attr_value = self._rewrite_url(attr_value, rw_mod) + else: # rewrite url using tag handler rw_mod = handler.get(attr_name) diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index afb1da93..6c84e021 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -56,6 +56,9 @@ r""" >>> parse('') +# href on other tags +>>> parse('
Text
') +
Text
# HTML Entities >>> parse('›   > ?') From 64a49b3e4deae8689d006a4d6f680f8642011dd9 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 16 Jul 2016 13:18:08 -0400 Subject: [PATCH 05/32] wombat: history change improvements (#188): - ensure back, go, forward also propagated to top frame - ensure pushState propagated as pushState and replaceState as replaceState to top frame - security: prevent pushState or replaceState from changing to different domain --- pywb/static/wb_frame.js | 24 ++++++++++++++++-- pywb/static/wombat.js | 55 +++++++++++++++++++++++++++++++++++------ 2 files changed, 70 insertions(+), 9 deletions(-) diff --git a/pywb/static/wb_frame.js b/pywb/static/wb_frame.js index 168b914f..ffc11576 100644 --- a/pywb/static/wb_frame.js +++ b/pywb/static/wb_frame.js @@ -54,7 +54,11 @@ function push_state(state) { var canon_url = make_url(state.url, state.request_ts, ""); if (window.location.href != canon_url) { - window.history.replaceState(state, "", canon_url); + if (state.wb_type != "pushState") { + window.history.replaceState(state, "", canon_url); + } else { + window.history.pushState(state, "", canon_url); + } } set_state(state); @@ -172,7 +176,8 @@ function init_pm() { // Check if iframe url change message if (typeof(event.data) == "object" && event.data["wb_type"]) { - update_wb_url(event.data); + handle_message(event.data); + } else { // Pass to parent window.parent.postMessage(event.data, "*"); @@ -187,6 +192,21 @@ function init_pm() { } +function handle_message(state) { + var type = state.wb_type; + + if (type == "load" || type == "pushState" || type == "replaceState") { + update_wb_url(state); + } else if (type == "go") { + window.history.go(state.param); + } else if (type == "back") { + window.history.back(); + } else if (type == "forward") { + window.history.forward(); + } +} + + function update_wb_url(state) { if (curr_state.url == state.url && curr_state.ts == state.ts) { return; diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 6214aefd..019de1ca 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -710,6 +710,10 @@ var wombat_internal = function($wbwindow) { $wbwindow.history['_orig_' + func_name] = orig_func; function rewritten_func(state_obj, title, url) { + if (!starts_with(url, $wbwindow.WB_wombat_location.origin + "/")) { + throw new DOMException("Invalid history change: " + url); + } + url = rewrite_url(url); if (url == $wbwindow.location.href) { @@ -718,15 +722,9 @@ var wombat_internal = function($wbwindow) { orig_func.call(this, state_obj, title, url); - //if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame && $wbwindow.__WB_top_frame.update_wb_url) { - // $wbwindow.__WB_top_frame.update_wb_url($wbwindow.WB_wombat_location.href, - // wb_info.timestamp, - // wb_info.request_ts, - // wb_info.is_live); - //} if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame) { var message = { - "url": url, + "url": extract_orig(url), "ts": wb_info.timestamp, "request_ts": wb_info.request_ts, "is_live": wb_info.is_live, @@ -746,6 +744,45 @@ var wombat_internal = function($wbwindow) { return rewritten_func; } + //============================================ + function override_history_nav(func_name) { + if (!$wbwindow.history) { + return; + } + + // Only useful for framed replay + if (!$wbwindow.__WB_top_frame || $wbwindow == $wbwindow.__WB_top_frame) { + return; + } + + var orig_func = $wbwindow.history[func_name]; + + if (!orig_func) { + return; + } + + function rewritten_func() { + orig_func.apply(this, arguments); + + var message = { + "wb_type": func_name, + } + + if (func_name == "go") { + message["param"] = arguments[0]; + } + + $wbwindow.__WB_top_frame.postMessage(message, "*"); + } + + $wbwindow.history[func_name] = rewritten_func; + if ($wbwindow.History && $wbwindow.History.prototype) { + $wbwindow.History.prototype[func_name] = rewritten_func; + } + + return rewritten_func; + } + //============================================ function init_ajax_rewrite() { if (!$wbwindow.XMLHttpRequest || @@ -2182,6 +2219,10 @@ var wombat_internal = function($wbwindow) { override_history_func("pushState"); override_history_func("replaceState"); + override_history_nav("go"); + override_history_nav("back"); + override_history_nav("forward"); + // open init_open_override(); From 42a2fa02fe83cc4b31046b979ad484543aedee65 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 16 Jul 2016 13:32:46 -0400 Subject: [PATCH 06/32] wombat: history check fix: ensure check applies to absolute url #188 --- pywb/static/wombat.js | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 019de1ca..c4e66618 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -710,12 +710,14 @@ var wombat_internal = function($wbwindow) { $wbwindow.history['_orig_' + func_name] = orig_func; function rewritten_func(state_obj, title, url) { - if (!starts_with(url, $wbwindow.WB_wombat_location.origin + "/")) { - throw new DOMException("Invalid history change: " + url); - } - url = rewrite_url(url); + var abs_url = extract_orig(url); + + if (!starts_with(abs_url, $wbwindow.WB_wombat_location.origin + "/")) { + throw new DOMException("Invalid history change: " + abs_url); + } + if (url == $wbwindow.location.href) { return; } @@ -724,7 +726,7 @@ var wombat_internal = function($wbwindow) { if ($wbwindow.__WB_top_frame && $wbwindow != $wbwindow.__WB_top_frame) { var message = { - "url": extract_orig(url), + "url": abs_url, "ts": wb_info.timestamp, "request_ts": wb_info.request_ts, "is_live": wb_info.is_live, From 782f95fa9782a0ead8692cba7fabcc54adcc8311 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 24 Jul 2016 19:39:43 -0400 Subject: [PATCH 07/32] rules: rules for yt video info update --- pywb/rules.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/pywb/rules.yaml b/pywb/rules.yaml index 3b4e08cf..7b72d976 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -187,6 +187,7 @@ rules: fuzzy_lookup: - video_id - html5 + - cpn - url_prefix: 'com,youtube,s)/api/stats/qoe' From 6928d72f688a2e10d2029ed99e9264995fd7ea00 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 26 Jul 2016 18:12:32 -0400 Subject: [PATCH 08/32] rewrite css: handle rewriting with entities around url() css by leaving them in place, eg: url("http://example.com/") --- pywb/rewrite/regex_rewriters.py | 2 +- pywb/rewrite/test/test_html_rewriter.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py index 1ec58f7f..831b0837 100644 --- a/pywb/rewrite/regex_rewriters.py +++ b/pywb/rewrite/regex_rewriters.py @@ -209,7 +209,7 @@ class XMLRewriter(RegexRewriter): #================================================================= class CSSRewriter(RegexRewriter): - CSS_URL_REGEX = "url\\s*\\(\\s*[\\\\\"']*([^)'\"]+)[\\\\\"']*\\s*\\)" + CSS_URL_REGEX = "url\\s*\\(\\s*(?:[\\\\\"']|(?:&.{1,4};))*([^)'\"]+)(?:[\\\\\"']|(?:&.{1,4};))*\\s*\\)" CSS_IMPORT_NO_URL_REGEX = ("@import\\s+(?!url)\\(?\\s*['\"]?" + "(?!url[\\s\\(])([\w.:/\\\\-]+)") diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index 6c84e021..02e55c11 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -153,6 +153,18 @@ r""" >>> parse('') +>>> parse('') + + +>>> parse('') + + +>>> parse("") + + +#>>> parse('') + + # Style >>> parse('') From cd15dbfe4887fa7113231b2d1d9fb4a44bc65eda Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 27 Jul 2016 10:34:54 -0400 Subject: [PATCH 09/32] head_insert: add decodeURI() to prefix to ensure unicode prefix string --- pywb/templates/head_insert.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pywb/templates/head_insert.html b/pywb/templates/head_insert.html index f398cc01..9e838b61 100644 --- a/pywb/templates/head_insert.html +++ b/pywb/templates/head_insert.html @@ -4,7 +4,7 @@ wbinfo.url = "{{ cdx.url }}"; wbinfo.timestamp = "{{ cdx.timestamp }}"; wbinfo.request_ts = "{{ wbrequest.wb_url.timestamp }}"; - wbinfo.prefix = "{{ wbrequest.wb_prefix }}"; + wbinfo.prefix = decodeURI("{{ wbrequest.wb_prefix }}"); wbinfo.mod = "{{ wbrequest.wb_url.mod }}"; wbinfo.top_url = "{{ top_url }}"; wbinfo.is_framed = {{ "true" if wbrequest.options.is_framed else "false" }}; From c8c0cecda3e9e73bdd3b50a3eb120fd8321c64fb Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 27 Jul 2016 21:34:58 -0400 Subject: [PATCH 10/32] rewrite improvements: if content-type is text/plain but mod is js_ or cs_, treat as js or css (#31) header rewriter: ensure removed content-length and content-encoding are added back if no rewriting performed on response body --- pywb/rewrite/header_rewriter.py | 19 ++++++++++++------- pywb/rewrite/rewrite_amf.py | 7 +++---- pywb/rewrite/rewrite_content.py | 16 +++++++++++----- pywb/rewrite/test/test_header_rewriter.py | 4 +++- tests/test_live_proxy.py | 4 ++-- 5 files changed, 31 insertions(+), 19 deletions(-) diff --git a/pywb/rewrite/header_rewriter.py b/pywb/rewrite/header_rewriter.py index 610df546..ba2a6d03 100644 --- a/pywb/rewrite/header_rewriter.py +++ b/pywb/rewrite/header_rewriter.py @@ -17,6 +17,12 @@ class RewrittenStatusAndHeaders(object): def contains_removed_header(self, name, value): return self.removed_header_dict.get(name) == value + def readd_rewrite_removed(self): + for name in HeaderRewriter.PROXY_NO_REWRITE_HEADERS: + value = self.removed_header_dict.get(name) + if value is not None: + self.status_headers.headers.append((name, value)) + #================================================================= class HeaderRewriter(object): @@ -34,6 +40,8 @@ class HeaderRewriter(object): 'json': ['application/json'], 'xml': ['/xml', '+xml', '.xml', '.rss'], + + 'plain': ['text/plain'], } PROXY_HEADERS = ['content-type', 'content-disposition', 'content-range', @@ -41,12 +49,12 @@ class HeaderRewriter(object): URL_REWRITE_HEADERS = ['location', 'content-location', 'content-base'] - ENCODING_HEADERS = ['content-encoding'] + #ENCODING_HEADERS = ['content-encoding'] REMOVE_HEADERS = ['transfer-encoding', 'content-security-policy', 'strict-transport-security'] - PROXY_NO_REWRITE_HEADERS = ['content-length'] + PROXY_NO_REWRITE_HEADERS = ['content-length', 'content-encoding'] COOKIE_HEADERS = ['set-cookie', 'cookie'] @@ -141,9 +149,10 @@ class HeaderRewriter(object): elif urlrewriter and lowername in self.URL_REWRITE_HEADERS: new_headers.append((name, urlrewriter.rewrite(value))) - elif lowername in self.ENCODING_HEADERS: + elif lowername in self.PROXY_NO_REWRITE_HEADERS: if content_rewritten: removed_header_dict[lowername] = value + add_prefixed_header(name, value) else: add_header(name, value) @@ -151,10 +160,6 @@ class HeaderRewriter(object): removed_header_dict[lowername] = value add_prefixed_header(name, value) - elif (lowername in self.PROXY_NO_REWRITE_HEADERS and - not content_rewritten): - add_header(name, value) - elif (lowername in self.COOKIE_HEADERS and cookie_rewriter): cookie_list = cookie_rewriter.rewrite(value) diff --git a/pywb/rewrite/rewrite_amf.py b/pywb/rewrite/rewrite_amf.py index 07a73470..0cfd217f 100644 --- a/pywb/rewrite/rewrite_amf.py +++ b/pywb/rewrite/rewrite_amf.py @@ -6,13 +6,12 @@ from pywb.rewrite.rewrite_content import RewriteContent # ============================================================================ # Expiermental: not fully tested class RewriteContentAMF(RewriteContent): #pragma: no cover - def handle_custom_rewrite(self, text_type, status_headers, stream, env): - - if status_headers.get_header('Content-Type') == 'application/x-amf': + def handle_custom_rewrite(self, rewritten_headers, stream, mod, env): + if rewritten_headers.status_headers.get_header('Content-Type') == 'application/x-amf': stream = self.rewrite_amf(stream, env) return (super(RewriteContentAMF, self). - handle_custom_rewrite(text_type, status_headers, stream, env)) + handle_custom_rewrite(rewritten_headers, stream, mod, env)) def rewrite_amf(self, stream, env): try: diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 677e20ae..6ae183a5 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -118,11 +118,9 @@ class RewriteContent(object): urlkey, cookie_rewriter) - status_headers = rewritten_headers.status_headers - - res = self.handle_custom_rewrite(rewritten_headers.text_type, - status_headers, + res = self.handle_custom_rewrite(rewritten_headers, stream, + wb_url.mod, env) if res: return res @@ -131,6 +129,7 @@ class RewriteContent(object): # ==================================================================== # special case -- need to ungzip the body + status_headers = rewritten_headers.status_headers text_type = rewritten_headers.text_type # see known js/css modifier specified, the context should run @@ -246,11 +245,18 @@ class RewriteContent(object): return (status_headers, gen, True) - def handle_custom_rewrite(self, text_type, status_headers, stream, env): + def handle_custom_rewrite(self, rewritten_headers, stream, mod, env): + text_type = rewritten_headers.text_type + status_headers = rewritten_headers.status_headers + # use rewritten headers, but no further rewriting needed if text_type is None: return (status_headers, self.stream_to_gen(stream), False) + if text_type == 'plain' and not mod in ('js_', 'cs_'): + rewritten_headers.readd_rewrite_removed() + return (status_headers, self.stream_to_gen(stream), False) + @staticmethod def _extract_html_charset(buff, status_headers): charset = None diff --git a/pywb/rewrite/test/test_header_rewriter.py b/pywb/rewrite/test/test_header_rewriter.py index 8e1f1a87..fc2146d7 100644 --- a/pywb/rewrite/test/test_header_rewriter.py +++ b/pywb/rewrite/test/test_header_rewriter.py @@ -6,7 +6,7 @@ HTTP Headers Rewriting # Text with charset >>> _test_headers([('Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')]) {'charset': 'utf-8', - 'removed_header_dict': {}, + 'removed_header_dict': {'content-length': '5'}, 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Date', 'Fri, 03 Jan 2014 03:03:21 GMT'), ('X-Archive-Orig-Content-Length', '5'), ('Content-Type', 'text/html;charset=UTF-8')]), @@ -24,9 +24,11 @@ HTTP Headers Rewriting >>> _test_headers([('Content-Length', '199999'), ('Content-Type', 'text/javascript'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')]) {'charset': None, 'removed_header_dict': {'content-encoding': 'gzip', + 'content-length': '199999', 'transfer-encoding': 'chunked'}, 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'), ('Content-Type', 'text/javascript'), + ('X-Archive-Orig-Content-Encoding', 'gzip'), ('X-Archive-Orig-Transfer-Encoding', 'chunked')]), 'text_type': 'js'} diff --git a/tests/test_live_proxy.py b/tests/test_live_proxy.py index 6c48c5de..575c51a8 100644 --- a/tests/test_live_proxy.py +++ b/tests/test_live_proxy.py @@ -125,7 +125,7 @@ class TestProxyLiveRewriter: def test_echo_proxy_start_unbounded_remove_range(self): headers = [('Range', 'bytes=0-')] - resp = self.testapp.get('/rewrite/http://example.com/', headers=headers) + resp = self.testapp.get('/rewrite/http://httpbin.org/range/100', headers=headers) # actual response is with range assert resp.status_int == 206 @@ -138,7 +138,7 @@ class TestProxyLiveRewriter: assert self.requestlog[0] == resp.text assert resp.headers['x-archive-orig-x-proxy'] == 'test' - assert self.requestlog[0].startswith('GET http://example.com/ HTTP/1.1') + assert self.requestlog[0].startswith('GET http://httpbin.org/range/100 HTTP/1.1') assert 'range: ' not in self.requestlog[0] assert len(self.cache) == 0 From c3389987cdce1267771ab3f4409f5395d465c4d3 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 28 Jul 2016 10:06:10 -0400 Subject: [PATCH 11/32] frame timestamp extract: fix timestamp extracting timestamp for non-html resources for use with frame display (#189) --- pywb/static/wb_frame.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pywb/static/wb_frame.js b/pywb/static/wb_frame.js index ffc11576..47d389b8 100644 --- a/pywb/static/wb_frame.js +++ b/pywb/static/wb_frame.js @@ -19,7 +19,7 @@ This file is part of pywb, https://github.com/ikreymer/pywb var LIVE_COOKIE_REGEX = /pywb.timestamp=([\d]{1,14})/; -var TS_REGEX = /\/([\d]{1,14})\//; +var TS_REGEX = /\/([\d]{1,14})(?:\w+_)?\/(?:\w+[:])?\/\//; var curr_state = {}; From 66ca8d8b26e2f8b0ef82372ca75f3ff95139d5b4 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 31 Jul 2016 12:56:00 -0400 Subject: [PATCH 12/32] http block loader: raise exception for 4xx, 5xx responses tests: add tests for limitreader posting, fix charset for frame test --- pywb/utils/loaders.py | 1 + pywb/utils/test/test_loaders.py | 9 +++++++++ tests/test_live_rewriter.py | 2 +- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 4c298334..3841134b 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -392,6 +392,7 @@ class HttpLoader(BaseLoader): self.session = requests.Session() r = self.session.get(url, headers=headers, stream=True) + r.raise_for_status() return r.raw diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py index 5d71a711..4b755726 100644 --- a/pywb/utils/test/test_loaders.py +++ b/pywb/utils/test/test_loaders.py @@ -138,6 +138,7 @@ import pytest import six from six import StringIO from io import BytesIO +import requests from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url from pywb.utils.loaders import LimitReader, extract_client_cookie, extract_post_query @@ -176,6 +177,14 @@ def test_s3_read_1(): assert reader.readline() == b'WARC/1.0\r\n' assert reader.readline() == b'WARC-Type: response\r\n' +def test_limit_post(): + reader = LimitReader(BytesIO(b'abcdefg'), 3) + r = requests.request(method='POST', + url='http://httpbin.org/post', + data=reader, + headers={'Content-Length': '3'}) + + assert '"abc"' in r.text # Error def test_err_no_such_file(): diff --git a/tests/test_live_rewriter.py b/tests/test_live_rewriter.py index fdc94415..0f4129f4 100644 --- a/tests/test_live_rewriter.py +++ b/tests/test_live_rewriter.py @@ -4,7 +4,6 @@ from pywb.framework.wsgi_wrappers import init_app import webtest import pywb.rewrite.rewrite_live - #================================================================= class MockYTDWrapper(object): def extract_info(self, url): @@ -47,6 +46,7 @@ class TestLiveRewriter: def test_live_live_frame(self): resp = self.testapp.get('/live/http://example.com/') assert resp.status_int == 200 + resp.charset = 'utf-8' assert ' + From 892ebaceadc88b5e077c970cb6934f2ea1a66c94 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 19 Aug 2016 23:44:15 -0400 Subject: [PATCH 19/32] cross-frame improvements: #191 - make hashchange functions use postMessage(), support setting top->replay and replay->top - special postMessage() option for sending message from top frame -> replay frame - fix history navigation, mimic top frame history same as replay frame as much as possible - remove iframe_loaded() callback, using postMessage() notifications only - include document title in 'load' message --- pywb/static/wb.js | 13 ++--- pywb/static/wb_frame.js | 115 +++++++++++++++------------------------- pywb/static/wombat.js | 64 +++++++++++++++++++--- 3 files changed, 102 insertions(+), 90 deletions(-) diff --git a/pywb/static/wb.js b/pywb/static/wb.js index 65d59d84..8157b90a 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -110,7 +110,7 @@ function remove_event(name, func, object) { } } -function notify_top() { +function notify_top(event) { if (!window.__WB_top_frame) { return; } @@ -123,19 +123,12 @@ function notify_top() { return; } - //if (window.__WB_top_frame.update_wb_url) { - // window.__WB_top_frame.update_wb_url(window.WB_wombat_location.href, - // wbinfo.timestamp, - // wbinfo.request_ts, - // wbinfo.is_live); - //} - var message = { "url": window.WB_wombat_location.href, "ts": wbinfo.timestamp, "request_ts": wbinfo.request_ts, "is_live": wbinfo.is_live, - "title": "", + "title": document ? document.title : "", "wb_type": "load", } @@ -152,7 +145,7 @@ this.load = function() { window._wb_js_inited = true; // Non-Framed Replay OR top frame for framed replay! - if (window.wbinfo && (!window.__WB_top_frame || window.__WB_top_frame == window)) { + if (window.wbinfo && !window.__WB_top_frame) { if (wbinfo.is_framed && wbinfo.mod != "bn_") { var hash = window.location.hash; diff --git a/pywb/static/wb_frame.js b/pywb/static/wb_frame.js index 47d389b8..9330dab0 100644 --- a/pywb/static/wb_frame.js +++ b/pywb/static/wb_frame.js @@ -21,10 +21,12 @@ var LIVE_COOKIE_REGEX = /pywb.timestamp=([\d]{1,14})/; var TS_REGEX = /\/([\d]{1,14})(?:\w+_)?\/(?:\w+[:])?\/\//; -var curr_state = {}; +//var curr_state = {}; var IFRAME_ID = "replay_iframe"; +var last_inner_hash = undefined; + function make_url(url, ts, mod) { if (ts || mod) { @@ -39,6 +41,7 @@ function make_url(url, ts, mod) } function push_state(state) { + /* var frame = document.getElementById(IFRAME_ID).contentWindow; if (frame.WB_wombat_location) { var curr_href = frame.WB_wombat_location.href; @@ -48,6 +51,7 @@ function push_state(state) { return; } } + */ state.outer_url = make_url(state.url, state.request_ts, wbinfo.frame_mod); state.inner_url = make_url(state.url, state.request_ts, wbinfo.replay_mod); @@ -67,8 +71,8 @@ function push_state(state) { function pop_state(state) { set_state(state); - var frame = document.getElementById(IFRAME_ID).contentWindow; - frame.src = state.inner_url; + //var frame = document.getElementById(IFRAME_ID); + //frame.src = state.inner_url; } function extract_ts(url) @@ -107,7 +111,7 @@ function set_state(state) { } } - curr_state = state; + //curr_state = state; } window.onpopstate = function(event) { @@ -127,43 +131,6 @@ function extract_ts_cookie(value) { } } -function iframe_loaded(event) { - var url; - var ts; - var request_ts; - var capture_str; - var is_live = false; - var iframe = document.getElementById(IFRAME_ID).contentWindow; - - if (iframe.WB_wombat_location) { - url = iframe.WB_wombat_location.href; - } else { - url = extract_replay_url(iframe.location.href); - } - - if (iframe.wbinfo) { - ts = iframe.wbinfo.timestamp; - request_ts = iframe.wbinfo.request_ts; - is_live = iframe.wbinfo.is_live; - } else { - ts = extract_ts_cookie(iframe.document.cookie); - if (ts) { - is_live = true; - } else { - ts = extract_ts(iframe.location.href); - } - request_ts = ts; - } - - var state = {} - state["url"] = url; - state["ts"] = ts; - state["request_ts"] = request_ts; - state["is_live"] = is_live - - update_wb_url(state); -} - function init_pm() { var frame = document.getElementById(IFRAME_ID).contentWindow; @@ -203,59 +170,56 @@ function handle_message(state) { window.history.back(); } else if (type == "forward") { window.history.forward(); + } else if (type == "hashchange") { + inner_hash_changed(state); } } function update_wb_url(state) { - if (curr_state.url == state.url && curr_state.ts == state.ts) { - return; - } + //if (curr_state && curr_state.url == state.url && curr_state.ts == state.ts) { + // return; + //} state['capture_str'] = _wb_js.ts_to_date(state.ts, true); push_state(state); } -// Load Banner -if (_wb_js) { - _wb_js.load(); +function inner_hash_changed(state) { + if (window.location.hash != state.hash) { + window.location.hash = state.hash; + } + last_inner_hash = state.hash; +} + +function outer_hash_changed(event) { + if (window.location.hash == last_inner_hash) { + return; + } + + var frame = document.getElementById(IFRAME_ID).contentWindow; + + var message = {"wb_type": "outer_hashchange", "hash": window.location.hash} + + frame.postMessage(message, "*", undefined, true); } function init_hash_connect() { - var frame = document.getElementById(IFRAME_ID).contentWindow; + var frame = document.getElementById(IFRAME_ID); if (window.location.hash) { var curr_url = wbinfo.capture_url + window.location.hash; - - frame.location.href = make_url(curr_url, wbinfo.request_ts, wbinfo.replay_mod); + + frame.src = make_url(curr_url, wbinfo.request_ts, wbinfo.replay_mod); + + last_inner_hash = window.location.hash; + //frame.location.href = make_url(curr_url, wbinfo.request_ts, wbinfo.replay_mod); //frame.location.hash = window.location.hash; } - - function outer_hash_changed() { - var the_frame = document.getElementById(IFRAME_ID).contentWindow; - - if (window.location.hash == the_frame.location.hash) { - return; - } - - the_frame.location.hash = window.location.hash; - //the_frame.location.href = make_url(curr_url, curr_state.request_ts, wbinfo.replay_mod); - } - - function inner_hash_changed() { - var the_frame = document.getElementById(IFRAME_ID).contentWindow; - - if (window.location.hash == the_frame.location.hash) { - return; - } - - window.location.hash = the_frame.location.hash; - } if ("onhashchange" in window) { window.addEventListener("hashchange", outer_hash_changed, false); - frame.addEventListener("hashchange", inner_hash_changed, false); } // Init Post Message connect @@ -264,3 +228,10 @@ function init_hash_connect() { document.addEventListener("DOMContentLoaded", init_hash_connect); +// Load Banner +if (_wb_js) { + _wb_js.load(); +} + + + diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index de0d16f2..340705c6 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -18,7 +18,7 @@ This file is part of pywb, https://github.com/ikreymer/pywb */ //============================================ -// Wombat JS-Rewriting Library v2.12 +// Wombat JS-Rewriting Library v2.14 //============================================ @@ -734,7 +734,7 @@ var wombat_internal = function($wbwindow) { "wb_type": func_name, } - $wbwindow.__WB_top_frame.postMessage(message, "*"); + $wbwindow.__WB_top_frame.postMessage(message, wb_info.top_host); } } @@ -774,7 +774,7 @@ var wombat_internal = function($wbwindow) { message["param"] = arguments[0]; } - $wbwindow.__WB_top_frame.postMessage(message, "*"); + $wbwindow.__WB_top_frame.postMessage(message, wb_info.top_host); } $wbwindow.history[func_name] = rewritten_func; @@ -1687,6 +1687,47 @@ var wombat_internal = function($wbwindow) { } } + //============================================ + function init_hash_change() + { + if (!$wbwindow.__WB_top_frame) { + return; + } + + function receive_hash_change(event) + { + if (!event.data || event.source != $wbwindow.__WB_top_frame) { + return; + } + + var message = event.data; + + if (!message.wb_type) { + return; + } + + if (message.wb_type == "outer_hashchange") { + if ($wbwindow.location.hash != message.hash) { + $wbwindow.location.hash = message.hash; + } + } + } + + function send_hash_change() { + var message = {"wb_type": "hashchange", + "hash": $wbwindow.location.hash + } + + if ($wbwindow.__WB_top_frame) { + $wbwindow.__WB_top_frame.postMessage(message, wb_info.top_host); + } + } + + $wbwindow.addEventListener("message", receive_hash_change); + + $wbwindow.addEventListener("hashchange", send_hash_change); + } + //============================================ function init_postmessage_override($wbwindow) { @@ -1698,7 +1739,7 @@ var wombat_internal = function($wbwindow) { $wbwindow.__orig_postMessage = orig; - var postmessage_rewritten = function(message, targetOrigin, transfer) { + var postmessage_rewritten = function(message, targetOrigin, transfer, from_top) { var from = undefined; var src_id = undefined; @@ -1738,7 +1779,9 @@ var wombat_internal = function($wbwindow) { var new_message = {"from": from, "to_host": to, "src_id": src_id, - "message": message}; + "message": message, + "from_top": from_top, + } if (targetOrigin != "*") { targetOrigin = this.location.origin; @@ -1776,7 +1819,9 @@ var wombat_internal = function($wbwindow) { var source = event.source; - if (event.data.src_id && win.__WB_win_id && win.__WB_win_id[event.data.src_id]) { + if (event.data.from_top) { + source = win.__WB_top_frame; + } else if (event.data.src_id && win.__WB_win_id && win.__WB_win_id[event.data.src_id]) { source = win.__WB_win_id[event.data.src_id]; } @@ -1892,7 +1937,7 @@ var wombat_internal = function($wbwindow) { } // norify of cookie setting to allow server-side tracking - $wbwindow.__WB_top_frame.postMessage(message, "*"); + $wbwindow.__WB_top_frame.postMessage(message, wb_info.top_host); // if no subdomain, eg. "localhost", just remove domain altogether if ($wbwindow.location.hostname.indexOf(".") >= 0 && !IP_RX.test($wbwindow.location.hostname)) { @@ -2173,6 +2218,8 @@ var wombat_internal = function($wbwindow) { wb_opts = wbinfo.wombat_opts; wb_replay_prefix = wbinfo.prefix; + wb_info.top_host = wb_info.top_host || "*"; + init_top_frame($wbwindow); init_wombat_top($wbwindow); @@ -2234,6 +2281,8 @@ var wombat_internal = function($wbwindow) { init_postmessage_override($wbwindow); } + init_hash_change(); + // write init_write_override(); @@ -2293,7 +2342,6 @@ var wombat_internal = function($wbwindow) { // Date init_date_override(wbinfo.wombat_sec); - // registerProtocolHandler override init_registerPH_override(); From 099a81b786b7758fd95d7f2367d30f85aa9b5efd Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 20 Aug 2016 00:03:21 -0400 Subject: [PATCH 20/32] wb_frame: add support for optional 'wbinfo.outer_prefix' which if set, is used for making the top frame url (#191) --- pywb/static/wb_frame.js | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pywb/static/wb_frame.js b/pywb/static/wb_frame.js index 9330dab0..2f32f43b 100644 --- a/pywb/static/wb_frame.js +++ b/pywb/static/wb_frame.js @@ -27,16 +27,18 @@ var IFRAME_ID = "replay_iframe"; var last_inner_hash = undefined; -function make_url(url, ts, mod) +function make_url(url, ts, mod, prefix) { if (ts || mod) { mod += "/"; } + prefix = prefix || wbinfo.prefix; + if (ts) { - return wbinfo.prefix + ts + mod + url; + return prefix + ts + mod + url; } else { - return wbinfo.prefix + mod + url; + return prefix + mod + url; } } @@ -53,10 +55,11 @@ function push_state(state) { } */ - state.outer_url = make_url(state.url, state.request_ts, wbinfo.frame_mod); + state.outer_url = make_url(state.url, state.request_ts, wbinfo.frame_mod, wbinfo.outer_prefix); state.inner_url = make_url(state.url, state.request_ts, wbinfo.replay_mod); - var canon_url = make_url(state.url, state.request_ts, ""); + var canon_url = make_url(state.url, state.request_ts, "", wbinfo.outer_prefix); + if (window.location.href != canon_url) { if (state.wb_type != "pushState") { window.history.replaceState(state, "", canon_url); From 70a25b6d0fabbad665f2160984b7e26ec21231b7 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 20 Aug 2016 13:03:17 -0400 Subject: [PATCH 21/32] client rewrite: ensure window.open() windows have wombat inited. if they are set to about:blank, use parser from opener to ensure proper relative url resolving --- pywb/static/wombat.js | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 340705c6..ccf3004f 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -402,7 +402,17 @@ var wombat_internal = function($wbwindow) { function make_parser(href) { href = extract_orig(href); - var p = $wbwindow.document.createElement("a", true); + var baseWin; + + // special case: for newly opened blank windows, use the opener + // to create parser to have the proper baseURI + if ($wbwindow.location.href == "about:blank" && $wbwindow.opener) { + baseWin = $wbwindow.opener; + } else { + baseWin = $wbwindow; + } + + var p = baseWin.document.createElement("a", true); p.href = href; return p; } @@ -1888,7 +1898,9 @@ var wombat_internal = function($wbwindow) { var open_rewritten = function(strUrl, strWindowName, strWindowFeatures) { strUrl = rewrite_url(strUrl, false, ""); - return orig.call(this, strUrl, strWindowName, strWindowFeatures); + var res = orig.call(this, strUrl, strWindowName, strWindowFeatures); + init_new_window_wombat(res, strUrl); + return res; } $wbwindow.open = open_rewritten; @@ -2080,7 +2092,11 @@ var wombat_internal = function($wbwindow) { //var src = iframe.src; var src = wb_getAttribute.call(iframe, "src"); - + + init_new_window_wombat(win, src); + } + + function init_new_window_wombat(win, src) { if (!src || src == "" || src == "about:blank" || src.indexOf("javascript:") >= 0) { win._WBWombat = wombat_internal(win); win._wb_wombat = new win._WBWombat(wb_info); From 895a01933ca55f46a2294e5fcc68ac82a7ff0b21 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 2 Sep 2016 12:04:30 -0700 Subject: [PATCH 22/32] wb: allow multiple readystateevent changes, in case data changes (eg. title is available later) --- pywb/static/wb.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pywb/static/wb.js b/pywb/static/wb.js index 8157b90a..6b5693c1 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -134,7 +134,7 @@ function notify_top(event) { window.__WB_top_frame.postMessage(message, "*"); - remove_event("readystatechange", notify_top, document); + //remove_event("readystatechange", notify_top, document); } this.load = function() { From 1fe201c5288a8361223f465d9c88d23a84e7bdde Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 8 Sep 2016 10:06:47 -0700 Subject: [PATCH 23/32] rewrite: html: rewrite svg tag client: update textContent after rewrite_style() in rewrite_elem() --- pywb/rewrite/html_rewriter.py | 1 + pywb/static/wombat.js | 1 + 2 files changed, 2 insertions(+) diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 7fc105d9..63e19ac1 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -40,6 +40,7 @@ class HTMLRewriterMixin(object): 'embed': {'src': 'oe_'}, 'head': {'': defmod}, # for head rewriting 'iframe': {'src': 'if_'}, + 'image': {'src': 'im_', 'xlink:href': 'im_'}, 'img': {'src': 'im_', 'srcset': 'im_'}, 'ins': {'cite': defmod}, diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index ccf3004f..71ec3227 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -1218,6 +1218,7 @@ var wombat_internal = function($wbwindow) { if (elem.tagName == "STYLE") { var new_content = rewrite_style(elem.textContent); if (elem.textContent != new_content) { + elem.textContent = new_content; changed = true; } } else if (elem.tagName == "OBJECT") { From 6452c72b4f8732e31996258a5d6cfc72ad5ef243 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 8 Sep 2016 10:31:07 -0700 Subject: [PATCH 24/32] bump versions --- README.rst | 2 +- pywb/__init__.py | 2 +- pywb/static/wombat.js | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 4e5fec37..60f5237d 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -PyWb 0.31.5 +PyWb 0.32.0 =========== .. image:: https://travis-ci.org/ikreymer/pywb.svg?branch=master diff --git a/pywb/__init__.py b/pywb/__init__.py index 7e67dee2..7cdc246c 100644 --- a/pywb/__init__.py +++ b/pywb/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.31.5' +__version__ = '0.32.0' DEFAULT_CONFIG = 'pywb/default_config.yaml' diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 71ec3227..882c996f 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -18,7 +18,7 @@ This file is part of pywb, https://github.com/ikreymer/pywb */ //============================================ -// Wombat JS-Rewriting Library v2.14 +// Wombat JS-Rewriting Library v2.15 //============================================ From f47ae0bb7edd96c0dd39ead3eb6148771007177c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 8 Sep 2016 18:34:47 -0700 Subject: [PATCH 25/32] rewrite: for rewriting on* attr, add 'window.' before WB_wombat_ as window may not be in scope (if no '.' before WB_wombat) --- pywb/rewrite/html_rewriter.py | 16 +++++++++++----- pywb/rewrite/test/test_html_rewriter.py | 5 ++++- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 63e19ac1..21989a29 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -119,6 +119,8 @@ class HTMLRewriterMixin(object): META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE) + ADD_WINDOW = re.compile('(?/**/" >>> parse('
') -
+
+ +>>> parse('
') +
>>> parse('') From 1a37d789ed871bf8583b28358c9c813b17d7399b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 8 Sep 2016 18:59:52 -0700 Subject: [PATCH 26/32] cdx-api: when using cdx server api, return no captures 404 error in json format if output=json, plain text otherwise instead of as html #193 --- pywb/webapp/cdx_api_handler.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/pywb/webapp/cdx_api_handler.py b/pywb/webapp/cdx_api_handler.py index 1835647a..1ebd0018 100644 --- a/pywb/webapp/cdx_api_handler.py +++ b/pywb/webapp/cdx_api_handler.py @@ -1,11 +1,13 @@ from pywb.cdx.cdxserver import create_cdx_server +from pywb.utils.wbexception import NotFoundException from pywb.framework.basehandlers import BaseHandler from pywb.framework.wbrequestresponse import WbResponse from pywb.webapp.query_handler import QueryHandler from six.moves.urllib.parse import parse_qs +import json import six @@ -21,7 +23,18 @@ class CDXAPIHandler(BaseHandler): def __call__(self, wbrequest): params = self.extract_params_from_wsgi_env(wbrequest.env) - cdx_iter = self.index_handler.load_cdx(wbrequest, params) + try: + cdx_iter = self.index_handler.load_cdx(wbrequest, params) + except NotFoundException: + msg = 'No Captures found for: ' + params.get('url') + if params.get('output') == 'json': + msg = json.dumps(dict(error=msg)) + content_type='application/json' + else: + content_type='text/plain' + + return WbResponse.text_response(msg, content_type=content_type, + status='404 Not Found') return WbResponse.text_stream(cdx_iter, content_type='text/plain') From 70fdaae2b392ee7a992ebb6b065ed489d1006417 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 12 Sep 2016 20:07:14 -0700 Subject: [PATCH 27/32] rules: rewrite location string for periscope js --- pywb/rules.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pywb/rules.yaml b/pywb/rules.yaml index 7b72d976..c819d446 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -135,6 +135,15 @@ rules: fuzzy_lookup: 'com,google,plus\)/_/.*?.*(f.sid=[^&]+)' + # periscope + #================================================================= + + - url_prefix: 'tv,periscope,assets)/js/' + + rewrite: + js_regexs: + - match: '"location"' + replace: '"WB_wombat_location"' # vimeo rules #================================================================= From 1fb6e9b5fa006c4a795bff13b78f08bba411e7a5 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 14 Sep 2016 13:04:46 -0700 Subject: [PATCH 28/32] rewrite: url rewriter: don't rewrite relative urls, only those that start with scheme, / or contain ../ #195 update tests to reflect this new behavior --- pywb/rewrite/test/test_cookie_rewriter.py | 12 +++---- pywb/rewrite/test/test_header_rewriter.py | 2 +- pywb/rewrite/test/test_html_rewriter.py | 32 +++++++++---------- pywb/rewrite/test/test_regex_rewriters.py | 14 ++++---- pywb/rewrite/test/test_rewrite_live.py | 4 +-- pywb/rewrite/test/test_url_rewriter.py | 16 +++++----- pywb/rewrite/url_rewriter.py | 8 +++++ pywb/static/wombat.js | 2 +- sample_archive/text_content/sample.html | 2 +- .../text_content/sample_no_head.html | 2 +- 10 files changed, 51 insertions(+), 43 deletions(-) diff --git a/pywb/rewrite/test/test_cookie_rewriter.py b/pywb/rewrite/test/test_cookie_rewriter.py index fa799787..c6a7780d 100644 --- a/pywb/rewrite/test/test_cookie_rewriter.py +++ b/pywb/rewrite/test/test_cookie_rewriter.py @@ -22,10 +22,10 @@ True [('Set-Cookie', 'some=value; Path=/pywb/')] >>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter, 'coll') -[('Set-Cookie', 'abc=def; Path=/pywb/20131226101010/http://example.com/some/path/file.html')] +[('Set-Cookie', 'abc=def; Path=file.html')] # keep Max-Age ->>> rewrite_cookie('abc=def; Path=file.html; Max-Age=1500', urlrewriter2, 'coll') +>>> rewrite_cookie('abc=def; Path=/file.html; Max-Age=1500', urlrewriter2, 'coll') [('Set-Cookie', 'abc=def; Max-Age=1500; Path=/preview/em_/http://example.com/file.html')] # Cookie with invalid chars, not parsed @@ -92,14 +92,14 @@ def rewrite_cookie(cookie_str, rewriter=urlrewriter, scope='default'): @pytest.mark.skipif(sys.version_info < (2,7), reason='Unsupported') def test_with_expires(): # keep expires - res = rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter2, 'coll') + res = rewrite_cookie('abc=def; Path=/file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter2, 'coll') assert len(res) == 1 assert res[0][1].lower() == 'abc=def; expires=wed, 13 jan 2021 22:23:01 gmt; path=/preview/em_/http://example.com/file.html' @pytest.mark.skipif(sys.version_info < (2,7), reason='Unsupported') def test_with_expires_utc_replace(): # keep expires, UTC->GMT - res = rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 UTC', urlrewriter2, 'coll') + res = rewrite_cookie('abc=def; Path=/file.html; Expires=Wed, 13 Jan 2021 22:23:01 UTC', urlrewriter2, 'coll') assert len(res) == 1 assert res[0][1].lower() == 'abc=def; expires=wed, 13 jan 2021 22:23:01 gmt; path=/preview/em_/http://example.com/file.html' @@ -113,14 +113,14 @@ def test_http_secure_flag(): @pytest.mark.skipif(sys.version_info < (2,7), reason='Unsupported') def test_secure_flag_remove(): # Secure Remove - res = rewrite_cookie('abc=def; Path=file.html; HttpOnly; Secure', urlrewriter2, 'coll') + res = rewrite_cookie('abc=def; Path=/file.html; HttpOnly; Secure', urlrewriter2, 'coll') assert len(res) == 1 assert res[0][1].lower() == 'abc=def; httponly; path=/preview/em_/http://example.com/file.html' @pytest.mark.skipif(sys.version_info < (2,7), reason='Unsupported') def test_secure_flag_keep(): # Secure Keep - res = rewrite_cookie('abc=def; Path=file.html; HttpOnly; Secure', urlrewriter3, 'coll') + res = rewrite_cookie('abc=def; Path=/file.html; HttpOnly; Secure', urlrewriter3, 'coll') assert res[0][1].lower() == 'abc=def; httponly; path=/preview/em_/http://example.com/file.html; secure' diff --git a/pywb/rewrite/test/test_header_rewriter.py b/pywb/rewrite/test/test_header_rewriter.py index fc2146d7..e58c6d84 100644 --- a/pywb/rewrite/test/test_header_rewriter.py +++ b/pywb/rewrite/test/test_header_rewriter.py @@ -78,7 +78,7 @@ def _test_head_data(headers, status='200 OK', rewriter=urlrewriter): def test_cookie_headers(): # cookie, host/origin rewriting res = _test_head_data([('Connection', 'close'), - ('Set-Cookie', 'foo=bar; Path=/; abc=def; Path=somefile.html'), + ('Set-Cookie', 'foo=bar; Path=/; abc=def; Path=/somefile.html'), ('Host', 'example.com'), ('Origin', 'https://example.com')]) diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index 04c42f73..d8087555 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -8,7 +8,7 @@ r""" #================================================================= >>> parse('
Text') -Text +Text >>> parse('
')
@@ -35,8 +35,8 @@ r""" >>> parse('', urlrewriter=full_path_urlrewriter) ->>> parse('') - +>>> parse('') + # ensure trailing slash added >>> parse('') @@ -47,7 +47,7 @@ r""" >>> parse('', urlrewriter=no_base_canon_rewriter) - + # Empty url >>> parse('') @@ -58,7 +58,7 @@ r""" # href on other tags >>> parse('
Text
') -
Text
+
Text
# HTML Entities >>> parse('›   > ?') @@ -148,10 +148,10 @@ r""" >>> parse('
') -
+
->>> parse('
') -
+>>> parse('
') +
>>> parse('') @@ -169,19 +169,19 @@ r""" # Style ->>> parse('') - +>>> parse('') + # Unterminated style tag, handle and auto-terminate >>> parse(' + # Head Insertion ->>> parse('Test', head_insert = '') -Test +>>> parse('Test', head_insert = '') +Test >>> parse('', head_insert = '') - + >>> parse('Test', head_insert = '') Test @@ -189,7 +189,7 @@ r""" >>> parse('
SomeTest
', head_insert = '/* Insert */') /* Insert */
SomeTest
->>> parse('
SomeTest
', head_insert = '') +>>> parse('
SomeTest
', head_insert = '')
SomeTest
>>> parse('Some Text without any tags ', head_insert = '') @@ -236,7 +236,7 @@ r""" # remove extra spaces >>> parse('
Text') -Text +Text >>> parse('Text') Text diff --git a/pywb/rewrite/test/test_regex_rewriters.py b/pywb/rewrite/test/test_regex_rewriters.py index 2f508cd8..2762b9d0 100644 --- a/pywb/rewrite/test/test_regex_rewriters.py +++ b/pywb/rewrite/test/test_regex_rewriters.py @@ -151,7 +151,7 @@ r""" 'background: url(" /web/20131010/http://domain.com/path.html x ")' >>> _test_css("background: url(file.jpeg)") -'background: url(/web/20131010/http://example.com/file.jpeg)' +'background: url(file.jpeg)' >>> _test_css("background:#abc url('/static/styles/../images/layout/logo.png')") "background:#abc url('/web/20131010/http://example.com/static/images/layout/logo.png')" @@ -163,18 +163,18 @@ r""" "background: url('')" >>> _test_css("background: url (\"weirdpath\')") -'background: url ("/web/20131010/http://example.com/weirdpath\')' +'background: url ("weirdpath\')' ->>> _test_css("@import url ('path.css')") +>>> _test_css("@import url ('/path.css')") "@import url ('/web/20131010/http://example.com/path.css')" >>> _test_css("@import url('path.css')") -"@import url('/web/20131010/http://example.com/path.css')" +"@import url('path.css')" >>> _test_css("@import ( 'path.css')") -"@import ( '/web/20131010/http://example.com/path.css')" +"@import ( 'path.css')" ->>> _test_css("@import \"path.css\"") +>>> _test_css("@import \"/path.css\"") '@import "/web/20131010/http://example.com/path.css"' >>> _test_css("@import ('../path.css\"") @@ -184,7 +184,7 @@ r""" '@import (\'/web/20131010/http://example.com/url.css"' >>> _test_css("@import (\"url.css\")") -'@import ("/web/20131010/http://example.com/url.css")' +'@import ("url.css")' >>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)") '@import url(/web/20131010/http://example.com/url.css)\n@import url(/web/20131010/http://example.com/anotherurl.css)\n @import url(/web/20131010/http://example.com/and_a_third.css)' diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py index d3ffc3d8..62fa3bf9 100644 --- a/pywb/rewrite/test/test_rewrite_live.py +++ b/pywb/rewrite/test/test_rewrite_live.py @@ -123,7 +123,7 @@ def test_local_no_head_banner_only(): assert 'window.location = "/other.html"' in buff # link NOT rewritten - assert '"another.html"' in buff + assert '"/some/path/another.html"' in buff def test_local_banner_only_no_rewrite(): status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', @@ -138,7 +138,7 @@ def test_local_banner_only_no_rewrite(): assert 'window.location = "http:\/\/example.com/dynamic_page.html"' in buff, buff # link NOT rewritten - assert '"another.html"' in buff + assert '"/some/path/another.html"' in buff def test_local_2_link_only_rewrite(): status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', diff --git a/pywb/rewrite/test/test_url_rewriter.py b/pywb/rewrite/test/test_url_rewriter.py index ac23051a..da243cd7 100644 --- a/pywb/rewrite/test/test_url_rewriter.py +++ b/pywb/rewrite/test/test_url_rewriter.py @@ -21,19 +21,19 @@ # UrlRewriter tests >>> do_rewrite('other.html', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') -'/web/20131010/http://example.com/path/other.html' +'other.html' ->>> do_rewrite('file.js', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_') +>>> do_rewrite('/path/file.js', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_') '/web/20131010js_/http://example.com/path/file.js' ->>> do_rewrite('file.js', '20131010/http://example.com/', '/coll/') +>>> do_rewrite('/file.js', '20131010/http://example.com/', '/coll/') '/coll/20131010/http://example.com/file.js' ->>> do_rewrite('file.js', '20131010/http://example.com', '/coll/', 'js_') +>>> do_rewrite('/file.js', '20131010/http://example.com', '/coll/', 'js_') '/coll/20131010js_/http://example.com/file.js' >>> do_rewrite('file.js', '20131010/http://example.com', '/coll/', '') -'/coll/20131010/http://example.com/file.js' +'file.js' >>> do_rewrite('/other.html', '20130907*/http://example.com/path/page.html', 'http://localhost:8080/coll/') '/coll/20130907*/http://example.com/other.html' @@ -41,8 +41,8 @@ >>> do_rewrite('/other.html', '20130907*/http://example.com/path/page.html', '/coll/') '/coll/20130907*/http://example.com/other.html' ->>> do_rewrite('./other.html', '20130907*/http://example.com/path/page.html', '/coll/') -'/coll/20130907*/http://example.com/path/other.html' +>>> do_rewrite('other.html', '20130907*/http://example.com/path/page.html', '/coll/') +'other.html' >>> do_rewrite('../other.html', '20131112im_/http://example.com/path/page.html', '/coll/') '/coll/20131112im_/http://example.com/other.html' @@ -87,7 +87,7 @@ '2020/http://example.com/other.html' >>> do_rewrite('', '20131010010203/http://example.com/file.html', '/web/') -'/web/20131010010203/http://example.com/file.html' +'' >>> do_rewrite('#anchor', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') '#anchor' diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index 4774bc03..d6fda47f 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -19,6 +19,9 @@ class UrlRewriter(object): REL_SCHEME = ('//', r'\/\/', r'\\/\\/') + PARENT_PATH = '../' + REL_PATH = '/' + def __init__(self, wburl, prefix='', full_prefix=None, rel_prefix=None, root_path=None, cookie_scope=None, rewrite_opts=None): self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl) @@ -60,6 +63,11 @@ class UrlRewriter(object): if url.startswith(self.REL_SCHEME): is_abs = True scheme_rel = True + elif (not is_abs and + not url.startswith(self.REL_PATH) and + self.PARENT_PATH not in url): + return url + # if prefix starts with a scheme #if self.prefix_scheme: # url = self.prefix_scheme + ':' + url diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 882c996f..b4730072 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -109,7 +109,7 @@ var wombat_internal = function($wbwindow) { } //============================================ - var rewrite_url = rewrite_url_; + var rewrite_url = rewrite_url_debug; function rewrite_url_debug(url, use_rel, mod) { var rewritten = rewrite_url_(url, use_rel, mod); diff --git a/sample_archive/text_content/sample.html b/sample_archive/text_content/sample.html index f2ed6842..fc2d66d0 100644 --- a/sample_archive/text_content/sample.html +++ b/sample_archive/text_content/sample.html @@ -10,5 +10,5 @@ if (some_val) { } Test Content -Some Link +Some Link diff --git a/sample_archive/text_content/sample_no_head.html b/sample_archive/text_content/sample_no_head.html index ed4bc4f3..4242336f 100644 --- a/sample_archive/text_content/sample_no_head.html +++ b/sample_archive/text_content/sample_no_head.html @@ -5,4 +5,4 @@ if (some_val) { } Test Content -Some Link +Some Link From 5fede0fea31011b011b8fec44f6f1ea1ceceab6a Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 14 Sep 2016 13:39:10 -0700 Subject: [PATCH 29/32] wombat: turn off debugging (accidentally committed) --- pywb/static/wombat.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index b4730072..882c996f 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -109,7 +109,7 @@ var wombat_internal = function($wbwindow) { } //============================================ - var rewrite_url = rewrite_url_debug; + var rewrite_url = rewrite_url_; function rewrite_url_debug(url, use_rel, mod) { var rewritten = rewrite_url_(url, use_rel, mod); From cc65ce914def8ec3da9dd24c537603d4cc5d7dd7 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 14 Sep 2016 14:13:59 -0700 Subject: [PATCH 30/32] wombat improvements (2.16): - rewrite_elem() also rewrite 'poster' - extract_orig() don't add http:// if nothing extracted - new override: navigator.sendBeacon() if available --- pywb/static/wombat.js | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 882c996f..a0926f0d 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -18,7 +18,7 @@ This file is part of pywb, https://github.com/ikreymer/pywb */ //============================================ -// Wombat JS-Rewriting Library v2.15 +// Wombat JS-Rewriting Library v2.16 //============================================ @@ -314,6 +314,8 @@ var wombat_internal = function($wbwindow) { return ""; } + var orig_href = href; + // proxy mode: no extraction needed if (!wb_replay_prefix) { return href; @@ -348,7 +350,7 @@ var wombat_internal = function($wbwindow) { href = href.substr(4); } - if (!starts_with(href, VALID_PREFIXES)) { + if (href != orig_href && !starts_with(href, VALID_PREFIXES)) { href = HTTP_PREFIX + href; } } @@ -1231,6 +1233,7 @@ var wombat_internal = function($wbwindow) { changed = rewrite_attr(elem, "src"); changed = rewrite_attr(elem, "href") || changed; changed = rewrite_attr(elem, "style") || changed; + changed = rewrite_attr(elem, "poster") || changed; } if (elem.getAttribute) { @@ -2209,6 +2212,20 @@ var wombat_internal = function($wbwindow) { } } + //============================================ + function init_beacon_override() + { + if (!$wbwindow.navigator.sendBeacon) { + return; + } + + var orig_sendBeacon = $wbwindow.navigator.sendBeacon; + + $wbwindow.navigator.sendBeacon = function(url, data) { + return orig_sendBeacon.call(this, rewrite_url(url), data); + } + } + //============================================ function get_final_url(prefix, mod, url) { if (mod == undefined) { @@ -2362,6 +2379,9 @@ var wombat_internal = function($wbwindow) { // registerProtocolHandler override init_registerPH_override(); + //sendBeacon override + init_beacon_override(); + // expose functions this.extract_orig = extract_orig; this.rewrite_url = rewrite_url; From 0a76a56b913928ed1b386e8ac27ff5cd70944f5f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 14 Sep 2016 15:44:20 -0700 Subject: [PATCH 31/32] wombat: edge case: correctly handle