diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index 68690b1a..d6d303af 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -14,7 +14,7 @@ from pywb.utils.loaders import extract_client_cookie from pywb.utils.io import BUFF_SIZE from pywb.utils.memento import MementoUtils -from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date +from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date, iso_date_to_timestamp from warcio.bufferedreaders import BufferedReader from warcio.recordloader import ArcWarcRecordLoader @@ -24,7 +24,7 @@ from pywb.apps.wbrequestresponse import WbResponse from pywb.rewrite.rewriteinputreq import RewriteInputRequest from pywb.rewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView - +import re from io import BytesIO from copy import copy @@ -144,6 +144,7 @@ class RewriterApp(object): full_prefix = host_prefix + rel_prefix is_proxy = ('wsgiprox.proxy_host' in environ) + is_ajax = self.is_ajax(environ) response = self.handle_custom_response(environ, wb_url, full_prefix, host_prefix, @@ -223,6 +224,13 @@ class RewriterApp(object): res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key) inputreq.extra_cookie, setcookie_headers = res + # TWITTER TEST + if is_ajax: + print('AJAX') + m = re.match('https://twitter[.]com/[^/]+/status/([^/]+)', wb_url.url) + if m: + wb_url.url += '?conversation' + r = self._do_req(inputreq, wb_url, kwargs, skip) if r.status_code >= 400: @@ -293,6 +301,22 @@ class RewriterApp(object): return resp + if record.rec_type == 'metadata' and record.rec_headers.get('WARC-Profile') == 'history': + history_state = record.content_stream().read().decode('utf-8') + orig_wb_url = WbUrl(record.rec_headers.get('WARC-Refers-To-Target-URI')) + orig_wb_url.timestamp = iso_date_to_timestamp(record.rec_headers.get('WARC-Refers-To-Date')) + print(orig_wb_url.url, orig_wb_url.timestamp) + new_r = self._do_req(inputreq, orig_wb_url, kwargs, skip) + + stream = BufferedReader(new_r.raw, block_size=BUFF_SIZE) + record = self.loader.parse_record_stream(stream, + ensure_http_headers=True) + + print(record.rec_headers) + + else: + history_state = 'undefined' + self._add_custom_params(cdx, r.headers, kwargs) if readd_range and record.http_headers.get_statuscode() == '200': @@ -318,7 +342,8 @@ class RewriterApp(object): top_url, environ, framed_replay, - config=self.config)) + config=self.config, + history_state=history_state)) cookie_rewriter = None if self.cookie_tracker: diff --git a/pywb/rules.yaml b/pywb/rules.yaml index 545cb7fe..8e0b3cae 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -60,6 +60,10 @@ rules: - url_prefix: 'com,twitter)/i/videos/tweet' fuzzy_lookup: '()' + + - url_prefix: 'com,twitter)/' + + fuzzy_lookup: '.*(conversation)?.*' # facebook rules diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index ec5f22d5..edfd876b 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -796,7 +796,7 @@ var _WBWombat = function($wbwindow, wbinfo) { } //============================================ - function send_history_update(url, title) { + function send_history_update(state, type, url, title) { if ($wbwindow.__WB_top_frame && $wbwindow == $wbwindow.__WB_replay_top) { var message = { "url": url, @@ -805,6 +805,8 @@ var _WBWombat = function($wbwindow, wbinfo) { "is_live": wb_info.is_live, "title": title, "wb_type": "replace-url", + "state": state, + "change_type": type, } $wbwindow.__WB_top_frame.postMessage(message, wb_info.top_host); @@ -817,7 +819,7 @@ var _WBWombat = function($wbwindow, wbinfo) { override_history_func("replaceState"); $wbwindow.addEventListener("popstate", function(event) { - send_history_update($wbwindow.WB_wombat_location.href, $wbwindow.document.title); + send_history_update(event.state, "popState", $wbwindow.WB_wombat_location.href, $wbwindow.document.title); }); } @@ -847,7 +849,7 @@ var _WBWombat = function($wbwindow, wbinfo) { if (url && (url != $wbwindow.WB_wombat_location.origin && $wbwindow.WB_wombat_location.href != "about:blank") && !starts_with(url, $wbwindow.WB_wombat_location.origin + "/")) { - throw new DOMException("Invalid history change: " + url); + //throw new DOMException("Invalid history change: " + url); } } else { url = $wbwindow.WB_wombat_location.href; @@ -855,7 +857,7 @@ var _WBWombat = function($wbwindow, wbinfo) { orig_func.call(this, state_obj, title, rewritten_url); - send_history_update(url, title); + send_history_update(state_obj, func_name, url, title); } $wbwindow.history[func_name] = rewritten_func; @@ -3036,6 +3038,52 @@ var _WBWombat = function($wbwindow, wbinfo) { // End Proxy Obj Override System + function init_history_replay($wbwindow, wbinfo) { + if (!wbinfo.history_state) { + return; + } + + if ($wbwindow.__WB_replay_top != $wbwindow) { + return; + } + + // replace initial state + var orig_url = wbinfo.prefix + wbinfo.history_state.base_timestamp + wbinfo.mod + "/" + wbinfo.history_state.base_url; + $wbwindow.history.replaceState(wbinfo.history_state.base_state || null, "Title", orig_url); + + var replayed = false; + + $wbwindow.addEventListener("load", function() { + setTimeout(replay, 500); + }); + + function replay() { + if (replayed) { + return; + } + + if (document.readyState != "complete") { + return; + } + + replayed = true; + + var states = wbinfo.history_state.states; + var lastState = states[states.length - 1][0]; + + if ($wbwindow.history.state == lastState) { + return; + } + + for (var i = 0; i < states.length; i++) { + console.log(JSON.stringify(states[i])); + $wbwindow.history.pushState.apply($wbwindow.history, states[i]); + } + + $wbwindow.dispatchEvent(new PopStateEvent('popstate', { state: lastState})); + }; + } + //============================================ function wombat_init(wbinfo) { init_paths(wbinfo); @@ -3044,6 +3092,8 @@ var _WBWombat = function($wbwindow, wbinfo) { init_wombat_loc($wbwindow); + init_history_replay($wbwindow, wbinfo); + // archival mode: init url-rewriting intercepts if (!wb_is_proxy) { init_wombat_top($wbwindow); diff --git a/pywb/templates/head_insert.html b/pywb/templates/head_insert.html index 5aa84b12..b3e84e88 100644 --- a/pywb/templates/head_insert.html +++ b/pywb/templates/head_insert.html @@ -15,6 +15,8 @@ wbinfo.proxy_magic = "{{ env.pywb_proxy_magic }}"; wbinfo.static_prefix = "{{ host_prefix }}/{{ static_path }}/"; + wbinfo.history_state = {{ history_state }}; + {% if not wb_url.is_banner_only %} wbinfo.wombat_ts = "{{ wombat_ts }}"; wbinfo.wombat_sec = "{{ wombat_sec }}";