1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

record-history experiments via custom metadata record, head_insert

work-in-progress.. highly experimental
This commit is contained in:
Ilya Kreymer 2017-11-04 17:37:46 -07:00
parent 0c74616070
commit dc7b8956bb
4 changed files with 88 additions and 7 deletions

View File

@ -14,7 +14,7 @@ from pywb.utils.loaders import extract_client_cookie
from pywb.utils.io import BUFF_SIZE from pywb.utils.io import BUFF_SIZE
from pywb.utils.memento import MementoUtils from pywb.utils.memento import MementoUtils
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date, iso_date_to_timestamp
from warcio.bufferedreaders import BufferedReader from warcio.bufferedreaders import BufferedReader
from warcio.recordloader import ArcWarcRecordLoader from warcio.recordloader import ArcWarcRecordLoader
@ -24,7 +24,7 @@ from pywb.apps.wbrequestresponse import WbResponse
from pywb.rewrite.rewriteinputreq import RewriteInputRequest from pywb.rewrite.rewriteinputreq import RewriteInputRequest
from pywb.rewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView from pywb.rewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
import re
from io import BytesIO from io import BytesIO
from copy import copy from copy import copy
@ -144,6 +144,7 @@ class RewriterApp(object):
full_prefix = host_prefix + rel_prefix full_prefix = host_prefix + rel_prefix
is_proxy = ('wsgiprox.proxy_host' in environ) is_proxy = ('wsgiprox.proxy_host' in environ)
is_ajax = self.is_ajax(environ)
response = self.handle_custom_response(environ, wb_url, response = self.handle_custom_response(environ, wb_url,
full_prefix, host_prefix, full_prefix, host_prefix,
@ -223,6 +224,13 @@ class RewriterApp(object):
res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key) res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key)
inputreq.extra_cookie, setcookie_headers = res inputreq.extra_cookie, setcookie_headers = res
# TWITTER TEST
if is_ajax:
print('AJAX')
m = re.match('https://twitter[.]com/[^/]+/status/([^/]+)', wb_url.url)
if m:
wb_url.url += '?conversation'
r = self._do_req(inputreq, wb_url, kwargs, skip) r = self._do_req(inputreq, wb_url, kwargs, skip)
if r.status_code >= 400: if r.status_code >= 400:
@ -293,6 +301,22 @@ class RewriterApp(object):
return resp return resp
if record.rec_type == 'metadata' and record.rec_headers.get('WARC-Profile') == 'history':
history_state = record.content_stream().read().decode('utf-8')
orig_wb_url = WbUrl(record.rec_headers.get('WARC-Refers-To-Target-URI'))
orig_wb_url.timestamp = iso_date_to_timestamp(record.rec_headers.get('WARC-Refers-To-Date'))
print(orig_wb_url.url, orig_wb_url.timestamp)
new_r = self._do_req(inputreq, orig_wb_url, kwargs, skip)
stream = BufferedReader(new_r.raw, block_size=BUFF_SIZE)
record = self.loader.parse_record_stream(stream,
ensure_http_headers=True)
print(record.rec_headers)
else:
history_state = 'undefined'
self._add_custom_params(cdx, r.headers, kwargs) self._add_custom_params(cdx, r.headers, kwargs)
if readd_range and record.http_headers.get_statuscode() == '200': if readd_range and record.http_headers.get_statuscode() == '200':
@ -318,7 +342,8 @@ class RewriterApp(object):
top_url, top_url,
environ, environ,
framed_replay, framed_replay,
config=self.config)) config=self.config,
history_state=history_state))
cookie_rewriter = None cookie_rewriter = None
if self.cookie_tracker: if self.cookie_tracker:

View File

@ -60,6 +60,10 @@ rules:
- url_prefix: 'com,twitter)/i/videos/tweet' - url_prefix: 'com,twitter)/i/videos/tweet'
fuzzy_lookup: '()' fuzzy_lookup: '()'
- url_prefix: 'com,twitter)/'
fuzzy_lookup: '.*(conversation)?.*'
# facebook rules # facebook rules

View File

@ -796,7 +796,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
} }
//============================================ //============================================
function send_history_update(url, title) { function send_history_update(state, type, url, title) {
if ($wbwindow.__WB_top_frame && $wbwindow == $wbwindow.__WB_replay_top) { if ($wbwindow.__WB_top_frame && $wbwindow == $wbwindow.__WB_replay_top) {
var message = { var message = {
"url": url, "url": url,
@ -805,6 +805,8 @@ var _WBWombat = function($wbwindow, wbinfo) {
"is_live": wb_info.is_live, "is_live": wb_info.is_live,
"title": title, "title": title,
"wb_type": "replace-url", "wb_type": "replace-url",
"state": state,
"change_type": type,
} }
$wbwindow.__WB_top_frame.postMessage(message, wb_info.top_host); $wbwindow.__WB_top_frame.postMessage(message, wb_info.top_host);
@ -817,7 +819,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
override_history_func("replaceState"); override_history_func("replaceState");
$wbwindow.addEventListener("popstate", function(event) { $wbwindow.addEventListener("popstate", function(event) {
send_history_update($wbwindow.WB_wombat_location.href, $wbwindow.document.title); send_history_update(event.state, "popState", $wbwindow.WB_wombat_location.href, $wbwindow.document.title);
}); });
} }
@ -847,7 +849,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
if (url && (url != $wbwindow.WB_wombat_location.origin && $wbwindow.WB_wombat_location.href != "about:blank") && if (url && (url != $wbwindow.WB_wombat_location.origin && $wbwindow.WB_wombat_location.href != "about:blank") &&
!starts_with(url, $wbwindow.WB_wombat_location.origin + "/")) { !starts_with(url, $wbwindow.WB_wombat_location.origin + "/")) {
throw new DOMException("Invalid history change: " + url); //throw new DOMException("Invalid history change: " + url);
} }
} else { } else {
url = $wbwindow.WB_wombat_location.href; url = $wbwindow.WB_wombat_location.href;
@ -855,7 +857,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
orig_func.call(this, state_obj, title, rewritten_url); orig_func.call(this, state_obj, title, rewritten_url);
send_history_update(url, title); send_history_update(state_obj, func_name, url, title);
} }
$wbwindow.history[func_name] = rewritten_func; $wbwindow.history[func_name] = rewritten_func;
@ -3036,6 +3038,52 @@ var _WBWombat = function($wbwindow, wbinfo) {
// End Proxy Obj Override System // End Proxy Obj Override System
function init_history_replay($wbwindow, wbinfo) {
if (!wbinfo.history_state) {
return;
}
if ($wbwindow.__WB_replay_top != $wbwindow) {
return;
}
// replace initial state
var orig_url = wbinfo.prefix + wbinfo.history_state.base_timestamp + wbinfo.mod + "/" + wbinfo.history_state.base_url;
$wbwindow.history.replaceState(wbinfo.history_state.base_state || null, "Title", orig_url);
var replayed = false;
$wbwindow.addEventListener("load", function() {
setTimeout(replay, 500);
});
function replay() {
if (replayed) {
return;
}
if (document.readyState != "complete") {
return;
}
replayed = true;
var states = wbinfo.history_state.states;
var lastState = states[states.length - 1][0];
if ($wbwindow.history.state == lastState) {
return;
}
for (var i = 0; i < states.length; i++) {
console.log(JSON.stringify(states[i]));
$wbwindow.history.pushState.apply($wbwindow.history, states[i]);
}
$wbwindow.dispatchEvent(new PopStateEvent('popstate', { state: lastState}));
};
}
//============================================ //============================================
function wombat_init(wbinfo) { function wombat_init(wbinfo) {
init_paths(wbinfo); init_paths(wbinfo);
@ -3044,6 +3092,8 @@ var _WBWombat = function($wbwindow, wbinfo) {
init_wombat_loc($wbwindow); init_wombat_loc($wbwindow);
init_history_replay($wbwindow, wbinfo);
// archival mode: init url-rewriting intercepts // archival mode: init url-rewriting intercepts
if (!wb_is_proxy) { if (!wb_is_proxy) {
init_wombat_top($wbwindow); init_wombat_top($wbwindow);

View File

@ -15,6 +15,8 @@
wbinfo.proxy_magic = "{{ env.pywb_proxy_magic }}"; wbinfo.proxy_magic = "{{ env.pywb_proxy_magic }}";
wbinfo.static_prefix = "{{ host_prefix }}/{{ static_path }}/"; wbinfo.static_prefix = "{{ host_prefix }}/{{ static_path }}/";
wbinfo.history_state = {{ history_state }};
{% if not wb_url.is_banner_only %} {% if not wb_url.is_banner_only %}
wbinfo.wombat_ts = "{{ wombat_ts }}"; wbinfo.wombat_ts = "{{ wombat_ts }}";
wbinfo.wombat_sec = "{{ wombat_sec }}"; wbinfo.wombat_sec = "{{ wombat_sec }}";