1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

record-history experiments via custom metadata record, head_insert

work-in-progress.. highly experimental
This commit is contained in:
Ilya Kreymer 2017-11-04 17:37:46 -07:00
parent 0c74616070
commit dc7b8956bb
4 changed files with 88 additions and 7 deletions

View File

@ -14,7 +14,7 @@ from pywb.utils.loaders import extract_client_cookie
from pywb.utils.io import BUFF_SIZE
from pywb.utils.memento import MementoUtils
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date, iso_date_to_timestamp
from warcio.bufferedreaders import BufferedReader
from warcio.recordloader import ArcWarcRecordLoader
@ -24,7 +24,7 @@ from pywb.apps.wbrequestresponse import WbResponse
from pywb.rewrite.rewriteinputreq import RewriteInputRequest
from pywb.rewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
import re
from io import BytesIO
from copy import copy
@ -144,6 +144,7 @@ class RewriterApp(object):
full_prefix = host_prefix + rel_prefix
is_proxy = ('wsgiprox.proxy_host' in environ)
is_ajax = self.is_ajax(environ)
response = self.handle_custom_response(environ, wb_url,
full_prefix, host_prefix,
@ -223,6 +224,13 @@ class RewriterApp(object):
res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key)
inputreq.extra_cookie, setcookie_headers = res
# TWITTER TEST
if is_ajax:
print('AJAX')
m = re.match('https://twitter[.]com/[^/]+/status/([^/]+)', wb_url.url)
if m:
wb_url.url += '?conversation'
r = self._do_req(inputreq, wb_url, kwargs, skip)
if r.status_code >= 400:
@ -293,6 +301,22 @@ class RewriterApp(object):
return resp
if record.rec_type == 'metadata' and record.rec_headers.get('WARC-Profile') == 'history':
history_state = record.content_stream().read().decode('utf-8')
orig_wb_url = WbUrl(record.rec_headers.get('WARC-Refers-To-Target-URI'))
orig_wb_url.timestamp = iso_date_to_timestamp(record.rec_headers.get('WARC-Refers-To-Date'))
print(orig_wb_url.url, orig_wb_url.timestamp)
new_r = self._do_req(inputreq, orig_wb_url, kwargs, skip)
stream = BufferedReader(new_r.raw, block_size=BUFF_SIZE)
record = self.loader.parse_record_stream(stream,
ensure_http_headers=True)
print(record.rec_headers)
else:
history_state = 'undefined'
self._add_custom_params(cdx, r.headers, kwargs)
if readd_range and record.http_headers.get_statuscode() == '200':
@ -318,7 +342,8 @@ class RewriterApp(object):
top_url,
environ,
framed_replay,
config=self.config))
config=self.config,
history_state=history_state))
cookie_rewriter = None
if self.cookie_tracker:

View File

@ -60,6 +60,10 @@ rules:
- url_prefix: 'com,twitter)/i/videos/tweet'
fuzzy_lookup: '()'
- url_prefix: 'com,twitter)/'
fuzzy_lookup: '.*(conversation)?.*'
# facebook rules

View File

@ -796,7 +796,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
}
//============================================
function send_history_update(url, title) {
function send_history_update(state, type, url, title) {
if ($wbwindow.__WB_top_frame && $wbwindow == $wbwindow.__WB_replay_top) {
var message = {
"url": url,
@ -805,6 +805,8 @@ var _WBWombat = function($wbwindow, wbinfo) {
"is_live": wb_info.is_live,
"title": title,
"wb_type": "replace-url",
"state": state,
"change_type": type,
}
$wbwindow.__WB_top_frame.postMessage(message, wb_info.top_host);
@ -817,7 +819,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
override_history_func("replaceState");
$wbwindow.addEventListener("popstate", function(event) {
send_history_update($wbwindow.WB_wombat_location.href, $wbwindow.document.title);
send_history_update(event.state, "popState", $wbwindow.WB_wombat_location.href, $wbwindow.document.title);
});
}
@ -847,7 +849,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
if (url && (url != $wbwindow.WB_wombat_location.origin && $wbwindow.WB_wombat_location.href != "about:blank") &&
!starts_with(url, $wbwindow.WB_wombat_location.origin + "/")) {
throw new DOMException("Invalid history change: " + url);
//throw new DOMException("Invalid history change: " + url);
}
} else {
url = $wbwindow.WB_wombat_location.href;
@ -855,7 +857,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
orig_func.call(this, state_obj, title, rewritten_url);
send_history_update(url, title);
send_history_update(state_obj, func_name, url, title);
}
$wbwindow.history[func_name] = rewritten_func;
@ -3036,6 +3038,52 @@ var _WBWombat = function($wbwindow, wbinfo) {
// End Proxy Obj Override System
function init_history_replay($wbwindow, wbinfo) {
if (!wbinfo.history_state) {
return;
}
if ($wbwindow.__WB_replay_top != $wbwindow) {
return;
}
// replace initial state
var orig_url = wbinfo.prefix + wbinfo.history_state.base_timestamp + wbinfo.mod + "/" + wbinfo.history_state.base_url;
$wbwindow.history.replaceState(wbinfo.history_state.base_state || null, "Title", orig_url);
var replayed = false;
$wbwindow.addEventListener("load", function() {
setTimeout(replay, 500);
});
function replay() {
if (replayed) {
return;
}
if (document.readyState != "complete") {
return;
}
replayed = true;
var states = wbinfo.history_state.states;
var lastState = states[states.length - 1][0];
if ($wbwindow.history.state == lastState) {
return;
}
for (var i = 0; i < states.length; i++) {
console.log(JSON.stringify(states[i]));
$wbwindow.history.pushState.apply($wbwindow.history, states[i]);
}
$wbwindow.dispatchEvent(new PopStateEvent('popstate', { state: lastState}));
};
}
//============================================
function wombat_init(wbinfo) {
init_paths(wbinfo);
@ -3044,6 +3092,8 @@ var _WBWombat = function($wbwindow, wbinfo) {
init_wombat_loc($wbwindow);
init_history_replay($wbwindow, wbinfo);
// archival mode: init url-rewriting intercepts
if (!wb_is_proxy) {
init_wombat_top($wbwindow);

View File

@ -15,6 +15,8 @@
wbinfo.proxy_magic = "{{ env.pywb_proxy_magic }}";
wbinfo.static_prefix = "{{ host_prefix }}/{{ static_path }}/";
wbinfo.history_state = {{ history_state }};
{% if not wb_url.is_banner_only %}
wbinfo.wombat_ts = "{{ wombat_ts }}";
wbinfo.wombat_sec = "{{ wombat_sec }}";