mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
record-history experiments via custom metadata record, head_insert
work-in-progress.. highly experimental
This commit is contained in:
parent
0c74616070
commit
dc7b8956bb
@ -14,7 +14,7 @@ from pywb.utils.loaders import extract_client_cookie
|
|||||||
from pywb.utils.io import BUFF_SIZE
|
from pywb.utils.io import BUFF_SIZE
|
||||||
from pywb.utils.memento import MementoUtils
|
from pywb.utils.memento import MementoUtils
|
||||||
|
|
||||||
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date
|
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date, iso_date_to_timestamp
|
||||||
from warcio.bufferedreaders import BufferedReader
|
from warcio.bufferedreaders import BufferedReader
|
||||||
from warcio.recordloader import ArcWarcRecordLoader
|
from warcio.recordloader import ArcWarcRecordLoader
|
||||||
|
|
||||||
@ -24,7 +24,7 @@ from pywb.apps.wbrequestresponse import WbResponse
|
|||||||
from pywb.rewrite.rewriteinputreq import RewriteInputRequest
|
from pywb.rewrite.rewriteinputreq import RewriteInputRequest
|
||||||
from pywb.rewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
|
from pywb.rewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
|
||||||
|
|
||||||
|
import re
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from copy import copy
|
from copy import copy
|
||||||
|
|
||||||
@ -144,6 +144,7 @@ class RewriterApp(object):
|
|||||||
full_prefix = host_prefix + rel_prefix
|
full_prefix = host_prefix + rel_prefix
|
||||||
|
|
||||||
is_proxy = ('wsgiprox.proxy_host' in environ)
|
is_proxy = ('wsgiprox.proxy_host' in environ)
|
||||||
|
is_ajax = self.is_ajax(environ)
|
||||||
|
|
||||||
response = self.handle_custom_response(environ, wb_url,
|
response = self.handle_custom_response(environ, wb_url,
|
||||||
full_prefix, host_prefix,
|
full_prefix, host_prefix,
|
||||||
@ -223,6 +224,13 @@ class RewriterApp(object):
|
|||||||
res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key)
|
res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key)
|
||||||
inputreq.extra_cookie, setcookie_headers = res
|
inputreq.extra_cookie, setcookie_headers = res
|
||||||
|
|
||||||
|
# TWITTER TEST
|
||||||
|
if is_ajax:
|
||||||
|
print('AJAX')
|
||||||
|
m = re.match('https://twitter[.]com/[^/]+/status/([^/]+)', wb_url.url)
|
||||||
|
if m:
|
||||||
|
wb_url.url += '?conversation'
|
||||||
|
|
||||||
r = self._do_req(inputreq, wb_url, kwargs, skip)
|
r = self._do_req(inputreq, wb_url, kwargs, skip)
|
||||||
|
|
||||||
if r.status_code >= 400:
|
if r.status_code >= 400:
|
||||||
@ -293,6 +301,22 @@ class RewriterApp(object):
|
|||||||
|
|
||||||
return resp
|
return resp
|
||||||
|
|
||||||
|
if record.rec_type == 'metadata' and record.rec_headers.get('WARC-Profile') == 'history':
|
||||||
|
history_state = record.content_stream().read().decode('utf-8')
|
||||||
|
orig_wb_url = WbUrl(record.rec_headers.get('WARC-Refers-To-Target-URI'))
|
||||||
|
orig_wb_url.timestamp = iso_date_to_timestamp(record.rec_headers.get('WARC-Refers-To-Date'))
|
||||||
|
print(orig_wb_url.url, orig_wb_url.timestamp)
|
||||||
|
new_r = self._do_req(inputreq, orig_wb_url, kwargs, skip)
|
||||||
|
|
||||||
|
stream = BufferedReader(new_r.raw, block_size=BUFF_SIZE)
|
||||||
|
record = self.loader.parse_record_stream(stream,
|
||||||
|
ensure_http_headers=True)
|
||||||
|
|
||||||
|
print(record.rec_headers)
|
||||||
|
|
||||||
|
else:
|
||||||
|
history_state = 'undefined'
|
||||||
|
|
||||||
self._add_custom_params(cdx, r.headers, kwargs)
|
self._add_custom_params(cdx, r.headers, kwargs)
|
||||||
|
|
||||||
if readd_range and record.http_headers.get_statuscode() == '200':
|
if readd_range and record.http_headers.get_statuscode() == '200':
|
||||||
@ -318,7 +342,8 @@ class RewriterApp(object):
|
|||||||
top_url,
|
top_url,
|
||||||
environ,
|
environ,
|
||||||
framed_replay,
|
framed_replay,
|
||||||
config=self.config))
|
config=self.config,
|
||||||
|
history_state=history_state))
|
||||||
|
|
||||||
cookie_rewriter = None
|
cookie_rewriter = None
|
||||||
if self.cookie_tracker:
|
if self.cookie_tracker:
|
||||||
|
@ -60,6 +60,10 @@ rules:
|
|||||||
- url_prefix: 'com,twitter)/i/videos/tweet'
|
- url_prefix: 'com,twitter)/i/videos/tweet'
|
||||||
|
|
||||||
fuzzy_lookup: '()'
|
fuzzy_lookup: '()'
|
||||||
|
|
||||||
|
- url_prefix: 'com,twitter)/'
|
||||||
|
|
||||||
|
fuzzy_lookup: '.*(conversation)?.*'
|
||||||
|
|
||||||
|
|
||||||
# facebook rules
|
# facebook rules
|
||||||
|
@ -796,7 +796,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
//============================================
|
//============================================
|
||||||
function send_history_update(url, title) {
|
function send_history_update(state, type, url, title) {
|
||||||
if ($wbwindow.__WB_top_frame && $wbwindow == $wbwindow.__WB_replay_top) {
|
if ($wbwindow.__WB_top_frame && $wbwindow == $wbwindow.__WB_replay_top) {
|
||||||
var message = {
|
var message = {
|
||||||
"url": url,
|
"url": url,
|
||||||
@ -805,6 +805,8 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
|||||||
"is_live": wb_info.is_live,
|
"is_live": wb_info.is_live,
|
||||||
"title": title,
|
"title": title,
|
||||||
"wb_type": "replace-url",
|
"wb_type": "replace-url",
|
||||||
|
"state": state,
|
||||||
|
"change_type": type,
|
||||||
}
|
}
|
||||||
|
|
||||||
$wbwindow.__WB_top_frame.postMessage(message, wb_info.top_host);
|
$wbwindow.__WB_top_frame.postMessage(message, wb_info.top_host);
|
||||||
@ -817,7 +819,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
|||||||
override_history_func("replaceState");
|
override_history_func("replaceState");
|
||||||
|
|
||||||
$wbwindow.addEventListener("popstate", function(event) {
|
$wbwindow.addEventListener("popstate", function(event) {
|
||||||
send_history_update($wbwindow.WB_wombat_location.href, $wbwindow.document.title);
|
send_history_update(event.state, "popState", $wbwindow.WB_wombat_location.href, $wbwindow.document.title);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -847,7 +849,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
|||||||
|
|
||||||
if (url && (url != $wbwindow.WB_wombat_location.origin && $wbwindow.WB_wombat_location.href != "about:blank") &&
|
if (url && (url != $wbwindow.WB_wombat_location.origin && $wbwindow.WB_wombat_location.href != "about:blank") &&
|
||||||
!starts_with(url, $wbwindow.WB_wombat_location.origin + "/")) {
|
!starts_with(url, $wbwindow.WB_wombat_location.origin + "/")) {
|
||||||
throw new DOMException("Invalid history change: " + url);
|
//throw new DOMException("Invalid history change: " + url);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
url = $wbwindow.WB_wombat_location.href;
|
url = $wbwindow.WB_wombat_location.href;
|
||||||
@ -855,7 +857,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
|||||||
|
|
||||||
orig_func.call(this, state_obj, title, rewritten_url);
|
orig_func.call(this, state_obj, title, rewritten_url);
|
||||||
|
|
||||||
send_history_update(url, title);
|
send_history_update(state_obj, func_name, url, title);
|
||||||
}
|
}
|
||||||
|
|
||||||
$wbwindow.history[func_name] = rewritten_func;
|
$wbwindow.history[func_name] = rewritten_func;
|
||||||
@ -3036,6 +3038,52 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
|||||||
// End Proxy Obj Override System
|
// End Proxy Obj Override System
|
||||||
|
|
||||||
|
|
||||||
|
function init_history_replay($wbwindow, wbinfo) {
|
||||||
|
if (!wbinfo.history_state) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($wbwindow.__WB_replay_top != $wbwindow) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// replace initial state
|
||||||
|
var orig_url = wbinfo.prefix + wbinfo.history_state.base_timestamp + wbinfo.mod + "/" + wbinfo.history_state.base_url;
|
||||||
|
$wbwindow.history.replaceState(wbinfo.history_state.base_state || null, "Title", orig_url);
|
||||||
|
|
||||||
|
var replayed = false;
|
||||||
|
|
||||||
|
$wbwindow.addEventListener("load", function() {
|
||||||
|
setTimeout(replay, 500);
|
||||||
|
});
|
||||||
|
|
||||||
|
function replay() {
|
||||||
|
if (replayed) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (document.readyState != "complete") {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
replayed = true;
|
||||||
|
|
||||||
|
var states = wbinfo.history_state.states;
|
||||||
|
var lastState = states[states.length - 1][0];
|
||||||
|
|
||||||
|
if ($wbwindow.history.state == lastState) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var i = 0; i < states.length; i++) {
|
||||||
|
console.log(JSON.stringify(states[i]));
|
||||||
|
$wbwindow.history.pushState.apply($wbwindow.history, states[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
$wbwindow.dispatchEvent(new PopStateEvent('popstate', { state: lastState}));
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
//============================================
|
//============================================
|
||||||
function wombat_init(wbinfo) {
|
function wombat_init(wbinfo) {
|
||||||
init_paths(wbinfo);
|
init_paths(wbinfo);
|
||||||
@ -3044,6 +3092,8 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
|||||||
|
|
||||||
init_wombat_loc($wbwindow);
|
init_wombat_loc($wbwindow);
|
||||||
|
|
||||||
|
init_history_replay($wbwindow, wbinfo);
|
||||||
|
|
||||||
// archival mode: init url-rewriting intercepts
|
// archival mode: init url-rewriting intercepts
|
||||||
if (!wb_is_proxy) {
|
if (!wb_is_proxy) {
|
||||||
init_wombat_top($wbwindow);
|
init_wombat_top($wbwindow);
|
||||||
|
@ -15,6 +15,8 @@
|
|||||||
wbinfo.proxy_magic = "{{ env.pywb_proxy_magic }}";
|
wbinfo.proxy_magic = "{{ env.pywb_proxy_magic }}";
|
||||||
wbinfo.static_prefix = "{{ host_prefix }}/{{ static_path }}/";
|
wbinfo.static_prefix = "{{ host_prefix }}/{{ static_path }}/";
|
||||||
|
|
||||||
|
wbinfo.history_state = {{ history_state }};
|
||||||
|
|
||||||
{% if not wb_url.is_banner_only %}
|
{% if not wb_url.is_banner_only %}
|
||||||
wbinfo.wombat_ts = "{{ wombat_ts }}";
|
wbinfo.wombat_ts = "{{ wombat_ts }}";
|
||||||
wbinfo.wombat_sec = "{{ wombat_sec }}";
|
wbinfo.wombat_sec = "{{ wombat_sec }}";
|
||||||
|
Loading…
x
Reference in New Issue
Block a user