1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

record-history experiments via custom metadata record, head_insert (wip)

- check last url instead of last state, which may be empty
- add init_state tracking
- add accept filtering, enabled via special rule
This commit is contained in:
Ilya Kreymer 2017-11-04 17:37:46 -07:00
parent 0767bf80d5
commit 781b2aa393
9 changed files with 296 additions and 20 deletions

View File

@ -6,6 +6,10 @@ from werkzeug.exceptions import HTTPException, NotFound
from werkzeug.wsgi import pop_path_info
from six.moves.urllib.parse import urljoin
from six import iteritems
from io import BytesIO
from warcio.timeutils import timestamp_to_iso_date, timestamp_now
from warcio.timeutils import sec_to_timestamp, timestamp_to_sec
from warcio.utils import to_native_str
from wsgiprox.wsgiprox import WSGIProxMiddleware
@ -16,6 +20,7 @@ from pywb.recorder.recorderapp import RecorderApp
from pywb.utils.loaders import load_yaml_config
from pywb.utils.geventserver import GeventServer
from pywb.utils.io import StreamIter
from pywb.utils.format import query_to_dict
from pywb.warcserver.warcserver import WarcServer
@ -29,6 +34,7 @@ import os
import traceback
import requests
import logging
import json
# ============================================================================
@ -102,6 +108,73 @@ class FrontEndApp(object):
self.url_map.add(Rule(coll_prefix + '/<path:url>', endpoint=self.serve_content))
self.url_map.add(Rule('/_/add_waypoint', endpoint=self.serve_add_history))
def serve_add_history(self, environ):
if environ.get('REQUEST_METHOD') != 'POST':
res = {'error_message': 'POST required'}
else:
try:
res = self.add_history(environ)
except Exception as e:
res = {'error_message': str(e)}
return WbResponse.json_response(res)
def add_history(self, environ):
if not self.recorder_path:
return {'error': 'not recording'}
params = query_to_dict(environ.get('QUERY_STRING'))
if 'coll' not in params:
return {'error': 'collection required'}
upstream_url = self.recorder_path + '&put_record=metadata'
hist_data = environ['wsgi.input'].read()
hist_json = json.loads(hist_data.decode('utf-8'))
if not hist_json.get('states'):
return {}
base_url = hist_json.get('base_url')
final_url = hist_json.get('final_url')
if not final_url:
final_url = base_url
upstream_url = upstream_url.format(url=final_url,
coll=params['coll'].strip())
now = timestamp_now()
if now == hist_json['base_timestamp']:
now = sec_to_timestamp(timestamp_to_sec() + 1)
headers = {'Content-Type': 'application/vnd.pywb-waypoint+json; charset=utf-8',
'WARC-Refers-To-Target-URI': hist_json['base_url'],
'WARC-Refers-To-Date': timestamp_to_iso_date(hist_json['base_timestamp']),
'WARC-Profile': 'history',
'WARC-Target-URI': final_url,
'WARC-Date': timestamp_to_iso_date(now)
}
r = requests.put(upstream_url,
data=BytesIO(hist_data),
headers=headers,
)
try:
r.raise_for_status()
res = r.json()
assert(res['success'] == 'true')
return {}
except Exception as e:
print(e)
return {'error_message': 'history save failed'}
def get_upstream_paths(self, port):
base_paths = {
'replay': self.REPLAY_API % port,
@ -269,6 +342,8 @@ class FrontEndApp(object):
if timemap_output:
metadata['output'] = timemap_output
environ['pywb.template_params'] = {'coll': coll}
try:
response = self.rewriterapp.render_content(wb_url_str, metadata, environ)
except UpstreamException as ue:

View File

@ -14,7 +14,7 @@ from pywb.utils.loaders import extract_client_cookie
from pywb.utils.io import BUFF_SIZE, OffsetLimitReader
from pywb.utils.memento import MementoUtils
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date, iso_date_to_timestamp
from warcio.bufferedreaders import BufferedReader
from warcio.recordloader import ArcWarcRecordLoader
@ -24,7 +24,7 @@ from pywb.apps.wbrequestresponse import WbResponse
from pywb.rewrite.rewriteinputreq import RewriteInputRequest
from pywb.rewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
import re
from io import BytesIO
from copy import copy
@ -209,6 +209,7 @@ class RewriterApp(object):
full_prefix = host_prefix + rel_prefix
is_proxy = ('wsgiprox.proxy_host' in environ)
is_ajax = self.is_ajax(environ)
response = self.handle_custom_response(environ, wb_url,
full_prefix, host_prefix,
@ -264,6 +265,12 @@ class RewriterApp(object):
res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key)
inputreq.extra_cookie, setcookie_headers = res
if is_ajax and kwargs.get('type') != 'record':
accept_filter = inputreq.get_accept_filter(wb_url.url)
if accept_filter:
kwargs['filter'] = '~mime:' + accept_filter
kwargs['matchType'] = 'prefix'
r = self._do_req(inputreq, wb_url, kwargs, skip_record)
if r.status_code >= 400:
@ -323,6 +330,8 @@ class RewriterApp(object):
return resp
record, history_state = self._resolve_history(record, inputreq, kwargs, skip_record)
self._add_custom_params(cdx, r.headers, kwargs)
if self._add_range(record, wb_url, range_start, range_end):
@ -342,7 +351,8 @@ class RewriterApp(object):
top_url,
environ,
framed_replay,
config=self.config))
config=self.config,
history_state=history_state))
cookie_rewriter = None
if self.cookie_tracker:
@ -378,6 +388,31 @@ class RewriterApp(object):
return response
def _resolve_history(self, record, inputreq, kwargs, skip_record):
history_state = None
while True:
if record.rec_type != 'metadata' or record.rec_headers.get('WARC-Profile') != 'history':
break
stream = record.content_stream()
try:
if not history_state:
history_state = stream.read().decode('utf-8')
finally:
stream.close()
orig_wb_url = WbUrl(record.rec_headers.get('WARC-Refers-To-Target-URI'))
orig_wb_url.timestamp = iso_date_to_timestamp(record.rec_headers.get('WARC-Refers-To-Date'))
orig_wb_url.type = orig_wb_url.REPLAY
kwargs['filter'] = '!status:302'
new_r = self._do_req(inputreq, orig_wb_url, kwargs, skip_record)
stream = BufferedReader(new_r.raw, block_size=BUFF_SIZE)
record = self.loader.parse_record_stream(stream,
ensure_http_headers=True)
return record, (history_state or 'undefined')
def format_response(self, response, wb_url, full_prefix, is_timegate, is_proxy):
memento_ts = None
if not isinstance(response, WbResponse):
@ -488,13 +523,17 @@ class RewriterApp(object):
params = {}
params['url'] = wb_url.url
params['closest'] = closest
params['matchType'] = 'exact'
params['matchType'] = kwargs.get('matchType', 'exact')
if wb_url.mod == 'vi_':
params['content_type'] = self.VIDEO_INFO_CONTENT_TYPE
upstream_url = self.get_upstream_url(wb_url, kwargs, params)
if 'filter' in kwargs:
upstream_url += '&filter=' + kwargs['filter']
r = requests.post(upstream_url,
data=BytesIO(req_data),
headers=headers,

View File

@ -141,7 +141,7 @@ class RecorderApp(object):
payload_length = req_stream.out.tell()
req_stream.out.seek(0)
record = self.writer.create_warc_record(uri=params['url'],
record = self.writer.create_warc_record(uri=params.get('url', ''),
record_type=record_type,
payload=req_stream.out,
length=payload_length,
@ -153,6 +153,10 @@ class RecorderApp(object):
msg = {'success': 'true',
'WARC-Date': record.rec_headers.get_header('WARC-Date')}
except:
import traceback
traceback.print_exc()
finally:
if req_stream:
req_stream.out.close()

View File

@ -38,6 +38,12 @@ class BaseContentRewriter(object):
if rule:
self.rules.append(rule)
self.accept_filter_prefixes = config.get('accept_filter_prefixes', [])
print(self.accept_filter_prefixes)
def allow_accept_filter(self, url):
return any(url.startswith(prefix) for prefix in self.accept_filter_prefixes)
def parse_rewrite_rule(self, config):
rw_config = config.get('rewrite')
if not rw_config:

View File

@ -90,6 +90,16 @@ class RewriteInputRequest(DirectWSGIInputRequest):
return headers
def get_accept_filter(self, url):
if not self.rewriter.allow_accept_filter(url):
return
accept = self.env.get('HTTP_ACCEPT')
if not accept:
return
return '|'.join(accept.split(', ')[:-1])
def extract_range(self):
use_206 = False
start = None

View File

@ -43,6 +43,10 @@ default_filters:
- match: '[?&](\w*(bust|ts)\w*=1[\d]{12,15})(?=&|$)'
replace: ''
accept_filter_prefixes:
- https://twitter.com/
rules:
# twitter rules
@ -60,7 +64,7 @@ rules:
- url_prefix: 'com,twitter)/i/videos/tweet'
fuzzy_lookup: '()'
# facebook rules
#=================================================================

View File

@ -23,6 +23,14 @@ function ContentFrame(content_info) {
this.last_url = content_info.url;
this.last_ts = content_info.request_ts;
window.wr_history = {
"base_url": content_info.url,
"base_timestamp": content_info.timestamp,
"states": [],
"init_state": null,
};
this.init_iframe = function() {
if (typeof(content_info.iframe) === "string") {
this.iframe = document.querySelector(content_info.iframe);
@ -111,13 +119,14 @@ function ContentFrame(content_info) {
var type = state.wb_type;
if (type == "load" || type == "replace-url") {
this.set_url(state);
this.set_url(state, type);
this.saveHistory(state);
} else if (type == "hashchange") {
this.inner_hash_changed(state);
}
}
this.set_url = function(state) {
this.set_url = function(state, type) {
if (state.url && (state.url != this.last_url || state.request_ts != this.last_ts)) {
var new_url = this.make_url(state.url, state.request_ts, false);
@ -173,4 +182,55 @@ function ContentFrame(content_info) {
this.pm_source = win;
return this;
}
this.saveHistory = function(message) {
if (!message.is_live) {
return;
}
var event = [message.state, message.title, message.url];
if (message.wb_type == "load") {
if (window.wr_history.base_timestamp == message.base_ts &&
window.wr_history.base_url == message.base_url) {
return;
}
window.wr_history.base_timestamp = message.base_ts;
window.wr_history.base_url = message.base_url;
window.wr_history.states = [];
window.wr_history.init_state = event;
}
if (message.change_type == "popState") {
window.wr_history.states.pop();
return;
}
if (message.change_type == "pushState" || message.change_type == "replaceState") {
if (message.change_type == "replaceState") {
if (message.url == window.wr_history.base_url) {
return;
}
if (window.wr_history.states.length == 0) {
window.wr_history.init_state = event;
} else {
window.wr_history.states[window.wr_history.states.length - 1] = event;
}
} else {
window.wr_history.states.push(event);
}
window.wr_history.final_url = message.url;
}
var data = JSON.stringify(window.wr_history);
console.log(data);
var xhr = new XMLHttpRequest();
xhr.addEventListener("load", function(res) { console.log(xhr.responseText); });
xhr.open("POST", "/_/add_waypoint?coll=" + message.coll);
xhr.setRequestHeader('Content-type','application/json; charset=utf-8');
xhr.send(data);
}
}

View File

@ -23,7 +23,6 @@ This file is part of pywb, https://github.com/webrecorder/pywb
var _WBWombat = function($wbwindow, wbinfo) {
// associative array for func->handler for message and storage events
function FuncMap() {
this._arr = [];
@ -794,14 +793,19 @@ var _WBWombat = function($wbwindow, wbinfo) {
}
//============================================
function send_history_update(url, title) {
function send_history_update(state, type, url, title) {
var message = {
"base_url": wb_info.url,
"base_ts": wb_info.timestamp,
"url": url,
"ts": wb_info.timestamp,
"ts": Date.__WB_ts_now(),
"request_ts": wb_info.request_ts,
"is_live": wb_info.is_live,
"title": title,
"wb_type": "replace-url",
"state": state,
"change_type": type,
"coll": wb_info.coll,
}
send_top_message(message);
@ -813,7 +817,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
override_history_func("replaceState");
$wbwindow.addEventListener("popstate", function(event) {
send_history_update($wbwindow.WB_wombat_location.href, $wbwindow.document.title);
send_history_update(event.state, "popState", $wbwindow.WB_wombat_location.href, $wbwindow.document.title);
});
}
@ -843,7 +847,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
if (url && (url != $wbwindow.WB_wombat_location.origin && $wbwindow.WB_wombat_location.href != "about:blank") &&
!starts_with(url, $wbwindow.WB_wombat_location.origin + "/")) {
throw new DOMException("Invalid history change: " + url);
//throw new DOMException("Invalid history change: " + url);
}
} else {
url = $wbwindow.WB_wombat_location.href;
@ -851,7 +855,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
orig_func.call(this, state_obj, title, rewritten_url);
send_history_update(url, title);
send_history_update(state_obj, func_name, url, title);
}
$wbwindow.history[func_name] = rewritten_func;
@ -1220,6 +1224,11 @@ var _WBWombat = function($wbwindow, wbinfo) {
$wbwindow.Date.__WB_timediff = timediff;
$wbwindow.Date.__WB_ts_now = function(precision) {
precision = precision || 14;
return new $wbwindow.Date().toISOString().replace(/[^\d]/g, "").substr(0, precision);
}
Object.defineProperty($wbwindow.Date.prototype, "constructor", {value: $wbwindow.Date});
}
@ -2468,8 +2477,6 @@ var _WBWombat = function($wbwindow, wbinfo) {
orig_func_to_string.apply = orig_apply;
}
//============================================
function init_open_override()
{
@ -3047,7 +3054,8 @@ var _WBWombat = function($wbwindow, wbinfo) {
}
},
has: function(target, prop) {
return prop in $wbwindow;
return Reflect.has(target, prop) || Reflect.has($wbwindow, prop);
//return prop in $wbwindow;
},
ownKeys: function(target) {
return Object.getOwnPropertyNames($wbwindow).concat(Object.getOwnPropertySymbols($wbwindow));
@ -3055,7 +3063,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
getOwnPropertyDescriptor: function(target, key) {
// first try the underlying object's descriptor
// (to match defineProperty() behavior)
var descriptor = Object.getOwnPropertyDescriptor(target, key);
var descriptor = Object.getOwnPropertyDescriptor(target, key);
if (!descriptor) {
descriptor = Object.getOwnPropertyDescriptor($wbwindow, key);
// if using window's descriptor, must ensure it's configurable
@ -3137,6 +3145,65 @@ var _WBWombat = function($wbwindow, wbinfo) {
// End Proxy Obj Override System
function init_history_replay($wbwindow, wbinfo) {
if (!wbinfo.history_state) {
return;
}
if ($wbwindow.__WB_replay_top != $wbwindow) {
return;
}
// replace initial state
if (!wbinfo.history_state.init_state) {
wbinfo.history_state.init_state = [wbinfo.history_state.base_state || $wbwindow.history.state,
$wbwindow.document.title,
wbinfo.history_state.base_url];
}
wbinfo.history_state.init_state[2] = wbinfo.prefix + wbinfo.history_state.base_timestamp + wbinfo.mod + "/" + wbinfo.history_state.init_state[2];
$wbwindow.history.replaceState.apply($wbwindow.history, wbinfo.history_state.init_state);
var replayed = false;
$wbwindow.addEventListener("load", function() {
setTimeout(replay, 500);
});
function replay() {
if (replayed) {
return;
}
if (document.readyState != "complete") {
return;
}
replayed = true;
var states = wbinfo.history_state.states;
var lastState = states[states.length - 1][0];
if ($wbwindow.history.state == lastState) {
return;
}
if ($wbwindow.WB_wombat_location.href == states[states.length - 1][2]) {
return;
}
for (var i = 0; i < states.length; i++) {
//if (states[i][2] == wbinfo.history_state.base_url) {
// continue;
//}
$wbwindow.history.pushState.apply($wbwindow.history, states[i]);
}
$wbwindow.dispatchEvent(new PopStateEvent('popstate', { state: lastState}));
};
}
//============================================
function wombat_init(wbinfo) {
init_paths(wbinfo);
@ -3145,6 +3212,8 @@ var _WBWombat = function($wbwindow, wbinfo) {
init_wombat_loc($wbwindow);
init_history_replay($wbwindow, wbinfo);
// archival mode: init url-rewriting intercepts
if (!wb_is_proxy) {
init_wombat_top($wbwindow);
@ -3267,6 +3336,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
init_beacon_override();
}
// other overrides
// proxy mode: only using these overrides
@ -3321,7 +3391,6 @@ var _WBWombat = function($wbwindow, wbinfo) {
function notify_top(event) {
if (!$wbwindow.__WB_top_frame) {
var hash = $wbwindow.location.hash;
//var loc = window.location.href.replace(window.location.hash, "");
//loc = decodeURI(loc);
@ -3333,6 +3402,10 @@ var _WBWombat = function($wbwindow, wbinfo) {
return;
}
if ($wbwindow != $wbwindow.__WB_replay_top) {
return;
}
if (!$wbwindow.WB_wombat_location) {
return;
}
@ -3344,13 +3417,16 @@ var _WBWombat = function($wbwindow, wbinfo) {
}
var message = {
"base_url": wbinfo.url,
"base_ts": wbinfo.timestamp,
"url": $wbwindow.WB_wombat_location.href,
"ts": wbinfo.timestamp,
"ts": Date.__WB_ts_now(),
"request_ts": wbinfo.request_ts,
"is_live": wbinfo.is_live,
"title": $wbwindow.document ? $wbwindow.document.title : "",
"readyState": $wbwindow.document.readyState,
"wb_type": "load"
"coll": wbinfo.coll,
}
send_top_message(message);

View File

@ -15,6 +15,8 @@
wbinfo.proxy_magic = "{{ env.pywb_proxy_magic }}";
wbinfo.static_prefix = "{{ host_prefix }}/{{ static_path }}/";
wbinfo.history_state = {{ history_state }};
{% if not wb_url.is_banner_only %}
wbinfo.wombat_ts = "{{ wombat_ts }}";
wbinfo.wombat_sec = "{{ wombat_sec }}";