From 781b2aa3933d43e1bbfb435157b94a421a7064da Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 4 Nov 2017 17:37:46 -0700 Subject: [PATCH] record-history experiments via custom metadata record, head_insert (wip) - check last url instead of last state, which may be empty - add init_state tracking - add accept filtering, enabled via special rule --- pywb/apps/frontendapp.py | 75 +++++++++++++++++++++++ pywb/apps/rewriterapp.py | 47 +++++++++++++-- pywb/recorder/recorderapp.py | 6 +- pywb/rewrite/content_rewriter.py | 6 ++ pywb/rewrite/rewriteinputreq.py | 10 ++++ pywb/rules.yaml | 6 +- pywb/static/wb_frame.js | 64 +++++++++++++++++++- pywb/static/wombat.js | 100 +++++++++++++++++++++++++++---- pywb/templates/head_insert.html | 2 + 9 files changed, 296 insertions(+), 20 deletions(-) diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index 2defe115..47bda7b4 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -6,6 +6,10 @@ from werkzeug.exceptions import HTTPException, NotFound from werkzeug.wsgi import pop_path_info from six.moves.urllib.parse import urljoin from six import iteritems +from io import BytesIO + +from warcio.timeutils import timestamp_to_iso_date, timestamp_now +from warcio.timeutils import sec_to_timestamp, timestamp_to_sec from warcio.utils import to_native_str from wsgiprox.wsgiprox import WSGIProxMiddleware @@ -16,6 +20,7 @@ from pywb.recorder.recorderapp import RecorderApp from pywb.utils.loaders import load_yaml_config from pywb.utils.geventserver import GeventServer from pywb.utils.io import StreamIter +from pywb.utils.format import query_to_dict from pywb.warcserver.warcserver import WarcServer @@ -29,6 +34,7 @@ import os import traceback import requests import logging +import json # ============================================================================ @@ -102,6 +108,73 @@ class FrontEndApp(object): self.url_map.add(Rule(coll_prefix + '/', endpoint=self.serve_content)) + self.url_map.add(Rule('/_/add_waypoint', endpoint=self.serve_add_history)) + + def serve_add_history(self, environ): + if environ.get('REQUEST_METHOD') != 'POST': + res = {'error_message': 'POST required'} + else: + try: + res = self.add_history(environ) + except Exception as e: + res = {'error_message': str(e)} + + return WbResponse.json_response(res) + + def add_history(self, environ): + if not self.recorder_path: + return {'error': 'not recording'} + + params = query_to_dict(environ.get('QUERY_STRING')) + if 'coll' not in params: + return {'error': 'collection required'} + + upstream_url = self.recorder_path + '&put_record=metadata' + + hist_data = environ['wsgi.input'].read() + + hist_json = json.loads(hist_data.decode('utf-8')) + + if not hist_json.get('states'): + return {} + + base_url = hist_json.get('base_url') + + final_url = hist_json.get('final_url') + if not final_url: + final_url = base_url + + upstream_url = upstream_url.format(url=final_url, + coll=params['coll'].strip()) + + now = timestamp_now() + if now == hist_json['base_timestamp']: + now = sec_to_timestamp(timestamp_to_sec() + 1) + + headers = {'Content-Type': 'application/vnd.pywb-waypoint+json; charset=utf-8', + 'WARC-Refers-To-Target-URI': hist_json['base_url'], + 'WARC-Refers-To-Date': timestamp_to_iso_date(hist_json['base_timestamp']), + 'WARC-Profile': 'history', + 'WARC-Target-URI': final_url, + 'WARC-Date': timestamp_to_iso_date(now) + } + + r = requests.put(upstream_url, + data=BytesIO(hist_data), + headers=headers, + ) + + try: + r.raise_for_status() + res = r.json() + + assert(res['success'] == 'true') + return {} + + except Exception as e: + print(e) + return {'error_message': 'history save failed'} + def get_upstream_paths(self, port): base_paths = { 'replay': self.REPLAY_API % port, @@ -269,6 +342,8 @@ class FrontEndApp(object): if timemap_output: metadata['output'] = timemap_output + environ['pywb.template_params'] = {'coll': coll} + try: response = self.rewriterapp.render_content(wb_url_str, metadata, environ) except UpstreamException as ue: diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index b23e948e..1d3ad5e1 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -14,7 +14,7 @@ from pywb.utils.loaders import extract_client_cookie from pywb.utils.io import BUFF_SIZE, OffsetLimitReader from pywb.utils.memento import MementoUtils -from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date +from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date, iso_date_to_timestamp from warcio.bufferedreaders import BufferedReader from warcio.recordloader import ArcWarcRecordLoader @@ -24,7 +24,7 @@ from pywb.apps.wbrequestresponse import WbResponse from pywb.rewrite.rewriteinputreq import RewriteInputRequest from pywb.rewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView - +import re from io import BytesIO from copy import copy @@ -209,6 +209,7 @@ class RewriterApp(object): full_prefix = host_prefix + rel_prefix is_proxy = ('wsgiprox.proxy_host' in environ) + is_ajax = self.is_ajax(environ) response = self.handle_custom_response(environ, wb_url, full_prefix, host_prefix, @@ -264,6 +265,12 @@ class RewriterApp(object): res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key) inputreq.extra_cookie, setcookie_headers = res + if is_ajax and kwargs.get('type') != 'record': + accept_filter = inputreq.get_accept_filter(wb_url.url) + if accept_filter: + kwargs['filter'] = '~mime:' + accept_filter + kwargs['matchType'] = 'prefix' + r = self._do_req(inputreq, wb_url, kwargs, skip_record) if r.status_code >= 400: @@ -323,6 +330,8 @@ class RewriterApp(object): return resp + record, history_state = self._resolve_history(record, inputreq, kwargs, skip_record) + self._add_custom_params(cdx, r.headers, kwargs) if self._add_range(record, wb_url, range_start, range_end): @@ -342,7 +351,8 @@ class RewriterApp(object): top_url, environ, framed_replay, - config=self.config)) + config=self.config, + history_state=history_state)) cookie_rewriter = None if self.cookie_tracker: @@ -378,6 +388,31 @@ class RewriterApp(object): return response + def _resolve_history(self, record, inputreq, kwargs, skip_record): + history_state = None + while True: + if record.rec_type != 'metadata' or record.rec_headers.get('WARC-Profile') != 'history': + break + + stream = record.content_stream() + try: + if not history_state: + history_state = stream.read().decode('utf-8') + finally: + stream.close() + + orig_wb_url = WbUrl(record.rec_headers.get('WARC-Refers-To-Target-URI')) + orig_wb_url.timestamp = iso_date_to_timestamp(record.rec_headers.get('WARC-Refers-To-Date')) + orig_wb_url.type = orig_wb_url.REPLAY + kwargs['filter'] = '!status:302' + new_r = self._do_req(inputreq, orig_wb_url, kwargs, skip_record) + + stream = BufferedReader(new_r.raw, block_size=BUFF_SIZE) + record = self.loader.parse_record_stream(stream, + ensure_http_headers=True) + + return record, (history_state or 'undefined') + def format_response(self, response, wb_url, full_prefix, is_timegate, is_proxy): memento_ts = None if not isinstance(response, WbResponse): @@ -488,13 +523,17 @@ class RewriterApp(object): params = {} params['url'] = wb_url.url params['closest'] = closest - params['matchType'] = 'exact' + params['matchType'] = kwargs.get('matchType', 'exact') if wb_url.mod == 'vi_': params['content_type'] = self.VIDEO_INFO_CONTENT_TYPE upstream_url = self.get_upstream_url(wb_url, kwargs, params) + if 'filter' in kwargs: + upstream_url += '&filter=' + kwargs['filter'] + + r = requests.post(upstream_url, data=BytesIO(req_data), headers=headers, diff --git a/pywb/recorder/recorderapp.py b/pywb/recorder/recorderapp.py index 1eb35ffe..8681897e 100644 --- a/pywb/recorder/recorderapp.py +++ b/pywb/recorder/recorderapp.py @@ -141,7 +141,7 @@ class RecorderApp(object): payload_length = req_stream.out.tell() req_stream.out.seek(0) - record = self.writer.create_warc_record(uri=params['url'], + record = self.writer.create_warc_record(uri=params.get('url', ''), record_type=record_type, payload=req_stream.out, length=payload_length, @@ -153,6 +153,10 @@ class RecorderApp(object): msg = {'success': 'true', 'WARC-Date': record.rec_headers.get_header('WARC-Date')} + except: + import traceback + traceback.print_exc() + finally: if req_stream: req_stream.out.close() diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index 295e11e4..d7ff117f 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -38,6 +38,12 @@ class BaseContentRewriter(object): if rule: self.rules.append(rule) + self.accept_filter_prefixes = config.get('accept_filter_prefixes', []) + print(self.accept_filter_prefixes) + + def allow_accept_filter(self, url): + return any(url.startswith(prefix) for prefix in self.accept_filter_prefixes) + def parse_rewrite_rule(self, config): rw_config = config.get('rewrite') if not rw_config: diff --git a/pywb/rewrite/rewriteinputreq.py b/pywb/rewrite/rewriteinputreq.py index 21efb1f5..59813e2f 100644 --- a/pywb/rewrite/rewriteinputreq.py +++ b/pywb/rewrite/rewriteinputreq.py @@ -90,6 +90,16 @@ class RewriteInputRequest(DirectWSGIInputRequest): return headers + def get_accept_filter(self, url): + if not self.rewriter.allow_accept_filter(url): + return + + accept = self.env.get('HTTP_ACCEPT') + if not accept: + return + + return '|'.join(accept.split(', ')[:-1]) + def extract_range(self): use_206 = False start = None diff --git a/pywb/rules.yaml b/pywb/rules.yaml index 0c46919d..0dd181b0 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -43,6 +43,10 @@ default_filters: - match: '[?&](\w*(bust|ts)\w*=1[\d]{12,15})(?=&|$)' replace: '' +accept_filter_prefixes: + - https://twitter.com/ + + rules: # twitter rules @@ -60,7 +64,7 @@ rules: - url_prefix: 'com,twitter)/i/videos/tweet' fuzzy_lookup: '()' - + # facebook rules #================================================================= diff --git a/pywb/static/wb_frame.js b/pywb/static/wb_frame.js index d743e658..066d4b5b 100644 --- a/pywb/static/wb_frame.js +++ b/pywb/static/wb_frame.js @@ -23,6 +23,14 @@ function ContentFrame(content_info) { this.last_url = content_info.url; this.last_ts = content_info.request_ts; + window.wr_history = { + "base_url": content_info.url, + "base_timestamp": content_info.timestamp, + "states": [], + "init_state": null, + }; + + this.init_iframe = function() { if (typeof(content_info.iframe) === "string") { this.iframe = document.querySelector(content_info.iframe); @@ -111,13 +119,14 @@ function ContentFrame(content_info) { var type = state.wb_type; if (type == "load" || type == "replace-url") { - this.set_url(state); + this.set_url(state, type); + this.saveHistory(state); } else if (type == "hashchange") { this.inner_hash_changed(state); } } - this.set_url = function(state) { + this.set_url = function(state, type) { if (state.url && (state.url != this.last_url || state.request_ts != this.last_ts)) { var new_url = this.make_url(state.url, state.request_ts, false); @@ -173,4 +182,55 @@ function ContentFrame(content_info) { this.pm_source = win; return this; } + + this.saveHistory = function(message) { + if (!message.is_live) { + return; + } + + var event = [message.state, message.title, message.url]; + + if (message.wb_type == "load") { + if (window.wr_history.base_timestamp == message.base_ts && + window.wr_history.base_url == message.base_url) { + return; + } + + window.wr_history.base_timestamp = message.base_ts; + window.wr_history.base_url = message.base_url; + window.wr_history.states = []; + window.wr_history.init_state = event; + } + + if (message.change_type == "popState") { + window.wr_history.states.pop(); + return; + } + + if (message.change_type == "pushState" || message.change_type == "replaceState") { + if (message.change_type == "replaceState") { + if (message.url == window.wr_history.base_url) { + return; + } + + if (window.wr_history.states.length == 0) { + window.wr_history.init_state = event; + } else { + window.wr_history.states[window.wr_history.states.length - 1] = event; + } + } else { + window.wr_history.states.push(event); + } + window.wr_history.final_url = message.url; + } + + var data = JSON.stringify(window.wr_history); + console.log(data); + + var xhr = new XMLHttpRequest(); + xhr.addEventListener("load", function(res) { console.log(xhr.responseText); }); + xhr.open("POST", "/_/add_waypoint?coll=" + message.coll); + xhr.setRequestHeader('Content-type','application/json; charset=utf-8'); + xhr.send(data); + } } diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 511d4253..ae34a055 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -23,7 +23,6 @@ This file is part of pywb, https://github.com/webrecorder/pywb var _WBWombat = function($wbwindow, wbinfo) { - // associative array for func->handler for message and storage events function FuncMap() { this._arr = []; @@ -794,14 +793,19 @@ var _WBWombat = function($wbwindow, wbinfo) { } //============================================ - function send_history_update(url, title) { + function send_history_update(state, type, url, title) { var message = { + "base_url": wb_info.url, + "base_ts": wb_info.timestamp, "url": url, - "ts": wb_info.timestamp, + "ts": Date.__WB_ts_now(), "request_ts": wb_info.request_ts, "is_live": wb_info.is_live, "title": title, "wb_type": "replace-url", + "state": state, + "change_type": type, + "coll": wb_info.coll, } send_top_message(message); @@ -813,7 +817,7 @@ var _WBWombat = function($wbwindow, wbinfo) { override_history_func("replaceState"); $wbwindow.addEventListener("popstate", function(event) { - send_history_update($wbwindow.WB_wombat_location.href, $wbwindow.document.title); + send_history_update(event.state, "popState", $wbwindow.WB_wombat_location.href, $wbwindow.document.title); }); } @@ -843,7 +847,7 @@ var _WBWombat = function($wbwindow, wbinfo) { if (url && (url != $wbwindow.WB_wombat_location.origin && $wbwindow.WB_wombat_location.href != "about:blank") && !starts_with(url, $wbwindow.WB_wombat_location.origin + "/")) { - throw new DOMException("Invalid history change: " + url); + //throw new DOMException("Invalid history change: " + url); } } else { url = $wbwindow.WB_wombat_location.href; @@ -851,7 +855,7 @@ var _WBWombat = function($wbwindow, wbinfo) { orig_func.call(this, state_obj, title, rewritten_url); - send_history_update(url, title); + send_history_update(state_obj, func_name, url, title); } $wbwindow.history[func_name] = rewritten_func; @@ -1220,6 +1224,11 @@ var _WBWombat = function($wbwindow, wbinfo) { $wbwindow.Date.__WB_timediff = timediff; + $wbwindow.Date.__WB_ts_now = function(precision) { + precision = precision || 14; + return new $wbwindow.Date().toISOString().replace(/[^\d]/g, "").substr(0, precision); + } + Object.defineProperty($wbwindow.Date.prototype, "constructor", {value: $wbwindow.Date}); } @@ -2468,8 +2477,6 @@ var _WBWombat = function($wbwindow, wbinfo) { orig_func_to_string.apply = orig_apply; } - - //============================================ function init_open_override() { @@ -3047,7 +3054,8 @@ var _WBWombat = function($wbwindow, wbinfo) { } }, has: function(target, prop) { - return prop in $wbwindow; + return Reflect.has(target, prop) || Reflect.has($wbwindow, prop); + //return prop in $wbwindow; }, ownKeys: function(target) { return Object.getOwnPropertyNames($wbwindow).concat(Object.getOwnPropertySymbols($wbwindow)); @@ -3055,7 +3063,7 @@ var _WBWombat = function($wbwindow, wbinfo) { getOwnPropertyDescriptor: function(target, key) { // first try the underlying object's descriptor // (to match defineProperty() behavior) - var descriptor = Object.getOwnPropertyDescriptor(target, key); + var descriptor = Object.getOwnPropertyDescriptor(target, key); if (!descriptor) { descriptor = Object.getOwnPropertyDescriptor($wbwindow, key); // if using window's descriptor, must ensure it's configurable @@ -3137,6 +3145,65 @@ var _WBWombat = function($wbwindow, wbinfo) { // End Proxy Obj Override System + function init_history_replay($wbwindow, wbinfo) { + if (!wbinfo.history_state) { + return; + } + + if ($wbwindow.__WB_replay_top != $wbwindow) { + return; + } + + // replace initial state + if (!wbinfo.history_state.init_state) { + wbinfo.history_state.init_state = [wbinfo.history_state.base_state || $wbwindow.history.state, + $wbwindow.document.title, + wbinfo.history_state.base_url]; + } + + wbinfo.history_state.init_state[2] = wbinfo.prefix + wbinfo.history_state.base_timestamp + wbinfo.mod + "/" + wbinfo.history_state.init_state[2]; + + $wbwindow.history.replaceState.apply($wbwindow.history, wbinfo.history_state.init_state); + + var replayed = false; + + $wbwindow.addEventListener("load", function() { + setTimeout(replay, 500); + }); + + function replay() { + if (replayed) { + return; + } + + if (document.readyState != "complete") { + return; + } + + replayed = true; + + var states = wbinfo.history_state.states; + var lastState = states[states.length - 1][0]; + + if ($wbwindow.history.state == lastState) { + return; + } + + if ($wbwindow.WB_wombat_location.href == states[states.length - 1][2]) { + return; + } + + for (var i = 0; i < states.length; i++) { + //if (states[i][2] == wbinfo.history_state.base_url) { + // continue; + //} + $wbwindow.history.pushState.apply($wbwindow.history, states[i]); + } + + $wbwindow.dispatchEvent(new PopStateEvent('popstate', { state: lastState})); + }; + } + //============================================ function wombat_init(wbinfo) { init_paths(wbinfo); @@ -3145,6 +3212,8 @@ var _WBWombat = function($wbwindow, wbinfo) { init_wombat_loc($wbwindow); + init_history_replay($wbwindow, wbinfo); + // archival mode: init url-rewriting intercepts if (!wb_is_proxy) { init_wombat_top($wbwindow); @@ -3267,6 +3336,7 @@ var _WBWombat = function($wbwindow, wbinfo) { init_beacon_override(); } + // other overrides // proxy mode: only using these overrides @@ -3321,7 +3391,6 @@ var _WBWombat = function($wbwindow, wbinfo) { function notify_top(event) { if (!$wbwindow.__WB_top_frame) { var hash = $wbwindow.location.hash; - //var loc = window.location.href.replace(window.location.hash, ""); //loc = decodeURI(loc); @@ -3333,6 +3402,10 @@ var _WBWombat = function($wbwindow, wbinfo) { return; } + if ($wbwindow != $wbwindow.__WB_replay_top) { + return; + } + if (!$wbwindow.WB_wombat_location) { return; } @@ -3344,13 +3417,16 @@ var _WBWombat = function($wbwindow, wbinfo) { } var message = { + "base_url": wbinfo.url, + "base_ts": wbinfo.timestamp, "url": $wbwindow.WB_wombat_location.href, - "ts": wbinfo.timestamp, + "ts": Date.__WB_ts_now(), "request_ts": wbinfo.request_ts, "is_live": wbinfo.is_live, "title": $wbwindow.document ? $wbwindow.document.title : "", "readyState": $wbwindow.document.readyState, "wb_type": "load" + "coll": wbinfo.coll, } send_top_message(message); diff --git a/pywb/templates/head_insert.html b/pywb/templates/head_insert.html index 5aa84b12..b3e84e88 100644 --- a/pywb/templates/head_insert.html +++ b/pywb/templates/head_insert.html @@ -15,6 +15,8 @@ wbinfo.proxy_magic = "{{ env.pywb_proxy_magic }}"; wbinfo.static_prefix = "{{ host_prefix }}/{{ static_path }}/"; + wbinfo.history_state = {{ history_state }}; + {% if not wb_url.is_banner_only %} wbinfo.wombat_ts = "{{ wombat_ts }}"; wbinfo.wombat_sec = "{{ wombat_sec }}";