mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
rewrite: wombat update 2.0! Using Object.defineProperty() to better
override .href and .hash properties when possible. .href returns original url, but on assignment rewrites before redirecting .hash proxies to location.hash Also added: - window.top -> window.WB_wombat_top - document.referrer -> document.WB_wombat_referrer - <source> html tag rewriting
This commit is contained in:
parent
e011da43f2
commit
23bb5bd175
@ -45,6 +45,7 @@ class HTMLRewriterMixin(object):
|
||||
'q': {'cite': defmod},
|
||||
'ref': {'href': 'oe_'},
|
||||
'script': {'src': 'js_'},
|
||||
'source': {'src': 'oe_'},
|
||||
'div': {'data-src': defmod,
|
||||
'data-uri': defmod},
|
||||
'li': {'data-src': defmod,
|
||||
|
@ -126,6 +126,8 @@ class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
|
||||
rules = rules + [
|
||||
(r'(?<!/)\blocation\b', RegexRewriter.add_prefix(prefix), 0),
|
||||
(r'(?<=document\.)domain', RegexRewriter.add_prefix(prefix), 0),
|
||||
(r'(?<=document\.)referrer', RegexRewriter.add_prefix(prefix), 0),
|
||||
(r'(?<=window\.)top', RegexRewriter.add_prefix(prefix), 0),
|
||||
]
|
||||
#import sys
|
||||
#sys.stderr.write('\n\n*** RULES:' + str(rules) + '\n\n')
|
||||
|
@ -21,6 +21,8 @@ class LiveRewriter(object):
|
||||
('HTTP_ACCEPT', 'Accept'),
|
||||
('HTTP_ACCEPT_LANGUAGE', 'Accept-Language'),
|
||||
('HTTP_ACCEPT_CHARSET', 'Accept-Charset'),
|
||||
('HTTP_ACCEPT_ENCODING', 'Accept-Encoding'),
|
||||
('HTTP_RANGE', 'Range'),
|
||||
('REL_REFERER', 'Referer'),
|
||||
]
|
||||
|
||||
@ -79,6 +81,7 @@ class LiveRewriter(object):
|
||||
stream=True,
|
||||
verify=False)
|
||||
|
||||
|
||||
statusline = str(response.status_code) + ' ' + response.reason
|
||||
|
||||
headers = response.headers.items()
|
||||
|
@ -18,7 +18,7 @@ This file is part of pywb.
|
||||
*/
|
||||
|
||||
//============================================
|
||||
// Wombat JS-Rewriting Library
|
||||
// Wombat JS-Rewriting Library v2.0
|
||||
//============================================
|
||||
WB_wombat_init = (function() {
|
||||
|
||||
@ -52,14 +52,22 @@ WB_wombat_init = (function() {
|
||||
return false;
|
||||
}
|
||||
|
||||
function starts_with(string, prefix) {
|
||||
if (string.indexOf(prefix) == 0) {
|
||||
return prefix;
|
||||
} else {
|
||||
return undefined;
|
||||
//============================================
|
||||
function starts_with(string, arr_or_prefix) {
|
||||
if (arr_or_prefix instanceof Array) {
|
||||
for (var i = 0; i < arr_or_prefix.length; i++) {
|
||||
if (string.indexOf(arr_or_prefix[i]) == 0) {
|
||||
return arr_or_prefix[i];
|
||||
}
|
||||
}
|
||||
} else if (string.indexOf(arr_or_prefix) == 0) {
|
||||
return arr_or_prefix;
|
||||
}
|
||||
|
||||
return undefined;
|
||||
}
|
||||
|
||||
//============================================
|
||||
function ends_with(str, suffix) {
|
||||
if (str.indexOf(suffix, str.length - suffix.length) !== -1) {
|
||||
return suffix;
|
||||
@ -76,38 +84,47 @@ WB_wombat_init = (function() {
|
||||
if (url != rewritten) {
|
||||
console.log('REWRITE: ' + url + ' -> ' + rewritten);
|
||||
} else {
|
||||
//console.log('NOT REWRITTEN ' + url);
|
||||
console.log('NOT REWRITTEN ' + url);
|
||||
}
|
||||
return rewritten;
|
||||
}
|
||||
|
||||
//============================================
|
||||
var HTTP_PREFIX = "http://";
|
||||
var HTTPS_PREFIX = "https://";
|
||||
var REL_PREFIX = "//";
|
||||
|
||||
var VALID_PREFIXES = [HTTP_PREFIX, HTTPS_PREFIX, REL_PREFIX];
|
||||
var IGNORE_PREFIXES = ["#", "about:", "data:", "mailto:", "javascript:"];
|
||||
|
||||
|
||||
//============================================
|
||||
function rewrite_url_(url) {
|
||||
var http_prefix = "http://";
|
||||
var https_prefix = "https://";
|
||||
var rel_prefix = "//";
|
||||
|
||||
// If not dealing with a string, just return it
|
||||
if (!url || (typeof url) != "string") {
|
||||
return url;
|
||||
}
|
||||
|
||||
// ignore anchors
|
||||
if (starts_with(url, "#")) {
|
||||
// just in case wombat reference made it into url!
|
||||
url = url.replace("WB_wombat_", "");
|
||||
|
||||
// ignore anchors, about, data
|
||||
if (starts_with(url, IGNORE_PREFIXES)) {
|
||||
return url;
|
||||
}
|
||||
|
||||
// If starts with prefix, no rewriting needed
|
||||
// Only check replay prefix (no date) as date may be different for each
|
||||
// capture
|
||||
if (starts_with(url, wb_replay_prefix)) {
|
||||
if (starts_with(url, wb_replay_prefix) || starts_with(url, window.location.origin + wb_replay_prefix)) {
|
||||
return url;
|
||||
}
|
||||
|
||||
// If server relative url, add prefix and original host
|
||||
if (url.charAt(0) == "/" && !starts_with(url, rel_prefix)) {
|
||||
if (url.charAt(0) == "/" && !starts_with(url, REL_PREFIX)) {
|
||||
|
||||
// Already a relative url, don't make any changes!
|
||||
if (url.indexOf(wb_capture_date_part) >= 0) {
|
||||
if (wb_capture_date_part && url.indexOf(wb_capture_date_part) >= 0) {
|
||||
return url;
|
||||
}
|
||||
|
||||
@ -116,9 +133,7 @@ WB_wombat_init = (function() {
|
||||
|
||||
// If full url starting with http://, add prefix
|
||||
|
||||
var prefix = starts_with(url, http_prefix) ||
|
||||
starts_with(url, https_prefix) ||
|
||||
starts_with(url, rel_prefix);
|
||||
var prefix = starts_with(url, VALID_PREFIXES);
|
||||
|
||||
if (prefix) {
|
||||
if (starts_with(url, prefix + window.location.host + '/')) {
|
||||
@ -130,25 +145,12 @@ WB_wombat_init = (function() {
|
||||
// May or may not be a hostname, call function to determine
|
||||
// If it is, add the prefix and make sure port is removed
|
||||
if (is_host_url(url) && !starts_with(url, window.location.host + '/')) {
|
||||
return wb_replay_date_prefix + http_prefix + url;
|
||||
return wb_replay_date_prefix + HTTP_PREFIX + url;
|
||||
}
|
||||
|
||||
return url;
|
||||
}
|
||||
|
||||
//============================================
|
||||
function copy_object_fields(obj) {
|
||||
var new_obj = {};
|
||||
|
||||
for (prop in obj) {
|
||||
if ((typeof obj[prop]) != "function") {
|
||||
new_obj[prop] = obj[prop];
|
||||
}
|
||||
}
|
||||
|
||||
return new_obj;
|
||||
}
|
||||
|
||||
//============================================
|
||||
function extract_orig(href) {
|
||||
if (!href) {
|
||||
@ -162,6 +164,20 @@ WB_wombat_init = (function() {
|
||||
// extract original url from wburl
|
||||
if (index > 0) {
|
||||
href = href.substr(index + 1);
|
||||
} else {
|
||||
index = href.indexOf(wb_replay_prefix);
|
||||
if (index >= 0) {
|
||||
href = href.substr(index + wb_replay_prefix.length);
|
||||
}
|
||||
if ((href.length > 4) &&
|
||||
(href.charAt(2) == "_") &&
|
||||
(href.charAt(3) == "/")) {
|
||||
href = href.substr(4);
|
||||
}
|
||||
|
||||
if (!starts_with(href, "http")) {
|
||||
href = HTTP_PREFIX + href;
|
||||
}
|
||||
}
|
||||
|
||||
// remove trailing slash
|
||||
@ -171,55 +187,142 @@ WB_wombat_init = (function() {
|
||||
|
||||
return href;
|
||||
}
|
||||
|
||||
|
||||
//============================================
|
||||
function copy_location_obj(loc) {
|
||||
var new_loc = copy_object_fields(loc);
|
||||
|
||||
new_loc._orig_loc = loc;
|
||||
new_loc._orig_href = loc.href;
|
||||
// Define custom property
|
||||
function defProp(obj, prop, value, set_func, get_func) {
|
||||
var key = "_" + prop;
|
||||
obj[key] = value;
|
||||
|
||||
try {
|
||||
Object.defineProperty(obj, prop, {
|
||||
configurable: false,
|
||||
enumerable: true,
|
||||
set: function(newval) {
|
||||
var result = set_func.call(obj, newval);
|
||||
if (result != undefined) {
|
||||
obj[key] = result;
|
||||
}
|
||||
},
|
||||
get: function() {
|
||||
if (get_func) {
|
||||
return get_func.call(obj, obj[key]);
|
||||
} else {
|
||||
return obj[key];
|
||||
}
|
||||
}
|
||||
});
|
||||
return true;
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
obj[prop] = value;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
//============================================
|
||||
//Define WombatLocation
|
||||
|
||||
function WombatLocation(loc) {
|
||||
this._orig_loc = loc;
|
||||
this._orig_href = loc.href;
|
||||
|
||||
// Rewrite replace and assign functions
|
||||
new_loc.replace = function(url) {
|
||||
this._orig_loc.replace(rewrite_url(url));
|
||||
this.replace = function(url) {
|
||||
return this._orig_loc.replace(rewrite_url(url));
|
||||
}
|
||||
new_loc.assign = function(url) {
|
||||
this._orig_loc.assign(rewrite_url(url));
|
||||
this.assign = function(url) {
|
||||
return this._orig_loc.assign(rewrite_url(url));
|
||||
}
|
||||
new_loc.reload = loc.reload;
|
||||
|
||||
this.reload = loc.reload;
|
||||
|
||||
// Adapted from:
|
||||
// https://gist.github.com/jlong/2428561
|
||||
var parser = document.createElement('a');
|
||||
parser.href = extract_orig(new_loc._orig_href);
|
||||
var href = extract_orig(this._orig_href);
|
||||
parser.href = href;
|
||||
|
||||
//console.log(this._orig_href + " -> " + tmp_href);
|
||||
this._autooverride = false;
|
||||
|
||||
var _set_hash = function(hash) {
|
||||
this._orig_loc.hash = hash;
|
||||
return this._orig_loc.hash;
|
||||
}
|
||||
|
||||
var _get_hash = function() {
|
||||
return this._orig_loc.hash;
|
||||
}
|
||||
|
||||
var _get_url_with_hash = function(url) {
|
||||
return url + this._orig_loc.hash;
|
||||
}
|
||||
|
||||
href = parser.href;
|
||||
var hash = parser.hash;
|
||||
|
||||
if (hash) {
|
||||
var hidx = href.lastIndexOf("#");
|
||||
if (hidx > 0) {
|
||||
href = href.substring(0, hidx);
|
||||
}
|
||||
}
|
||||
|
||||
if (Object.defineProperty) {
|
||||
var res1 = defProp(this, "href", href,
|
||||
this.assign,
|
||||
_get_url_with_hash);
|
||||
|
||||
var res2 = defProp(this, "hash", parser.hash,
|
||||
_set_hash,
|
||||
_get_hash);
|
||||
|
||||
this._autooverride = res1 && res2;
|
||||
}
|
||||
|
||||
this.host = parser.host;
|
||||
this.hostname = parser.hostname;
|
||||
|
||||
new_loc.hash = parser.hash;
|
||||
new_loc.host = parser.host;
|
||||
new_loc.hostname = parser.hostname;
|
||||
new_loc.href = parser.href;
|
||||
|
||||
if (new_loc.origin) {
|
||||
new_loc.origin = parser.origin;
|
||||
if (parser.origin) {
|
||||
this.origin = parser.origin;
|
||||
}
|
||||
|
||||
new_loc.pathname = parser.pathname;
|
||||
new_loc.port = parser.port
|
||||
new_loc.protocol = parser.protocol;
|
||||
new_loc.search = parser.search;
|
||||
this.pathname = parser.pathname;
|
||||
this.port = parser.port
|
||||
this.protocol = parser.protocol;
|
||||
this.search = parser.search;
|
||||
|
||||
new_loc.toString = function() {
|
||||
this.toString = function() {
|
||||
return this.href;
|
||||
}
|
||||
|
||||
return new_loc;
|
||||
|
||||
// Copy any remaining properties
|
||||
for (prop in loc) {
|
||||
if (this.hasOwnProperty(prop)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((typeof loc[prop]) != "function") {
|
||||
this[prop] = loc[prop];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//============================================
|
||||
function update_location(req_href, orig_href, actual_location) {
|
||||
if (!req_href || req_href == orig_href) {
|
||||
function update_location(req_href, orig_href, actual_location, wombat_loc) {
|
||||
if (!req_href) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (req_href == orig_href) {
|
||||
// Reset wombat loc to the unrewritten version
|
||||
//if (wombat_loc) {
|
||||
// wombat_loc.href = extract_orig(orig_href);
|
||||
//}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
ext_orig = extract_orig(orig_href);
|
||||
ext_req = extract_orig(req_href);
|
||||
|
||||
@ -235,19 +338,19 @@ WB_wombat_init = (function() {
|
||||
}
|
||||
|
||||
//============================================
|
||||
function check_location_change(loc, is_top) {
|
||||
var locType = (typeof loc);
|
||||
function check_location_change(wombat_loc, is_top) {
|
||||
var locType = (typeof wombat_loc);
|
||||
|
||||
var actual_location = (is_top ? window.top.location : window.location);
|
||||
|
||||
//console.log(loc.href);
|
||||
|
||||
// String has been assigned to location, so assign it
|
||||
if (locType == "string") {
|
||||
update_location(loc, actual_location.href, actual_location)
|
||||
|
||||
update_location(wombat_loc, actual_location.href, actual_location);
|
||||
|
||||
} else if (locType == "object") {
|
||||
update_location(loc.href, loc._orig_href, actual_location);
|
||||
update_location(wombat_loc.href,
|
||||
wombat_loc._orig_href,
|
||||
actual_location);
|
||||
}
|
||||
}
|
||||
|
||||
@ -261,10 +364,21 @@ WB_wombat_init = (function() {
|
||||
|
||||
check_location_change(window.WB_wombat_location, false);
|
||||
|
||||
if (window.self.location != window.top.location) {
|
||||
// Only check top if its a different window
|
||||
if (window.self.WB_wombat_location != window.top.WB_wombat_location) {
|
||||
check_location_change(window.top.WB_wombat_location, true);
|
||||
}
|
||||
|
||||
// lochash = window.WB_wombat_location.hash;
|
||||
//
|
||||
// if (lochash) {
|
||||
// window.location.hash = lochash;
|
||||
//
|
||||
// //if (window.top.update_wb_url) {
|
||||
// // window.top.location.hash = lochash;
|
||||
// //}
|
||||
// }
|
||||
|
||||
wb_wombat_updating = false;
|
||||
}
|
||||
|
||||
@ -328,6 +442,7 @@ WB_wombat_init = (function() {
|
||||
window.XMLHttpRequest.prototype.open = open_rewritten;
|
||||
}
|
||||
|
||||
//============================================
|
||||
function init_worker_override() {
|
||||
if (!window.Worker) {
|
||||
return;
|
||||
@ -338,6 +453,7 @@ WB_wombat_init = (function() {
|
||||
window.Worker = undefined;
|
||||
}
|
||||
|
||||
//============================================
|
||||
function rewrite_attr(elem, name) {
|
||||
if (!elem || !elem.getAttribute) {
|
||||
return;
|
||||
@ -359,6 +475,7 @@ WB_wombat_init = (function() {
|
||||
elem.setAttribute(name, value);
|
||||
}
|
||||
|
||||
//============================================
|
||||
function init_dom_override() {
|
||||
if (!Node || !Node.prototype) {
|
||||
return;
|
||||
@ -376,9 +493,9 @@ WB_wombat_init = (function() {
|
||||
var desc;
|
||||
|
||||
if (child instanceof DocumentFragment) {
|
||||
desc = child.querySelectorAll("*[href],*[src]");
|
||||
//desc = child.querySelectorAll("*[href],*[src]");
|
||||
} else if (child.getElementsByTagName) {
|
||||
desc = child.getElementsByTagName("*");
|
||||
//desc = child.getElementsByTagName("*");
|
||||
}
|
||||
|
||||
if (desc) {
|
||||
@ -401,19 +518,55 @@ WB_wombat_init = (function() {
|
||||
//============================================
|
||||
function wombat_init(replay_prefix, capture_date, orig_host, timestamp) {
|
||||
wb_replay_prefix = replay_prefix;
|
||||
wb_replay_date_prefix = replay_prefix + capture_date + "/";
|
||||
wb_capture_date_part = "/" + capture_date + "/";
|
||||
|
||||
wb_orig_host = "http://" + orig_host;
|
||||
wb_replay_date_prefix = replay_prefix + capture_date + "em_/";
|
||||
|
||||
if (capture_date.length > 0) {
|
||||
wb_capture_date_part = "/" + capture_date + "/";
|
||||
} else {
|
||||
wb_capture_date_part = "";
|
||||
}
|
||||
|
||||
wb_orig_host = HTTP_PREFIX + orig_host;
|
||||
|
||||
// Location
|
||||
window.WB_wombat_location = copy_location_obj(window.self.location);
|
||||
document.WB_wombat_location = window.WB_wombat_location;
|
||||
var wombat_location = new WombatLocation(window.self.location);
|
||||
|
||||
if (wombat_location._autooverride) {
|
||||
|
||||
var setter = function(val) {
|
||||
if (typeof(val) == "string") {
|
||||
if (starts_with(val, "about:")) {
|
||||
return undefined;
|
||||
}
|
||||
this._WB_wombat_location.href = val;
|
||||
}
|
||||
}
|
||||
|
||||
defProp(window, "WB_wombat_location", wombat_location, setter);
|
||||
defProp(document, "WB_wombat_location", wombat_location, setter);
|
||||
} else {
|
||||
// Check quickly after page load
|
||||
setTimeout(check_all_locations, 500);
|
||||
|
||||
// Check periodically every few seconds
|
||||
setInterval(check_all_locations, 500);
|
||||
}
|
||||
|
||||
var is_framed = (window.top.update_wb_url != undefined);
|
||||
|
||||
//if (window.self.location != window.top.location) {
|
||||
// window.top.WB_wombat_location = copy_location_obj(window.top.location);
|
||||
//}
|
||||
window.top.WB_wombat_location = window.WB_wombat_location;
|
||||
if (window.self.location != window.top.location) {
|
||||
if (is_framed) {
|
||||
window.top.WB_wombat_location = window.WB_wombat_location;
|
||||
window.WB_wombat_top = window.self;
|
||||
} else {
|
||||
window.top.WB_wombat_location = new WombatLocation(window.top.location);
|
||||
|
||||
window.WB_wombat_top = window.top;
|
||||
}
|
||||
} else {
|
||||
window.WB_wombat_top = window.top;
|
||||
}
|
||||
|
||||
//if (window.opener) {
|
||||
// window.opener.WB_wombat_location = copy_location_obj(window.opener.location);
|
||||
@ -421,6 +574,7 @@ WB_wombat_init = (function() {
|
||||
|
||||
// Domain
|
||||
document.WB_wombat_domain = orig_host;
|
||||
document.WB_wombat_referrer = extract_orig(document.referrer);
|
||||
|
||||
// History
|
||||
copy_history_func(window.history, 'pushState');
|
||||
@ -434,15 +588,9 @@ WB_wombat_init = (function() {
|
||||
init_dom_override();
|
||||
|
||||
// Random
|
||||
init_seeded_random(timestamp);
|
||||
init_seeded_random(timestamp);
|
||||
}
|
||||
|
||||
// Check quickly after page load
|
||||
setTimeout(check_all_locations, 100);
|
||||
|
||||
// Check periodically every few seconds
|
||||
setInterval(check_all_locations, 500);
|
||||
|
||||
return wombat_init;
|
||||
|
||||
})(this);
|
||||
|
Loading…
x
Reference in New Issue
Block a user