1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

rewrite: wombat update 2.0! Using Object.defineProperty() to better

override .href and .hash properties when possible.
.href returns original url, but on assignment rewrites before redirecting
.hash proxies to location.hash
Also added:
- window.top -> window.WB_wombat_top
- document.referrer -> document.WB_wombat_referrer
- <source> html tag rewriting
This commit is contained in:
Ilya Kreymer 2014-04-18 19:30:48 -07:00
parent e011da43f2
commit 23bb5bd175
4 changed files with 242 additions and 88 deletions

View File

@ -45,6 +45,7 @@ class HTMLRewriterMixin(object):
'q': {'cite': defmod}, 'q': {'cite': defmod},
'ref': {'href': 'oe_'}, 'ref': {'href': 'oe_'},
'script': {'src': 'js_'}, 'script': {'src': 'js_'},
'source': {'src': 'oe_'},
'div': {'data-src': defmod, 'div': {'data-src': defmod,
'data-uri': defmod}, 'data-uri': defmod},
'li': {'data-src': defmod, 'li': {'data-src': defmod,

View File

@ -126,6 +126,8 @@ class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
rules = rules + [ rules = rules + [
(r'(?<!/)\blocation\b', RegexRewriter.add_prefix(prefix), 0), (r'(?<!/)\blocation\b', RegexRewriter.add_prefix(prefix), 0),
(r'(?<=document\.)domain', RegexRewriter.add_prefix(prefix), 0), (r'(?<=document\.)domain', RegexRewriter.add_prefix(prefix), 0),
(r'(?<=document\.)referrer', RegexRewriter.add_prefix(prefix), 0),
(r'(?<=window\.)top', RegexRewriter.add_prefix(prefix), 0),
] ]
#import sys #import sys
#sys.stderr.write('\n\n*** RULES:' + str(rules) + '\n\n') #sys.stderr.write('\n\n*** RULES:' + str(rules) + '\n\n')

View File

@ -21,6 +21,8 @@ class LiveRewriter(object):
('HTTP_ACCEPT', 'Accept'), ('HTTP_ACCEPT', 'Accept'),
('HTTP_ACCEPT_LANGUAGE', 'Accept-Language'), ('HTTP_ACCEPT_LANGUAGE', 'Accept-Language'),
('HTTP_ACCEPT_CHARSET', 'Accept-Charset'), ('HTTP_ACCEPT_CHARSET', 'Accept-Charset'),
('HTTP_ACCEPT_ENCODING', 'Accept-Encoding'),
('HTTP_RANGE', 'Range'),
('REL_REFERER', 'Referer'), ('REL_REFERER', 'Referer'),
] ]
@ -79,6 +81,7 @@ class LiveRewriter(object):
stream=True, stream=True,
verify=False) verify=False)
statusline = str(response.status_code) + ' ' + response.reason statusline = str(response.status_code) + ' ' + response.reason
headers = response.headers.items() headers = response.headers.items()

View File

@ -18,7 +18,7 @@ This file is part of pywb.
*/ */
//============================================ //============================================
// Wombat JS-Rewriting Library // Wombat JS-Rewriting Library v2.0
//============================================ //============================================
WB_wombat_init = (function() { WB_wombat_init = (function() {
@ -52,14 +52,22 @@ WB_wombat_init = (function() {
return false; return false;
} }
function starts_with(string, prefix) { //============================================
if (string.indexOf(prefix) == 0) { function starts_with(string, arr_or_prefix) {
return prefix; if (arr_or_prefix instanceof Array) {
} else { for (var i = 0; i < arr_or_prefix.length; i++) {
return undefined; if (string.indexOf(arr_or_prefix[i]) == 0) {
return arr_or_prefix[i];
}
}
} else if (string.indexOf(arr_or_prefix) == 0) {
return arr_or_prefix;
} }
return undefined;
} }
//============================================
function ends_with(str, suffix) { function ends_with(str, suffix) {
if (str.indexOf(suffix, str.length - suffix.length) !== -1) { if (str.indexOf(suffix, str.length - suffix.length) !== -1) {
return suffix; return suffix;
@ -76,38 +84,47 @@ WB_wombat_init = (function() {
if (url != rewritten) { if (url != rewritten) {
console.log('REWRITE: ' + url + ' -> ' + rewritten); console.log('REWRITE: ' + url + ' -> ' + rewritten);
} else { } else {
//console.log('NOT REWRITTEN ' + url); console.log('NOT REWRITTEN ' + url);
} }
return rewritten; return rewritten;
} }
//============================================
var HTTP_PREFIX = "http://";
var HTTPS_PREFIX = "https://";
var REL_PREFIX = "//";
var VALID_PREFIXES = [HTTP_PREFIX, HTTPS_PREFIX, REL_PREFIX];
var IGNORE_PREFIXES = ["#", "about:", "data:", "mailto:", "javascript:"];
//============================================
function rewrite_url_(url) { function rewrite_url_(url) {
var http_prefix = "http://";
var https_prefix = "https://";
var rel_prefix = "//";
// If not dealing with a string, just return it // If not dealing with a string, just return it
if (!url || (typeof url) != "string") { if (!url || (typeof url) != "string") {
return url; return url;
} }
// ignore anchors // just in case wombat reference made it into url!
if (starts_with(url, "#")) { url = url.replace("WB_wombat_", "");
// ignore anchors, about, data
if (starts_with(url, IGNORE_PREFIXES)) {
return url; return url;
} }
// If starts with prefix, no rewriting needed // If starts with prefix, no rewriting needed
// Only check replay prefix (no date) as date may be different for each // Only check replay prefix (no date) as date may be different for each
// capture // capture
if (starts_with(url, wb_replay_prefix)) { if (starts_with(url, wb_replay_prefix) || starts_with(url, window.location.origin + wb_replay_prefix)) {
return url; return url;
} }
// If server relative url, add prefix and original host // If server relative url, add prefix and original host
if (url.charAt(0) == "/" && !starts_with(url, rel_prefix)) { if (url.charAt(0) == "/" && !starts_with(url, REL_PREFIX)) {
// Already a relative url, don't make any changes! // Already a relative url, don't make any changes!
if (url.indexOf(wb_capture_date_part) >= 0) { if (wb_capture_date_part && url.indexOf(wb_capture_date_part) >= 0) {
return url; return url;
} }
@ -116,9 +133,7 @@ WB_wombat_init = (function() {
// If full url starting with http://, add prefix // If full url starting with http://, add prefix
var prefix = starts_with(url, http_prefix) || var prefix = starts_with(url, VALID_PREFIXES);
starts_with(url, https_prefix) ||
starts_with(url, rel_prefix);
if (prefix) { if (prefix) {
if (starts_with(url, prefix + window.location.host + '/')) { if (starts_with(url, prefix + window.location.host + '/')) {
@ -130,25 +145,12 @@ WB_wombat_init = (function() {
// May or may not be a hostname, call function to determine // May or may not be a hostname, call function to determine
// If it is, add the prefix and make sure port is removed // If it is, add the prefix and make sure port is removed
if (is_host_url(url) && !starts_with(url, window.location.host + '/')) { if (is_host_url(url) && !starts_with(url, window.location.host + '/')) {
return wb_replay_date_prefix + http_prefix + url; return wb_replay_date_prefix + HTTP_PREFIX + url;
} }
return url; return url;
} }
//============================================
function copy_object_fields(obj) {
var new_obj = {};
for (prop in obj) {
if ((typeof obj[prop]) != "function") {
new_obj[prop] = obj[prop];
}
}
return new_obj;
}
//============================================ //============================================
function extract_orig(href) { function extract_orig(href) {
if (!href) { if (!href) {
@ -162,6 +164,20 @@ WB_wombat_init = (function() {
// extract original url from wburl // extract original url from wburl
if (index > 0) { if (index > 0) {
href = href.substr(index + 1); href = href.substr(index + 1);
} else {
index = href.indexOf(wb_replay_prefix);
if (index >= 0) {
href = href.substr(index + wb_replay_prefix.length);
}
if ((href.length > 4) &&
(href.charAt(2) == "_") &&
(href.charAt(3) == "/")) {
href = href.substr(4);
}
if (!starts_with(href, "http")) {
href = HTTP_PREFIX + href;
}
} }
// remove trailing slash // remove trailing slash
@ -171,55 +187,142 @@ WB_wombat_init = (function() {
return href; return href;
} }
//============================================ //============================================
function copy_location_obj(loc) { // Define custom property
var new_loc = copy_object_fields(loc); function defProp(obj, prop, value, set_func, get_func) {
var key = "_" + prop;
new_loc._orig_loc = loc; obj[key] = value;
new_loc._orig_href = loc.href;
try {
Object.defineProperty(obj, prop, {
configurable: false,
enumerable: true,
set: function(newval) {
var result = set_func.call(obj, newval);
if (result != undefined) {
obj[key] = result;
}
},
get: function() {
if (get_func) {
return get_func.call(obj, obj[key]);
} else {
return obj[key];
}
}
});
return true;
} catch (e) {
console.log(e);
obj[prop] = value;
return false;
}
}
//============================================
//Define WombatLocation
function WombatLocation(loc) {
this._orig_loc = loc;
this._orig_href = loc.href;
// Rewrite replace and assign functions // Rewrite replace and assign functions
new_loc.replace = function(url) { this.replace = function(url) {
this._orig_loc.replace(rewrite_url(url)); return this._orig_loc.replace(rewrite_url(url));
} }
new_loc.assign = function(url) { this.assign = function(url) {
this._orig_loc.assign(rewrite_url(url)); return this._orig_loc.assign(rewrite_url(url));
} }
new_loc.reload = loc.reload; this.reload = loc.reload;
// Adapted from: // Adapted from:
// https://gist.github.com/jlong/2428561 // https://gist.github.com/jlong/2428561
var parser = document.createElement('a'); var parser = document.createElement('a');
parser.href = extract_orig(new_loc._orig_href); var href = extract_orig(this._orig_href);
parser.href = href;
//console.log(this._orig_href + " -> " + tmp_href);
this._autooverride = false;
var _set_hash = function(hash) {
this._orig_loc.hash = hash;
return this._orig_loc.hash;
}
var _get_hash = function() {
return this._orig_loc.hash;
}
var _get_url_with_hash = function(url) {
return url + this._orig_loc.hash;
}
href = parser.href;
var hash = parser.hash;
if (hash) {
var hidx = href.lastIndexOf("#");
if (hidx > 0) {
href = href.substring(0, hidx);
}
}
if (Object.defineProperty) {
var res1 = defProp(this, "href", href,
this.assign,
_get_url_with_hash);
var res2 = defProp(this, "hash", parser.hash,
_set_hash,
_get_hash);
this._autooverride = res1 && res2;
}
this.host = parser.host;
this.hostname = parser.hostname;
new_loc.hash = parser.hash; if (parser.origin) {
new_loc.host = parser.host; this.origin = parser.origin;
new_loc.hostname = parser.hostname;
new_loc.href = parser.href;
if (new_loc.origin) {
new_loc.origin = parser.origin;
} }
new_loc.pathname = parser.pathname; this.pathname = parser.pathname;
new_loc.port = parser.port this.port = parser.port
new_loc.protocol = parser.protocol; this.protocol = parser.protocol;
new_loc.search = parser.search; this.search = parser.search;
new_loc.toString = function() { this.toString = function() {
return this.href; return this.href;
} }
return new_loc; // Copy any remaining properties
for (prop in loc) {
if (this.hasOwnProperty(prop)) {
continue;
}
if ((typeof loc[prop]) != "function") {
this[prop] = loc[prop];
}
}
} }
//============================================ //============================================
function update_location(req_href, orig_href, actual_location) { function update_location(req_href, orig_href, actual_location, wombat_loc) {
if (!req_href || req_href == orig_href) { if (!req_href) {
return; return;
} }
if (req_href == orig_href) {
// Reset wombat loc to the unrewritten version
//if (wombat_loc) {
// wombat_loc.href = extract_orig(orig_href);
//}
return;
}
ext_orig = extract_orig(orig_href); ext_orig = extract_orig(orig_href);
ext_req = extract_orig(req_href); ext_req = extract_orig(req_href);
@ -235,19 +338,19 @@ WB_wombat_init = (function() {
} }
//============================================ //============================================
function check_location_change(loc, is_top) { function check_location_change(wombat_loc, is_top) {
var locType = (typeof loc); var locType = (typeof wombat_loc);
var actual_location = (is_top ? window.top.location : window.location); var actual_location = (is_top ? window.top.location : window.location);
//console.log(loc.href);
// String has been assigned to location, so assign it // String has been assigned to location, so assign it
if (locType == "string") { if (locType == "string") {
update_location(loc, actual_location.href, actual_location) update_location(wombat_loc, actual_location.href, actual_location);
} else if (locType == "object") { } else if (locType == "object") {
update_location(loc.href, loc._orig_href, actual_location); update_location(wombat_loc.href,
wombat_loc._orig_href,
actual_location);
} }
} }
@ -261,10 +364,21 @@ WB_wombat_init = (function() {
check_location_change(window.WB_wombat_location, false); check_location_change(window.WB_wombat_location, false);
if (window.self.location != window.top.location) { // Only check top if its a different window
if (window.self.WB_wombat_location != window.top.WB_wombat_location) {
check_location_change(window.top.WB_wombat_location, true); check_location_change(window.top.WB_wombat_location, true);
} }
// lochash = window.WB_wombat_location.hash;
//
// if (lochash) {
// window.location.hash = lochash;
//
// //if (window.top.update_wb_url) {
// // window.top.location.hash = lochash;
// //}
// }
wb_wombat_updating = false; wb_wombat_updating = false;
} }
@ -328,6 +442,7 @@ WB_wombat_init = (function() {
window.XMLHttpRequest.prototype.open = open_rewritten; window.XMLHttpRequest.prototype.open = open_rewritten;
} }
//============================================
function init_worker_override() { function init_worker_override() {
if (!window.Worker) { if (!window.Worker) {
return; return;
@ -338,6 +453,7 @@ WB_wombat_init = (function() {
window.Worker = undefined; window.Worker = undefined;
} }
//============================================
function rewrite_attr(elem, name) { function rewrite_attr(elem, name) {
if (!elem || !elem.getAttribute) { if (!elem || !elem.getAttribute) {
return; return;
@ -359,6 +475,7 @@ WB_wombat_init = (function() {
elem.setAttribute(name, value); elem.setAttribute(name, value);
} }
//============================================
function init_dom_override() { function init_dom_override() {
if (!Node || !Node.prototype) { if (!Node || !Node.prototype) {
return; return;
@ -376,9 +493,9 @@ WB_wombat_init = (function() {
var desc; var desc;
if (child instanceof DocumentFragment) { if (child instanceof DocumentFragment) {
desc = child.querySelectorAll("*[href],*[src]"); //desc = child.querySelectorAll("*[href],*[src]");
} else if (child.getElementsByTagName) { } else if (child.getElementsByTagName) {
desc = child.getElementsByTagName("*"); //desc = child.getElementsByTagName("*");
} }
if (desc) { if (desc) {
@ -401,19 +518,55 @@ WB_wombat_init = (function() {
//============================================ //============================================
function wombat_init(replay_prefix, capture_date, orig_host, timestamp) { function wombat_init(replay_prefix, capture_date, orig_host, timestamp) {
wb_replay_prefix = replay_prefix; wb_replay_prefix = replay_prefix;
wb_replay_date_prefix = replay_prefix + capture_date + "/";
wb_capture_date_part = "/" + capture_date + "/";
wb_orig_host = "http://" + orig_host; wb_replay_date_prefix = replay_prefix + capture_date + "em_/";
if (capture_date.length > 0) {
wb_capture_date_part = "/" + capture_date + "/";
} else {
wb_capture_date_part = "";
}
wb_orig_host = HTTP_PREFIX + orig_host;
// Location // Location
window.WB_wombat_location = copy_location_obj(window.self.location); var wombat_location = new WombatLocation(window.self.location);
document.WB_wombat_location = window.WB_wombat_location;
if (wombat_location._autooverride) {
var setter = function(val) {
if (typeof(val) == "string") {
if (starts_with(val, "about:")) {
return undefined;
}
this._WB_wombat_location.href = val;
}
}
defProp(window, "WB_wombat_location", wombat_location, setter);
defProp(document, "WB_wombat_location", wombat_location, setter);
} else {
// Check quickly after page load
setTimeout(check_all_locations, 500);
// Check periodically every few seconds
setInterval(check_all_locations, 500);
}
var is_framed = (window.top.update_wb_url != undefined);
//if (window.self.location != window.top.location) { if (window.self.location != window.top.location) {
// window.top.WB_wombat_location = copy_location_obj(window.top.location); if (is_framed) {
//} window.top.WB_wombat_location = window.WB_wombat_location;
window.top.WB_wombat_location = window.WB_wombat_location; window.WB_wombat_top = window.self;
} else {
window.top.WB_wombat_location = new WombatLocation(window.top.location);
window.WB_wombat_top = window.top;
}
} else {
window.WB_wombat_top = window.top;
}
//if (window.opener) { //if (window.opener) {
// window.opener.WB_wombat_location = copy_location_obj(window.opener.location); // window.opener.WB_wombat_location = copy_location_obj(window.opener.location);
@ -421,6 +574,7 @@ WB_wombat_init = (function() {
// Domain // Domain
document.WB_wombat_domain = orig_host; document.WB_wombat_domain = orig_host;
document.WB_wombat_referrer = extract_orig(document.referrer);
// History // History
copy_history_func(window.history, 'pushState'); copy_history_func(window.history, 'pushState');
@ -434,15 +588,9 @@ WB_wombat_init = (function() {
init_dom_override(); init_dom_override();
// Random // Random
init_seeded_random(timestamp); init_seeded_random(timestamp);
} }
// Check quickly after page load
setTimeout(check_all_locations, 100);
// Check periodically every few seconds
setInterval(check_all_locations, 500);
return wombat_init; return wombat_init;
})(this); })(this);