1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

rewrite: wombat update 2.0! Using Object.defineProperty() to better

override .href and .hash properties when possible.
.href returns original url, but on assignment rewrites before redirecting
.hash proxies to location.hash
Also added:
- window.top -> window.WB_wombat_top
- document.referrer -> document.WB_wombat_referrer
- <source> html tag rewriting
This commit is contained in:
Ilya Kreymer 2014-04-18 19:30:48 -07:00
parent e011da43f2
commit 23bb5bd175
4 changed files with 242 additions and 88 deletions

View File

@ -45,6 +45,7 @@ class HTMLRewriterMixin(object):
'q': {'cite': defmod},
'ref': {'href': 'oe_'},
'script': {'src': 'js_'},
'source': {'src': 'oe_'},
'div': {'data-src': defmod,
'data-uri': defmod},
'li': {'data-src': defmod,

View File

@ -126,6 +126,8 @@ class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
rules = rules + [
(r'(?<!/)\blocation\b', RegexRewriter.add_prefix(prefix), 0),
(r'(?<=document\.)domain', RegexRewriter.add_prefix(prefix), 0),
(r'(?<=document\.)referrer', RegexRewriter.add_prefix(prefix), 0),
(r'(?<=window\.)top', RegexRewriter.add_prefix(prefix), 0),
]
#import sys
#sys.stderr.write('\n\n*** RULES:' + str(rules) + '\n\n')

View File

@ -21,6 +21,8 @@ class LiveRewriter(object):
('HTTP_ACCEPT', 'Accept'),
('HTTP_ACCEPT_LANGUAGE', 'Accept-Language'),
('HTTP_ACCEPT_CHARSET', 'Accept-Charset'),
('HTTP_ACCEPT_ENCODING', 'Accept-Encoding'),
('HTTP_RANGE', 'Range'),
('REL_REFERER', 'Referer'),
]
@ -79,6 +81,7 @@ class LiveRewriter(object):
stream=True,
verify=False)
statusline = str(response.status_code) + ' ' + response.reason
headers = response.headers.items()

View File

@ -18,7 +18,7 @@ This file is part of pywb.
*/
//============================================
// Wombat JS-Rewriting Library
// Wombat JS-Rewriting Library v2.0
//============================================
WB_wombat_init = (function() {
@ -52,14 +52,22 @@ WB_wombat_init = (function() {
return false;
}
function starts_with(string, prefix) {
if (string.indexOf(prefix) == 0) {
return prefix;
} else {
return undefined;
//============================================
function starts_with(string, arr_or_prefix) {
if (arr_or_prefix instanceof Array) {
for (var i = 0; i < arr_or_prefix.length; i++) {
if (string.indexOf(arr_or_prefix[i]) == 0) {
return arr_or_prefix[i];
}
}
} else if (string.indexOf(arr_or_prefix) == 0) {
return arr_or_prefix;
}
return undefined;
}
//============================================
function ends_with(str, suffix) {
if (str.indexOf(suffix, str.length - suffix.length) !== -1) {
return suffix;
@ -76,38 +84,47 @@ WB_wombat_init = (function() {
if (url != rewritten) {
console.log('REWRITE: ' + url + ' -> ' + rewritten);
} else {
//console.log('NOT REWRITTEN ' + url);
console.log('NOT REWRITTEN ' + url);
}
return rewritten;
}
//============================================
var HTTP_PREFIX = "http://";
var HTTPS_PREFIX = "https://";
var REL_PREFIX = "//";
var VALID_PREFIXES = [HTTP_PREFIX, HTTPS_PREFIX, REL_PREFIX];
var IGNORE_PREFIXES = ["#", "about:", "data:", "mailto:", "javascript:"];
//============================================
function rewrite_url_(url) {
var http_prefix = "http://";
var https_prefix = "https://";
var rel_prefix = "//";
// If not dealing with a string, just return it
if (!url || (typeof url) != "string") {
return url;
}
// ignore anchors
if (starts_with(url, "#")) {
// just in case wombat reference made it into url!
url = url.replace("WB_wombat_", "");
// ignore anchors, about, data
if (starts_with(url, IGNORE_PREFIXES)) {
return url;
}
// If starts with prefix, no rewriting needed
// Only check replay prefix (no date) as date may be different for each
// capture
if (starts_with(url, wb_replay_prefix)) {
if (starts_with(url, wb_replay_prefix) || starts_with(url, window.location.origin + wb_replay_prefix)) {
return url;
}
// If server relative url, add prefix and original host
if (url.charAt(0) == "/" && !starts_with(url, rel_prefix)) {
if (url.charAt(0) == "/" && !starts_with(url, REL_PREFIX)) {
// Already a relative url, don't make any changes!
if (url.indexOf(wb_capture_date_part) >= 0) {
if (wb_capture_date_part && url.indexOf(wb_capture_date_part) >= 0) {
return url;
}
@ -116,9 +133,7 @@ WB_wombat_init = (function() {
// If full url starting with http://, add prefix
var prefix = starts_with(url, http_prefix) ||
starts_with(url, https_prefix) ||
starts_with(url, rel_prefix);
var prefix = starts_with(url, VALID_PREFIXES);
if (prefix) {
if (starts_with(url, prefix + window.location.host + '/')) {
@ -130,25 +145,12 @@ WB_wombat_init = (function() {
// May or may not be a hostname, call function to determine
// If it is, add the prefix and make sure port is removed
if (is_host_url(url) && !starts_with(url, window.location.host + '/')) {
return wb_replay_date_prefix + http_prefix + url;
return wb_replay_date_prefix + HTTP_PREFIX + url;
}
return url;
}
//============================================
function copy_object_fields(obj) {
var new_obj = {};
for (prop in obj) {
if ((typeof obj[prop]) != "function") {
new_obj[prop] = obj[prop];
}
}
return new_obj;
}
//============================================
function extract_orig(href) {
if (!href) {
@ -162,6 +164,20 @@ WB_wombat_init = (function() {
// extract original url from wburl
if (index > 0) {
href = href.substr(index + 1);
} else {
index = href.indexOf(wb_replay_prefix);
if (index >= 0) {
href = href.substr(index + wb_replay_prefix.length);
}
if ((href.length > 4) &&
(href.charAt(2) == "_") &&
(href.charAt(3) == "/")) {
href = href.substr(4);
}
if (!starts_with(href, "http")) {
href = HTTP_PREFIX + href;
}
}
// remove trailing slash
@ -171,55 +187,142 @@ WB_wombat_init = (function() {
return href;
}
//============================================
function copy_location_obj(loc) {
var new_loc = copy_object_fields(loc);
new_loc._orig_loc = loc;
new_loc._orig_href = loc.href;
// Define custom property
function defProp(obj, prop, value, set_func, get_func) {
var key = "_" + prop;
obj[key] = value;
try {
Object.defineProperty(obj, prop, {
configurable: false,
enumerable: true,
set: function(newval) {
var result = set_func.call(obj, newval);
if (result != undefined) {
obj[key] = result;
}
},
get: function() {
if (get_func) {
return get_func.call(obj, obj[key]);
} else {
return obj[key];
}
}
});
return true;
} catch (e) {
console.log(e);
obj[prop] = value;
return false;
}
}
//============================================
//Define WombatLocation
function WombatLocation(loc) {
this._orig_loc = loc;
this._orig_href = loc.href;
// Rewrite replace and assign functions
new_loc.replace = function(url) {
this._orig_loc.replace(rewrite_url(url));
this.replace = function(url) {
return this._orig_loc.replace(rewrite_url(url));
}
new_loc.assign = function(url) {
this._orig_loc.assign(rewrite_url(url));
this.assign = function(url) {
return this._orig_loc.assign(rewrite_url(url));
}
new_loc.reload = loc.reload;
this.reload = loc.reload;
// Adapted from:
// https://gist.github.com/jlong/2428561
var parser = document.createElement('a');
parser.href = extract_orig(new_loc._orig_href);
var href = extract_orig(this._orig_href);
parser.href = href;
//console.log(this._orig_href + " -> " + tmp_href);
this._autooverride = false;
var _set_hash = function(hash) {
this._orig_loc.hash = hash;
return this._orig_loc.hash;
}
var _get_hash = function() {
return this._orig_loc.hash;
}
var _get_url_with_hash = function(url) {
return url + this._orig_loc.hash;
}
href = parser.href;
var hash = parser.hash;
if (hash) {
var hidx = href.lastIndexOf("#");
if (hidx > 0) {
href = href.substring(0, hidx);
}
}
if (Object.defineProperty) {
var res1 = defProp(this, "href", href,
this.assign,
_get_url_with_hash);
var res2 = defProp(this, "hash", parser.hash,
_set_hash,
_get_hash);
this._autooverride = res1 && res2;
}
this.host = parser.host;
this.hostname = parser.hostname;
new_loc.hash = parser.hash;
new_loc.host = parser.host;
new_loc.hostname = parser.hostname;
new_loc.href = parser.href;
if (new_loc.origin) {
new_loc.origin = parser.origin;
if (parser.origin) {
this.origin = parser.origin;
}
new_loc.pathname = parser.pathname;
new_loc.port = parser.port
new_loc.protocol = parser.protocol;
new_loc.search = parser.search;
this.pathname = parser.pathname;
this.port = parser.port
this.protocol = parser.protocol;
this.search = parser.search;
new_loc.toString = function() {
this.toString = function() {
return this.href;
}
return new_loc;
// Copy any remaining properties
for (prop in loc) {
if (this.hasOwnProperty(prop)) {
continue;
}
if ((typeof loc[prop]) != "function") {
this[prop] = loc[prop];
}
}
}
//============================================
function update_location(req_href, orig_href, actual_location) {
if (!req_href || req_href == orig_href) {
function update_location(req_href, orig_href, actual_location, wombat_loc) {
if (!req_href) {
return;
}
if (req_href == orig_href) {
// Reset wombat loc to the unrewritten version
//if (wombat_loc) {
// wombat_loc.href = extract_orig(orig_href);
//}
return;
}
ext_orig = extract_orig(orig_href);
ext_req = extract_orig(req_href);
@ -235,19 +338,19 @@ WB_wombat_init = (function() {
}
//============================================
function check_location_change(loc, is_top) {
var locType = (typeof loc);
function check_location_change(wombat_loc, is_top) {
var locType = (typeof wombat_loc);
var actual_location = (is_top ? window.top.location : window.location);
//console.log(loc.href);
// String has been assigned to location, so assign it
if (locType == "string") {
update_location(loc, actual_location.href, actual_location)
update_location(wombat_loc, actual_location.href, actual_location);
} else if (locType == "object") {
update_location(loc.href, loc._orig_href, actual_location);
update_location(wombat_loc.href,
wombat_loc._orig_href,
actual_location);
}
}
@ -261,10 +364,21 @@ WB_wombat_init = (function() {
check_location_change(window.WB_wombat_location, false);
if (window.self.location != window.top.location) {
// Only check top if its a different window
if (window.self.WB_wombat_location != window.top.WB_wombat_location) {
check_location_change(window.top.WB_wombat_location, true);
}
// lochash = window.WB_wombat_location.hash;
//
// if (lochash) {
// window.location.hash = lochash;
//
// //if (window.top.update_wb_url) {
// // window.top.location.hash = lochash;
// //}
// }
wb_wombat_updating = false;
}
@ -328,6 +442,7 @@ WB_wombat_init = (function() {
window.XMLHttpRequest.prototype.open = open_rewritten;
}
//============================================
function init_worker_override() {
if (!window.Worker) {
return;
@ -338,6 +453,7 @@ WB_wombat_init = (function() {
window.Worker = undefined;
}
//============================================
function rewrite_attr(elem, name) {
if (!elem || !elem.getAttribute) {
return;
@ -359,6 +475,7 @@ WB_wombat_init = (function() {
elem.setAttribute(name, value);
}
//============================================
function init_dom_override() {
if (!Node || !Node.prototype) {
return;
@ -376,9 +493,9 @@ WB_wombat_init = (function() {
var desc;
if (child instanceof DocumentFragment) {
desc = child.querySelectorAll("*[href],*[src]");
//desc = child.querySelectorAll("*[href],*[src]");
} else if (child.getElementsByTagName) {
desc = child.getElementsByTagName("*");
//desc = child.getElementsByTagName("*");
}
if (desc) {
@ -401,19 +518,55 @@ WB_wombat_init = (function() {
//============================================
function wombat_init(replay_prefix, capture_date, orig_host, timestamp) {
wb_replay_prefix = replay_prefix;
wb_replay_date_prefix = replay_prefix + capture_date + "/";
wb_capture_date_part = "/" + capture_date + "/";
wb_orig_host = "http://" + orig_host;
wb_replay_date_prefix = replay_prefix + capture_date + "em_/";
if (capture_date.length > 0) {
wb_capture_date_part = "/" + capture_date + "/";
} else {
wb_capture_date_part = "";
}
wb_orig_host = HTTP_PREFIX + orig_host;
// Location
window.WB_wombat_location = copy_location_obj(window.self.location);
document.WB_wombat_location = window.WB_wombat_location;
var wombat_location = new WombatLocation(window.self.location);
if (wombat_location._autooverride) {
var setter = function(val) {
if (typeof(val) == "string") {
if (starts_with(val, "about:")) {
return undefined;
}
this._WB_wombat_location.href = val;
}
}
defProp(window, "WB_wombat_location", wombat_location, setter);
defProp(document, "WB_wombat_location", wombat_location, setter);
} else {
// Check quickly after page load
setTimeout(check_all_locations, 500);
// Check periodically every few seconds
setInterval(check_all_locations, 500);
}
var is_framed = (window.top.update_wb_url != undefined);
//if (window.self.location != window.top.location) {
// window.top.WB_wombat_location = copy_location_obj(window.top.location);
//}
window.top.WB_wombat_location = window.WB_wombat_location;
if (window.self.location != window.top.location) {
if (is_framed) {
window.top.WB_wombat_location = window.WB_wombat_location;
window.WB_wombat_top = window.self;
} else {
window.top.WB_wombat_location = new WombatLocation(window.top.location);
window.WB_wombat_top = window.top;
}
} else {
window.WB_wombat_top = window.top;
}
//if (window.opener) {
// window.opener.WB_wombat_location = copy_location_obj(window.opener.location);
@ -421,6 +574,7 @@ WB_wombat_init = (function() {
// Domain
document.WB_wombat_domain = orig_host;
document.WB_wombat_referrer = extract_orig(document.referrer);
// History
copy_history_func(window.history, 'pushState');
@ -434,15 +588,9 @@ WB_wombat_init = (function() {
init_dom_override();
// Random
init_seeded_random(timestamp);
init_seeded_random(timestamp);
}
// Check quickly after page load
setTimeout(check_all_locations, 100);
// Check periodically every few seconds
setInterval(check_all_locations, 500);
return wombat_init;
})(this);