mirror of https://github.com/webrecorder/pywb.git synced 2025-03-28 00:25:21 +01:00

929 lines
27 KiB

Copyright(c) 2013-2014 Ilya Kreymer. Released under the GNU General Public License.
This file is part of pywb, https://github.com/ikreymer/pywb
pywb is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
pywb is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with pywb. If not, see <http://www.gnu.org/licenses/>.
// Wombat JS-Rewriting Library v2.1
_WBWombat = (function() {
// Globals
var wb_replay_prefix;
var wb_replay_date_prefix;
var wb_capture_date_part;
var wb_orig_scheme;
var wb_orig_host;
var wb_wombat_updating = false;
function is_host_url(str) {
// Good guess that's its a hostname
if (str.indexOf("www.") == 0) {
return true;
// hostname:port (port required)
var matches = str.match(/^[\w-]+(\.[\w-_]+)+(:\d+)(\/|$)/);
if (matches && (matches[0].length < 64)) {
return true;
// ip:port
matches = str.match(/^\d+\.\d+\.\d+\.\d+(:\d+)?(\/|$)/);
if (matches && (matches[0].length < 64)) {
return true;
return false;
function starts_with(string, arr_or_prefix) {
if (arr_or_prefix instanceof Array) {
for (var i = 0; i < arr_or_prefix.length; i++) {
if (string.indexOf(arr_or_prefix[i]) == 0) {
return arr_or_prefix[i];
} else if (string.indexOf(arr_or_prefix) == 0) {
return arr_or_prefix;
return undefined;
function ends_with(str, suffix) {
if (str.indexOf(suffix, str.length - suffix.length) !== -1) {
return suffix;
} else {
return undefined;
var rewrite_url = rewrite_url_;
function rewrite_url_debug(url) {
var rewritten = rewrite_url_(url);
if (url != rewritten) {
console.log('REWRITE: ' + url + ' -> ' + rewritten);
} else {
console.log('NOT REWRITTEN ' + url);
return rewritten;
var HTTP_PREFIX = "http://";
var HTTPS_PREFIX = "https://";
var REL_PREFIX = "//";
var IGNORE_PREFIXES = ["#", "about:", "data:", "mailto:", "javascript:"];
function init_bad_prefixes(prefix) {
BAD_PREFIXES = ["http:" + prefix, "https:" + prefix,
"http:/" + prefix, "https:/" + prefix];
function rewrite_url_(url) {
// If undefined, just return it
if (!url) {
return url;
var urltype_ = (typeof url);
// If object, use toString
if (urltype_ == "object") {
url = url.toString();
} else if (urltype_ != "string") {
return url;
// proxy mode: If no wb_replay_prefix, only rewrite https:// -> http://
if (!wb_replay_prefix) {
if (starts_with(url, HTTPS_PREFIX)) {
return HTTP_PREFIX + url.substr(HTTPS_PREFIX.length);
} else {
return url;
// just in case wombat reference made it into url!
url = url.replace("WB_wombat_", "");
// ignore anchors, about, data
if (starts_with(url, IGNORE_PREFIXES)) {
return url;
// If starts with prefix, no rewriting needed
// Only check replay prefix (no date) as date may be different for each
// capture
if (starts_with(url, wb_replay_prefix) || starts_with(url, window.location.origin + wb_replay_prefix)) {
return url;
// If server relative url, add prefix and original host
if (url.charAt(0) == "/" && !starts_with(url, REL_PREFIX)) {
// Already a relative url, don't make any changes!
if (wb_capture_date_part && url.indexOf(wb_capture_date_part) >= 0) {
return url;
return wb_replay_date_prefix + wb_orig_host + url;
// If full url starting with http://, add prefix
var prefix = starts_with(url, VALID_PREFIXES);
if (prefix) {
if (starts_with(url, prefix + window.location.host + '/')) {
return url;
return wb_replay_date_prefix + url;
// Check for common bad prefixes and remove them
prefix = starts_with(url, BAD_PREFIXES);
if (prefix) {
url = extract_orig(url);
return wb_replay_date_prefix + url;
// May or may not be a hostname, call function to determine
// If it is, add the prefix and make sure port is removed
if (is_host_url(url) && !starts_with(url, window.location.host + '/')) {
return wb_replay_date_prefix + wb_orig_scheme + url;
return url;
function extract_orig(href) {
if (!href) {
return "";
// proxy mode: no extraction needed
if (!wb_replay_prefix) {
return href;
href = href.toString();
var index = href.indexOf("/http", 1);
// extract original url from wburl
if (index > 0) {
href = href.substr(index + 1);
} else {
index = href.indexOf(wb_replay_prefix);
if (index >= 0) {
href = href.substr(index + wb_replay_prefix.length);
if ((href.length > 4) &&
(href.charAt(2) == "_") &&
(href.charAt(3) == "/")) {
href = href.substr(4);
if (!starts_with(href, "http")) {
href = HTTP_PREFIX + href;
// remove trailing slash
if (ends_with(href, "/")) {
href = href.substring(0, href.length - 1);
return href;
// Define custom property
function def_prop(obj, prop, value, set_func, get_func) {
var key = "_" + prop;
obj[key] = value;
try {
Object.defineProperty(obj, prop, {
configurable: false,
enumerable: true,
set: function(newval) {
var result = set_func.call(obj, newval);
if (result != undefined) {
obj[key] = result;
get: function() {
if (get_func) {
return get_func.call(obj, obj[key]);
} else {
return obj[key];
return true;
} catch (e) {
obj[prop] = value;
return false;
//Define WombatLocation
function WombatLocation(loc) {
this._orig_loc = loc;
this._orig_href = loc.href;
// Rewrite replace and assign functions
this.replace = function(url) {
return this._orig_loc.replace(rewrite_url(url));
this.assign = function(url) {
return this._orig_loc.assign(rewrite_url(url));
this.reload = loc.reload;
// Adapted from:
// https://gist.github.com/jlong/2428561
var parser = document.createElement('a');
var href = extract_orig(this._orig_href);
parser.href = href;
this._autooverride = false;
var _set_hash = function(hash) {
this._orig_loc.hash = hash;
return this._orig_loc.hash;
var _get_hash = function() {
return this._orig_loc.hash;
var _get_url_with_hash = function(url) {
return url + this._orig_loc.hash;
href = parser.href;
var hash = parser.hash;
if (hash) {
var hidx = href.lastIndexOf("#");
if (hidx > 0) {
href = href.substring(0, hidx);
if (Object.defineProperty) {
var res1 = def_prop(this, "href", href,
var res2 = def_prop(this, "hash", parser.hash,
this._autooverride = res1 && res2;
} else {
this.href = href;
this.hash = parser.hash;
this.host = parser.host;
this.hostname = parser.hostname;
if (parser.origin) {
this.origin = parser.origin;
this.pathname = parser.pathname;
this.port = parser.port
this.protocol = parser.protocol;
this.search = parser.search;
this.toString = function() {
return this.href;
// Copy any remaining properties
for (prop in loc) {
if (this.hasOwnProperty(prop)) {
if ((typeof loc[prop]) != "function") {
this[prop] = loc[prop];
function update_location(req_href, orig_href, actual_location, wombat_loc) {
if (!req_href) {
if (req_href == orig_href) {
// Reset wombat loc to the unrewritten version
//if (wombat_loc) {
// wombat_loc.href = extract_orig(orig_href);
var ext_orig = extract_orig(orig_href);
var ext_req = extract_orig(req_href);
if (!ext_orig || ext_orig == ext_req) {
var final_href = rewrite_url(req_href);
console.log(actual_location.href + ' -> ' + final_href);
actual_location.href = final_href;
function check_location_change(wombat_loc, is_top) {
var locType = (typeof wombat_loc);
var actual_location = (is_top ? window.top.location : window.location);
// String has been assigned to location, so assign it
if (locType == "string") {
update_location(wombat_loc, actual_location.href, actual_location);
} else if (locType == "object") {
function check_all_locations() {
if (wb_wombat_updating) {
return false;
wb_wombat_updating = true;
check_location_change(window.WB_wombat_location, false);
// Only check top if its a different window
if (window.self.WB_wombat_location != window.top.WB_wombat_location) {
check_location_change(window.top.WB_wombat_location, true);
// lochash = window.WB_wombat_location.hash;
// if (lochash) {
// window.location.hash = lochash;
// //if (window.top.update_wb_url) {
// // window.top.location.hash = lochash;
// //}
// }
wb_wombat_updating = false;
function init_seeded_random(seed) {
// Adapted from:
// http://indiegamr.com/generate-repeatable-random-numbers-in-js/
Math.seed = parseInt(seed);
function seeded_random() {
Math.seed = (Math.seed * 9301 + 49297) % 233280;
var rnd = Math.seed / 233280;
return rnd;
Math.random = seeded_random;
function copy_history_func(history, func_name) {
var orig_func = history[func_name];
if (!orig_func) {
history['_orig_' + func_name] = orig_func;
function rewritten_func(state_obj, title, url) {
url = rewrite_url(url);
return orig_func.call(history, state_obj, title, url);
history[func_name] = rewritten_func;
return rewritten_func;
function init_ajax_rewrite() {
if (!window.XMLHttpRequest ||
!window.XMLHttpRequest.prototype ||
!window.XMLHttpRequest.prototype.open) {
var orig = window.XMLHttpRequest.prototype.open;
function open_rewritten(method, url, async, user, password) {
if (!this._no_rewrite) {
url = rewrite_url(url);
// defaults to true
if (async != false) {
async = true;
// extra check for correct scheme here.. maybe move to rewrite_url..
var curr_scheme = window.location.protocol + '//';
if (starts_with(url, wb_orig_scheme) && (wb_orig_scheme != curr_scheme)) {
url = curr_scheme + url.substring(wb_orig_scheme.length);
return orig.call(this, method, url, async, user, password);
window.XMLHttpRequest.prototype.open = open_rewritten;
function init_setAttribute_override()
if (!window.Element ||
!window.Element.prototype ||
!window.Element.prototype.setAttribute) {
var orig_setAttribute = window.Element.prototype.setAttribute;
Element.prototype.setAttribute = function(name, value) {
if (name) {
var lowername = name.toLowerCase();
if (lowername == "src" || lowername == "href") {
if (!this._no_rewrite) {
value = rewrite_url(value);
orig_setAttribute.call(this, name, value);
function init_image_override() {
window.__Image = window.Image;
window.Image = function (Image) {
return function (width, height) {
var image = new Image(width, height);
override_attr(image, "src");
return image;
function init_date_override(timestamp) {
timestamp = parseInt(timestamp) * 1000;
var timezone = new Date().getTimezoneOffset() * 60 * 1000;
var timediff = Date.now() - (timestamp - timezone);
window.__Date = window.Date;
window.__Date_now = window.Date.now;
var utc = window.Date.UTC;
var parse = window.Date.parse;
window.Date = function (Date) {
return function (A, B, C, D, E, F, G) {
// Apply doesn't work for constructors and Date doesn't
// seem to like undefined args, so must explicitly
// call constructor for each possible args 0..7
if (A === undefined) {
return new Date(window.Date.now());
} else if (B === undefined) {
return new Date(A);
} else if (C === undefined) {
return new Date(A, B);
} else if (D === undefined) {
return new Date(A, B, C);
} else if (E === undefined) {
return new Date(A, B, C, D);
} else if (F === undefined) {
return new Date(A, B, C, D, E);
} else if (G === undefined) {
return new Date(A, B, C, D, E, F);
} else {
return new Date(A, B, C, D, E, F, G);
window.Date.now = function() {
return __Date_now() - timediff;
window.Date.UTC = utc;
window.Date.parse = parse;
function init_worker_override() {
if (!window.Worker) {
// for now, disabling workers until override of worker content can be supported
// hopefully, pages depending on workers will have a fallback
window.Worker = undefined;
function rewrite_attr(elem, name, func) {
if (!elem || !elem.getAttribute) {
var value = elem.getAttribute(name);
if (!value) {
if (starts_with(value, "javascript:")) {
if (func) {
value = func(value);
// this now handles the actual rewrite
elem.setAttribute(name, value);
function rewrite_style(value)
//console.log("style rewrite: " + value);
STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/g;
function style_replacer(match, n1, n2, n3, offset, string) {
return n1 + rewrite_url(n2) + n3;
return value.replace(STYLE_REGEX, style_replacer);
function rewrite_elem(elem)
rewrite_attr(elem, "src");
rewrite_attr(elem, "href");
rewrite_attr(elem, "style", rewrite_style);
if (elem && elem.getAttribute && elem.getAttribute("crossorigin")) {
function override_attr(obj, attr) {
var setter = function(orig) {
//var val = rewrite_url(orig);
var val = orig;
this.setAttribute(attr, val);
return val;
var getter = function(val) {
var res = this.getAttribute(attr);
return res;
var curr_src = obj.getAttribute(attr);
def_prop(obj, attr, curr_src, setter, getter);
function init_dom_override() {
if (!Node || !Node.prototype) {
function replace_dom_func(funcname) {
var orig = Node.prototype[funcname];
Node.prototype[funcname] = function() {
var child = arguments[0];
var desc;
if (child instanceof DocumentFragment) {
desc = child.querySelectorAll("a[href], iframe[src]");
} else if (child.getElementsByTagName) {
desc = child.getElementsByTagName("*");
if (desc) {
for (var i = 0; i < desc.length; i++) {
var created = orig.apply(this, arguments);
if (!created) {
if (created.tagName == "IFRAME") {
if (created.contentWindow) {
created.contentWindow.window.WB_wombat_location = created.contentWindow.window.location;
override_attr(created, "src");
} else if (created.tagName && starts_with(created.tagName, SRC_TAGS)) {
override_attr(created, "src");
return created;
function init_postmessage_override()
if (!Window.prototype.postMessage) {
var orig = Window.prototype.postMessage;
var postmessage_rewritten = function(message, targetOrigin, transfer) {
if (targetOrigin && targetOrigin != "*") {
targetOrigin = window.location.origin;
return orig.call(this, message, targetOrigin, transfer);
window.postMessage = postmessage_rewritten;
window.Window.prototype.postMessage = postmessage_rewritten;
for (var i = 0; i < window.frames.length; i++) {
try {
window.frames[i].postMessage = postmessage_rewritten;
} catch (e) {
function init_open_override()
if (!Window.prototype.open) {
var orig = Window.prototype.open;
var open_rewritten = function(strUrl, strWindowName, strWindowFeatures) {
strUrl = rewrite_url(strUrl);
return orig.call(this, strUrl, strWindowName, strWindowFeatures);
window.open = open_rewritten;
window.Window.prototype.open = open_rewritten;
for (var i = 0; i < window.frames.length; i++) {
try {
window.frames[i].open = open_rewritten;
} catch (e) {
function init_cookies_override()
var cookie_path_regex = /\bPath=\'?\"?([^;'"\s]+)/i;
var get_cookie = function() {
return document.cookie;
var set_cookie = function(value) {
var matched = value.match(cookie_path_regex);
// if has cookie path, rewrite and replace
if (matched) {
var rewritten = rewrite_url(matched[1]);
value = value.replace(matched[1], rewritten);
document.cookie = value;
def_prop(document, "WB_wombat_cookie", document.cookie,
function init_write_override()
document.write = function(string) {
var doc = new DOMParser().parseFromString(string, "text/html");
if (doc) {
var children = doc.body.children;
for (var i = 0; i < children.length; i++) {
function wombat_init(replay_prefix, capture_date, orig_scheme, orig_host, timestamp, mod) {
wb_replay_prefix = replay_prefix;
if (wb_replay_prefix) {
wb_replay_date_prefix = replay_prefix + capture_date + mod + "/";
if (capture_date.length > 0) {
wb_capture_date_part = "/" + capture_date + "/";
} else {
wb_capture_date_part = "";
wb_orig_scheme = orig_scheme + '://';
wb_orig_host = wb_orig_scheme + orig_host;
// Location
var wombat_location = new WombatLocation(window.self.location);
if (wombat_location._autooverride) {
var setter = function(val) {
if (typeof(val) == "string") {
if (starts_with(val, "about:")) {
return undefined;
this._WB_wombat_location.href = val;
def_prop(window, "WB_wombat_location", wombat_location, setter);
def_prop(document, "WB_wombat_location", wombat_location, setter);
} else {
window.WB_wombat_location = wombat_location;
document.WB_wombat_location = wombat_location;
// Check quickly after page load
setTimeout(check_all_locations, 500);
// Check periodically every few seconds
setInterval(check_all_locations, 500);
var is_framed = (window.top.wbinfo && window.top.wbinfo.is_frame);
function find_next_top(win) {
while ((win.parent != win) && (win.parent != win.top)) {
win = win.parent;
return win;
if (window.self.location != window.top.location) {
if (is_framed) {
window.top.WB_wombat_location = window.WB_wombat_location;
window.WB_wombat_top = find_next_top(window.self);
} else {
window.top.WB_wombat_location = new WombatLocation(window.top.location);
window.WB_wombat_top = window.top;
} else {
window.WB_wombat_top = window.top;
//if (window.opener) {
// window.opener.WB_wombat_location = copy_location_obj(window.opener.location);
// Domain
document.WB_wombat_domain = orig_host;
document.WB_wombat_referrer = extract_orig(document.referrer);
// History
copy_history_func(window.history, 'pushState');
copy_history_func(window.history, 'replaceState');
// open
// postMessage
// write
// Ajax
// setAttribute
// Image
// Cookies
// DOM
// Random
// Date
// expose functions
this.extract_orig = extract_orig;
return wombat_init;
window._WBWombat = _WBWombat;