diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 36601e98..a855cda2 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -45,6 +45,7 @@ class HTMLRewriterMixin(object): 'q': {'cite': defmod}, 'ref': {'href': 'oe_'}, 'script': {'src': 'js_'}, + 'source': {'src': 'oe_'}, 'div': {'data-src': defmod, 'data-uri': defmod}, 'li': {'data-src': defmod, diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py index 5f429339..0fffc7c2 100644 --- a/pywb/rewrite/regex_rewriters.py +++ b/pywb/rewrite/regex_rewriters.py @@ -126,6 +126,8 @@ class JSLinkAndLocationRewriter(JSLinkOnlyRewriter): rules = rules + [ (r'(? ' + rewritten); } else { - //console.log('NOT REWRITTEN ' + url); + console.log('NOT REWRITTEN ' + url); } return rewritten; } + + //============================================ + var HTTP_PREFIX = "http://"; + var HTTPS_PREFIX = "https://"; + var REL_PREFIX = "//"; + + var VALID_PREFIXES = [HTTP_PREFIX, HTTPS_PREFIX, REL_PREFIX]; + var IGNORE_PREFIXES = ["#", "about:", "data:", "mailto:", "javascript:"]; + + //============================================ function rewrite_url_(url) { - var http_prefix = "http://"; - var https_prefix = "https://"; - var rel_prefix = "//"; - // If not dealing with a string, just return it if (!url || (typeof url) != "string") { return url; } - // ignore anchors - if (starts_with(url, "#")) { + // just in case wombat reference made it into url! + url = url.replace("WB_wombat_", ""); + + // ignore anchors, about, data + if (starts_with(url, IGNORE_PREFIXES)) { return url; } // If starts with prefix, no rewriting needed // Only check replay prefix (no date) as date may be different for each // capture - if (starts_with(url, wb_replay_prefix)) { + if (starts_with(url, wb_replay_prefix) || starts_with(url, window.location.origin + wb_replay_prefix)) { return url; } // If server relative url, add prefix and original host - if (url.charAt(0) == "/" && !starts_with(url, rel_prefix)) { + if (url.charAt(0) == "/" && !starts_with(url, REL_PREFIX)) { // Already a relative url, don't make any changes! - if (url.indexOf(wb_capture_date_part) >= 0) { + if (wb_capture_date_part && url.indexOf(wb_capture_date_part) >= 0) { return url; } @@ -116,9 +133,7 @@ WB_wombat_init = (function() { // If full url starting with http://, add prefix - var prefix = starts_with(url, http_prefix) || - starts_with(url, https_prefix) || - starts_with(url, rel_prefix); + var prefix = starts_with(url, VALID_PREFIXES); if (prefix) { if (starts_with(url, prefix + window.location.host + '/')) { @@ -130,25 +145,12 @@ WB_wombat_init = (function() { // May or may not be a hostname, call function to determine // If it is, add the prefix and make sure port is removed if (is_host_url(url) && !starts_with(url, window.location.host + '/')) { - return wb_replay_date_prefix + http_prefix + url; + return wb_replay_date_prefix + HTTP_PREFIX + url; } return url; } - //============================================ - function copy_object_fields(obj) { - var new_obj = {}; - - for (prop in obj) { - if ((typeof obj[prop]) != "function") { - new_obj[prop] = obj[prop]; - } - } - - return new_obj; - } - //============================================ function extract_orig(href) { if (!href) { @@ -162,6 +164,20 @@ WB_wombat_init = (function() { // extract original url from wburl if (index > 0) { href = href.substr(index + 1); + } else { + index = href.indexOf(wb_replay_prefix); + if (index >= 0) { + href = href.substr(index + wb_replay_prefix.length); + } + if ((href.length > 4) && + (href.charAt(2) == "_") && + (href.charAt(3) == "/")) { + href = href.substr(4); + } + + if (!starts_with(href, "http")) { + href = HTTP_PREFIX + href; + } } // remove trailing slash @@ -171,55 +187,142 @@ WB_wombat_init = (function() { return href; } - + //============================================ - function copy_location_obj(loc) { - var new_loc = copy_object_fields(loc); - - new_loc._orig_loc = loc; - new_loc._orig_href = loc.href; + // Define custom property + function defProp(obj, prop, value, set_func, get_func) { + var key = "_" + prop; + obj[key] = value; + + try { + Object.defineProperty(obj, prop, { + configurable: false, + enumerable: true, + set: function(newval) { + var result = set_func.call(obj, newval); + if (result != undefined) { + obj[key] = result; + } + }, + get: function() { + if (get_func) { + return get_func.call(obj, obj[key]); + } else { + return obj[key]; + } + } + }); + return true; + } catch (e) { + console.log(e); + obj[prop] = value; + return false; + } + } + + //============================================ + //Define WombatLocation + + function WombatLocation(loc) { + this._orig_loc = loc; + this._orig_href = loc.href; // Rewrite replace and assign functions - new_loc.replace = function(url) { - this._orig_loc.replace(rewrite_url(url)); + this.replace = function(url) { + return this._orig_loc.replace(rewrite_url(url)); } - new_loc.assign = function(url) { - this._orig_loc.assign(rewrite_url(url)); + this.assign = function(url) { + return this._orig_loc.assign(rewrite_url(url)); } - new_loc.reload = loc.reload; - + this.reload = loc.reload; + // Adapted from: // https://gist.github.com/jlong/2428561 var parser = document.createElement('a'); - parser.href = extract_orig(new_loc._orig_href); + var href = extract_orig(this._orig_href); + parser.href = href; + + //console.log(this._orig_href + " -> " + tmp_href); + this._autooverride = false; + + var _set_hash = function(hash) { + this._orig_loc.hash = hash; + return this._orig_loc.hash; + } + + var _get_hash = function() { + return this._orig_loc.hash; + } + + var _get_url_with_hash = function(url) { + return url + this._orig_loc.hash; + } + + href = parser.href; + var hash = parser.hash; + + if (hash) { + var hidx = href.lastIndexOf("#"); + if (hidx > 0) { + href = href.substring(0, hidx); + } + } + + if (Object.defineProperty) { + var res1 = defProp(this, "href", href, + this.assign, + _get_url_with_hash); + + var res2 = defProp(this, "hash", parser.hash, + _set_hash, + _get_hash); + + this._autooverride = res1 && res2; + } + + this.host = parser.host; + this.hostname = parser.hostname; - new_loc.hash = parser.hash; - new_loc.host = parser.host; - new_loc.hostname = parser.hostname; - new_loc.href = parser.href; - - if (new_loc.origin) { - new_loc.origin = parser.origin; + if (parser.origin) { + this.origin = parser.origin; } - new_loc.pathname = parser.pathname; - new_loc.port = parser.port - new_loc.protocol = parser.protocol; - new_loc.search = parser.search; + this.pathname = parser.pathname; + this.port = parser.port + this.protocol = parser.protocol; + this.search = parser.search; - new_loc.toString = function() { + this.toString = function() { return this.href; } - - return new_loc; + + // Copy any remaining properties + for (prop in loc) { + if (this.hasOwnProperty(prop)) { + continue; + } + + if ((typeof loc[prop]) != "function") { + this[prop] = loc[prop]; + } + } } //============================================ - function update_location(req_href, orig_href, actual_location) { - if (!req_href || req_href == orig_href) { + function update_location(req_href, orig_href, actual_location, wombat_loc) { + if (!req_href) { return; } + if (req_href == orig_href) { + // Reset wombat loc to the unrewritten version + //if (wombat_loc) { + // wombat_loc.href = extract_orig(orig_href); + //} + return; + } + + ext_orig = extract_orig(orig_href); ext_req = extract_orig(req_href); @@ -235,19 +338,19 @@ WB_wombat_init = (function() { } //============================================ - function check_location_change(loc, is_top) { - var locType = (typeof loc); + function check_location_change(wombat_loc, is_top) { + var locType = (typeof wombat_loc); var actual_location = (is_top ? window.top.location : window.location); - //console.log(loc.href); - // String has been assigned to location, so assign it if (locType == "string") { - update_location(loc, actual_location.href, actual_location) - + update_location(wombat_loc, actual_location.href, actual_location); + } else if (locType == "object") { - update_location(loc.href, loc._orig_href, actual_location); + update_location(wombat_loc.href, + wombat_loc._orig_href, + actual_location); } } @@ -261,10 +364,21 @@ WB_wombat_init = (function() { check_location_change(window.WB_wombat_location, false); - if (window.self.location != window.top.location) { + // Only check top if its a different window + if (window.self.WB_wombat_location != window.top.WB_wombat_location) { check_location_change(window.top.WB_wombat_location, true); } +// lochash = window.WB_wombat_location.hash; +// +// if (lochash) { +// window.location.hash = lochash; +// +// //if (window.top.update_wb_url) { +// // window.top.location.hash = lochash; +// //} +// } + wb_wombat_updating = false; } @@ -328,6 +442,7 @@ WB_wombat_init = (function() { window.XMLHttpRequest.prototype.open = open_rewritten; } + //============================================ function init_worker_override() { if (!window.Worker) { return; @@ -338,6 +453,7 @@ WB_wombat_init = (function() { window.Worker = undefined; } + //============================================ function rewrite_attr(elem, name) { if (!elem || !elem.getAttribute) { return; @@ -359,6 +475,7 @@ WB_wombat_init = (function() { elem.setAttribute(name, value); } + //============================================ function init_dom_override() { if (!Node || !Node.prototype) { return; @@ -376,9 +493,9 @@ WB_wombat_init = (function() { var desc; if (child instanceof DocumentFragment) { - desc = child.querySelectorAll("*[href],*[src]"); + //desc = child.querySelectorAll("*[href],*[src]"); } else if (child.getElementsByTagName) { - desc = child.getElementsByTagName("*"); + //desc = child.getElementsByTagName("*"); } if (desc) { @@ -401,19 +518,55 @@ WB_wombat_init = (function() { //============================================ function wombat_init(replay_prefix, capture_date, orig_host, timestamp) { wb_replay_prefix = replay_prefix; - wb_replay_date_prefix = replay_prefix + capture_date + "/"; - wb_capture_date_part = "/" + capture_date + "/"; - wb_orig_host = "http://" + orig_host; + wb_replay_date_prefix = replay_prefix + capture_date + "em_/"; + + if (capture_date.length > 0) { + wb_capture_date_part = "/" + capture_date + "/"; + } else { + wb_capture_date_part = ""; + } + + wb_orig_host = HTTP_PREFIX + orig_host; // Location - window.WB_wombat_location = copy_location_obj(window.self.location); - document.WB_wombat_location = window.WB_wombat_location; + var wombat_location = new WombatLocation(window.self.location); + + if (wombat_location._autooverride) { + + var setter = function(val) { + if (typeof(val) == "string") { + if (starts_with(val, "about:")) { + return undefined; + } + this._WB_wombat_location.href = val; + } + } + + defProp(window, "WB_wombat_location", wombat_location, setter); + defProp(document, "WB_wombat_location", wombat_location, setter); + } else { + // Check quickly after page load + setTimeout(check_all_locations, 500); + + // Check periodically every few seconds + setInterval(check_all_locations, 500); + } + + var is_framed = (window.top.update_wb_url != undefined); - //if (window.self.location != window.top.location) { - // window.top.WB_wombat_location = copy_location_obj(window.top.location); - //} - window.top.WB_wombat_location = window.WB_wombat_location; + if (window.self.location != window.top.location) { + if (is_framed) { + window.top.WB_wombat_location = window.WB_wombat_location; + window.WB_wombat_top = window.self; + } else { + window.top.WB_wombat_location = new WombatLocation(window.top.location); + + window.WB_wombat_top = window.top; + } + } else { + window.WB_wombat_top = window.top; + } //if (window.opener) { // window.opener.WB_wombat_location = copy_location_obj(window.opener.location); @@ -421,6 +574,7 @@ WB_wombat_init = (function() { // Domain document.WB_wombat_domain = orig_host; + document.WB_wombat_referrer = extract_orig(document.referrer); // History copy_history_func(window.history, 'pushState'); @@ -434,15 +588,9 @@ WB_wombat_init = (function() { init_dom_override(); // Random - init_seeded_random(timestamp); + init_seeded_random(timestamp); } - // Check quickly after page load - setTimeout(check_all_locations, 100); - - // Check periodically every few seconds - setInterval(check_all_locations, 500); - return wombat_init; })(this);