From b4d4be8a64fba7e18e55b0f760785414289b5ea5 Mon Sep 17 00:00:00 2001 From: John Berlin Date: Mon, 20 Aug 2018 16:12:43 -0400 Subject: [PATCH] Advandced preservation of media query based style rules and complete preservation of srcset values to fix https://github.com/webrecorder/webrecorder/issues/64. (#359) wombat.js: - Finalized PreserveWorker that preserves srcset values and Media Query values - Defered extraction and preservation of the values to be preserved so that the UI thread is not clobered - Hooked into places where wombat rewrites the values we are interested in wombatPreservationWorker.js: - Updated handling of srcset extraction now that we are sending wombat srcset rewrites - Added check to see if we have seen a URL to be fetched - Added light polyfill of Promise and fetch if they are not defined in wombatPreservationWorker.js, for safari wombat.spec.js - Updated to include values necessary to work with PWorker changes. --- karma-tests/wombat.spec.js | 8 + pywb/static/wombat.js | 242 ++++++++++++++++++++---- pywb/static/wombatPreservationWorker.js | 205 ++++++++++++++++++++ 3 files changed, 420 insertions(+), 35 deletions(-) create mode 100644 pywb/static/wombatPreservationWorker.js diff --git a/karma-tests/wombat.spec.js b/karma-tests/wombat.spec.js index 343c295c..bdaf49f0 100644 --- a/karma-tests/wombat.spec.js +++ b/karma-tests/wombat.spec.js @@ -127,6 +127,8 @@ describe('WombatJS', function () { wbinfo = { wombat_opts: {}, wombat_ts: '', + is_live: false, + top_url: '' }; }, wombatScript: wombatScript, @@ -142,6 +144,8 @@ describe('WombatJS', function () { wombat_opts: {}, prefix: window.location.origin, wombat_ts: '', + is_live: false, + top_url: '' }; }, wombatScript: wombatScript, @@ -179,6 +183,8 @@ describe('WombatJS', function () { wombat_opts: {}, prefix: window.location.origin, wombat_ts: '', + is_live: false, + top_url: '' }; }, wombatScript: wombatScript, @@ -199,6 +205,8 @@ describe('WombatJS', function () { initScript: function () { wbinfo = { wombat_opts: {}, + is_live: false, + top_url: '' }; }, wombatScript: wombatScript, diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 74f4dc46..3f565a8c 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -78,6 +78,9 @@ var _WBWombat = function($wbwindow, wbinfo) { var wb_setAttribute = $wbwindow.Element.prototype.setAttribute; var wb_getAttribute = $wbwindow.Element.prototype.getAttribute; var wb_funToString = Function.prototype.toString; + var WBPreserWorker; + var wbSheetMediaQChecker; + var wbUsePresWorker = $wbwindow.Worker != null && wbinfo.is_live; var wb_info; @@ -1326,6 +1329,131 @@ var _WBWombat = function($wbwindow, wbinfo) { } //============================================ + function initPreserveWorker() { + if (!wbUsePresWorker) { + return; + } + + var Preserver = (function(Worker) { + function PWorker(prefix, mod) { + if (!(this instanceof PWorker)) { + return new PWorker(prefix, mod); + } + if ($wbwindow === $wbwindow.__WB_replay_top) { + // we are top and can will own this worker + // setup URL for the kewl case + var workerURL = wbinfo.static_prefix + + 'wombatPreservationWorker.js?prefix=' + + encodeURIComponent(prefix) + '&mod=' + + encodeURIComponent(mod); + this.worker = new Worker(workerURL); + } else { + this.worker = null; + } + } + + PWorker.prototype.deferredSheetExtraction = function(rules) { + // if no rules this a no op + if (rules.length === 0) return; + function extract() { + // loop through each rule of the stylesheet + var media = []; + for (var j = 0; j < rules.length; ++j) { + var rule = rules[j]; + if (rule instanceof CSSMediaRule) { + // we are a media rule so get its text + media.push(rule.cssText); + } + } + if (media.length > 0) { + // we have some media rules to preserve + WBPreserWorker.preserveMedia(media); + } + } + // defer things until next time the Promise.resolve Qs are cleared + $wbwindow.Promise.resolve().then(extract); + }; + + PWorker.prototype.terminate = function() { + // terminate the worker, a no op when not replay top + if ($wbwindow === $wbwindow.__WB_replay_top) { + this.worker.terminate(); + } + }; + + PWorker.prototype.postMessage = function(msg) { + if ($wbwindow === $wbwindow.__WB_replay_top) { + // we are actually replay top so send directly to worker + this.worker.postMessage(msg); + } else { + // send message to replay top + $wbwindow.__WB_replay_top.__orig_postMessage({ + 'wb_type': 'pworker', 'msg': msg, + }, '*'); + } + }; + + PWorker.prototype.preserveSrcset = function(srcset) { + // send values from rewrite_srcset to the worker + this.postMessage({ + 'type': 'values', + 'srcset': {'values': srcset, 'presplit': true}, + }); + }; + + PWorker.prototype.preserveMedia = function(media) { + // send CSSMediaRule values to the worker + this.postMessage({'type': 'values', 'media': media}) + }; + + PWorker.prototype.extractFromLocalDoc = function() { + // get the values to be preserved from the documents stylesheets + // and all elements with a srcset + var media = []; + var srcset = []; + var sheets = $wbwindow.document.styleSheets; + var i = 0; + for (; i < sheets.length; ++i) { + var sheet = sheets[i]; + var rules = sheet.cssRules; + for (var j = 0; j < rules.length; ++j) { + var rule = rules[j]; + if (rule instanceof CSSMediaRule) { + media.push(rule.cssText); + } + } + } + var srcsetElems = $wbwindow.document.querySelectorAll('*[srcset]'); + for (i = 0; i < srcsetElems.length; i++) { + var srcsetElem = srcsetElems[i]; + if (wb_getAttribute) { + srcset.push(wb_getAttribute.call(srcsetElem,'srcset')); + } else { + srcset.push(srcsetElem.getAttribute('srcset')); + } + } + this.postMessage({ + 'type': 'values', + 'media': media, + 'srcset': {'values': srcset, 'presplit': false}, + }); + }; + + return PWorker; + })($wbwindow.Worker); + + WBPreserWorker = new Preserver(wb_abs_prefix, wbinfo.mod); + + wbSheetMediaQChecker = function checkStyle () { + // used only for link[rel='stylesheet'] so we remove our listener + this.removeEventListener('load', wbSheetMediaQChecker); + // check no op condition + if (this.sheet == null) return; + // defer extraction to be nice :) + WBPreserWorker.deferredSheetExtraction(this.sheet.cssRules); + }; + } + function rewriteWorker(workerUrl) { var fetch = true; var makeBlob = false; @@ -1521,7 +1649,10 @@ var _WBWombat = function($wbwindow, wbinfo) { for (var i = 0; i < values.length; i++) { values[i] = rewrite_url(values[i].trim()); } - + if (wbUsePresWorker) { + // send post split values to preservation worker + WBPreserWorker.preserveSrcset(values); + } return values.join(", "); } @@ -1617,33 +1748,59 @@ var _WBWombat = function($wbwindow, wbinfo) { } var changed; - - if (elem.tagName == "STYLE") { - var new_content = rewrite_style(elem.textContent); - if (elem.textContent != new_content) { - elem.textContent = new_content; - changed = true; - } - } else if (elem.tagName == "OBJECT") { - changed = rewrite_attr(elem, "data", true); - } else if (elem.tagName == "FORM") { - changed = rewrite_attr(elem, "action", true); - //} else if (elem.tagName == "INPUT") { - // changed = rewrite_attr(elem, "value", true); - } else if (elem.tagName == "IFRAME" || elem.tagName == "FRAME") { - changed = rewrite_frame_src(elem, "src"); - } else if (elem.tagName == "SCRIPT") { - changed = rewrite_script(elem); - } else if (elem.tagName == "image") { - changed = rewrite_attr(elem, "xlink:href"); - } else if (elem instanceof SVGElement && elem.hasAttribute('filter')) { - changed = rewrite_attr(elem, 'filter'); - } else { - changed = rewrite_attr(elem, "src"); - changed = rewrite_attr(elem, "srcset") || changed; - changed = rewrite_attr(elem, "href") || changed; - changed = rewrite_attr(elem, "style") || changed; - changed = rewrite_attr(elem, "poster") || changed; + // we use a switch now cause perf and complexity + switch (elem.tagName) { + case 'STYLE': + var new_content = rewrite_style(elem.textContent); + if (elem.textContent !== new_content) { + elem.textContent = new_content; + changed = true; + if (wbUsePresWorker && elem.sheet != null) { + // we have a stylesheet so lets be nice to UI thread + // and defer extraction + WBPreserWorker.deferredSheetExtraction(elem.sheet.cssRules); + } + } + break; + case 'LINK': + changed = rewrite_attr(elem, 'href'); + if (wbUsePresWorker && elem.rel === 'stylesheet') { + // we can only check link[rel='stylesheet'] when it loads + elem.addEventListener('load', wbSheetMediaQChecker); + } + break; + case 'IMG': + changed = rewrite_attr(elem, 'src'); + changed = rewrite_attr(elem, 'srcset') || changed; + changed = rewrite_attr(elem, 'style') || changed; + break; + case 'OBJECT': + changed = rewrite_attr(elem, "data", true); + break; + case 'FORM': + changed = rewrite_attr(elem, "action", true); + break; + case 'IFRAME': + case 'FRAME': + changed = rewrite_frame_src(elem, "src"); + break; + case 'SCRIPT': + changed = rewrite_script(elem); + break; + case 'image': + changed = rewrite_attr(elem, "xlink:href"); + break; + default: + if (elem instanceof SVGElement && elem.hasAttribute('filter')) { + changed = rewrite_attr(elem, 'filter'); + } else { + changed = rewrite_attr(elem, 'src'); + changed = rewrite_attr(elem, 'srcset') || changed; + changed = rewrite_attr(elem, 'href') || changed; + changed = rewrite_attr(elem, 'style') || changed; + changed = rewrite_attr(elem, 'poster') || changed; + } + break; } if (elem.getAttribute) { @@ -1657,7 +1814,6 @@ var _WBWombat = function($wbwindow, wbinfo) { changed = true; } } - return changed; } @@ -2030,14 +2186,18 @@ var _WBWombat = function($wbwindow, wbinfo) { var res = orig; if (!this._no_rewrite) { //init_iframe_insert_obs(this); - if (this.tagName == "STYLE") { + if (this.tagName === "STYLE") { res = rewrite_style(orig); } else { res = rewrite_html(orig); } } orig_setter.call(this, res); - } + if (wbUsePresWorker && this.tagName === 'STYLE' && this.sheet != null) { + // got preserve all the things + WBPreserWorker.deferredSheetExtraction(this.sheet.rules); + } + }; var getter = function() { var res = orig_getter.call(this); @@ -2045,7 +2205,7 @@ var _WBWombat = function($wbwindow, wbinfo) { res = res.replace(wb_unrewrite_rx, ""); } return res; - } + }; def_prop(obj, prop, setter, rewrite_getter ? getter : orig_getter); } @@ -3464,6 +3624,7 @@ var _WBWombat = function($wbwindow, wbinfo) { initFontFaceOverride($wbwindow); // Worker override (experimental) + initPreserveWorker(); init_web_worker_override(); init_service_worker_override(); initSharedWorkerOverride(); @@ -3490,7 +3651,7 @@ var _WBWombat = function($wbwindow, wbinfo) { initInsertAdjacentElementOverride(); - // iframe.contentWindow and iframe.contentDocument overrides to + // iframe.contentWindow and iframe.contentDocument overrides to // ensure wombat is inited on the iframe $wbwindow! override_iframe_content_access("contentWindow"); override_iframe_content_access("contentDocument"); @@ -3619,6 +3780,10 @@ var _WBWombat = function($wbwindow, wbinfo) { return; } + if ($wbwindow.document.readyState === "complete" && wbUsePresWorker) { + WBPreserWorker.extractFromLocalDoc(); + } + if ($wbwindow != $wbwindow.__WB_replay_top) { return; } @@ -3643,12 +3808,12 @@ var _WBWombat = function($wbwindow, wbinfo) { "title": $wbwindow.document ? $wbwindow.document.title : "", "readyState": $wbwindow.document.readyState, "wb_type": "load" - } + }; send_top_message(message); } - if ($wbwindow.document.readyState == "complete") { + if ($wbwindow.document.readyState === "complete") { notify_top(); } else if ($wbwindow.addEventListener) { $wbwindow.document.addEventListener("readystatechange", notify_top); @@ -3728,6 +3893,13 @@ var _WBWombat = function($wbwindow, wbinfo) { // Fix .parent only if not embeddable, otherwise leave for accessing embedding window if (!wb_opts.embedded && (replay_top == $wbwindow)) { + if (wbUsePresWorker) { + $wbwindow.addEventListener("message", function(event) { + if (event.data && event.data.wb_type === 'pworker') { + WBPreserWorker.postMessage(event.data.msg); + } + }, false); + } $wbwindow.__WB_orig_parent = $wbwindow.parent; $wbwindow.parent = replay_top; } diff --git a/pywb/static/wombatPreservationWorker.js b/pywb/static/wombatPreservationWorker.js new file mode 100644 index 00000000..02858a37 --- /dev/null +++ b/pywb/static/wombatPreservationWorker.js @@ -0,0 +1,205 @@ +'use strict'; +// thanks wombat +var STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi; +var IMPORT_REGEX = /(@import\s+[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi; +var srcsetSplit = /\s*(\S*\s+[\d.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/; +// the preserver instance for this worker +var preserver = null; + +function noop() {} + +if (typeof self.Promise === 'undefined') { + // not kewl we must polyfill Promise + self.Promise = function (executor) { + executor(noop, noop); + }; + self.Promise.prototype.then = function (cb) { + if (cb) cb(); + return this; + }; + self.Promise.prototype.catch = function () { + return this; + }; + self.Promise.all = function (values) { + return new Promise(noop); + }; +} + +if (typeof self.fetch === 'undefined') { + // not kewl we must polyfill fetch. + self.fetch = function (url) { + return new Promise(function (resolve) { + var xhr = new XMLHttpRequest(); + xhr.open('GET', url); + xhr.send(); + resolve(); + }); + }; +} + +self.onmessage = function (event) { + var data = event.data; + switch (data.type) { + case 'values': + preserver.preserveMediaSrcset(data); + break; + } +}; + +function pMap(p) { + // mapping function to ensure each fetch promises catch has a no op cb + return p.catch(noop); +} + +function Preserver(prefix, mod) { + if (!(this instanceof Preserver)) { + return new Preserver(prefix, mod); + } + this.prefix = prefix; + this.mod = mod; + this.prefixMod = prefix + mod; + // relative url, WorkerLocation is set by owning document + this.relative = prefix.split(location.origin)[1]; + // schemeless url + this.schemeless = '/' + this.relative; + // local cache of URLs fetched, to reduce server load + this.seen = {}; + // counter used to know when to clear seen (count > 2500) + this.seenCount = 0; + // array of promises returned by fetch(URL) + this.fetches = []; + // array of URL to be fetched + this.queue = []; + // should we queue a URL or not + this.queuing = false; + this.urlExtractor = this.urlExtractor.bind(this); + this.fetchDone = this.fetchDone.bind(this); +} + +Preserver.prototype.fixupURL = function (url) { + // attempt to fix up the url and do our best to ensure we can get dat 200 OK! + if (url.indexOf(this.prefixMod) === 0) { + return url; + } + if (url.indexOf(this.relative) === 0) { + return url.replace(this.relative, this.prefix); + } + if (url.indexOf(this.schemeless) === 0) { + return url.replace(this.schemeless, this.prefix); + } + if (url.indexOf(this.prefix) !== 0) { + return this.prefix + url; + } + return url; +}; + +Preserver.prototype.safeFetch = function (url) { + var fixedURL = this.fixupURL(url); + // check to see if we have seen this url before in order + // to lessen the load against the server content is preserved from + if (this.seen[url] != null) return; + this.seen[url] = true; + if (this.queuing) { + // we are currently waiting for a batch of fetches to complete + return this.queue.push(fixedURL); + } + // queue this urls fetch + this.fetches.push(fetch(fixedURL)); +}; + +Preserver.prototype.urlExtractor = function (match, n1, n2, n3, offset, string) { + // Same function as style_replacer in wombat.rewrite_style, n2 is our URL + this.safeFetch(n2); + return n1 + n2 + n3; +}; + +Preserver.prototype.fetchDone = function () { + // clear our fetches array in place + // https://www.ecma-international.org/ecma-262/9.0/index.html#sec-properties-of-array-instances-length + this.fetches.length = 0; + // indicate we no longer need to Q + this.queuing = false; + if (this.queue.length > 0) { + // we have a Q of some length drain it + this.drainQ(); + } else if (this.seenCount > 2500) { + // we seen 2500 URLs so lets free some memory as at this point + // we will probably see some more. GC it! + this.seen = {}; + this.seenCount = 0; + } +}; + +Preserver.prototype.fetchAll = function () { + // if we are queuing or have no fetches this is a no op + if (this.queuing) return; + if (this.fetches.length === 0) return; + // we are about to fetch queue anything that comes our way + this.queuing = true; + // initiate fetches by turning the initial fetch promises + // into rejctionless promises and "await" all + Promise.all(this.fetches.map(pMap)) + .then(this.fetchDone) + .catch(this.fetchDone); +}; + +Preserver.prototype.drainQ = function () { + // clear our Q in place and fill our fetches array + while (this.queue.length > 0) { + this.fetches.push(fetch(this.queue.shift())); + } + // fetch all the things + this.fetchAll(); +}; + +Preserver.prototype.extractMedia = function (mediaRules) { + // this is a broken down rewrite_style + if (mediaRules == null) return; + for (var i = 0; i < mediaRules.length; i++) { + var rule = mediaRules[i]; + rule.replace(STYLE_REGEX, this.urlExtractor); + rule.replace(IMPORT_REGEX, this.urlExtractor); + } +}; + +Preserver.prototype.extractSrcset = function (srcsets) { + if (srcsets == null || srcsets.values == null) return; + var srcsetValues = srcsets.values; + // was srcsets from rewrite_srcset and if so no need to split + var presplit = srcsets.presplit; + for (var i = 0; i < srcsetValues.length; i++) { + var srcset = srcsetValues[i]; + if (presplit) { + // was rewrite_srcset so just ensure we just + // grab the URL not width/height key + this.safeFetch(srcset.split(' ')[0]); + } else { + // was from extract from local doc so we need to duplicate work + var values = srcset.split(srcsetSplit).filter(Boolean); + for (var j = 0; j < values.length; j++) { + var value = values[j].trim(); + if (value.length > 0) { + this.safeFetch(value.split(' ')[0]); + } + } + } + } +}; + +Preserver.prototype.preserveMediaSrcset = function (data) { + // we got a message and now we preserve! + // these calls turn into no ops if they have no work + this.extractMedia(data.media); + this.extractSrcset(data.srcset); + this.fetchAll(); +}; + +// initialize ourselves from the query params :) +try { + var loc = new self.URL(location); + preserver = new Preserver(loc.searchParams.get('prefix'), loc.searchParams.get('mod')); +} catch (e) { + // likely we are in an older version of safari + var search = decodeURIComponent(location.search.split('?')[1]).split('&'); + preserver = new Preserver(search[0].substr(search[0].indexOf('=') + 1), search[1].substr(search[1].indexOf('=') + 1)); +}