1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Advandced preservation of media query based style rules and complete preservation of srcset values to fix https://github.com/webrecorder/webrecorder/issues/64. (#359)

wombat.js:
- Finalized PreserveWorker that preserves srcset values and Media Query values
- Defered extraction and preservation of the values to be preserved so that the UI thread is not clobered
- Hooked into places where wombat rewrites the values we are interested in
wombatPreservationWorker.js:
- Updated handling of srcset extraction now that we are sending wombat srcset rewrites
- Added check to see if we have seen a URL to be fetched
- Added light polyfill of Promise and fetch if they are not defined in wombatPreservationWorker.js, for safari
wombat.spec.js
- Updated to include values necessary to work with PWorker changes.
This commit is contained in:
John Berlin 2018-08-20 16:12:43 -04:00 committed by Ilya Kreymer
parent 841687fcc0
commit b4d4be8a64
3 changed files with 420 additions and 35 deletions

View File

@ -127,6 +127,8 @@ describe('WombatJS', function () {
wbinfo = {
wombat_opts: {},
wombat_ts: '',
is_live: false,
top_url: ''
};
},
wombatScript: wombatScript,
@ -142,6 +144,8 @@ describe('WombatJS', function () {
wombat_opts: {},
prefix: window.location.origin,
wombat_ts: '',
is_live: false,
top_url: ''
};
},
wombatScript: wombatScript,
@ -179,6 +183,8 @@ describe('WombatJS', function () {
wombat_opts: {},
prefix: window.location.origin,
wombat_ts: '',
is_live: false,
top_url: ''
};
},
wombatScript: wombatScript,
@ -199,6 +205,8 @@ describe('WombatJS', function () {
initScript: function () {
wbinfo = {
wombat_opts: {},
is_live: false,
top_url: ''
};
},
wombatScript: wombatScript,

View File

@ -78,6 +78,9 @@ var _WBWombat = function($wbwindow, wbinfo) {
var wb_setAttribute = $wbwindow.Element.prototype.setAttribute;
var wb_getAttribute = $wbwindow.Element.prototype.getAttribute;
var wb_funToString = Function.prototype.toString;
var WBPreserWorker;
var wbSheetMediaQChecker;
var wbUsePresWorker = $wbwindow.Worker != null && wbinfo.is_live;
var wb_info;
@ -1326,6 +1329,131 @@ var _WBWombat = function($wbwindow, wbinfo) {
}
//============================================
function initPreserveWorker() {
if (!wbUsePresWorker) {
return;
}
var Preserver = (function(Worker) {
function PWorker(prefix, mod) {
if (!(this instanceof PWorker)) {
return new PWorker(prefix, mod);
}
if ($wbwindow === $wbwindow.__WB_replay_top) {
// we are top and can will own this worker
// setup URL for the kewl case
var workerURL = wbinfo.static_prefix +
'wombatPreservationWorker.js?prefix=' +
encodeURIComponent(prefix) + '&mod=' +
encodeURIComponent(mod);
this.worker = new Worker(workerURL);
} else {
this.worker = null;
}
}
PWorker.prototype.deferredSheetExtraction = function(rules) {
// if no rules this a no op
if (rules.length === 0) return;
function extract() {
// loop through each rule of the stylesheet
var media = [];
for (var j = 0; j < rules.length; ++j) {
var rule = rules[j];
if (rule instanceof CSSMediaRule) {
// we are a media rule so get its text
media.push(rule.cssText);
}
}
if (media.length > 0) {
// we have some media rules to preserve
WBPreserWorker.preserveMedia(media);
}
}
// defer things until next time the Promise.resolve Qs are cleared
$wbwindow.Promise.resolve().then(extract);
};
PWorker.prototype.terminate = function() {
// terminate the worker, a no op when not replay top
if ($wbwindow === $wbwindow.__WB_replay_top) {
this.worker.terminate();
}
};
PWorker.prototype.postMessage = function(msg) {
if ($wbwindow === $wbwindow.__WB_replay_top) {
// we are actually replay top so send directly to worker
this.worker.postMessage(msg);
} else {
// send message to replay top
$wbwindow.__WB_replay_top.__orig_postMessage({
'wb_type': 'pworker', 'msg': msg,
}, '*');
}
};
PWorker.prototype.preserveSrcset = function(srcset) {
// send values from rewrite_srcset to the worker
this.postMessage({
'type': 'values',
'srcset': {'values': srcset, 'presplit': true},
});
};
PWorker.prototype.preserveMedia = function(media) {
// send CSSMediaRule values to the worker
this.postMessage({'type': 'values', 'media': media})
};
PWorker.prototype.extractFromLocalDoc = function() {
// get the values to be preserved from the documents stylesheets
// and all elements with a srcset
var media = [];
var srcset = [];
var sheets = $wbwindow.document.styleSheets;
var i = 0;
for (; i < sheets.length; ++i) {
var sheet = sheets[i];
var rules = sheet.cssRules;
for (var j = 0; j < rules.length; ++j) {
var rule = rules[j];
if (rule instanceof CSSMediaRule) {
media.push(rule.cssText);
}
}
}
var srcsetElems = $wbwindow.document.querySelectorAll('*[srcset]');
for (i = 0; i < srcsetElems.length; i++) {
var srcsetElem = srcsetElems[i];
if (wb_getAttribute) {
srcset.push(wb_getAttribute.call(srcsetElem,'srcset'));
} else {
srcset.push(srcsetElem.getAttribute('srcset'));
}
}
this.postMessage({
'type': 'values',
'media': media,
'srcset': {'values': srcset, 'presplit': false},
});
};
return PWorker;
})($wbwindow.Worker);
WBPreserWorker = new Preserver(wb_abs_prefix, wbinfo.mod);
wbSheetMediaQChecker = function checkStyle () {
// used only for link[rel='stylesheet'] so we remove our listener
this.removeEventListener('load', wbSheetMediaQChecker);
// check no op condition
if (this.sheet == null) return;
// defer extraction to be nice :)
WBPreserWorker.deferredSheetExtraction(this.sheet.cssRules);
};
}
function rewriteWorker(workerUrl) {
var fetch = true;
var makeBlob = false;
@ -1521,7 +1649,10 @@ var _WBWombat = function($wbwindow, wbinfo) {
for (var i = 0; i < values.length; i++) {
values[i] = rewrite_url(values[i].trim());
}
if (wbUsePresWorker) {
// send post split values to preservation worker
WBPreserWorker.preserveSrcset(values);
}
return values.join(", ");
}
@ -1617,33 +1748,59 @@ var _WBWombat = function($wbwindow, wbinfo) {
}
var changed;
if (elem.tagName == "STYLE") {
var new_content = rewrite_style(elem.textContent);
if (elem.textContent != new_content) {
elem.textContent = new_content;
changed = true;
}
} else if (elem.tagName == "OBJECT") {
changed = rewrite_attr(elem, "data", true);
} else if (elem.tagName == "FORM") {
changed = rewrite_attr(elem, "action", true);
//} else if (elem.tagName == "INPUT") {
// changed = rewrite_attr(elem, "value", true);
} else if (elem.tagName == "IFRAME" || elem.tagName == "FRAME") {
changed = rewrite_frame_src(elem, "src");
} else if (elem.tagName == "SCRIPT") {
changed = rewrite_script(elem);
} else if (elem.tagName == "image") {
changed = rewrite_attr(elem, "xlink:href");
} else if (elem instanceof SVGElement && elem.hasAttribute('filter')) {
changed = rewrite_attr(elem, 'filter');
} else {
changed = rewrite_attr(elem, "src");
changed = rewrite_attr(elem, "srcset") || changed;
changed = rewrite_attr(elem, "href") || changed;
changed = rewrite_attr(elem, "style") || changed;
changed = rewrite_attr(elem, "poster") || changed;
// we use a switch now cause perf and complexity
switch (elem.tagName) {
case 'STYLE':
var new_content = rewrite_style(elem.textContent);
if (elem.textContent !== new_content) {
elem.textContent = new_content;
changed = true;
if (wbUsePresWorker && elem.sheet != null) {
// we have a stylesheet so lets be nice to UI thread
// and defer extraction
WBPreserWorker.deferredSheetExtraction(elem.sheet.cssRules);
}
}
break;
case 'LINK':
changed = rewrite_attr(elem, 'href');
if (wbUsePresWorker && elem.rel === 'stylesheet') {
// we can only check link[rel='stylesheet'] when it loads
elem.addEventListener('load', wbSheetMediaQChecker);
}
break;
case 'IMG':
changed = rewrite_attr(elem, 'src');
changed = rewrite_attr(elem, 'srcset') || changed;
changed = rewrite_attr(elem, 'style') || changed;
break;
case 'OBJECT':
changed = rewrite_attr(elem, "data", true);
break;
case 'FORM':
changed = rewrite_attr(elem, "action", true);
break;
case 'IFRAME':
case 'FRAME':
changed = rewrite_frame_src(elem, "src");
break;
case 'SCRIPT':
changed = rewrite_script(elem);
break;
case 'image':
changed = rewrite_attr(elem, "xlink:href");
break;
default:
if (elem instanceof SVGElement && elem.hasAttribute('filter')) {
changed = rewrite_attr(elem, 'filter');
} else {
changed = rewrite_attr(elem, 'src');
changed = rewrite_attr(elem, 'srcset') || changed;
changed = rewrite_attr(elem, 'href') || changed;
changed = rewrite_attr(elem, 'style') || changed;
changed = rewrite_attr(elem, 'poster') || changed;
}
break;
}
if (elem.getAttribute) {
@ -1657,7 +1814,6 @@ var _WBWombat = function($wbwindow, wbinfo) {
changed = true;
}
}
return changed;
}
@ -2030,14 +2186,18 @@ var _WBWombat = function($wbwindow, wbinfo) {
var res = orig;
if (!this._no_rewrite) {
//init_iframe_insert_obs(this);
if (this.tagName == "STYLE") {
if (this.tagName === "STYLE") {
res = rewrite_style(orig);
} else {
res = rewrite_html(orig);
}
}
orig_setter.call(this, res);
}
if (wbUsePresWorker && this.tagName === 'STYLE' && this.sheet != null) {
// got preserve all the things
WBPreserWorker.deferredSheetExtraction(this.sheet.rules);
}
};
var getter = function() {
var res = orig_getter.call(this);
@ -2045,7 +2205,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
res = res.replace(wb_unrewrite_rx, "");
}
return res;
}
};
def_prop(obj, prop, setter, rewrite_getter ? getter : orig_getter);
}
@ -3464,6 +3624,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
initFontFaceOverride($wbwindow);
// Worker override (experimental)
initPreserveWorker();
init_web_worker_override();
init_service_worker_override();
initSharedWorkerOverride();
@ -3490,7 +3651,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
initInsertAdjacentElementOverride();
// iframe.contentWindow and iframe.contentDocument overrides to
// iframe.contentWindow and iframe.contentDocument overrides to
// ensure wombat is inited on the iframe $wbwindow!
override_iframe_content_access("contentWindow");
override_iframe_content_access("contentDocument");
@ -3619,6 +3780,10 @@ var _WBWombat = function($wbwindow, wbinfo) {
return;
}
if ($wbwindow.document.readyState === "complete" && wbUsePresWorker) {
WBPreserWorker.extractFromLocalDoc();
}
if ($wbwindow != $wbwindow.__WB_replay_top) {
return;
}
@ -3643,12 +3808,12 @@ var _WBWombat = function($wbwindow, wbinfo) {
"title": $wbwindow.document ? $wbwindow.document.title : "",
"readyState": $wbwindow.document.readyState,
"wb_type": "load"
}
};
send_top_message(message);
}
if ($wbwindow.document.readyState == "complete") {
if ($wbwindow.document.readyState === "complete") {
notify_top();
} else if ($wbwindow.addEventListener) {
$wbwindow.document.addEventListener("readystatechange", notify_top);
@ -3728,6 +3893,13 @@ var _WBWombat = function($wbwindow, wbinfo) {
// Fix .parent only if not embeddable, otherwise leave for accessing embedding window
if (!wb_opts.embedded && (replay_top == $wbwindow)) {
if (wbUsePresWorker) {
$wbwindow.addEventListener("message", function(event) {
if (event.data && event.data.wb_type === 'pworker') {
WBPreserWorker.postMessage(event.data.msg);
}
}, false);
}
$wbwindow.__WB_orig_parent = $wbwindow.parent;
$wbwindow.parent = replay_top;
}

View File

@ -0,0 +1,205 @@
'use strict';
// thanks wombat
var STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi;
var IMPORT_REGEX = /(@import\s+[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi;
var srcsetSplit = /\s*(\S*\s+[\d.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/;
// the preserver instance for this worker
var preserver = null;
function noop() {}
if (typeof self.Promise === 'undefined') {
// not kewl we must polyfill Promise
self.Promise = function (executor) {
executor(noop, noop);
};
self.Promise.prototype.then = function (cb) {
if (cb) cb();
return this;
};
self.Promise.prototype.catch = function () {
return this;
};
self.Promise.all = function (values) {
return new Promise(noop);
};
}
if (typeof self.fetch === 'undefined') {
// not kewl we must polyfill fetch.
self.fetch = function (url) {
return new Promise(function (resolve) {
var xhr = new XMLHttpRequest();
xhr.open('GET', url);
xhr.send();
resolve();
});
};
}
self.onmessage = function (event) {
var data = event.data;
switch (data.type) {
case 'values':
preserver.preserveMediaSrcset(data);
break;
}
};
function pMap(p) {
// mapping function to ensure each fetch promises catch has a no op cb
return p.catch(noop);
}
function Preserver(prefix, mod) {
if (!(this instanceof Preserver)) {
return new Preserver(prefix, mod);
}
this.prefix = prefix;
this.mod = mod;
this.prefixMod = prefix + mod;
// relative url, WorkerLocation is set by owning document
this.relative = prefix.split(location.origin)[1];
// schemeless url
this.schemeless = '/' + this.relative;
// local cache of URLs fetched, to reduce server load
this.seen = {};
// counter used to know when to clear seen (count > 2500)
this.seenCount = 0;
// array of promises returned by fetch(URL)
this.fetches = [];
// array of URL to be fetched
this.queue = [];
// should we queue a URL or not
this.queuing = false;
this.urlExtractor = this.urlExtractor.bind(this);
this.fetchDone = this.fetchDone.bind(this);
}
Preserver.prototype.fixupURL = function (url) {
// attempt to fix up the url and do our best to ensure we can get dat 200 OK!
if (url.indexOf(this.prefixMod) === 0) {
return url;
}
if (url.indexOf(this.relative) === 0) {
return url.replace(this.relative, this.prefix);
}
if (url.indexOf(this.schemeless) === 0) {
return url.replace(this.schemeless, this.prefix);
}
if (url.indexOf(this.prefix) !== 0) {
return this.prefix + url;
}
return url;
};
Preserver.prototype.safeFetch = function (url) {
var fixedURL = this.fixupURL(url);
// check to see if we have seen this url before in order
// to lessen the load against the server content is preserved from
if (this.seen[url] != null) return;
this.seen[url] = true;
if (this.queuing) {
// we are currently waiting for a batch of fetches to complete
return this.queue.push(fixedURL);
}
// queue this urls fetch
this.fetches.push(fetch(fixedURL));
};
Preserver.prototype.urlExtractor = function (match, n1, n2, n3, offset, string) {
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
this.safeFetch(n2);
return n1 + n2 + n3;
};
Preserver.prototype.fetchDone = function () {
// clear our fetches array in place
// https://www.ecma-international.org/ecma-262/9.0/index.html#sec-properties-of-array-instances-length
this.fetches.length = 0;
// indicate we no longer need to Q
this.queuing = false;
if (this.queue.length > 0) {
// we have a Q of some length drain it
this.drainQ();
} else if (this.seenCount > 2500) {
// we seen 2500 URLs so lets free some memory as at this point
// we will probably see some more. GC it!
this.seen = {};
this.seenCount = 0;
}
};
Preserver.prototype.fetchAll = function () {
// if we are queuing or have no fetches this is a no op
if (this.queuing) return;
if (this.fetches.length === 0) return;
// we are about to fetch queue anything that comes our way
this.queuing = true;
// initiate fetches by turning the initial fetch promises
// into rejctionless promises and "await" all
Promise.all(this.fetches.map(pMap))
.then(this.fetchDone)
.catch(this.fetchDone);
};
Preserver.prototype.drainQ = function () {
// clear our Q in place and fill our fetches array
while (this.queue.length > 0) {
this.fetches.push(fetch(this.queue.shift()));
}
// fetch all the things
this.fetchAll();
};
Preserver.prototype.extractMedia = function (mediaRules) {
// this is a broken down rewrite_style
if (mediaRules == null) return;
for (var i = 0; i < mediaRules.length; i++) {
var rule = mediaRules[i];
rule.replace(STYLE_REGEX, this.urlExtractor);
rule.replace(IMPORT_REGEX, this.urlExtractor);
}
};
Preserver.prototype.extractSrcset = function (srcsets) {
if (srcsets == null || srcsets.values == null) return;
var srcsetValues = srcsets.values;
// was srcsets from rewrite_srcset and if so no need to split
var presplit = srcsets.presplit;
for (var i = 0; i < srcsetValues.length; i++) {
var srcset = srcsetValues[i];
if (presplit) {
// was rewrite_srcset so just ensure we just
// grab the URL not width/height key
this.safeFetch(srcset.split(' ')[0]);
} else {
// was from extract from local doc so we need to duplicate work
var values = srcset.split(srcsetSplit).filter(Boolean);
for (var j = 0; j < values.length; j++) {
var value = values[j].trim();
if (value.length > 0) {
this.safeFetch(value.split(' ')[0]);
}
}
}
}
};
Preserver.prototype.preserveMediaSrcset = function (data) {
// we got a message and now we preserve!
// these calls turn into no ops if they have no work
this.extractMedia(data.media);
this.extractSrcset(data.srcset);
this.fetchAll();
};
// initialize ourselves from the query params :)
try {
var loc = new self.URL(location);
preserver = new Preserver(loc.searchParams.get('prefix'), loc.searchParams.get('mod'));
} catch (e) {
// likely we are in an older version of safari
var search = decodeURIComponent(location.search.split('?')[1]).split('&');
preserver = new Preserver(search[0].substr(search[0].indexOf('=') + 1), search[1].substr(search[1].indexOf('=') + 1));
}