1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Exposed AutoFetchWorker on window in proxy-mode (#389)

Added methods to AutoFetchWorker in proxy mode that allow external JS to initiate checks
Updated the actual proxy mode worker implementation to match the functionality added
This commit is contained in:
John Berlin 2018-12-13 21:48:16 -05:00 committed by Ilya Kreymer
parent 2c8d607b18
commit 9597a632c8
2 changed files with 158 additions and 119 deletions

View File

@ -8,6 +8,7 @@ var FullImgQDrainLen = 10;
var DefaultNumAvFetches = 5;
var FullAVQDrainLen = 5;
var DataURLPrefix = 'data:';
var FetchDelay = 1000;
// the autofetcher instance for this worker
var autofetcher = null;
@ -49,6 +50,9 @@ self.onmessage = function (event) {
case 'values':
autofetcher.autofetchMediaSrcset(data);
break;
case 'fetch-all':
autofetcher.justFetch(data);
break;
}
};
@ -74,9 +78,8 @@ function AutoFetcher() {
}
AutoFetcher.prototype.delay = function () {
// 2 second delay seem reasonable
return new Promise(function (resolve, reject) {
setTimeout(resolve, 2000);
setTimeout(resolve, FetchDelay);
});
};
@ -270,4 +273,13 @@ AutoFetcher.prototype.autofetchMediaSrcset = function (data) {
this.fetchAV();
};
AutoFetcher.prototype.justFetch = function (data) {
// we got a message containing only urls to be fetched
if (data == null || data.values == null) return;
for (var i = 0; i < data.values.length; ++i) {
this.queueNonAVURL(data.values[i]);
}
this.fetchImgs();
};
autofetcher = new AutoFetcher();

View File

@ -174,8 +174,8 @@ var _WBWombat = function ($wbwindow, wbinfo) {
if (!(this instanceof AutoFetchWorkerProxyMode)) {
return new AutoFetchWorkerProxyMode();
}
this.checkIntervalTime = 15000;
this.checkIntervalCB = this.checkIntervalCB.bind(this);
this.checkIntervalTime = 15000;
this.elemSelector = ['img', 'source', 'video', 'audio'].map(function (which) {
if (which === 'source') {
return ['picture > ', 'video > ', 'audio >'].map(function (parent) {
@ -185,21 +185,20 @@ var _WBWombat = function ($wbwindow, wbinfo) {
return which + '[srcset], ' + which + '[data-srcset], ' + which + '[data-src]';
}
}).join(', ');
// use our origins reference to the document in order for us to parse stylesheets :/
this.styleTag = document.createElement('style');
this.styleTag.id = '$wrStyleParser$';
document.documentElement.appendChild(this.styleTag);
if (isTop) {
// Cannot directly load our worker from the proxy origin into the current origin
// however we fetch it from proxy origin and can blob it into the current origin :)
var self = this;
var afwpm = this;
fetch(wbAutoFetchWorkerPrefix)
.then(function (res) {
return res.text().then(function (text) {
var blob = new Blob([text], {"type": "text/javascript"});
self.worker = new $wbwindow.Worker(URL.createObjectURL(blob));
// use our origins reference to the document in order for us to parse stylesheets :/
self.styleTag = document.createElement('style');
self.styleTag.id = '$wrStyleParser$';
document.documentElement.appendChild(self.styleTag);
self.startCheckingInterval();
var blob = new Blob([text], { "type": "text/javascript" });
afwpm.worker = new $wbwindow.Worker(URL.createObjectURL(blob));
afwpm.startCheckingInterval();
});
});
} else {
@ -211,26 +210,40 @@ var _WBWombat = function ($wbwindow, wbinfo) {
}
$wbwindow.top.postMessage(msg, '*');
},
"terminate": function () {
}
"terminate": function () {}
};
this.startCheckingInterval();
}
}
AutoFetchWorkerProxyMode.prototype.resumeCheckInterval = function () {
// if the checkInterval is null (it is not active) restart the check interval
if (this.checkInterval == null) {
this.checkInterval = setInterval(this.checkIntervalCB, this.checkIntervalTime);
}
};
AutoFetchWorkerProxyMode.prototype.pauseCheckInterval = function () {
// if the checkInterval is not null (it is active) clear the check interval
if (this.checkInterval != null) {
clearInterval(this.checkInterval);
this.checkInterval = null;
}
};
AutoFetchWorkerProxyMode.prototype.startCheckingInterval = function () {
// if document ready state is complete do first extraction and start check polling
// otherwise wait for document ready state to complete to extract and start check polling
var self = this;
var afwpm = this;
if ($wbwindow.document.readyState === "complete") {
this.extractFromLocalDoc();
setInterval(this.checkIntervalCB, this.checkIntervalTime);
this.checkInterval = setInterval(this.checkIntervalCB, this.checkIntervalTime);
} else {
var i = setInterval(function () {
if ($wbwindow.document.readyState === "complete") {
self.extractFromLocalDoc();
afwpm.extractFromLocalDoc();
clearInterval(i);
setInterval(self.checkIntervalCB, self.checkIntervalTime);
afwpm.checkInterval = setInterval(afwpm.checkIntervalCB, afwpm.checkIntervalTime);
}
}, 1000);
}
@ -245,147 +258,161 @@ var _WBWombat = function ($wbwindow, wbinfo) {
this.worker.terminate();
};
AutoFetchWorkerProxyMode.prototype.postMessage = function (msg, deferred) {
if (deferred) {
var self = this;
return Promise.resolve().then(function () {
self.worker.postMessage(msg);
});
}
AutoFetchWorkerProxyMode.prototype.justFetch = function (urls) {
this.worker.postMessage({ 'type': 'fetch-all', 'values': urls });
};
AutoFetchWorkerProxyMode.prototype.postMessage = function (msg) {
this.worker.postMessage(msg);
};
AutoFetchWorkerProxyMode.prototype.extractMediaRules = function (rules, href) {
// We are in proxy mode and must include a URL to resolve relative URLs in media rules
if (!rules) return [];
var rvlen = rules.length;
var text = [];
var rule;
for (var i = 0; i < rvlen; ++i) {
rule = rules[i];
if (rule.type === CSSRule.MEDIA_RULE) {
text.push({"cssText": rule.cssText, "resolve": href});
}
}
return text;
};
AutoFetchWorkerProxyMode.prototype.corsCSSFetch = function (href) {
// because this JS in proxy mode operates as it would on the live web
// the rules of CORS apply and we cannot rely on URLs being rewritten correctly
// fetch the cross origin css file and then parse it using a style tag to get the rules
var url = location.protocol + '//' + wb_info.proxy_magic + '/proxy-fetch/' + href;
var aaw = this;
return fetch(url).then(function (res) {
return res.text().then(function (text) {
aaw.styleTag.textContent = text;
var sheet = aaw.styleTag.sheet || {};
return aaw.extractMediaRules(sheet.cssRules || sheet.rules, href);
});
}).catch(function (error) {
return [];
});
};
AutoFetchWorkerProxyMode.prototype.shouldSkipSheet = function (sheet) {
// we skip extracting rules from sheets if they are from our parsing style or come from pywb
if (sheet.id === '$wrStyleParser$') return true;
return !!(sheet.href && sheet.href.indexOf(wb_info.proxy_magic) !== -1);
};
AutoFetchWorkerProxyMode.prototype.getImgAVElems = function () {
var elem, srcv, mod;
var results = { 'srcset': [], 'src': []} ;
var baseURI = $wbwindow.document.baseURI;
var elems = $wbwindow.document.querySelectorAll(this.elemSelector);
for (var i = 0; i < elems.length; i++) {
elem = elems[i];
// we want the original src value in order to resolve URLs in the worker when needed
srcv = elem.src ? elem.src : null;
// get the correct mod in order to inform the backing worker where the URL(s) are from
mod = elem.tagName === "SOURCE" ?
elem.parentElement.tagName === "PICTURE" ? 'im_' : 'oe_'
: elem.tagName === "IMG" ? 'im_' : 'oe_';
if (elem.srcset) {
results.srcset.push({ 'srcset': elem.srcset, 'resolve': srcv || baseURI, 'mod': mod });
}
if (elem.dataset.srcset) {
results.srcset.push({ 'srcset': elem.dataset.srcset, 'resolve': srcv || baseURI, 'mod': mod });
}
if (elem.dataset.src) {
results.src.push({'src': elem.dataset.src, 'resolve': srcv || baseURI, 'mod': mod});
}
if (elem.tagName === "SOURCE" && srcv) {
results.src.push({'src': srcv, 'resolve': baseURI, 'mod': mod});
AutoFetchWorkerProxyMode.prototype.validateSrcV = function (srcV) {
// returns null if the supplied value is not usable for resolving rel URLs
// otherwise returns the supplied value
if (!srcV || srcV.indexOf('data:') === 0 || srcV.indexOf('blob:') === 0) return null;
return srcV;
};
AutoFetchWorkerProxyMode.prototype.fetchCSSAndExtract = function (cssURL) {
// because this JS in proxy mode operates as it would on the live web
// the rules of CORS apply and we cannot rely on URLs being rewritten correctly
// fetch the cross origin css file and then parse it using a style tag to get the rules
var url = location.protocol + '//' + wb_info.proxy_magic + '/proxy-fetch/' + cssURL;
var afwpm = this;
return fetch(url).then(function (res) {
return res.text().then(function (text) {
afwpm.styleTag.textContent = text;
return afwpm.extractMediaRules(afwpm.styleTag.sheet, cssURL);
});
}).catch(function (error) {
return [];
});
};
AutoFetchWorkerProxyMode.prototype.extractMediaRules = function (sheet, baseURI) {
// We are in proxy mode and must include a URL to resolve relative URLs in media rules
var results = [];
if (!sheet) return results;
var rules = sheet.cssRules || sheet.rules;
if (!rules || rules.length === 0) return results;
var len = rules.length;
var resolve = sheet.href || baseURI;
for (var i = 0; i < len; ++i) {
var rule = rules[i];
if (rule.type === CSSRule.MEDIA_RULE) {
results.push({ "cssText": rule.cssText, "resolve": resolve });
}
}
return results;
};
AutoFetchWorkerProxyMode.prototype.extractFromLocalDoc = function () {
AutoFetchWorkerProxyMode.prototype.extractSrcSrcsetFrom = function (fromElem, baseURI) {
// retrieve the auto-fetched elements from the supplied dom node
var elems = fromElem.querySelectorAll(this.elemSelector);
var len = elems.length;
var msg = {'type': 'values', 'srcset': [], 'src': []};
for (var i = 0; i < len; i++) {
var elem = elems[i];
// we want the original src value in order to resolve URLs in the worker when needed
var srcv = this.validateSrcV(elem.src);
var resolve = srcv || baseURI;
// get the correct mod in order to inform the backing worker where the URL(s) are from
var mod = elem.tagName === "SOURCE" ?
elem.parentElement.tagName === "PICTURE" ? 'im_' : 'oe_'
: elem.tagName === "IMG" ? 'im_' : 'oe_';
if (elem.srcset) {
msg.srcset.push({'srcset': elem.srcset, 'resolve': resolve, 'mod': mod});
}
if (elem.dataset.srcset) {
msg.srcset.push({'srcset': elem.dataset.srcset, 'resolve': resolve, 'mod': mod});
}
if (elem.dataset.src) {
msg.src.push({'src': elem.dataset.src, 'resolve': resolve, 'mod': mod});
}
if (elem.tagName === "SOURCE" && srcv) {
msg.src.push({'src': srcv, 'resolve': baseURI, 'mod': mod});
}
}
// send what we have extracted, if anything, to the worker for processing
if (msg.srcset.length || msg.src.length) {
this.postMessage(msg);
}
};
AutoFetchWorkerProxyMode.prototype.checkStyleSheets = function (doc) {
var media = [];
var deferredMediaURLS = [];
var sheet;
var resolve;
// We must use the window reference passed to us to access this origins stylesheets
var styleSheets = $wbwindow.document.styleSheets;
for (var i = 0; i < styleSheets.length; i++) {
sheet = styleSheets[i];
var deferredMediaExtraction = [];
var styleSheets = doc.styleSheets;
var sheetLen = styleSheets.length;
for (var i = 0; i < sheetLen; i++) {
var sheet = styleSheets[i];
// if the sheet belongs to our parser node we must skip it
if (!this.shouldSkipSheet(sheet)) {
try {
// if no error is thrown due to cross origin sheet the urls then just add
// the resolved URLS if any to the media urls array
if (sheet.cssRules != null) {
resolve = sheet.href || $wbwindow.document.baseURI;
media = media.concat(this.extractMediaRules(sheet.cssRules, resolve));
if (sheet.cssRules || sheet.rules) {
var extracted = this.extractMediaRules(sheet, doc.baseURI);
if (extracted.length) {
media = media.concat(extracted);
}
} else if (sheet.href != null) {
// depending on the browser cross origin stylesheets will have their
// cssRules property null but href non-null
deferredMediaURLS.push(this.corsCSSFetch(sheet.href));
deferredMediaExtraction.push(this.fetchCSSAndExtract(sheet.href));
}
} catch (error) {
// the stylesheet is cross origin and we must re-fetch via PYWB to get the contents for checking
deferredMediaURLS.push(this.corsCSSFetch(sheet.href));
if (sheet.href != null) {
deferredMediaExtraction.push(this.fetchCSSAndExtract(sheet.href));
}
}
}
}
// We must use the window reference passed to us to access this origins elements with srcset attr
// like cssRule handling we must include a URL to resolve relative URLs by
var results = this.getImgAVElems();
var msg = { 'type': 'values' };
// send what we have extracted, if anything, to the worker for processing
if (media.length > 0) {
msg.media = media;
}
if (results.srcset) {
msg.srcset = results.srcset;
}
if (results.src) {
msg.src = results.src;
}
if (msg.media || msg.srcset || msg.src) {
this.postMessage(msg);
if (media.length) {
// send
this.postMessage({'type': 'values', 'media': media});
}
if (deferredMediaURLS.length > 0) {
if (deferredMediaExtraction.length) {
// wait for all our deferred fetching and extraction of cross origin
// stylesheets to complete and then send those values, if any, to the worker
var aaw = this;
Promise.all(deferredMediaURLS).then(function (values) {
var results = [];
while (values.length > 0) {
results = results.concat(values.shift());
}
if (results.length > 0) {
aaw.postMessage({'type': 'values', 'media': results});
var afwpm = this;
Promise.all(deferredMediaExtraction).then(function (results) {
if (results.length === 0) return;
var len = results.length;
var media = [];
for (var i = 0; i < len; ++i) {
media = media.concat(results[i]);
}
afwpm.postMessage({'type': 'values', 'media': media });
});
}
};
AutoFetchWorkerProxyMode.prototype.extractFromLocalDoc = function () {
// check for data-[src,srcset] and auto-fetched elems with srcset first
this.extractSrcSrcsetFrom($wbwindow.document, $wbwindow.document.baseURI);
// we must use the window reference passed to us to access this origins stylesheets
this.checkStyleSheets($wbwindow.document);
};
WBAutoFetchWorker = new AutoFetchWorkerProxyMode();
// expose AutoFetchWorkerProxyMode
Object.defineProperty(window, '$WBAutoFetchWorker$', {
'enumerable': false,
'value': WBAutoFetchWorker
});
if (isTop) {
$wbwindow.addEventListener("message", function (event) {
if (event.data && event.data.wb_type === 'aaworker') {