1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-25 23:47:47 +01:00
pywb/pywb/static/autoFetchWorker.js
John Berlin d869f7ef48 auto-fetch: (#484)
- reworked both proxy and non-proxy mode backing workers to no-longer fetch in burst mode but as sent with a maximum of 20 fetches running at a time
 - added just-fetch to non-proxy mode backing worker
 - updated the auto fetch worker abstraction in non-proxy mode used by wombat to exposed like in proxy mode and ensured that value property for the srcset object is used when sending rewritten srcset values to the backing worker
  - combined the backing worker proxy & non-proxy mode into a single file
  - added rollup config for back auto fetch worker
2019-06-27 22:01:45 -07:00

372 lines
11 KiB
JavaScript

'use strict';
// thanks wombat
var STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi;
var IMPORT_REGEX = /(@import\s*[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi;
var srcsetSplit = /\s*(\S*\s+[\d.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/;
var MaxRunningFetches = 15;
var DataURLPrefix = 'data:';
var seen = {};
// array of URLs to be fetched
var queue = [];
var runningFetches = 0;
// a URL to resolve relative URLs found in the cssText of CSSMedia rules.
var currentResolver = null;
var config = {
havePromise: typeof self.Promise !== 'undefined',
haveFetch: typeof self.fetch !== 'undefined',
proxyMode: false,
mod: null,
prefix: null,
prefixMod: null,
relative: null,
rwRe: null
};
if (!config.havePromise) {
// not kewl we must polyfill Promise
self.Promise = function(executor) {
executor(noop, noop);
};
self.Promise.prototype.then = function(cb) {
if (cb) cb();
return this;
};
self.Promise.prototype.catch = function() {
return this;
};
self.Promise.all = function(values) {
return new Promise(noop);
};
}
if (!config.haveFetch) {
// not kewl we must polyfill fetch.
self.fetch = function(url) {
return new Promise(function(resolve) {
var xhr = new XMLHttpRequest();
xhr.open('GET', url, true);
xhr.onreadystatechange = function() {
if (xhr.readyState === 4) {
if (!config.havePromise) {
fetchDoneOrErrored();
}
resolve();
}
};
xhr.send();
});
};
}
if (location.search.indexOf('init') !== -1) {
(function() {
var init;
if (typeof self.URL === 'function') {
var loc = new self.URL(location.href);
init = JSON.parse(loc.searchParams.get('init'));
} else {
var search = decodeURIComponent(location.search.split('?')[1]).split('&');
init = JSON.parse(search[0].substr(search[0].indexOf('=') + 1));
init.prefix = decodeURIComponent(init.prefix);
init.baseURI = decodeURIComponent(init.prefix);
}
config.prefix = init.prefix;
config.mod = init.mod;
config.prefixMod = init.prefix + init.mod;
config.rwRe = new RegExp(init.rwRe, 'g');
config.relative = init.prefix.split(location.origin)[1];
config.schemeless = '/' + config.relative;
})();
} else {
config.proxyMode = true;
}
self.onmessage = function(event) {
var data = event.data;
switch (data.type) {
case 'values':
autoFetch(data);
break;
case 'fetch-all':
justFetch(data);
break;
}
};
function noop() {}
function fetchDoneOrErrored() {
runningFetches -= 1;
fetchFromQ();
}
function fetchURL(urlToBeFetched) {
runningFetches += 1;
fetch(urlToBeFetched)
.then(fetchDoneOrErrored)
.catch(fetchDoneOrErrored);
}
function queueOrFetch(urlToBeFetched) {
if (
!urlToBeFetched ||
urlToBeFetched.indexOf(DataURLPrefix) === 0 ||
seen[urlToBeFetched] != null
) {
return;
}
seen[urlToBeFetched] = true;
if (runningFetches >= MaxRunningFetches) {
queue.push(urlToBeFetched);
return;
}
fetchURL(urlToBeFetched);
}
function fetchFromQ() {
while (queue.length && runningFetches < MaxRunningFetches) {
fetchURL(queue.shift());
}
}
function maybeResolveURL(url, base) {
// given a url and base url returns a resolved full URL or
// null if resolution was unsuccessful
try {
var _url = new URL(url, base);
return _url.href;
} catch (e) {
return null;
}
}
function safeResolve(url, resolver) {
// Guard against the exception thrown by the URL constructor if the URL or resolver is bad
// if resolver is undefined/null then this function passes url through
var resolvedURL = url;
if (resolver) {
try {
var _url = new URL(url, resolver);
return _url.href;
} catch (e) {
resolvedURL = url;
}
}
return resolvedURL;
}
function maybeFixUpRelSchemelessPrefix(url) {
// attempt to ensure rewritten relative or schemeless URLs become full URLS!
// otherwise returns null if this did not happen
if (url.indexOf(config.relative) === 0) {
return url.replace(config.relative, config.prefix);
}
if (url.indexOf(config.schemeless) === 0) {
return url.replace(config.schemeless, config.prefix);
}
return null;
}
function maybeFixUpURL(url, resolveOpts) {
// attempt to fix up the url and do our best to ensure we can get dat 200 OK!
if (config.rwRe.test(url)) {
return url;
}
var mod = resolveOpts.mod || 'mp_';
// first check for / (relative) or // (schemeless) rewritten urls
var maybeFixed = maybeFixUpRelSchemelessPrefix(url);
if (maybeFixed != null) {
return maybeFixed;
}
// resolve URL against tag src
if (resolveOpts.tagSrc != null) {
maybeFixed = maybeResolveURL(url, resolveOpts.tagSrc);
if (maybeFixed != null) {
return config.prefix + mod + '/' + maybeFixed;
}
}
// finally last attempt resolve the originating documents base URI
if (resolveOpts.docBaseURI) {
maybeFixed = maybeResolveURL(url, resolveOpts.docBaseURI);
if (maybeFixed != null) {
return config.prefix + mod + '/' + maybeFixed;
}
}
// not much to do now.....
return config.prefixMod + '/' + url;
}
function urlExtractor(match, n1, n2, n3, offset, string) {
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
queueOrFetch(n2);
return n1 + n2 + n3;
}
function urlExtractorProxyMode(match, n1, n2, n3, offset, string) {
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
// this.currentResolver is set to the URL which the browser would normally
// resolve relative urls with (URL of the stylesheet) in an exceptionless manner
// (resolvedURL will be undefined if an error occurred)
queueOrFetch(safeResolve(n2, currentResolver));
return n1 + n2 + n3;
}
function handleMedia(mediaRules) {
// this is a broken down rewrite_style
if (mediaRules == null || mediaRules.length === 0) return;
for (var i = 0; i < mediaRules.length; i++) {
mediaRules[i]
.replace(STYLE_REGEX, urlExtractor)
.replace(IMPORT_REGEX, urlExtractor);
}
}
function handleMediaProxyMode(mediaRules) {
// this is a broken down rewrite_style
if (mediaRules == null || mediaRules.length === 0) return;
for (var i = 0; i < mediaRules.length; i++) {
// set currentResolver to the value of this stylesheets URL, done to ensure we do not have to
// create functions on each loop iteration because we potentially create a new `URL` object
// twice per iteration
currentResolver = mediaRules[i].resolve;
mediaRules[i].cssText
.replace(STYLE_REGEX, urlExtractorProxyMode)
.replace(IMPORT_REGEX, urlExtractorProxyMode);
}
}
function handleSrc(srcValues, context) {
var resolveOpts = { docBaseURI: context.docBaseURI };
if (srcValues.value) {
resolveOpts.mod = srcValues.mod;
return queueOrFetch(maybeFixUpURL(srcValues.value.trim(), resolveOpts));
}
var len = srcValues.values.length;
for (var i = 0; i < len; i++) {
var value = srcValues.values[i];
resolveOpts.mod = value.mod;
queueOrFetch(maybeFixUpURL(value.src, resolveOpts));
}
}
function handleSrcProxyMode(srcValues) {
// preservation worker in proxy mode sends us the value of the srcset attribute of an element
// and a URL to correctly resolve relative URLS. Thus we must recreate rewrite_srcset logic here
if (srcValues == null || srcValues.length === 0) return;
var srcVal;
for (var i = 0; i < srcValues.length; i++) {
srcVal = srcValues[i];
queueOrFetch(safeResolve(srcVal.src, srcVal.resolve));
}
}
function extractSrcSetNotPreSplit(ssV, resolveOpts) {
if (!ssV) return;
// was from extract from local doc so we need to duplicate work
var srcsetValues = ssV.split(srcsetSplit);
for (var i = 0; i < srcsetValues.length; i++) {
// grab the URL not width/height key
if (srcsetValues[i]) {
var value = srcsetValues[i].trim().split(' ')[0];
var maybeResolvedURL = maybeFixUpURL(value.trim(), resolveOpts);
queueOrFetch(maybeResolvedURL);
}
}
}
function extractSrcset(srcsets) {
// was rewrite_srcset and only need to q
for (var i = 0; i < srcsets.length; i++) {
// grab the URL not width/height key
var url = srcsets[i].split(' ')[0];
queueOrFetch(url);
}
}
function handleSrcset(srcset, context) {
if (srcset == null) return;
var resolveOpts = {
docBaseURI: context.docBaseURI,
mode: null,
tagSrc: null
};
if (srcset.value) {
// we have a single value, this srcset came from either
// preserveDataSrcset (not presplit) preserveSrcset (presplit)
resolveOpts.mod = srcset.mod;
if (!srcset.presplit) {
// extract URLs from the srcset string
return extractSrcSetNotPreSplit(srcset.value, resolveOpts);
}
// we have an array of srcset URL strings
return extractSrcset(srcset.value);
}
// we have an array of values, these srcsets came from extractFromLocalDoc
var len = srcset.values.length;
for (var i = 0; i < len; i++) {
var ssv = srcset.values[i];
resolveOpts.mod = ssv.mod;
resolveOpts.tagSrc = ssv.tagSrc;
extractSrcSetNotPreSplit(ssv.srcset, resolveOpts);
}
}
function handleSrcsetProxyMode(srcsets) {
// preservation worker in proxy mode sends us the value of the srcset attribute of an element
// and a URL to correctly resolve relative URLS. Thus we must recreate rewrite_srcset logic here
if (srcsets == null) return;
var length = srcsets.length;
var extractedSrcSet, srcsetValue, ssSplit, j;
for (var i = 0; i < length; i++) {
extractedSrcSet = srcsets[i];
ssSplit = extractedSrcSet.srcset.split(srcsetSplit);
for (j = 0; j < ssSplit.length; j++) {
if (ssSplit[j]) {
srcsetValue = ssSplit[j].trim();
if (srcsetValue) {
queueOrFetch(
safeResolve(srcsetValue.split(' ')[0], extractedSrcSet.resolve)
);
}
}
}
}
}
function autoFetch(data) {
// we got a message and now we autofetch!
// these calls turn into no ops if they have no work
if (data.media) {
if (config.proxyMode) {
handleMediaProxyMode(data.media);
} else {
handleMedia(data.media);
}
}
if (data.src) {
if (config.proxyMode) {
handleSrcProxyMode(data.src);
} else {
handleSrc(data.src, data.context || { docBaseURI: null });
}
}
if (data.srcset) {
if (config.proxyMode) {
handleSrcsetProxyMode(data.srcset);
} else {
handleSrcset(data.srcset, data.context || { docBaseURI: null });
}
}
}
function justFetch(data) {
// we got a message containing only urls to be fetched
if (data == null || data.values == null) return;
for (var i = 0; i < data.values.length; ++i) {
queueOrFetch(data.values[i]);
}
}