mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
auto-fetch: (#484)
- reworked both proxy and non-proxy mode backing workers to no-longer fetch in burst mode but as sent with a maximum of 20 fetches running at a time - added just-fetch to non-proxy mode backing worker - updated the auto fetch worker abstraction in non-proxy mode used by wombat to exposed like in proxy mode and ensured that value property for the srcset object is used when sending rewritten srcset values to the backing worker - combined the backing worker proxy & non-proxy mode into a single file - added rollup config for back auto fetch worker
This commit is contained in:
parent
193607eed8
commit
06513c2592
@ -1,20 +1,29 @@
|
||||
'use strict';
|
||||
// thanks wombat
|
||||
var STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi;
|
||||
var IMPORT_REGEX = /(@import\s+[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi;
|
||||
var IMPORT_REGEX = /(@import\s*[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi;
|
||||
var srcsetSplit = /\s*(\S*\s+[\d.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/;
|
||||
var DefaultNumImFetches = 30;
|
||||
var FullImgQDrainLen = 10;
|
||||
var DefaultNumAvFetches = 5;
|
||||
var FullAVQDrainLen = 5;
|
||||
var MaxRunningFetches = 15;
|
||||
var DataURLPrefix = 'data:';
|
||||
var seen = {};
|
||||
// array of URLs to be fetched
|
||||
var queue = [];
|
||||
var runningFetches = 0;
|
||||
// a URL to resolve relative URLs found in the cssText of CSSMedia rules.
|
||||
var currentResolver = null;
|
||||
|
||||
// the autofetcher instance for this worker
|
||||
var autofetcher = null;
|
||||
var config = {
|
||||
havePromise: typeof self.Promise !== 'undefined',
|
||||
haveFetch: typeof self.fetch !== 'undefined',
|
||||
proxyMode: false,
|
||||
mod: null,
|
||||
prefix: null,
|
||||
prefixMod: null,
|
||||
relative: null,
|
||||
rwRe: null
|
||||
};
|
||||
|
||||
function noop() {}
|
||||
|
||||
if (typeof self.Promise === 'undefined') {
|
||||
if (!config.havePromise) {
|
||||
// not kewl we must polyfill Promise
|
||||
self.Promise = function(executor) {
|
||||
executor(noop, noop);
|
||||
@ -31,157 +40,97 @@ if (typeof self.Promise === 'undefined') {
|
||||
};
|
||||
}
|
||||
|
||||
if (typeof self.fetch === 'undefined') {
|
||||
if (!config.haveFetch) {
|
||||
// not kewl we must polyfill fetch.
|
||||
self.fetch = function(url) {
|
||||
return new Promise(function(resolve) {
|
||||
var xhr = new XMLHttpRequest();
|
||||
xhr.open('GET', url);
|
||||
xhr.open('GET', url, true);
|
||||
xhr.onreadystatechange = function() {
|
||||
if (xhr.readyState === 4) {
|
||||
if (!config.havePromise) {
|
||||
fetchDoneOrErrored();
|
||||
}
|
||||
resolve();
|
||||
}
|
||||
};
|
||||
xhr.send();
|
||||
resolve();
|
||||
});
|
||||
};
|
||||
}
|
||||
|
||||
if (location.search.indexOf('init') !== -1) {
|
||||
(function() {
|
||||
var init;
|
||||
if (typeof self.URL === 'function') {
|
||||
var loc = new self.URL(location.href);
|
||||
init = JSON.parse(loc.searchParams.get('init'));
|
||||
} else {
|
||||
var search = decodeURIComponent(location.search.split('?')[1]).split('&');
|
||||
init = JSON.parse(search[0].substr(search[0].indexOf('=') + 1));
|
||||
init.prefix = decodeURIComponent(init.prefix);
|
||||
init.baseURI = decodeURIComponent(init.prefix);
|
||||
}
|
||||
config.prefix = init.prefix;
|
||||
config.mod = init.mod;
|
||||
config.prefixMod = init.prefix + init.mod;
|
||||
config.rwRe = new RegExp(init.rwRe, 'g');
|
||||
config.relative = init.prefix.split(location.origin)[1];
|
||||
config.schemeless = '/' + config.relative;
|
||||
})();
|
||||
} else {
|
||||
config.proxyMode = true;
|
||||
}
|
||||
|
||||
self.onmessage = function(event) {
|
||||
var data = event.data;
|
||||
switch (data.type) {
|
||||
case 'values':
|
||||
autofetcher.autoFetch(data);
|
||||
autoFetch(data);
|
||||
break;
|
||||
case 'fetch-all':
|
||||
justFetch(data);
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
function AutoFetcher(init) {
|
||||
if (!(this instanceof AutoFetcher)) {
|
||||
return new AutoFetcher(init);
|
||||
}
|
||||
this.prefix = init.prefix;
|
||||
this.mod = init.mod;
|
||||
this.prefixMod = init.prefix + init.mod;
|
||||
this.rwRe = new RegExp(init.rwRe);
|
||||
// relative url, WorkerLocation is set by owning document
|
||||
this.relative = init.prefix.split(location.origin)[1];
|
||||
// schemeless url
|
||||
this.schemeless = '/' + this.relative;
|
||||
// local cache of URLs fetched, to reduce server load
|
||||
this.seen = {};
|
||||
// array of URLs to be fetched
|
||||
this.queue = [];
|
||||
this.avQueue = [];
|
||||
// should we queue a URL or not
|
||||
this.queuing = false;
|
||||
this.queuingAV = false;
|
||||
this.urlExtractor = this.urlExtractor.bind(this);
|
||||
this.imgFetchDone = this.imgFetchDone.bind(this);
|
||||
this.avFetchDone = this.avFetchDone.bind(this);
|
||||
function noop() {}
|
||||
|
||||
function fetchDoneOrErrored() {
|
||||
runningFetches -= 1;
|
||||
fetchFromQ();
|
||||
}
|
||||
|
||||
AutoFetcher.prototype.delay = function() {
|
||||
// 2 second delay seem reasonable
|
||||
return new Promise(function(resolve, reject) {
|
||||
setTimeout(resolve, 2000);
|
||||
});
|
||||
};
|
||||
function fetchURL(urlToBeFetched) {
|
||||
runningFetches += 1;
|
||||
fetch(urlToBeFetched)
|
||||
.then(fetchDoneOrErrored)
|
||||
.catch(fetchDoneOrErrored);
|
||||
}
|
||||
|
||||
AutoFetcher.prototype.imgFetchDone = function() {
|
||||
if (this.queue.length > 0) {
|
||||
// we have a Q of some length drain it
|
||||
var autofetcher = this;
|
||||
this.delay().then(function() {
|
||||
autofetcher.queuing = false;
|
||||
autofetcher.fetchImgs();
|
||||
});
|
||||
} else {
|
||||
this.queuing = false;
|
||||
}
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.avFetchDone = function() {
|
||||
if (this.avQueue.length > 0) {
|
||||
// we have a Q of some length drain it
|
||||
var autofetcher = this;
|
||||
this.delay().then(function() {
|
||||
autofetcher.queuingAV = false;
|
||||
autofetcher.fetchAV();
|
||||
});
|
||||
} else {
|
||||
this.queuingAV = false;
|
||||
}
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.fetchAV = function() {
|
||||
if (this.queuingAV || this.avQueue.length === 0) {
|
||||
function queueOrFetch(urlToBeFetched) {
|
||||
if (
|
||||
!urlToBeFetched ||
|
||||
urlToBeFetched.indexOf(DataURLPrefix) === 0 ||
|
||||
seen[urlToBeFetched] != null
|
||||
) {
|
||||
return;
|
||||
}
|
||||
// the number of fetches is limited to a maximum of DefaultNumAvFetches + FullAVQDrainLen outstanding fetches
|
||||
// the baseline maximum number of fetches is DefaultNumAvFetches but if the size(avQueue) <= FullAVQDrainLen
|
||||
// we add them to the current batch. Because audio video resources might be big
|
||||
// we limit how many we fetch at a time drastically
|
||||
this.queuingAV = true;
|
||||
var runningFetchers = [];
|
||||
while (
|
||||
this.avQueue.length > 0 &&
|
||||
runningFetchers.length <= DefaultNumAvFetches
|
||||
) {
|
||||
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop));
|
||||
}
|
||||
if (this.avQueue.length <= FullAVQDrainLen) {
|
||||
while (this.avQueue.length > 0) {
|
||||
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop));
|
||||
}
|
||||
}
|
||||
Promise.all(runningFetchers)
|
||||
.then(this.avFetchDone)
|
||||
.catch(this.avFetchDone);
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.fetchImgs = function() {
|
||||
if (this.queuing || this.queue.length === 0) {
|
||||
seen[urlToBeFetched] = true;
|
||||
if (runningFetches >= MaxRunningFetches) {
|
||||
queue.push(urlToBeFetched);
|
||||
return;
|
||||
}
|
||||
// the number of fetches is limited to a maximum of DefaultNumImFetches + FullImgQDrainLen outstanding fetches
|
||||
// the baseline maximum number of fetches is DefaultNumImFetches but if the size(queue) <= FullImgQDrainLen
|
||||
// we add them to the current batch
|
||||
this.queuing = true;
|
||||
var runningFetchers = [];
|
||||
while (
|
||||
this.queue.length > 0 &&
|
||||
runningFetchers.length <= DefaultNumImFetches
|
||||
) {
|
||||
runningFetchers.push(fetch(this.queue.shift()).catch(noop));
|
||||
fetchURL(urlToBeFetched);
|
||||
}
|
||||
|
||||
function fetchFromQ() {
|
||||
while (queue.length && runningFetches < MaxRunningFetches) {
|
||||
fetchURL(queue.shift());
|
||||
}
|
||||
if (this.queue.length <= FullImgQDrainLen) {
|
||||
while (this.queue.length > 0) {
|
||||
runningFetchers.push(fetch(this.queue.shift()).catch(noop));
|
||||
}
|
||||
}
|
||||
Promise.all(runningFetchers)
|
||||
.then(this.imgFetchDone)
|
||||
.catch(this.imgFetchDone);
|
||||
};
|
||||
}
|
||||
|
||||
AutoFetcher.prototype.queueNonAVURL = function(url) {
|
||||
// ensure we do not request data urls
|
||||
if (url.indexOf(DataURLPrefix) === 0) return;
|
||||
// check to see if we have seen this url before in order
|
||||
// to lessen the load against the server content is fetched from
|
||||
if (this.seen[url] != null) return;
|
||||
this.seen[url] = true;
|
||||
this.queue.push(url);
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.queueAVURL = function(url) {
|
||||
// ensure we do not request data urls
|
||||
if (url.indexOf(DataURLPrefix) === 0) return;
|
||||
// check to see if we have seen this url before in order
|
||||
// to lessen the load against the server content is fetched from
|
||||
if (this.seen[url] != null) return;
|
||||
this.seen[url] = true;
|
||||
this.avQueue.push(url);
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.maybeResolveURL = function(url, base) {
|
||||
function maybeResolveURL(url, base) {
|
||||
// given a url and base url returns a resolved full URL or
|
||||
// null if resolution was unsuccessful
|
||||
try {
|
||||
@ -190,99 +139,129 @@ AutoFetcher.prototype.maybeResolveURL = function(url, base) {
|
||||
} catch (e) {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
AutoFetcher.prototype.maybeFixUpRelSchemelessPrefix = function(url) {
|
||||
function safeResolve(url, resolver) {
|
||||
// Guard against the exception thrown by the URL constructor if the URL or resolver is bad
|
||||
// if resolver is undefined/null then this function passes url through
|
||||
var resolvedURL = url;
|
||||
if (resolver) {
|
||||
try {
|
||||
var _url = new URL(url, resolver);
|
||||
return _url.href;
|
||||
} catch (e) {
|
||||
resolvedURL = url;
|
||||
}
|
||||
}
|
||||
return resolvedURL;
|
||||
}
|
||||
|
||||
function maybeFixUpRelSchemelessPrefix(url) {
|
||||
// attempt to ensure rewritten relative or schemeless URLs become full URLS!
|
||||
// otherwise returns null if this did not happen
|
||||
if (url.indexOf(this.relative) === 0) {
|
||||
return url.replace(this.relative, this.prefix);
|
||||
if (url.indexOf(config.relative) === 0) {
|
||||
return url.replace(config.relative, config.prefix);
|
||||
}
|
||||
if (url.indexOf(this.schemeless) === 0) {
|
||||
return url.replace(this.schemeless, this.prefix);
|
||||
if (url.indexOf(config.schemeless) === 0) {
|
||||
return url.replace(config.schemeless, config.prefix);
|
||||
}
|
||||
return null;
|
||||
};
|
||||
}
|
||||
|
||||
AutoFetcher.prototype.maybeFixUpURL = function(url, resolveOpts) {
|
||||
function maybeFixUpURL(url, resolveOpts) {
|
||||
// attempt to fix up the url and do our best to ensure we can get dat 200 OK!
|
||||
if (this.rwRe.test(url)) {
|
||||
if (config.rwRe.test(url)) {
|
||||
return url;
|
||||
}
|
||||
var mod = resolveOpts.mod || 'mp_';
|
||||
// first check for / (relative) or // (schemeless) rewritten urls
|
||||
var maybeFixed = this.maybeFixUpRelSchemelessPrefix(url);
|
||||
var maybeFixed = maybeFixUpRelSchemelessPrefix(url);
|
||||
if (maybeFixed != null) {
|
||||
return maybeFixed;
|
||||
}
|
||||
// resolve URL against tag src
|
||||
if (resolveOpts.tagSrc != null) {
|
||||
maybeFixed = this.maybeResolveURL(url, resolveOpts.tagSrc);
|
||||
maybeFixed = maybeResolveURL(url, resolveOpts.tagSrc);
|
||||
if (maybeFixed != null) {
|
||||
return this.prefix + mod + '/' + maybeFixed;
|
||||
return config.prefix + mod + '/' + maybeFixed;
|
||||
}
|
||||
}
|
||||
// finally last attempt resolve the originating documents base URI
|
||||
if (resolveOpts.docBaseURI) {
|
||||
maybeFixed = this.maybeResolveURL(url, resolveOpts.docBaseURI);
|
||||
maybeFixed = maybeResolveURL(url, resolveOpts.docBaseURI);
|
||||
if (maybeFixed != null) {
|
||||
return this.prefix + mod + '/' + maybeFixed;
|
||||
return config.prefix + mod + '/' + maybeFixed;
|
||||
}
|
||||
}
|
||||
// not much to do now.....
|
||||
return this.prefixMod + '/' + url;
|
||||
};
|
||||
return config.prefixMod + '/' + url;
|
||||
}
|
||||
|
||||
AutoFetcher.prototype.urlExtractor = function(
|
||||
match,
|
||||
n1,
|
||||
n2,
|
||||
n3,
|
||||
offset,
|
||||
string
|
||||
) {
|
||||
function urlExtractor(match, n1, n2, n3, offset, string) {
|
||||
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
|
||||
this.queueNonAVURL(n2);
|
||||
queueOrFetch(n2);
|
||||
return n1 + n2 + n3;
|
||||
};
|
||||
}
|
||||
|
||||
AutoFetcher.prototype.handleMedia = function(mediaRules) {
|
||||
function urlExtractorProxyMode(match, n1, n2, n3, offset, string) {
|
||||
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
|
||||
// this.currentResolver is set to the URL which the browser would normally
|
||||
// resolve relative urls with (URL of the stylesheet) in an exceptionless manner
|
||||
// (resolvedURL will be undefined if an error occurred)
|
||||
queueOrFetch(safeResolve(n2, currentResolver));
|
||||
return n1 + n2 + n3;
|
||||
}
|
||||
|
||||
function handleMedia(mediaRules) {
|
||||
// this is a broken down rewrite_style
|
||||
if (mediaRules == null || mediaRules.length === 0) return;
|
||||
// var rules = mediaRules.values;
|
||||
for (var i = 0; i < mediaRules.length; i++) {
|
||||
mediaRules[i]
|
||||
.replace(STYLE_REGEX, this.urlExtractor)
|
||||
.replace(IMPORT_REGEX, this.urlExtractor);
|
||||
.replace(STYLE_REGEX, urlExtractor)
|
||||
.replace(IMPORT_REGEX, urlExtractor);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
AutoFetcher.prototype.handleSrc = function(srcValues, context) {
|
||||
function handleMediaProxyMode(mediaRules) {
|
||||
// this is a broken down rewrite_style
|
||||
if (mediaRules == null || mediaRules.length === 0) return;
|
||||
for (var i = 0; i < mediaRules.length; i++) {
|
||||
// set currentResolver to the value of this stylesheets URL, done to ensure we do not have to
|
||||
// create functions on each loop iteration because we potentially create a new `URL` object
|
||||
// twice per iteration
|
||||
currentResolver = mediaRules[i].resolve;
|
||||
mediaRules[i].cssText
|
||||
.replace(STYLE_REGEX, urlExtractorProxyMode)
|
||||
.replace(IMPORT_REGEX, urlExtractorProxyMode);
|
||||
}
|
||||
}
|
||||
|
||||
function handleSrc(srcValues, context) {
|
||||
var resolveOpts = { docBaseURI: context.docBaseURI };
|
||||
if (srcValues.value) {
|
||||
resolveOpts.mod = srcValues.mod;
|
||||
if (resolveOpts.mod === 1) {
|
||||
return this.queueNonAVURL(
|
||||
this.maybeFixUpURL(srcValues.value.trim(), resolveOpts)
|
||||
);
|
||||
}
|
||||
return this.queueAVURL(
|
||||
this.maybeFixUpURL(srcValues.value.trim(), resolveOpts)
|
||||
);
|
||||
return queueOrFetch(maybeFixUpURL(srcValues.value.trim(), resolveOpts));
|
||||
}
|
||||
var len = srcValues.values.length;
|
||||
for (var i = 0; i < len; i++) {
|
||||
var value = srcValues.values[i];
|
||||
resolveOpts.mod = value.mod;
|
||||
if (resolveOpts.mod === 'im_') {
|
||||
this.queueNonAVURL(this.maybeFixUpURL(value.src, resolveOpts));
|
||||
} else {
|
||||
this.queueAVURL(this.maybeFixUpURL(value.src, resolveOpts));
|
||||
}
|
||||
queueOrFetch(maybeFixUpURL(value.src, resolveOpts));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
AutoFetcher.prototype.extractSrcSetNotPreSplit = function(ssV, resolveOpts) {
|
||||
function handleSrcProxyMode(srcValues) {
|
||||
// preservation worker in proxy mode sends us the value of the srcset attribute of an element
|
||||
// and a URL to correctly resolve relative URLS. Thus we must recreate rewrite_srcset logic here
|
||||
if (srcValues == null || srcValues.length === 0) return;
|
||||
var srcVal;
|
||||
for (var i = 0; i < srcValues.length; i++) {
|
||||
srcVal = srcValues[i];
|
||||
queueOrFetch(safeResolve(srcVal.src, srcVal.resolve));
|
||||
}
|
||||
}
|
||||
|
||||
function extractSrcSetNotPreSplit(ssV, resolveOpts) {
|
||||
if (!ssV) return;
|
||||
// was from extract from local doc so we need to duplicate work
|
||||
var srcsetValues = ssV.split(srcsetSplit);
|
||||
@ -290,41 +269,38 @@ AutoFetcher.prototype.extractSrcSetNotPreSplit = function(ssV, resolveOpts) {
|
||||
// grab the URL not width/height key
|
||||
if (srcsetValues[i]) {
|
||||
var value = srcsetValues[i].trim().split(' ')[0];
|
||||
var maybeResolvedURL = this.maybeFixUpURL(value.trim(), resolveOpts);
|
||||
if (resolveOpts.mod === 'im_') {
|
||||
this.queueNonAVURL(maybeResolvedURL);
|
||||
} else {
|
||||
this.queueAVURL(maybeResolvedURL);
|
||||
}
|
||||
var maybeResolvedURL = maybeFixUpURL(value.trim(), resolveOpts);
|
||||
queueOrFetch(maybeResolvedURL);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
AutoFetcher.prototype.extractSrcset = function(srcsets, context) {
|
||||
function extractSrcset(srcsets) {
|
||||
// was rewrite_srcset and only need to q
|
||||
for (var i = 0; i < srcsets.length; i++) {
|
||||
// grab the URL not width/height key
|
||||
var url = srcsets[i].split(' ')[0];
|
||||
if (context.mod === 'im_') {
|
||||
this.queueNonAVURL(url);
|
||||
} else {
|
||||
this.queueAVURL(url);
|
||||
}
|
||||
queueOrFetch(url);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
AutoFetcher.prototype.handleSrcset = function(srcset, context) {
|
||||
var resolveOpts = { docBaseURI: context.docBaseURI };
|
||||
function handleSrcset(srcset, context) {
|
||||
if (srcset == null) return;
|
||||
var resolveOpts = {
|
||||
docBaseURI: context.docBaseURI,
|
||||
mode: null,
|
||||
tagSrc: null
|
||||
};
|
||||
if (srcset.value) {
|
||||
// we have a single value, this srcset came from either
|
||||
// preserveDataSrcset (not presplit) preserveSrcset (presplit)
|
||||
resolveOpts.mod = srcset.mod;
|
||||
if (!srcset.presplit) {
|
||||
// extract URLs from the srcset string
|
||||
return this.extractSrcSetNotPreSplit(srcset.value, resolveOpts);
|
||||
return extractSrcSetNotPreSplit(srcset.value, resolveOpts);
|
||||
}
|
||||
// we have an array of srcset URL strings
|
||||
return this.extractSrcset(srcset.value, resolveOpts);
|
||||
return extractSrcset(srcset.value);
|
||||
}
|
||||
// we have an array of values, these srcsets came from extractFromLocalDoc
|
||||
var len = srcset.values.length;
|
||||
@ -332,38 +308,64 @@ AutoFetcher.prototype.handleSrcset = function(srcset, context) {
|
||||
var ssv = srcset.values[i];
|
||||
resolveOpts.mod = ssv.mod;
|
||||
resolveOpts.tagSrc = ssv.tagSrc;
|
||||
this.extractSrcSetNotPreSplit(ssv.srcset, resolveOpts);
|
||||
extractSrcSetNotPreSplit(ssv.srcset, resolveOpts);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
AutoFetcher.prototype.autoFetch = function(data) {
|
||||
function handleSrcsetProxyMode(srcsets) {
|
||||
// preservation worker in proxy mode sends us the value of the srcset attribute of an element
|
||||
// and a URL to correctly resolve relative URLS. Thus we must recreate rewrite_srcset logic here
|
||||
if (srcsets == null) return;
|
||||
var length = srcsets.length;
|
||||
var extractedSrcSet, srcsetValue, ssSplit, j;
|
||||
for (var i = 0; i < length; i++) {
|
||||
extractedSrcSet = srcsets[i];
|
||||
ssSplit = extractedSrcSet.srcset.split(srcsetSplit);
|
||||
for (j = 0; j < ssSplit.length; j++) {
|
||||
if (ssSplit[j]) {
|
||||
srcsetValue = ssSplit[j].trim();
|
||||
if (srcsetValue) {
|
||||
queueOrFetch(
|
||||
safeResolve(srcsetValue.split(' ')[0], extractedSrcSet.resolve)
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function autoFetch(data) {
|
||||
// we got a message and now we autofetch!
|
||||
// these calls turn into no ops if they have no work
|
||||
if (data.media) {
|
||||
this.handleMedia(data.media);
|
||||
if (config.proxyMode) {
|
||||
handleMediaProxyMode(data.media);
|
||||
} else {
|
||||
handleMedia(data.media);
|
||||
}
|
||||
}
|
||||
|
||||
if (data.src) {
|
||||
this.handleSrc(data.src, data.context || {});
|
||||
if (config.proxyMode) {
|
||||
handleSrcProxyMode(data.src);
|
||||
} else {
|
||||
handleSrc(data.src, data.context || { docBaseURI: null });
|
||||
}
|
||||
}
|
||||
|
||||
if (data.srcset) {
|
||||
this.handleSrcset(data.srcset, data.context || {});
|
||||
if (config.proxyMode) {
|
||||
handleSrcsetProxyMode(data.srcset);
|
||||
} else {
|
||||
handleSrcset(data.srcset, data.context || { docBaseURI: null });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function justFetch(data) {
|
||||
// we got a message containing only urls to be fetched
|
||||
if (data == null || data.values == null) return;
|
||||
for (var i = 0; i < data.values.length; ++i) {
|
||||
queueOrFetch(data.values[i]);
|
||||
}
|
||||
|
||||
this.fetchImgs();
|
||||
this.fetchAV();
|
||||
};
|
||||
|
||||
// initialize ourselves from the query params :)
|
||||
try {
|
||||
var loc = new self.URL(location.href);
|
||||
autofetcher = new AutoFetcher(JSON.parse(loc.searchParams.get('init')));
|
||||
} catch (e) {
|
||||
// likely we are in an older version of safari
|
||||
var search = decodeURIComponent(location.search.split('?')[1]).split('&');
|
||||
var init = JSON.parse(search[0].substr(search[0].indexOf('=') + 1));
|
||||
init.prefix = decodeURIComponent(init.prefix);
|
||||
init.baseURI = decodeURIComponent(init.baseURI);
|
||||
autofetcher = new AutoFetcher(init);
|
||||
}
|
||||
|
@ -1,303 +0,0 @@
|
||||
'use strict';
|
||||
// thanks wombat
|
||||
var STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi;
|
||||
var IMPORT_REGEX = /(@import\s+[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi;
|
||||
var srcsetSplit = /\s*(\S*\s+[\d.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/;
|
||||
var DefaultNumImFetches = 30;
|
||||
var FullImgQDrainLen = 10;
|
||||
var DefaultNumAvFetches = 5;
|
||||
var FullAVQDrainLen = 5;
|
||||
var DataURLPrefix = 'data:';
|
||||
var FetchDelay = 1000;
|
||||
// the autofetcher instance for this worker
|
||||
var autofetcher = null;
|
||||
|
||||
function noop() {}
|
||||
|
||||
if (typeof self.Promise === 'undefined') {
|
||||
// not kewl we must polyfill Promise
|
||||
self.Promise = function(executor) {
|
||||
executor(noop, noop);
|
||||
};
|
||||
self.Promise.prototype.then = function(cb) {
|
||||
if (cb) cb();
|
||||
return this;
|
||||
};
|
||||
self.Promise.prototype.catch = function() {
|
||||
return this;
|
||||
};
|
||||
self.Promise.all = function(values) {
|
||||
return new Promise(noop);
|
||||
};
|
||||
}
|
||||
|
||||
if (typeof self.fetch === 'undefined') {
|
||||
// not kewl we must polyfill fetch.
|
||||
self.fetch = function(url) {
|
||||
return new Promise(function(resolve) {
|
||||
var xhr = new XMLHttpRequest();
|
||||
xhr.open('GET', url);
|
||||
xhr.send();
|
||||
resolve();
|
||||
});
|
||||
};
|
||||
}
|
||||
|
||||
self.onmessage = function(event) {
|
||||
var data = event.data;
|
||||
switch (data.type) {
|
||||
case 'values':
|
||||
autofetcher.autofetchMediaSrcset(data);
|
||||
break;
|
||||
case 'fetch-all':
|
||||
autofetcher.justFetch(data);
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
function AutoFetcher() {
|
||||
if (!(this instanceof AutoFetcher)) {
|
||||
return new AutoFetcher();
|
||||
}
|
||||
// local cache of URLs fetched, to reduce server load
|
||||
this.seen = {};
|
||||
// array of URLs to be fetched
|
||||
this.queue = [];
|
||||
this.avQueue = [];
|
||||
// should we queue a URL or not
|
||||
this.queuing = false;
|
||||
// a URL to resolve relative URLs found in the cssText of CSSMedia rules.
|
||||
this.currentResolver = null;
|
||||
// should we queue a URL or not
|
||||
this.queuing = false;
|
||||
this.queuingAV = false;
|
||||
this.urlExtractor = this.urlExtractor.bind(this);
|
||||
this.imgFetchDone = this.imgFetchDone.bind(this);
|
||||
this.avFetchDone = this.avFetchDone.bind(this);
|
||||
}
|
||||
|
||||
AutoFetcher.prototype.delay = function() {
|
||||
return new Promise(function(resolve, reject) {
|
||||
setTimeout(resolve, FetchDelay);
|
||||
});
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.imgFetchDone = function() {
|
||||
if (this.queue.length > 0) {
|
||||
// we have a Q of some length drain it
|
||||
var autofetcher = this;
|
||||
this.delay().then(function() {
|
||||
autofetcher.queuing = false;
|
||||
autofetcher.fetchImgs();
|
||||
});
|
||||
} else {
|
||||
this.queuing = false;
|
||||
}
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.avFetchDone = function() {
|
||||
if (this.avQueue.length > 0) {
|
||||
// we have a Q of some length drain it
|
||||
var autofetcher = this;
|
||||
this.delay().then(function() {
|
||||
autofetcher.queuingAV = false;
|
||||
autofetcher.fetchAV();
|
||||
});
|
||||
} else {
|
||||
this.queuingAV = false;
|
||||
}
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.fetchAV = function() {
|
||||
if (this.queuingAV || this.avQueue.length === 0) {
|
||||
return;
|
||||
}
|
||||
// the number of fetches is limited to a maximum of DefaultNumAvFetches + FullAVQDrainLen outstanding fetches
|
||||
// the baseline maximum number of fetches is DefaultNumAvFetches but if the size(avQueue) <= FullAVQDrainLen
|
||||
// we add them to the current batch. Because audio video resources might be big
|
||||
// we limit how many we fetch at a time drastically
|
||||
this.queuingAV = true;
|
||||
var runningFetchers = [];
|
||||
while (
|
||||
this.avQueue.length > 0 &&
|
||||
runningFetchers.length <= DefaultNumAvFetches
|
||||
) {
|
||||
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop));
|
||||
}
|
||||
if (this.avQueue.length <= FullAVQDrainLen) {
|
||||
while (this.avQueue.length > 0) {
|
||||
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop));
|
||||
}
|
||||
}
|
||||
Promise.all(runningFetchers)
|
||||
.then(this.avFetchDone)
|
||||
.catch(this.avFetchDone);
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.fetchImgs = function() {
|
||||
if (this.queuing || this.queue.length === 0) {
|
||||
return;
|
||||
}
|
||||
// the number of fetches is limited to a maximum of DefaultNumImFetches + FullImgQDrainLen outstanding fetches
|
||||
// the baseline maximum number of fetches is DefaultNumImFetches but if the size(queue) <= FullImgQDrainLen
|
||||
// we add them to the current batch
|
||||
this.queuing = true;
|
||||
var runningFetchers = [];
|
||||
while (
|
||||
this.queue.length > 0 &&
|
||||
runningFetchers.length <= DefaultNumImFetches
|
||||
) {
|
||||
runningFetchers.push(fetch(this.queue.shift()).catch(noop));
|
||||
}
|
||||
if (this.queue.length <= FullImgQDrainLen) {
|
||||
while (this.queue.length > 0) {
|
||||
runningFetchers.push(fetch(this.queue.shift()).catch(noop));
|
||||
}
|
||||
}
|
||||
Promise.all(runningFetchers)
|
||||
.then(this.imgFetchDone)
|
||||
.catch(this.imgFetchDone);
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.queueNonAVURL = function(url) {
|
||||
// ensure we do not request data urls
|
||||
if (url.indexOf(DataURLPrefix) === 0) return;
|
||||
// check to see if we have seen this url before in order
|
||||
// to lessen the load against the server content is fetched from
|
||||
if (this.seen[url] != null) return;
|
||||
this.seen[url] = true;
|
||||
this.queue.push(url);
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.queueAVURL = function(url) {
|
||||
// ensure we do not request data urls
|
||||
if (url.indexOf(DataURLPrefix) === 0) return;
|
||||
// check to see if we have seen this url before in order
|
||||
// to lessen the load against the server content is fetched from
|
||||
if (this.seen[url] != null) return;
|
||||
this.seen[url] = true;
|
||||
this.avQueue.push(url);
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.safeResolve = function(url, resolver) {
|
||||
// Guard against the exception thrown by the URL constructor if the URL or resolver is bad
|
||||
// if resolver is undefined/null then this function passes url through
|
||||
var resolvedURL = url;
|
||||
if (resolver) {
|
||||
try {
|
||||
resolvedURL = new URL(url, resolver).href;
|
||||
} catch (e) {
|
||||
resolvedURL = url;
|
||||
}
|
||||
}
|
||||
return resolvedURL;
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.urlExtractor = function(
|
||||
match,
|
||||
n1,
|
||||
n2,
|
||||
n3,
|
||||
offset,
|
||||
string
|
||||
) {
|
||||
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
|
||||
// this.currentResolver is set to the URL which the browser would normally
|
||||
// resolve relative urls with (URL of the stylesheet) in an exceptionless manner
|
||||
// (resolvedURL will be undefined if an error occurred)
|
||||
var resolvedURL = this.safeResolve(n2, this.currentResolver);
|
||||
if (resolvedURL) {
|
||||
this.queueNonAVURL(resolvedURL);
|
||||
}
|
||||
return n1 + n2 + n3;
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.extractMedia = function(mediaRules) {
|
||||
// this is a broken down rewrite_style
|
||||
if (mediaRules == null) return;
|
||||
for (var i = 0; i < mediaRules.length; i++) {
|
||||
// set currentResolver to the value of this stylesheets URL, done to ensure we do not have to
|
||||
// create functions on each loop iteration because we potentially create a new `URL` object
|
||||
// twice per iteration
|
||||
this.currentResolver = mediaRules[i].resolve;
|
||||
mediaRules[i].cssText
|
||||
.replace(STYLE_REGEX, this.urlExtractor)
|
||||
.replace(IMPORT_REGEX, this.urlExtractor);
|
||||
}
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.extractSrcset = function(srcsets) {
|
||||
// preservation worker in proxy mode sends us the value of the srcset attribute of an element
|
||||
// and a URL to correctly resolve relative URLS. Thus we must recreate rewrite_srcset logic here
|
||||
if (srcsets == null) return;
|
||||
var length = srcsets.length;
|
||||
var extractedSrcSet, srcsetValue, ssSplit, j;
|
||||
for (var i = 0; i < length; i++) {
|
||||
extractedSrcSet = srcsets[i];
|
||||
ssSplit = extractedSrcSet.srcset.split(srcsetSplit);
|
||||
console.log(ssSplit);
|
||||
for (j = 0; j < ssSplit.length; j++) {
|
||||
if (ssSplit[j]) {
|
||||
srcsetValue = ssSplit[j].trim();
|
||||
if (srcsetValue.length > 0) {
|
||||
// resolve the URL in an exceptionless manner (resolvedURL will be undefined if an error occurred)
|
||||
var resolvedURL = this.safeResolve(
|
||||
srcsetValue.split(' ')[0],
|
||||
extractedSrcSet.resolve
|
||||
);
|
||||
if (resolvedURL) {
|
||||
if (extractedSrcSet.mod === 'im_') {
|
||||
this.queueNonAVURL(resolvedURL);
|
||||
} else {
|
||||
this.queueAVURL(resolvedURL);
|
||||
}
|
||||
} else {
|
||||
console.log(resolvedURL);
|
||||
}
|
||||
} else {
|
||||
console.log(srcsetValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.extractSrc = function(srcVals) {
|
||||
// preservation worker in proxy mode sends us the value of the srcset attribute of an element
|
||||
// and a URL to correctly resolve relative URLS. Thus we must recreate rewrite_srcset logic here
|
||||
if (srcVals == null || srcVals.length === 0) return;
|
||||
var length = srcVals.length;
|
||||
var srcVal;
|
||||
for (var i = 0; i < length; i++) {
|
||||
srcVal = srcVals[i];
|
||||
var resolvedURL = this.safeResolve(srcVal.src, srcVal.resolve);
|
||||
if (resolvedURL) {
|
||||
if (srcVal.mod === 'im_') {
|
||||
this.queueNonAVURL(resolvedURL);
|
||||
} else {
|
||||
this.queueAVURL(resolvedURL);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.autofetchMediaSrcset = function(data) {
|
||||
// we got a message and now we autofetch!
|
||||
// these calls turn into no ops if they have no work
|
||||
this.extractMedia(data.media);
|
||||
this.extractSrcset(data.srcset);
|
||||
this.extractSrc(data.src);
|
||||
this.fetchImgs();
|
||||
this.fetchAV();
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.justFetch = function(data) {
|
||||
// we got a message containing only urls to be fetched
|
||||
if (data == null || data.values == null) return;
|
||||
for (var i = 0; i < data.values.length; ++i) {
|
||||
this.queueNonAVURL(data.values[i]);
|
||||
}
|
||||
this.fetchImgs();
|
||||
};
|
||||
|
||||
autofetcher = new AutoFetcher();
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright(c) 2013-2018 Rhizome and Ilya Kreymer. Released under the GNU General Public License.
|
||||
Copyright(c) 2013-2018 Rhizome and Contributors. Released under the GNU General Public License.
|
||||
|
||||
This file is part of pywb, https://github.com/webrecorder/pywb
|
||||
|
||||
|
@ -363,7 +363,7 @@ class TestProxyAutoFetchWorkerEndPoints(BaseTestProxy):
|
||||
|
||||
def test_proxy_worker_options_request(self, scheme):
|
||||
expected_origin = '{0}://example.com'.format(scheme)
|
||||
res = requests.options('{0}://pywb.proxy/static/autoFetchWorkerProxyMode.js'.format(scheme),
|
||||
res = requests.options('{0}://pywb.proxy/static/autoFetchWorker.js'.format(scheme),
|
||||
headers=dict(Origin=expected_origin),
|
||||
proxies=self.proxies, verify=self.root_ca_file)
|
||||
|
||||
@ -372,7 +372,7 @@ class TestProxyAutoFetchWorkerEndPoints(BaseTestProxy):
|
||||
|
||||
def test_proxy_worker_fetch(self, scheme):
|
||||
origin = '{0}://example.com'.format(scheme)
|
||||
url = '{0}://pywb.proxy/static/autoFetchWorkerProxyMode.js'.format(scheme)
|
||||
url = '{0}://pywb.proxy/static/autoFetchWorker.js'.format(scheme)
|
||||
res = requests.get(url,
|
||||
headers=dict(Origin=origin),
|
||||
proxies=self.proxies, verify=self.root_ca_file)
|
||||
@ -380,11 +380,11 @@ class TestProxyAutoFetchWorkerEndPoints(BaseTestProxy):
|
||||
assert res.ok
|
||||
assert res.headers.get('Content-Type') == 'application/javascript'
|
||||
assert res.headers.get('Access-Control-Allow-Origin') == origin
|
||||
assert 'AutoFetcher.prototype.safeResolve' in res.text
|
||||
assert 'function handleSrcsetProxyMode' in res.text
|
||||
|
||||
res = requests.get(url, proxies=self.proxies, verify=self.root_ca_file)
|
||||
|
||||
assert res.ok
|
||||
assert res.headers.get('Content-Type') == 'application/javascript'
|
||||
assert res.headers.get('Access-Control-Allow-Origin') == '*'
|
||||
assert 'AutoFetcher.prototype.safeResolve' in res.text
|
||||
assert 'function handleSrcsetProxyMode' in res.text
|
||||
|
Loading…
x
Reference in New Issue
Block a user