1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

auto-fetch: (#484)

- reworked both proxy and non-proxy mode backing workers to no-longer fetch in burst mode but as sent with a maximum of 20 fetches running at a time
 - added just-fetch to non-proxy mode backing worker
 - updated the auto fetch worker abstraction in non-proxy mode used by wombat to exposed like in proxy mode and ensured that value property for the srcset object is used when sending rewritten srcset values to the backing worker
  - combined the backing worker proxy & non-proxy mode into a single file
  - added rollup config for back auto fetch worker
This commit is contained in:
John Berlin 2019-06-28 01:01:45 -04:00 committed by Ilya Kreymer
parent 193607eed8
commit 06513c2592
6 changed files with 233 additions and 534 deletions

View File

@ -1,20 +1,29 @@
'use strict';
// thanks wombat
var STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi;
var IMPORT_REGEX = /(@import\s+[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi;
var IMPORT_REGEX = /(@import\s*[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi;
var srcsetSplit = /\s*(\S*\s+[\d.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/;
var DefaultNumImFetches = 30;
var FullImgQDrainLen = 10;
var DefaultNumAvFetches = 5;
var FullAVQDrainLen = 5;
var MaxRunningFetches = 15;
var DataURLPrefix = 'data:';
var seen = {};
// array of URLs to be fetched
var queue = [];
var runningFetches = 0;
// a URL to resolve relative URLs found in the cssText of CSSMedia rules.
var currentResolver = null;
// the autofetcher instance for this worker
var autofetcher = null;
var config = {
havePromise: typeof self.Promise !== 'undefined',
haveFetch: typeof self.fetch !== 'undefined',
proxyMode: false,
mod: null,
prefix: null,
prefixMod: null,
relative: null,
rwRe: null
};
function noop() {}
if (typeof self.Promise === 'undefined') {
if (!config.havePromise) {
// not kewl we must polyfill Promise
self.Promise = function(executor) {
executor(noop, noop);
@ -31,157 +40,97 @@ if (typeof self.Promise === 'undefined') {
};
}
if (typeof self.fetch === 'undefined') {
if (!config.haveFetch) {
// not kewl we must polyfill fetch.
self.fetch = function(url) {
return new Promise(function(resolve) {
var xhr = new XMLHttpRequest();
xhr.open('GET', url);
xhr.open('GET', url, true);
xhr.onreadystatechange = function() {
if (xhr.readyState === 4) {
if (!config.havePromise) {
fetchDoneOrErrored();
}
resolve();
}
};
xhr.send();
resolve();
});
};
}
if (location.search.indexOf('init') !== -1) {
(function() {
var init;
if (typeof self.URL === 'function') {
var loc = new self.URL(location.href);
init = JSON.parse(loc.searchParams.get('init'));
} else {
var search = decodeURIComponent(location.search.split('?')[1]).split('&');
init = JSON.parse(search[0].substr(search[0].indexOf('=') + 1));
init.prefix = decodeURIComponent(init.prefix);
init.baseURI = decodeURIComponent(init.prefix);
}
config.prefix = init.prefix;
config.mod = init.mod;
config.prefixMod = init.prefix + init.mod;
config.rwRe = new RegExp(init.rwRe, 'g');
config.relative = init.prefix.split(location.origin)[1];
config.schemeless = '/' + config.relative;
})();
} else {
config.proxyMode = true;
}
self.onmessage = function(event) {
var data = event.data;
switch (data.type) {
case 'values':
autofetcher.autoFetch(data);
autoFetch(data);
break;
case 'fetch-all':
justFetch(data);
break;
}
};
function AutoFetcher(init) {
if (!(this instanceof AutoFetcher)) {
return new AutoFetcher(init);
}
this.prefix = init.prefix;
this.mod = init.mod;
this.prefixMod = init.prefix + init.mod;
this.rwRe = new RegExp(init.rwRe);
// relative url, WorkerLocation is set by owning document
this.relative = init.prefix.split(location.origin)[1];
// schemeless url
this.schemeless = '/' + this.relative;
// local cache of URLs fetched, to reduce server load
this.seen = {};
// array of URLs to be fetched
this.queue = [];
this.avQueue = [];
// should we queue a URL or not
this.queuing = false;
this.queuingAV = false;
this.urlExtractor = this.urlExtractor.bind(this);
this.imgFetchDone = this.imgFetchDone.bind(this);
this.avFetchDone = this.avFetchDone.bind(this);
function noop() {}
function fetchDoneOrErrored() {
runningFetches -= 1;
fetchFromQ();
}
AutoFetcher.prototype.delay = function() {
// 2 second delay seem reasonable
return new Promise(function(resolve, reject) {
setTimeout(resolve, 2000);
});
};
function fetchURL(urlToBeFetched) {
runningFetches += 1;
fetch(urlToBeFetched)
.then(fetchDoneOrErrored)
.catch(fetchDoneOrErrored);
}
AutoFetcher.prototype.imgFetchDone = function() {
if (this.queue.length > 0) {
// we have a Q of some length drain it
var autofetcher = this;
this.delay().then(function() {
autofetcher.queuing = false;
autofetcher.fetchImgs();
});
} else {
this.queuing = false;
}
};
AutoFetcher.prototype.avFetchDone = function() {
if (this.avQueue.length > 0) {
// we have a Q of some length drain it
var autofetcher = this;
this.delay().then(function() {
autofetcher.queuingAV = false;
autofetcher.fetchAV();
});
} else {
this.queuingAV = false;
}
};
AutoFetcher.prototype.fetchAV = function() {
if (this.queuingAV || this.avQueue.length === 0) {
function queueOrFetch(urlToBeFetched) {
if (
!urlToBeFetched ||
urlToBeFetched.indexOf(DataURLPrefix) === 0 ||
seen[urlToBeFetched] != null
) {
return;
}
// the number of fetches is limited to a maximum of DefaultNumAvFetches + FullAVQDrainLen outstanding fetches
// the baseline maximum number of fetches is DefaultNumAvFetches but if the size(avQueue) <= FullAVQDrainLen
// we add them to the current batch. Because audio video resources might be big
// we limit how many we fetch at a time drastically
this.queuingAV = true;
var runningFetchers = [];
while (
this.avQueue.length > 0 &&
runningFetchers.length <= DefaultNumAvFetches
) {
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop));
}
if (this.avQueue.length <= FullAVQDrainLen) {
while (this.avQueue.length > 0) {
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop));
}
}
Promise.all(runningFetchers)
.then(this.avFetchDone)
.catch(this.avFetchDone);
};
AutoFetcher.prototype.fetchImgs = function() {
if (this.queuing || this.queue.length === 0) {
seen[urlToBeFetched] = true;
if (runningFetches >= MaxRunningFetches) {
queue.push(urlToBeFetched);
return;
}
// the number of fetches is limited to a maximum of DefaultNumImFetches + FullImgQDrainLen outstanding fetches
// the baseline maximum number of fetches is DefaultNumImFetches but if the size(queue) <= FullImgQDrainLen
// we add them to the current batch
this.queuing = true;
var runningFetchers = [];
while (
this.queue.length > 0 &&
runningFetchers.length <= DefaultNumImFetches
) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop));
fetchURL(urlToBeFetched);
}
function fetchFromQ() {
while (queue.length && runningFetches < MaxRunningFetches) {
fetchURL(queue.shift());
}
if (this.queue.length <= FullImgQDrainLen) {
while (this.queue.length > 0) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop));
}
}
Promise.all(runningFetchers)
.then(this.imgFetchDone)
.catch(this.imgFetchDone);
};
}
AutoFetcher.prototype.queueNonAVURL = function(url) {
// ensure we do not request data urls
if (url.indexOf(DataURLPrefix) === 0) return;
// check to see if we have seen this url before in order
// to lessen the load against the server content is fetched from
if (this.seen[url] != null) return;
this.seen[url] = true;
this.queue.push(url);
};
AutoFetcher.prototype.queueAVURL = function(url) {
// ensure we do not request data urls
if (url.indexOf(DataURLPrefix) === 0) return;
// check to see if we have seen this url before in order
// to lessen the load against the server content is fetched from
if (this.seen[url] != null) return;
this.seen[url] = true;
this.avQueue.push(url);
};
AutoFetcher.prototype.maybeResolveURL = function(url, base) {
function maybeResolveURL(url, base) {
// given a url and base url returns a resolved full URL or
// null if resolution was unsuccessful
try {
@ -190,99 +139,129 @@ AutoFetcher.prototype.maybeResolveURL = function(url, base) {
} catch (e) {
return null;
}
};
}
AutoFetcher.prototype.maybeFixUpRelSchemelessPrefix = function(url) {
function safeResolve(url, resolver) {
// Guard against the exception thrown by the URL constructor if the URL or resolver is bad
// if resolver is undefined/null then this function passes url through
var resolvedURL = url;
if (resolver) {
try {
var _url = new URL(url, resolver);
return _url.href;
} catch (e) {
resolvedURL = url;
}
}
return resolvedURL;
}
function maybeFixUpRelSchemelessPrefix(url) {
// attempt to ensure rewritten relative or schemeless URLs become full URLS!
// otherwise returns null if this did not happen
if (url.indexOf(this.relative) === 0) {
return url.replace(this.relative, this.prefix);
if (url.indexOf(config.relative) === 0) {
return url.replace(config.relative, config.prefix);
}
if (url.indexOf(this.schemeless) === 0) {
return url.replace(this.schemeless, this.prefix);
if (url.indexOf(config.schemeless) === 0) {
return url.replace(config.schemeless, config.prefix);
}
return null;
};
}
AutoFetcher.prototype.maybeFixUpURL = function(url, resolveOpts) {
function maybeFixUpURL(url, resolveOpts) {
// attempt to fix up the url and do our best to ensure we can get dat 200 OK!
if (this.rwRe.test(url)) {
if (config.rwRe.test(url)) {
return url;
}
var mod = resolveOpts.mod || 'mp_';
// first check for / (relative) or // (schemeless) rewritten urls
var maybeFixed = this.maybeFixUpRelSchemelessPrefix(url);
var maybeFixed = maybeFixUpRelSchemelessPrefix(url);
if (maybeFixed != null) {
return maybeFixed;
}
// resolve URL against tag src
if (resolveOpts.tagSrc != null) {
maybeFixed = this.maybeResolveURL(url, resolveOpts.tagSrc);
maybeFixed = maybeResolveURL(url, resolveOpts.tagSrc);
if (maybeFixed != null) {
return this.prefix + mod + '/' + maybeFixed;
return config.prefix + mod + '/' + maybeFixed;
}
}
// finally last attempt resolve the originating documents base URI
if (resolveOpts.docBaseURI) {
maybeFixed = this.maybeResolveURL(url, resolveOpts.docBaseURI);
maybeFixed = maybeResolveURL(url, resolveOpts.docBaseURI);
if (maybeFixed != null) {
return this.prefix + mod + '/' + maybeFixed;
return config.prefix + mod + '/' + maybeFixed;
}
}
// not much to do now.....
return this.prefixMod + '/' + url;
};
return config.prefixMod + '/' + url;
}
AutoFetcher.prototype.urlExtractor = function(
match,
n1,
n2,
n3,
offset,
string
) {
function urlExtractor(match, n1, n2, n3, offset, string) {
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
this.queueNonAVURL(n2);
queueOrFetch(n2);
return n1 + n2 + n3;
};
}
AutoFetcher.prototype.handleMedia = function(mediaRules) {
function urlExtractorProxyMode(match, n1, n2, n3, offset, string) {
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
// this.currentResolver is set to the URL which the browser would normally
// resolve relative urls with (URL of the stylesheet) in an exceptionless manner
// (resolvedURL will be undefined if an error occurred)
queueOrFetch(safeResolve(n2, currentResolver));
return n1 + n2 + n3;
}
function handleMedia(mediaRules) {
// this is a broken down rewrite_style
if (mediaRules == null || mediaRules.length === 0) return;
// var rules = mediaRules.values;
for (var i = 0; i < mediaRules.length; i++) {
mediaRules[i]
.replace(STYLE_REGEX, this.urlExtractor)
.replace(IMPORT_REGEX, this.urlExtractor);
.replace(STYLE_REGEX, urlExtractor)
.replace(IMPORT_REGEX, urlExtractor);
}
};
}
AutoFetcher.prototype.handleSrc = function(srcValues, context) {
function handleMediaProxyMode(mediaRules) {
// this is a broken down rewrite_style
if (mediaRules == null || mediaRules.length === 0) return;
for (var i = 0; i < mediaRules.length; i++) {
// set currentResolver to the value of this stylesheets URL, done to ensure we do not have to
// create functions on each loop iteration because we potentially create a new `URL` object
// twice per iteration
currentResolver = mediaRules[i].resolve;
mediaRules[i].cssText
.replace(STYLE_REGEX, urlExtractorProxyMode)
.replace(IMPORT_REGEX, urlExtractorProxyMode);
}
}
function handleSrc(srcValues, context) {
var resolveOpts = { docBaseURI: context.docBaseURI };
if (srcValues.value) {
resolveOpts.mod = srcValues.mod;
if (resolveOpts.mod === 1) {
return this.queueNonAVURL(
this.maybeFixUpURL(srcValues.value.trim(), resolveOpts)
);
}
return this.queueAVURL(
this.maybeFixUpURL(srcValues.value.trim(), resolveOpts)
);
return queueOrFetch(maybeFixUpURL(srcValues.value.trim(), resolveOpts));
}
var len = srcValues.values.length;
for (var i = 0; i < len; i++) {
var value = srcValues.values[i];
resolveOpts.mod = value.mod;
if (resolveOpts.mod === 'im_') {
this.queueNonAVURL(this.maybeFixUpURL(value.src, resolveOpts));
} else {
this.queueAVURL(this.maybeFixUpURL(value.src, resolveOpts));
}
queueOrFetch(maybeFixUpURL(value.src, resolveOpts));
}
};
}
AutoFetcher.prototype.extractSrcSetNotPreSplit = function(ssV, resolveOpts) {
function handleSrcProxyMode(srcValues) {
// preservation worker in proxy mode sends us the value of the srcset attribute of an element
// and a URL to correctly resolve relative URLS. Thus we must recreate rewrite_srcset logic here
if (srcValues == null || srcValues.length === 0) return;
var srcVal;
for (var i = 0; i < srcValues.length; i++) {
srcVal = srcValues[i];
queueOrFetch(safeResolve(srcVal.src, srcVal.resolve));
}
}
function extractSrcSetNotPreSplit(ssV, resolveOpts) {
if (!ssV) return;
// was from extract from local doc so we need to duplicate work
var srcsetValues = ssV.split(srcsetSplit);
@ -290,41 +269,38 @@ AutoFetcher.prototype.extractSrcSetNotPreSplit = function(ssV, resolveOpts) {
// grab the URL not width/height key
if (srcsetValues[i]) {
var value = srcsetValues[i].trim().split(' ')[0];
var maybeResolvedURL = this.maybeFixUpURL(value.trim(), resolveOpts);
if (resolveOpts.mod === 'im_') {
this.queueNonAVURL(maybeResolvedURL);
} else {
this.queueAVURL(maybeResolvedURL);
}
var maybeResolvedURL = maybeFixUpURL(value.trim(), resolveOpts);
queueOrFetch(maybeResolvedURL);
}
}
};
}
AutoFetcher.prototype.extractSrcset = function(srcsets, context) {
function extractSrcset(srcsets) {
// was rewrite_srcset and only need to q
for (var i = 0; i < srcsets.length; i++) {
// grab the URL not width/height key
var url = srcsets[i].split(' ')[0];
if (context.mod === 'im_') {
this.queueNonAVURL(url);
} else {
this.queueAVURL(url);
}
queueOrFetch(url);
}
};
}
AutoFetcher.prototype.handleSrcset = function(srcset, context) {
var resolveOpts = { docBaseURI: context.docBaseURI };
function handleSrcset(srcset, context) {
if (srcset == null) return;
var resolveOpts = {
docBaseURI: context.docBaseURI,
mode: null,
tagSrc: null
};
if (srcset.value) {
// we have a single value, this srcset came from either
// preserveDataSrcset (not presplit) preserveSrcset (presplit)
resolveOpts.mod = srcset.mod;
if (!srcset.presplit) {
// extract URLs from the srcset string
return this.extractSrcSetNotPreSplit(srcset.value, resolveOpts);
return extractSrcSetNotPreSplit(srcset.value, resolveOpts);
}
// we have an array of srcset URL strings
return this.extractSrcset(srcset.value, resolveOpts);
return extractSrcset(srcset.value);
}
// we have an array of values, these srcsets came from extractFromLocalDoc
var len = srcset.values.length;
@ -332,38 +308,64 @@ AutoFetcher.prototype.handleSrcset = function(srcset, context) {
var ssv = srcset.values[i];
resolveOpts.mod = ssv.mod;
resolveOpts.tagSrc = ssv.tagSrc;
this.extractSrcSetNotPreSplit(ssv.srcset, resolveOpts);
extractSrcSetNotPreSplit(ssv.srcset, resolveOpts);
}
};
}
AutoFetcher.prototype.autoFetch = function(data) {
function handleSrcsetProxyMode(srcsets) {
// preservation worker in proxy mode sends us the value of the srcset attribute of an element
// and a URL to correctly resolve relative URLS. Thus we must recreate rewrite_srcset logic here
if (srcsets == null) return;
var length = srcsets.length;
var extractedSrcSet, srcsetValue, ssSplit, j;
for (var i = 0; i < length; i++) {
extractedSrcSet = srcsets[i];
ssSplit = extractedSrcSet.srcset.split(srcsetSplit);
for (j = 0; j < ssSplit.length; j++) {
if (ssSplit[j]) {
srcsetValue = ssSplit[j].trim();
if (srcsetValue) {
queueOrFetch(
safeResolve(srcsetValue.split(' ')[0], extractedSrcSet.resolve)
);
}
}
}
}
}
function autoFetch(data) {
// we got a message and now we autofetch!
// these calls turn into no ops if they have no work
if (data.media) {
this.handleMedia(data.media);
if (config.proxyMode) {
handleMediaProxyMode(data.media);
} else {
handleMedia(data.media);
}
}
if (data.src) {
this.handleSrc(data.src, data.context || {});
if (config.proxyMode) {
handleSrcProxyMode(data.src);
} else {
handleSrc(data.src, data.context || { docBaseURI: null });
}
}
if (data.srcset) {
this.handleSrcset(data.srcset, data.context || {});
if (config.proxyMode) {
handleSrcsetProxyMode(data.srcset);
} else {
handleSrcset(data.srcset, data.context || { docBaseURI: null });
}
}
}
function justFetch(data) {
// we got a message containing only urls to be fetched
if (data == null || data.values == null) return;
for (var i = 0; i < data.values.length; ++i) {
queueOrFetch(data.values[i]);
}
this.fetchImgs();
this.fetchAV();
};
// initialize ourselves from the query params :)
try {
var loc = new self.URL(location.href);
autofetcher = new AutoFetcher(JSON.parse(loc.searchParams.get('init')));
} catch (e) {
// likely we are in an older version of safari
var search = decodeURIComponent(location.search.split('?')[1]).split('&');
var init = JSON.parse(search[0].substr(search[0].indexOf('=') + 1));
init.prefix = decodeURIComponent(init.prefix);
init.baseURI = decodeURIComponent(init.baseURI);
autofetcher = new AutoFetcher(init);
}

View File

@ -1,303 +0,0 @@
'use strict';
// thanks wombat
var STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi;
var IMPORT_REGEX = /(@import\s+[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi;
var srcsetSplit = /\s*(\S*\s+[\d.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/;
var DefaultNumImFetches = 30;
var FullImgQDrainLen = 10;
var DefaultNumAvFetches = 5;
var FullAVQDrainLen = 5;
var DataURLPrefix = 'data:';
var FetchDelay = 1000;
// the autofetcher instance for this worker
var autofetcher = null;
function noop() {}
if (typeof self.Promise === 'undefined') {
// not kewl we must polyfill Promise
self.Promise = function(executor) {
executor(noop, noop);
};
self.Promise.prototype.then = function(cb) {
if (cb) cb();
return this;
};
self.Promise.prototype.catch = function() {
return this;
};
self.Promise.all = function(values) {
return new Promise(noop);
};
}
if (typeof self.fetch === 'undefined') {
// not kewl we must polyfill fetch.
self.fetch = function(url) {
return new Promise(function(resolve) {
var xhr = new XMLHttpRequest();
xhr.open('GET', url);
xhr.send();
resolve();
});
};
}
self.onmessage = function(event) {
var data = event.data;
switch (data.type) {
case 'values':
autofetcher.autofetchMediaSrcset(data);
break;
case 'fetch-all':
autofetcher.justFetch(data);
break;
}
};
function AutoFetcher() {
if (!(this instanceof AutoFetcher)) {
return new AutoFetcher();
}
// local cache of URLs fetched, to reduce server load
this.seen = {};
// array of URLs to be fetched
this.queue = [];
this.avQueue = [];
// should we queue a URL or not
this.queuing = false;
// a URL to resolve relative URLs found in the cssText of CSSMedia rules.
this.currentResolver = null;
// should we queue a URL or not
this.queuing = false;
this.queuingAV = false;
this.urlExtractor = this.urlExtractor.bind(this);
this.imgFetchDone = this.imgFetchDone.bind(this);
this.avFetchDone = this.avFetchDone.bind(this);
}
AutoFetcher.prototype.delay = function() {
return new Promise(function(resolve, reject) {
setTimeout(resolve, FetchDelay);
});
};
AutoFetcher.prototype.imgFetchDone = function() {
if (this.queue.length > 0) {
// we have a Q of some length drain it
var autofetcher = this;
this.delay().then(function() {
autofetcher.queuing = false;
autofetcher.fetchImgs();
});
} else {
this.queuing = false;
}
};
AutoFetcher.prototype.avFetchDone = function() {
if (this.avQueue.length > 0) {
// we have a Q of some length drain it
var autofetcher = this;
this.delay().then(function() {
autofetcher.queuingAV = false;
autofetcher.fetchAV();
});
} else {
this.queuingAV = false;
}
};
AutoFetcher.prototype.fetchAV = function() {
if (this.queuingAV || this.avQueue.length === 0) {
return;
}
// the number of fetches is limited to a maximum of DefaultNumAvFetches + FullAVQDrainLen outstanding fetches
// the baseline maximum number of fetches is DefaultNumAvFetches but if the size(avQueue) <= FullAVQDrainLen
// we add them to the current batch. Because audio video resources might be big
// we limit how many we fetch at a time drastically
this.queuingAV = true;
var runningFetchers = [];
while (
this.avQueue.length > 0 &&
runningFetchers.length <= DefaultNumAvFetches
) {
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop));
}
if (this.avQueue.length <= FullAVQDrainLen) {
while (this.avQueue.length > 0) {
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop));
}
}
Promise.all(runningFetchers)
.then(this.avFetchDone)
.catch(this.avFetchDone);
};
AutoFetcher.prototype.fetchImgs = function() {
if (this.queuing || this.queue.length === 0) {
return;
}
// the number of fetches is limited to a maximum of DefaultNumImFetches + FullImgQDrainLen outstanding fetches
// the baseline maximum number of fetches is DefaultNumImFetches but if the size(queue) <= FullImgQDrainLen
// we add them to the current batch
this.queuing = true;
var runningFetchers = [];
while (
this.queue.length > 0 &&
runningFetchers.length <= DefaultNumImFetches
) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop));
}
if (this.queue.length <= FullImgQDrainLen) {
while (this.queue.length > 0) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop));
}
}
Promise.all(runningFetchers)
.then(this.imgFetchDone)
.catch(this.imgFetchDone);
};
AutoFetcher.prototype.queueNonAVURL = function(url) {
// ensure we do not request data urls
if (url.indexOf(DataURLPrefix) === 0) return;
// check to see if we have seen this url before in order
// to lessen the load against the server content is fetched from
if (this.seen[url] != null) return;
this.seen[url] = true;
this.queue.push(url);
};
AutoFetcher.prototype.queueAVURL = function(url) {
// ensure we do not request data urls
if (url.indexOf(DataURLPrefix) === 0) return;
// check to see if we have seen this url before in order
// to lessen the load against the server content is fetched from
if (this.seen[url] != null) return;
this.seen[url] = true;
this.avQueue.push(url);
};
AutoFetcher.prototype.safeResolve = function(url, resolver) {
// Guard against the exception thrown by the URL constructor if the URL or resolver is bad
// if resolver is undefined/null then this function passes url through
var resolvedURL = url;
if (resolver) {
try {
resolvedURL = new URL(url, resolver).href;
} catch (e) {
resolvedURL = url;
}
}
return resolvedURL;
};
AutoFetcher.prototype.urlExtractor = function(
match,
n1,
n2,
n3,
offset,
string
) {
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
// this.currentResolver is set to the URL which the browser would normally
// resolve relative urls with (URL of the stylesheet) in an exceptionless manner
// (resolvedURL will be undefined if an error occurred)
var resolvedURL = this.safeResolve(n2, this.currentResolver);
if (resolvedURL) {
this.queueNonAVURL(resolvedURL);
}
return n1 + n2 + n3;
};
AutoFetcher.prototype.extractMedia = function(mediaRules) {
// this is a broken down rewrite_style
if (mediaRules == null) return;
for (var i = 0; i < mediaRules.length; i++) {
// set currentResolver to the value of this stylesheets URL, done to ensure we do not have to
// create functions on each loop iteration because we potentially create a new `URL` object
// twice per iteration
this.currentResolver = mediaRules[i].resolve;
mediaRules[i].cssText
.replace(STYLE_REGEX, this.urlExtractor)
.replace(IMPORT_REGEX, this.urlExtractor);
}
};
AutoFetcher.prototype.extractSrcset = function(srcsets) {
// preservation worker in proxy mode sends us the value of the srcset attribute of an element
// and a URL to correctly resolve relative URLS. Thus we must recreate rewrite_srcset logic here
if (srcsets == null) return;
var length = srcsets.length;
var extractedSrcSet, srcsetValue, ssSplit, j;
for (var i = 0; i < length; i++) {
extractedSrcSet = srcsets[i];
ssSplit = extractedSrcSet.srcset.split(srcsetSplit);
console.log(ssSplit);
for (j = 0; j < ssSplit.length; j++) {
if (ssSplit[j]) {
srcsetValue = ssSplit[j].trim();
if (srcsetValue.length > 0) {
// resolve the URL in an exceptionless manner (resolvedURL will be undefined if an error occurred)
var resolvedURL = this.safeResolve(
srcsetValue.split(' ')[0],
extractedSrcSet.resolve
);
if (resolvedURL) {
if (extractedSrcSet.mod === 'im_') {
this.queueNonAVURL(resolvedURL);
} else {
this.queueAVURL(resolvedURL);
}
} else {
console.log(resolvedURL);
}
} else {
console.log(srcsetValue);
}
}
}
}
};
AutoFetcher.prototype.extractSrc = function(srcVals) {
// preservation worker in proxy mode sends us the value of the srcset attribute of an element
// and a URL to correctly resolve relative URLS. Thus we must recreate rewrite_srcset logic here
if (srcVals == null || srcVals.length === 0) return;
var length = srcVals.length;
var srcVal;
for (var i = 0; i < length; i++) {
srcVal = srcVals[i];
var resolvedURL = this.safeResolve(srcVal.src, srcVal.resolve);
if (resolvedURL) {
if (srcVal.mod === 'im_') {
this.queueNonAVURL(resolvedURL);
} else {
this.queueAVURL(resolvedURL);
}
}
}
};
AutoFetcher.prototype.autofetchMediaSrcset = function(data) {
// we got a message and now we autofetch!
// these calls turn into no ops if they have no work
this.extractMedia(data.media);
this.extractSrcset(data.srcset);
this.extractSrc(data.src);
this.fetchImgs();
this.fetchAV();
};
AutoFetcher.prototype.justFetch = function(data) {
// we got a message containing only urls to be fetched
if (data == null || data.values == null) return;
for (var i = 0; i < data.values.length; ++i) {
this.queueNonAVURL(data.values[i]);
}
this.fetchImgs();
};
autofetcher = new AutoFetcher();

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,5 +1,5 @@
/*
Copyright(c) 2013-2018 Rhizome and Ilya Kreymer. Released under the GNU General Public License.
Copyright(c) 2013-2018 Rhizome and Contributors. Released under the GNU General Public License.
This file is part of pywb, https://github.com/webrecorder/pywb

View File

@ -363,7 +363,7 @@ class TestProxyAutoFetchWorkerEndPoints(BaseTestProxy):
def test_proxy_worker_options_request(self, scheme):
expected_origin = '{0}://example.com'.format(scheme)
res = requests.options('{0}://pywb.proxy/static/autoFetchWorkerProxyMode.js'.format(scheme),
res = requests.options('{0}://pywb.proxy/static/autoFetchWorker.js'.format(scheme),
headers=dict(Origin=expected_origin),
proxies=self.proxies, verify=self.root_ca_file)
@ -372,7 +372,7 @@ class TestProxyAutoFetchWorkerEndPoints(BaseTestProxy):
def test_proxy_worker_fetch(self, scheme):
origin = '{0}://example.com'.format(scheme)
url = '{0}://pywb.proxy/static/autoFetchWorkerProxyMode.js'.format(scheme)
url = '{0}://pywb.proxy/static/autoFetchWorker.js'.format(scheme)
res = requests.get(url,
headers=dict(Origin=origin),
proxies=self.proxies, verify=self.root_ca_file)
@ -380,11 +380,11 @@ class TestProxyAutoFetchWorkerEndPoints(BaseTestProxy):
assert res.ok
assert res.headers.get('Content-Type') == 'application/javascript'
assert res.headers.get('Access-Control-Allow-Origin') == origin
assert 'AutoFetcher.prototype.safeResolve' in res.text
assert 'function handleSrcsetProxyMode' in res.text
res = requests.get(url, proxies=self.proxies, verify=self.root_ca_file)
assert res.ok
assert res.headers.get('Content-Type') == 'application/javascript'
assert res.headers.get('Access-Control-Allow-Origin') == '*'
assert 'AutoFetcher.prototype.safeResolve' in res.text
assert 'function handleSrcsetProxyMode' in res.text