1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Automatic fetching of picture > source[srcset] fixes #414 (#415)

- added to the auto-fetch worker of both wombat and wombatProxymode
- added utility function isImageSrcset to wombat for determining if the srcset values being rewritten are from either a image tag or a source tag within a picture tag
- added utility function isImageDataSrcset to wombat to check for img/source data-srcset attributes
- reworked the backing auto-fetch worker to now queue all URLs and perform fetch batching with maximum batch size of 60. A delay of 2 seconds is applied after each batch.

Ensured that the srcset values sent to the auto-fetch worker can be resolved in non-proxy mode fixes #413
Renamed the auto-fetch class named used in proxy mode from AutoFetchWorker to AutoFetchWorkerProxyMode
Added checking of script tage types application/json and text/template to rewrite_script
This commit is contained in:
John Berlin 2018-11-20 14:43:18 -05:00 committed by Ilya Kreymer
parent 3e0bb49ae1
commit f78bac9474
4 changed files with 223 additions and 103 deletions

View File

@ -59,8 +59,6 @@ function AutoFetcher(init) {
this.schemeless = '/' + this.relative;
// local cache of URLs fetched, to reduce server load
this.seen = {};
// array of promises returned by fetch(URL)
this.fetches = [];
// array of URL to be fetched
this.queue = [];
// should we queue a URL or not
@ -86,61 +84,62 @@ AutoFetcher.prototype.fixupURL = function (url) {
return url;
};
AutoFetcher.prototype.safeFetch = function (url) {
AutoFetcher.prototype.queueURL = function (url) {
// ensure we do not request data urls
if (url.indexOf('data:') === 0) return;
// check to see if we have seen this url before in order
// to lessen the load against the server content is fetched from
if (this.seen[url] != null) return;
this.seen[url] = true;
if (this.queuing) {
// we are currently waiting for a batch of fetches to complete
return this.queue.push(url);
}
// fetch this url
this.fetches.push(fetch(url));
this.queue.push(url);
};
AutoFetcher.prototype.urlExtractor = function (match, n1, n2, n3, offset, string) {
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
this.safeFetch(this.fixupURL(n2));
this.queueURL(this.fixupURL(n2));
return n1 + n2 + n3;
};
AutoFetcher.prototype.delay = function () {
// 2 second delay seem reasonable
return new Promise(function (resolve, reject) {
setTimeout(resolve, 2000);
});
};
AutoFetcher.prototype.fetchDone = function () {
// indicate we no longer need to Q
this.queuing = false;
if (this.queue.length > 0) {
// we have a Q of some length drain it
this.drainQ();
var autofetcher = this;
this.delay().then(function () {
autofetcher.fetchAll();
});
}
};
AutoFetcher.prototype.fetchAll = function () {
// if we are queuing or have no fetches this is a no op
if (this.queuing) return;
if (this.fetches.length === 0) return;
// we are about to fetch queue anything that comes our way
if (this.queuing || this.queue.length === 0) {
return;
}
// the number of fetches is limited to a maximum of 60 outstanding fetches
// the baseline maximum number of fetches is 50 but if the size(queue) <= 10
// we add them to the current batch
this.queuing = true;
/// initiate fetches by turning the initial fetch promises
// into rejctionless promises and "await" all clearing
// our fetches array in place
var runningFetchers = [];
while (this.fetches.length > 0) {
runningFetchers.push(this.fetches.shift().catch(noop))
while (this.queue.length > 0 && runningFetchers.length <= 50) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop))
}
if (this.queue.length <= 10) {
while (this.queue.length > 0) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop))
}
}
Promise.all(runningFetchers)
.then(this.fetchDone)
.catch(this.fetchDone);
};
AutoFetcher.prototype.drainQ = function () {
// clear our Q in place and fill our fetches array
while (this.queue.length > 0) {
this.fetches.push(fetch(this.queue.shift()));
}
// fetch all the things
this.fetchAll();
};
AutoFetcher.prototype.extractMedia = function (mediaRules) {
// this is a broken down rewrite_style
if (mediaRules == null || mediaRules.values === null) return;
@ -185,9 +184,11 @@ AutoFetcher.prototype.fixupURLSrcSet = function (url, tagSrc, context) {
return maybeFixed;
}
// resolve URL against tag src
maybeFixed = this.maybeResolveURL(url, tagSrc);
if (maybeFixed != null) {
return this.prefix + 'im_/' + maybeFixed;
if (tagSrc != null) {
maybeFixed = this.maybeResolveURL(url, tagSrc);
if (maybeFixed != null) {
return this.prefix + 'im_/' + maybeFixed;
}
}
// finally last attempt resolve the originating documents base URI
maybeFixed = this.maybeResolveURL(url, context.docBaseURI);
@ -210,7 +211,7 @@ AutoFetcher.prototype.extractSrcset = function (srcsets, context) {
// was rewrite_srcset so just ensure we just
for (var i = 0; i < srcsetValues.length; i++) {
// grab the URL not width/height key
this.safeFetch(srcsetValues[i].split(' ')[0]);
this.queueURL(srcsetValues[i].split(' ')[0]);
}
};
@ -224,7 +225,7 @@ AutoFetcher.prototype.srcsetNotPreSplit = function (values, context) {
// grab the URL not width/height key
if (Boolean(srcsetValues[j])) {
var value = srcsetValues[j].trim().split(' ')[0];
this.safeFetch(this.fixupURLSrcSet(value, tagSrc, context));
this.queueURL(this.fixupURLSrcSet(value, tagSrc, context));
}
}
}

View File

@ -53,8 +53,6 @@ function AutoFetcher() {
}
// local cache of URLs fetched, to reduce server load
this.seen = {};
// array of promises returned by fetch(URL)
this.fetches = [];
// array of URL to be fetched
this.queue = [];
// should we queue a URL or not
@ -65,19 +63,14 @@ function AutoFetcher() {
this.fetchDone = this.fetchDone.bind(this);
}
AutoFetcher.prototype.safeFetch = function (url) {
AutoFetcher.prototype.queueURL = function (url) {
// ensure we do not request data urls
if (url.indexOf('data:') === 0) return;
// check to see if we have seen this url before in order
// to lessen the load against the server content is autofetchd from
if (this.seen[url] != null) return;
this.seen[url] = true;
if (this.queuing) {
// we are currently waiting for a batch of fetches to complete
return this.queue.push(url);
}
// fetch this url
this.fetches.push(fetch(url));
this.queue.push(url);
};
AutoFetcher.prototype.safeResolve = function (url, resolver) {
@ -102,47 +95,52 @@ AutoFetcher.prototype.urlExtractor = function (match, n1, n2, n3, offset, string
// (resolvedURL will be undefined if an error occurred)
var resolvedURL = this.safeResolve(n2, this.currentResolver);
if (resolvedURL) {
this.safeFetch(resolvedURL);
this.queueURL(resolvedURL);
}
return n1 + n2 + n3;
};
AutoFetcher.prototype.delay = function () {
// 2 second delay seem reasonable
return new Promise(function (resolve, reject) {
setTimeout(resolve, 2000);
});
};
AutoFetcher.prototype.fetchDone = function () {
// indicate we no longer need to Q
this.queuing = false;
if (this.queue.length > 0) {
// we have a Q of some length drain it
this.drainQ();
var autofetcher = this;
// wait 2 seconds before doing another batch
this.delay().then(function () {
autofetcher.fetchAll();
});
}
};
AutoFetcher.prototype.fetchAll = function () {
// if we are queuing or have no fetches this is a no op
if (this.queuing) return;
if (this.fetches.length === 0) return;
// we are about to fetch queue anything that comes our way
if (this.queuing || this.queue.length === 0) {
return;
}
// the number of fetches is limited to a maximum of 60 outstanding fetches
// the baseline maximum number of fetches is 50 but if the size(queue) <= 10
// we add them to the current batch this.queuing = true;
this.queuing = true;
// initiate fetches by turning the initial fetch promises
// into rejctionless promises and "await" all clearing
// our fetches array in place
var runningFetchers = [];
while (this.fetches.length > 0) {
runningFetchers.push(this.fetches.shift().catch(noop))
while (this.queue.length > 0 && runningFetchers.length <= 50) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop))
}
if (this.queue.length <= 10) {
while (this.queue.length > 0) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop))
}
}
Promise.all(runningFetchers)
.then(this.fetchDone)
.catch(this.fetchDone);
};
AutoFetcher.prototype.drainQ = function () {
// clear our Q in place and fill our fetches array
while (this.queue.length > 0) {
this.fetches.push(fetch(this.queue.shift()));
}
// fetch all the things
this.fetchAll();
};
AutoFetcher.prototype.extractMedia = function (mediaRules) {
// this is a broken down rewrite_style
if (mediaRules == null) return;
@ -173,7 +171,7 @@ AutoFetcher.prototype.extractSrcset = function (srcsets) {
// resolve the URL in an exceptionless manner (resolvedURL will be undefined if an error occurred)
var resolvedURL = this.safeResolve(srcsetValue.split(' ')[0], extractedSrcSet.resolve);
if (resolvedURL) {
this.safeFetch(resolvedURL);
this.queueURL(resolvedURL);
}
}
}

View File

@ -186,6 +186,16 @@ var _WBWombat = function($wbwindow, wbinfo) {
}
}
function isImageSrcset(elem) {
if (elem.tagName === 'IMG') return true;
return elem.tagName === 'SOURCE' && elem.parentElement && elem.parentElement.tagName === 'PICTURE';
}
function isImageDataSrcset(elem) {
if (isImageSrcset(elem)) return elem.dataset.srcset != null;
return false;
}
//============================================
function is_host_url(str) {
// Good guess that's its a hostname
@ -1152,7 +1162,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
} else if (lowername == "style") {
value = rewrite_style(value);
} else if (lowername == "srcset") {
value = rewrite_srcset(value, this.tagName === 'IMG');
value = rewrite_srcset(value, isImageSrcset(this));
}
}
orig_setAttribute.call(this, name, value);
@ -1423,16 +1433,74 @@ var _WBWombat = function($wbwindow, wbinfo) {
}, true);
};
AutoFetchWorker.prototype.preserveDataSrcset = function (srcset) {
// send values from rewrite_attr srcset to the worker deferred
// to ensure the page viewer sees the images first
this.postMessage({
'type': 'values',
'srcset': {'values': srcset, 'presplit': false},
}, true);
};
AutoFetchWorker.prototype.preserveMedia = function (media) {
// send CSSMediaRule values to the worker
this.postMessage({'type': 'values', 'media': media})
this.postMessage({'type': 'values', 'media': media}, true);
};
AutoFetchWorker.prototype.extractSrcset = function (elem) {
if (wb_getAttribute) {
return wb_getAttribute.call(elem, 'srcset');
}
return elem.getAttribute('srcset');
};
AutoFetchWorker.prototype.checkForPictureSourceDataSrcsets = function () {
var dataSS = $wbwindow.document.querySelectorAll('img[data-srcset], source[data-srcset]');
var elem;
var srcset = [];
for (var i = 0; i < dataSS.length; i++) {
elem = dataSS[i];
if (elem.tagName === 'SOURCE') {
if (elem.parentElement && elem.parentElement.tagName === 'PICTURE' && elem.dataset.srcset) {
srcset.push({srcset: elem.dataset.srcset});
}
} else if (elem.dataset.srcset) {
srcset.push({srcset: elem.dataset.srcset});
}
}
if (srcset.length) {
this.postMessage({
'type': 'values',
'srcset': {'values': srcset, 'presplit': false},
'context': {
'docBaseURI': $wbwindow.document.baseURI
}
}, true);
}
};
AutoFetchWorker.prototype.extractImgPictureSourceSrcsets = function () {
var i;
var elem = null;
var srcset = [];
var ssElements = $wbwindow.document.querySelectorAll('img[srcset], source[srcset]');
for (i = 0; i < ssElements.length; i++) {
elem = ssElements[i];
if (elem.tagName === 'SOURCE') {
if (elem.parentElement && elem.parentElement.tagName === 'PICTURE') {
srcset.push({srcset: this.extractSrcset(elem)});
}
} else {
srcset.push({tagSrc: elem.src, srcset: this.extractSrcset(elem)});
}
}
return srcset;
};
AutoFetchWorker.prototype.extractFromLocalDoc = function () {
// get the values to be preserved from the documents stylesheets
// and all elements with a srcset
var media = [];
var srcset = [];
var sheets = $wbwindow.document.styleSheets;
var i = 0;
for (; i < sheets.length; ++i) {
@ -1444,16 +1512,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
}
}
}
var srcsetElems = $wbwindow.document.querySelectorAll('img[srcset]');
for (i = 0; i < srcsetElems.length; i++) {
var ssv = {tagSrc: srcsetElems[i].src};
if (wb_getAttribute) {
ssv.srcset = wb_getAttribute.call(srcsetElems[i], 'srcset');
} else {
ssv.srcset = srcsetElems[i].getAttribute('srcset');
}
srcset.push(ssv);
}
var srcset = this.extractImgPictureSourceSrcsets();
// send the extracted values to the worker deferred
// to ensure the page viewer sees the images first
this.postMessage({
@ -1464,6 +1523,12 @@ var _WBWombat = function($wbwindow, wbinfo) {
'docBaseURI': $wbwindow.document.baseURI
}
}, true);
// deffer the checking of img/source data-srcset
// so that we do not clobber the UI thread
var self = this;
Promise.resolve().then(function () {
self.checkForPictureSourceDataSrcsets();
});
};
WBAutoFetchWorker = new AutoFetchWorker(wb_abs_prefix, wbinfo.mod);
@ -1615,7 +1680,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
} else if (name == "style") {
new_value = rewrite_style(value);
} else if (name == "srcset") {
new_value = rewrite_srcset(value, elem.tagName === 'IMG');
new_value = rewrite_srcset(value, isImageSrcset(elem));
} else {
// Only rewrite if absolute url
if (abs_url_only && !starts_with(value, VALID_PREFIXES)) {
@ -1623,6 +1688,9 @@ var _WBWombat = function($wbwindow, wbinfo) {
}
var mod = rwModForElement(elem, name);
new_value = rewrite_url(value, false, mod, elem.ownerDocument);
if (wbUseAFWorker && isImageDataSrcset(elem)) {
WBAutoFetchWorker.preserveDataSrcset(elem.dataset.srcset);
}
}
if (new_value != value) {
@ -1724,7 +1792,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
if (elem.getAttribute("src") || !elem.textContent || !$wbwindow.Proxy) {
return rewrite_attr(elem, "src");
}
if (elem.type && (elem.type === 'application/json' || elem.type.indexOf('text/template') !== -1)) return;
if (elem.textContent.indexOf("_____WB$wombat$assign$function_____") >= 0) {
return false;
}
@ -2029,7 +2097,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
if (mod == "cs_" && orig.indexOf("data:text/css") == 0) {
val = rewrite_inline_style(orig);
} else if (attr == "srcset") {
val = rewrite_srcset(orig, this.tagName === 'IMG');
val = rewrite_srcset(orig, isImageSrcset(this));
} else if (this.tagName === 'LINK' && attr === 'href') {
var relV = this.rel;
if (relV === 'import' || relV === 'preload') {

View File

@ -169,9 +169,9 @@ var _WBWombat = function ($wbwindow, wbinfo) {
var isTop = $wbwindow.self === $wbwindow.top;
function AutoFetchWorker() {
if (!(this instanceof AutoFetchWorker)) {
return new AutoFetchWorker();
function AutoFetchWorkerProxyMode() {
if (!(this instanceof AutoFetchWorkerProxyMode)) {
return new AutoFetchWorkerProxyMode();
}
this.checkIntervalTime = 15000;
this.checkIntervalCB = this.checkIntervalCB.bind(this);
@ -206,7 +206,7 @@ var _WBWombat = function ($wbwindow, wbinfo) {
}
}
AutoFetchWorker.prototype.startCheckingInterval = function () {
AutoFetchWorkerProxyMode.prototype.startCheckingInterval = function () {
// if document ready state is complete do first extraction and start check polling
// otherwise wait for document ready state to complete to extract and start check polling
var self = this;
@ -224,20 +224,26 @@ var _WBWombat = function ($wbwindow, wbinfo) {
}
};
AutoFetchWorker.prototype.checkIntervalCB = function () {
AutoFetchWorkerProxyMode.prototype.checkIntervalCB = function () {
this.extractFromLocalDoc();
};
AutoFetchWorker.prototype.terminate = function () {
AutoFetchWorkerProxyMode.prototype.terminate = function () {
// terminate the worker, a no op when not replay top
this.worker.terminate();
};
AutoFetchWorker.prototype.postMessage = function (msg) {
AutoFetchWorkerProxyMode.prototype.postMessage = function (msg, deferred) {
if (deferred) {
var self = this;
return Promise.resolve().then(function () {
self.worker.postMessage(msg);
});
}
this.worker.postMessage(msg);
};
AutoFetchWorker.prototype.extractMediaRules = function (rules, href) {
AutoFetchWorkerProxyMode.prototype.extractMediaRules = function (rules, href) {
// We are in proxy mode and must include a URL to resolve relative URLs in media rules
if (!rules) return [];
var rvlen = rules.length;
@ -252,7 +258,7 @@ var _WBWombat = function ($wbwindow, wbinfo) {
return text;
};
AutoFetchWorker.prototype.corsCSSFetch = function (href) {
AutoFetchWorkerProxyMode.prototype.corsCSSFetch = function (href) {
// because this JS in proxy mode operates as it would on the live web
// the rules of CORS apply and we cannot rely on URLs being rewritten correctly
// fetch the cross origin css file and then parse it using a style tag to get the rules
@ -269,17 +275,64 @@ var _WBWombat = function ($wbwindow, wbinfo) {
});
};
AutoFetchWorker.prototype.shouldSkipSheet = function (sheet) {
AutoFetchWorkerProxyMode.prototype.shouldSkipSheet = function (sheet) {
// we skip extracting rules from sheets if they are from our parsing style or come from pywb
if (sheet.id === '$wrStyleParser$') return true;
return !!(sheet.href && sheet.href.indexOf(wb_info.proxy_magic) !== -1);
};
AutoFetchWorkerProxyMode.prototype.extractImgPictureSourceSrcsets = function () {
var i;
var elem;
var srcset = [];
var baseURI = $wbwindow.document.baseURI;
var ssElements = $wbwindow.document.querySelectorAll('img[srcset], source[srcset]');
for (i = 0; i < ssElements.length; i++) {
elem = ssElements[i];
if (elem.tagName === 'SOURCE') {
if (elem.parentElement && elem.parentElement.tagName === 'PICTURE') {
srcset.push({srcset: elem.srcset, resolve: baseURI});
}
} else {
srcset.push({
srcset: elem.srcset,
resolve: elem.src != null && elem.src !== ' ' ? elem.src : baseURI
});
}
}
return srcset;
};
AutoFetchWorkerProxyMode.prototype.checkForPictureSourceDataSrcsets = function () {
var baseURI = $wbwindow.document.baseURI;
var dataSS = $wbwindow.document.querySelectorAll('img[data-srcset], source[data-srcset]');
var elem;
var srcset = [];
for (var i = 0; i < dataSS.length; i++) {
elem = dataSS[i];
if (elem.tagName === 'SOURCE') {
if (elem.parentElement && elem.parentElement.tagName === 'PICTURE' && elem.dataset.srcset) {
srcset.push({srcset: elem.dataset.srcset, resolve: baseURI});
}
} else if (elem.dataset.srcset) {
srcset.push({srcset: elem.dataset.srcset, resolve: elem.src != null && elem.src !== ' ' ? elem.src : baseURI});
}
}
if (srcset.length) {
this.postMessage({
'type': 'values',
'srcset': {'values': srcset, 'presplit': false},
'context': {
'docBaseURI': $wbwindow.document.baseURI
}
}, true);
}
};
AutoFetchWorker.prototype.extractFromLocalDoc = function () {
AutoFetchWorkerProxyMode.prototype.extractFromLocalDoc = function () {
var i = 0;
var media = [];
var deferredMediaURLS = [];
var srcset = [];
var sheet;
var resolve;
// We must use the window reference passed to us to access this origins stylesheets
@ -307,17 +360,11 @@ var _WBWombat = function ($wbwindow, wbinfo) {
}
// We must use the window reference passed to us to access this origins elements with srcset attr
// like cssRule handling we must include a URL to resolve relative URLs by
var srcsetElems = $wbwindow.document.querySelectorAll('img[srcset]');
var ssElem, resolveAgainst;
for (i = 0; i < srcsetElems.length; i++) {
ssElem = srcsetElems[i];
resolveAgainst = ssElem.src != null && ssElem.src !== ' ' ? ssElem.src : $wbwindow.document.baseURI;
srcset.push({'srcset': ssElem.srcset, 'resolve': resolveAgainst});
}
var srcset = this.extractImgPictureSourceSrcsets();
// send what we have extracted, if anything, to the worker for processing
if (media.length > 0 || srcset.length > 0) {
this.postMessage({'type': 'values', 'media': media, 'srcset': srcset});
this.postMessage({'type': 'values', 'media': media, 'srcset': srcset}, true);
}
if (deferredMediaURLS.length > 0) {
@ -334,9 +381,15 @@ var _WBWombat = function ($wbwindow, wbinfo) {
}
});
}
// deffer the checking of img/source data-srcset
// so that we do not clobber the UI thread
var self = this;
Promise.resolve().then(function () {
self.checkForPictureSourceDataSrcsets();
});
};
WBAutoFetchWorker = new AutoFetchWorker();
WBAutoFetchWorker = new AutoFetchWorkerProxyMode();
if (isTop) {
$wbwindow.addEventListener("message", function (event) {