diff --git a/pywb/static/autoFetchWorker.js b/pywb/static/autoFetchWorker.js index fe0fdcef..5baba7a1 100644 --- a/pywb/static/autoFetchWorker.js +++ b/pywb/static/autoFetchWorker.js @@ -59,8 +59,6 @@ function AutoFetcher(init) { this.schemeless = '/' + this.relative; // local cache of URLs fetched, to reduce server load this.seen = {}; - // array of promises returned by fetch(URL) - this.fetches = []; // array of URL to be fetched this.queue = []; // should we queue a URL or not @@ -86,61 +84,62 @@ AutoFetcher.prototype.fixupURL = function (url) { return url; }; -AutoFetcher.prototype.safeFetch = function (url) { +AutoFetcher.prototype.queueURL = function (url) { + // ensure we do not request data urls + if (url.indexOf('data:') === 0) return; // check to see if we have seen this url before in order // to lessen the load against the server content is fetched from if (this.seen[url] != null) return; this.seen[url] = true; - if (this.queuing) { - // we are currently waiting for a batch of fetches to complete - return this.queue.push(url); - } - // fetch this url - this.fetches.push(fetch(url)); + this.queue.push(url); }; AutoFetcher.prototype.urlExtractor = function (match, n1, n2, n3, offset, string) { // Same function as style_replacer in wombat.rewrite_style, n2 is our URL - this.safeFetch(this.fixupURL(n2)); + this.queueURL(this.fixupURL(n2)); return n1 + n2 + n3; }; +AutoFetcher.prototype.delay = function () { + // 2 second delay seem reasonable + return new Promise(function (resolve, reject) { + setTimeout(resolve, 2000); + }); +}; + AutoFetcher.prototype.fetchDone = function () { - // indicate we no longer need to Q this.queuing = false; if (this.queue.length > 0) { // we have a Q of some length drain it - this.drainQ(); + var autofetcher = this; + this.delay().then(function () { + autofetcher.fetchAll(); + }); } }; AutoFetcher.prototype.fetchAll = function () { - // if we are queuing or have no fetches this is a no op - if (this.queuing) return; - if (this.fetches.length === 0) return; - // we are about to fetch queue anything that comes our way + if (this.queuing || this.queue.length === 0) { + return; + } + // the number of fetches is limited to a maximum of 60 outstanding fetches + // the baseline maximum number of fetches is 50 but if the size(queue) <= 10 + // we add them to the current batch this.queuing = true; - /// initiate fetches by turning the initial fetch promises - // into rejctionless promises and "await" all clearing - // our fetches array in place var runningFetchers = []; - while (this.fetches.length > 0) { - runningFetchers.push(this.fetches.shift().catch(noop)) + while (this.queue.length > 0 && runningFetchers.length <= 50) { + runningFetchers.push(fetch(this.queue.shift()).catch(noop)) + } + if (this.queue.length <= 10) { + while (this.queue.length > 0) { + runningFetchers.push(fetch(this.queue.shift()).catch(noop)) + } } Promise.all(runningFetchers) .then(this.fetchDone) .catch(this.fetchDone); }; -AutoFetcher.prototype.drainQ = function () { - // clear our Q in place and fill our fetches array - while (this.queue.length > 0) { - this.fetches.push(fetch(this.queue.shift())); - } - // fetch all the things - this.fetchAll(); -}; - AutoFetcher.prototype.extractMedia = function (mediaRules) { // this is a broken down rewrite_style if (mediaRules == null || mediaRules.values === null) return; @@ -185,9 +184,11 @@ AutoFetcher.prototype.fixupURLSrcSet = function (url, tagSrc, context) { return maybeFixed; } // resolve URL against tag src - maybeFixed = this.maybeResolveURL(url, tagSrc); - if (maybeFixed != null) { - return this.prefix + 'im_/' + maybeFixed; + if (tagSrc != null) { + maybeFixed = this.maybeResolveURL(url, tagSrc); + if (maybeFixed != null) { + return this.prefix + 'im_/' + maybeFixed; + } } // finally last attempt resolve the originating documents base URI maybeFixed = this.maybeResolveURL(url, context.docBaseURI); @@ -210,7 +211,7 @@ AutoFetcher.prototype.extractSrcset = function (srcsets, context) { // was rewrite_srcset so just ensure we just for (var i = 0; i < srcsetValues.length; i++) { // grab the URL not width/height key - this.safeFetch(srcsetValues[i].split(' ')[0]); + this.queueURL(srcsetValues[i].split(' ')[0]); } }; @@ -224,7 +225,7 @@ AutoFetcher.prototype.srcsetNotPreSplit = function (values, context) { // grab the URL not width/height key if (Boolean(srcsetValues[j])) { var value = srcsetValues[j].trim().split(' ')[0]; - this.safeFetch(this.fixupURLSrcSet(value, tagSrc, context)); + this.queueURL(this.fixupURLSrcSet(value, tagSrc, context)); } } } diff --git a/pywb/static/autoFetchWorkerProxyMode.js b/pywb/static/autoFetchWorkerProxyMode.js index 95832840..813c5994 100644 --- a/pywb/static/autoFetchWorkerProxyMode.js +++ b/pywb/static/autoFetchWorkerProxyMode.js @@ -53,8 +53,6 @@ function AutoFetcher() { } // local cache of URLs fetched, to reduce server load this.seen = {}; - // array of promises returned by fetch(URL) - this.fetches = []; // array of URL to be fetched this.queue = []; // should we queue a URL or not @@ -65,19 +63,14 @@ function AutoFetcher() { this.fetchDone = this.fetchDone.bind(this); } -AutoFetcher.prototype.safeFetch = function (url) { +AutoFetcher.prototype.queueURL = function (url) { // ensure we do not request data urls if (url.indexOf('data:') === 0) return; // check to see if we have seen this url before in order // to lessen the load against the server content is autofetchd from if (this.seen[url] != null) return; this.seen[url] = true; - if (this.queuing) { - // we are currently waiting for a batch of fetches to complete - return this.queue.push(url); - } - // fetch this url - this.fetches.push(fetch(url)); + this.queue.push(url); }; AutoFetcher.prototype.safeResolve = function (url, resolver) { @@ -102,47 +95,52 @@ AutoFetcher.prototype.urlExtractor = function (match, n1, n2, n3, offset, string // (resolvedURL will be undefined if an error occurred) var resolvedURL = this.safeResolve(n2, this.currentResolver); if (resolvedURL) { - this.safeFetch(resolvedURL); + this.queueURL(resolvedURL); } return n1 + n2 + n3; }; +AutoFetcher.prototype.delay = function () { + // 2 second delay seem reasonable + return new Promise(function (resolve, reject) { + setTimeout(resolve, 2000); + }); +}; + AutoFetcher.prototype.fetchDone = function () { - // indicate we no longer need to Q this.queuing = false; if (this.queue.length > 0) { // we have a Q of some length drain it - this.drainQ(); + var autofetcher = this; + // wait 2 seconds before doing another batch + this.delay().then(function () { + autofetcher.fetchAll(); + }); } }; AutoFetcher.prototype.fetchAll = function () { - // if we are queuing or have no fetches this is a no op - if (this.queuing) return; - if (this.fetches.length === 0) return; - // we are about to fetch queue anything that comes our way + if (this.queuing || this.queue.length === 0) { + return; + } + // the number of fetches is limited to a maximum of 60 outstanding fetches + // the baseline maximum number of fetches is 50 but if the size(queue) <= 10 + // we add them to the current batch this.queuing = true; this.queuing = true; - // initiate fetches by turning the initial fetch promises - // into rejctionless promises and "await" all clearing - // our fetches array in place var runningFetchers = []; - while (this.fetches.length > 0) { - runningFetchers.push(this.fetches.shift().catch(noop)) + while (this.queue.length > 0 && runningFetchers.length <= 50) { + runningFetchers.push(fetch(this.queue.shift()).catch(noop)) + } + if (this.queue.length <= 10) { + while (this.queue.length > 0) { + runningFetchers.push(fetch(this.queue.shift()).catch(noop)) + } } Promise.all(runningFetchers) .then(this.fetchDone) .catch(this.fetchDone); }; -AutoFetcher.prototype.drainQ = function () { - // clear our Q in place and fill our fetches array - while (this.queue.length > 0) { - this.fetches.push(fetch(this.queue.shift())); - } - // fetch all the things - this.fetchAll(); -}; - AutoFetcher.prototype.extractMedia = function (mediaRules) { // this is a broken down rewrite_style if (mediaRules == null) return; @@ -173,7 +171,7 @@ AutoFetcher.prototype.extractSrcset = function (srcsets) { // resolve the URL in an exceptionless manner (resolvedURL will be undefined if an error occurred) var resolvedURL = this.safeResolve(srcsetValue.split(' ')[0], extractedSrcSet.resolve); if (resolvedURL) { - this.safeFetch(resolvedURL); + this.queueURL(resolvedURL); } } } diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 216058e9..87df270d 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -186,6 +186,16 @@ var _WBWombat = function($wbwindow, wbinfo) { } } + function isImageSrcset(elem) { + if (elem.tagName === 'IMG') return true; + return elem.tagName === 'SOURCE' && elem.parentElement && elem.parentElement.tagName === 'PICTURE'; + } + + function isImageDataSrcset(elem) { + if (isImageSrcset(elem)) return elem.dataset.srcset != null; + return false; + } + //============================================ function is_host_url(str) { // Good guess that's its a hostname @@ -1152,7 +1162,7 @@ var _WBWombat = function($wbwindow, wbinfo) { } else if (lowername == "style") { value = rewrite_style(value); } else if (lowername == "srcset") { - value = rewrite_srcset(value, this.tagName === 'IMG'); + value = rewrite_srcset(value, isImageSrcset(this)); } } orig_setAttribute.call(this, name, value); @@ -1423,16 +1433,74 @@ var _WBWombat = function($wbwindow, wbinfo) { }, true); }; + AutoFetchWorker.prototype.preserveDataSrcset = function (srcset) { + // send values from rewrite_attr srcset to the worker deferred + // to ensure the page viewer sees the images first + this.postMessage({ + 'type': 'values', + 'srcset': {'values': srcset, 'presplit': false}, + }, true); + }; + AutoFetchWorker.prototype.preserveMedia = function (media) { // send CSSMediaRule values to the worker - this.postMessage({'type': 'values', 'media': media}) + this.postMessage({'type': 'values', 'media': media}, true); + }; + + AutoFetchWorker.prototype.extractSrcset = function (elem) { + if (wb_getAttribute) { + return wb_getAttribute.call(elem, 'srcset'); + } + return elem.getAttribute('srcset'); + }; + + AutoFetchWorker.prototype.checkForPictureSourceDataSrcsets = function () { + var dataSS = $wbwindow.document.querySelectorAll('img[data-srcset], source[data-srcset]'); + var elem; + var srcset = []; + for (var i = 0; i < dataSS.length; i++) { + elem = dataSS[i]; + if (elem.tagName === 'SOURCE') { + if (elem.parentElement && elem.parentElement.tagName === 'PICTURE' && elem.dataset.srcset) { + srcset.push({srcset: elem.dataset.srcset}); + } + } else if (elem.dataset.srcset) { + srcset.push({srcset: elem.dataset.srcset}); + } + } + if (srcset.length) { + this.postMessage({ + 'type': 'values', + 'srcset': {'values': srcset, 'presplit': false}, + 'context': { + 'docBaseURI': $wbwindow.document.baseURI + } + }, true); + } + }; + + AutoFetchWorker.prototype.extractImgPictureSourceSrcsets = function () { + var i; + var elem = null; + var srcset = []; + var ssElements = $wbwindow.document.querySelectorAll('img[srcset], source[srcset]'); + for (i = 0; i < ssElements.length; i++) { + elem = ssElements[i]; + if (elem.tagName === 'SOURCE') { + if (elem.parentElement && elem.parentElement.tagName === 'PICTURE') { + srcset.push({srcset: this.extractSrcset(elem)}); + } + } else { + srcset.push({tagSrc: elem.src, srcset: this.extractSrcset(elem)}); + } + } + return srcset; }; AutoFetchWorker.prototype.extractFromLocalDoc = function () { // get the values to be preserved from the documents stylesheets // and all elements with a srcset var media = []; - var srcset = []; var sheets = $wbwindow.document.styleSheets; var i = 0; for (; i < sheets.length; ++i) { @@ -1444,16 +1512,7 @@ var _WBWombat = function($wbwindow, wbinfo) { } } } - var srcsetElems = $wbwindow.document.querySelectorAll('img[srcset]'); - for (i = 0; i < srcsetElems.length; i++) { - var ssv = {tagSrc: srcsetElems[i].src}; - if (wb_getAttribute) { - ssv.srcset = wb_getAttribute.call(srcsetElems[i], 'srcset'); - } else { - ssv.srcset = srcsetElems[i].getAttribute('srcset'); - } - srcset.push(ssv); - } + var srcset = this.extractImgPictureSourceSrcsets(); // send the extracted values to the worker deferred // to ensure the page viewer sees the images first this.postMessage({ @@ -1464,6 +1523,12 @@ var _WBWombat = function($wbwindow, wbinfo) { 'docBaseURI': $wbwindow.document.baseURI } }, true); + // deffer the checking of img/source data-srcset + // so that we do not clobber the UI thread + var self = this; + Promise.resolve().then(function () { + self.checkForPictureSourceDataSrcsets(); + }); }; WBAutoFetchWorker = new AutoFetchWorker(wb_abs_prefix, wbinfo.mod); @@ -1615,7 +1680,7 @@ var _WBWombat = function($wbwindow, wbinfo) { } else if (name == "style") { new_value = rewrite_style(value); } else if (name == "srcset") { - new_value = rewrite_srcset(value, elem.tagName === 'IMG'); + new_value = rewrite_srcset(value, isImageSrcset(elem)); } else { // Only rewrite if absolute url if (abs_url_only && !starts_with(value, VALID_PREFIXES)) { @@ -1623,6 +1688,9 @@ var _WBWombat = function($wbwindow, wbinfo) { } var mod = rwModForElement(elem, name); new_value = rewrite_url(value, false, mod, elem.ownerDocument); + if (wbUseAFWorker && isImageDataSrcset(elem)) { + WBAutoFetchWorker.preserveDataSrcset(elem.dataset.srcset); + } } if (new_value != value) { @@ -1724,7 +1792,7 @@ var _WBWombat = function($wbwindow, wbinfo) { if (elem.getAttribute("src") || !elem.textContent || !$wbwindow.Proxy) { return rewrite_attr(elem, "src"); } - + if (elem.type && (elem.type === 'application/json' || elem.type.indexOf('text/template') !== -1)) return; if (elem.textContent.indexOf("_____WB$wombat$assign$function_____") >= 0) { return false; } @@ -2029,7 +2097,7 @@ var _WBWombat = function($wbwindow, wbinfo) { if (mod == "cs_" && orig.indexOf("data:text/css") == 0) { val = rewrite_inline_style(orig); } else if (attr == "srcset") { - val = rewrite_srcset(orig, this.tagName === 'IMG'); + val = rewrite_srcset(orig, isImageSrcset(this)); } else if (this.tagName === 'LINK' && attr === 'href') { var relV = this.rel; if (relV === 'import' || relV === 'preload') { diff --git a/pywb/static/wombatProxyMode.js b/pywb/static/wombatProxyMode.js index 324f3da7..1e6ca162 100644 --- a/pywb/static/wombatProxyMode.js +++ b/pywb/static/wombatProxyMode.js @@ -169,9 +169,9 @@ var _WBWombat = function ($wbwindow, wbinfo) { var isTop = $wbwindow.self === $wbwindow.top; - function AutoFetchWorker() { - if (!(this instanceof AutoFetchWorker)) { - return new AutoFetchWorker(); + function AutoFetchWorkerProxyMode() { + if (!(this instanceof AutoFetchWorkerProxyMode)) { + return new AutoFetchWorkerProxyMode(); } this.checkIntervalTime = 15000; this.checkIntervalCB = this.checkIntervalCB.bind(this); @@ -206,7 +206,7 @@ var _WBWombat = function ($wbwindow, wbinfo) { } } - AutoFetchWorker.prototype.startCheckingInterval = function () { + AutoFetchWorkerProxyMode.prototype.startCheckingInterval = function () { // if document ready state is complete do first extraction and start check polling // otherwise wait for document ready state to complete to extract and start check polling var self = this; @@ -224,20 +224,26 @@ var _WBWombat = function ($wbwindow, wbinfo) { } }; - AutoFetchWorker.prototype.checkIntervalCB = function () { + AutoFetchWorkerProxyMode.prototype.checkIntervalCB = function () { this.extractFromLocalDoc(); }; - AutoFetchWorker.prototype.terminate = function () { + AutoFetchWorkerProxyMode.prototype.terminate = function () { // terminate the worker, a no op when not replay top this.worker.terminate(); }; - AutoFetchWorker.prototype.postMessage = function (msg) { + AutoFetchWorkerProxyMode.prototype.postMessage = function (msg, deferred) { + if (deferred) { + var self = this; + return Promise.resolve().then(function () { + self.worker.postMessage(msg); + }); + } this.worker.postMessage(msg); }; - AutoFetchWorker.prototype.extractMediaRules = function (rules, href) { + AutoFetchWorkerProxyMode.prototype.extractMediaRules = function (rules, href) { // We are in proxy mode and must include a URL to resolve relative URLs in media rules if (!rules) return []; var rvlen = rules.length; @@ -252,7 +258,7 @@ var _WBWombat = function ($wbwindow, wbinfo) { return text; }; - AutoFetchWorker.prototype.corsCSSFetch = function (href) { + AutoFetchWorkerProxyMode.prototype.corsCSSFetch = function (href) { // because this JS in proxy mode operates as it would on the live web // the rules of CORS apply and we cannot rely on URLs being rewritten correctly // fetch the cross origin css file and then parse it using a style tag to get the rules @@ -269,17 +275,64 @@ var _WBWombat = function ($wbwindow, wbinfo) { }); }; - AutoFetchWorker.prototype.shouldSkipSheet = function (sheet) { + AutoFetchWorkerProxyMode.prototype.shouldSkipSheet = function (sheet) { // we skip extracting rules from sheets if they are from our parsing style or come from pywb if (sheet.id === '$wrStyleParser$') return true; return !!(sheet.href && sheet.href.indexOf(wb_info.proxy_magic) !== -1); }; + + AutoFetchWorkerProxyMode.prototype.extractImgPictureSourceSrcsets = function () { + var i; + var elem; + var srcset = []; + var baseURI = $wbwindow.document.baseURI; + var ssElements = $wbwindow.document.querySelectorAll('img[srcset], source[srcset]'); + for (i = 0; i < ssElements.length; i++) { + elem = ssElements[i]; + if (elem.tagName === 'SOURCE') { + if (elem.parentElement && elem.parentElement.tagName === 'PICTURE') { + srcset.push({srcset: elem.srcset, resolve: baseURI}); + } + } else { + srcset.push({ + srcset: elem.srcset, + resolve: elem.src != null && elem.src !== ' ' ? elem.src : baseURI + }); + } + } + return srcset; + }; + + AutoFetchWorkerProxyMode.prototype.checkForPictureSourceDataSrcsets = function () { + var baseURI = $wbwindow.document.baseURI; + var dataSS = $wbwindow.document.querySelectorAll('img[data-srcset], source[data-srcset]'); + var elem; + var srcset = []; + for (var i = 0; i < dataSS.length; i++) { + elem = dataSS[i]; + if (elem.tagName === 'SOURCE') { + if (elem.parentElement && elem.parentElement.tagName === 'PICTURE' && elem.dataset.srcset) { + srcset.push({srcset: elem.dataset.srcset, resolve: baseURI}); + } + } else if (elem.dataset.srcset) { + srcset.push({srcset: elem.dataset.srcset, resolve: elem.src != null && elem.src !== ' ' ? elem.src : baseURI}); + } + } + if (srcset.length) { + this.postMessage({ + 'type': 'values', + 'srcset': {'values': srcset, 'presplit': false}, + 'context': { + 'docBaseURI': $wbwindow.document.baseURI + } + }, true); + } + }; - AutoFetchWorker.prototype.extractFromLocalDoc = function () { + AutoFetchWorkerProxyMode.prototype.extractFromLocalDoc = function () { var i = 0; var media = []; var deferredMediaURLS = []; - var srcset = []; var sheet; var resolve; // We must use the window reference passed to us to access this origins stylesheets @@ -307,17 +360,11 @@ var _WBWombat = function ($wbwindow, wbinfo) { } // We must use the window reference passed to us to access this origins elements with srcset attr // like cssRule handling we must include a URL to resolve relative URLs by - var srcsetElems = $wbwindow.document.querySelectorAll('img[srcset]'); - var ssElem, resolveAgainst; - for (i = 0; i < srcsetElems.length; i++) { - ssElem = srcsetElems[i]; - resolveAgainst = ssElem.src != null && ssElem.src !== ' ' ? ssElem.src : $wbwindow.document.baseURI; - srcset.push({'srcset': ssElem.srcset, 'resolve': resolveAgainst}); - } + var srcset = this.extractImgPictureSourceSrcsets(); // send what we have extracted, if anything, to the worker for processing if (media.length > 0 || srcset.length > 0) { - this.postMessage({'type': 'values', 'media': media, 'srcset': srcset}); + this.postMessage({'type': 'values', 'media': media, 'srcset': srcset}, true); } if (deferredMediaURLS.length > 0) { @@ -334,9 +381,15 @@ var _WBWombat = function ($wbwindow, wbinfo) { } }); } + // deffer the checking of img/source data-srcset + // so that we do not clobber the UI thread + var self = this; + Promise.resolve().then(function () { + self.checkForPictureSourceDataSrcsets(); + }); }; - WBAutoFetchWorker = new AutoFetchWorker(); + WBAutoFetchWorker = new AutoFetchWorkerProxyMode(); if (isTop) { $wbwindow.addEventListener("message", function (event) {