From 82f2dace64497e457bb2237d10b566ac0707b228 Mon Sep 17 00:00:00 2001 From: John Berlin Date: Tue, 23 Oct 2018 15:52:58 -0400 Subject: [PATCH] autoFetchWorker.js improvements: (#397) - ensured that autoFetchWorker uses full srcset URLs - resolves the URL against the img.src or document.baseURI if not rewritten - otherwise ensures the rewritten URL is not relative or schemeless wombat.js: - AutoFetchWorker updated extractFromLocalDoc to send URL resolution information to the worker - defer extractFromLocalDoc and preserveSrcset postMessages to ensure page viewer can see the images first --- pywb/static/autoFetchWorker.js | 94 ++++++++++++++++++++++++++-------- pywb/static/wombat.js | 84 +++++++++++++++++------------- 2 files changed, 121 insertions(+), 57 deletions(-) diff --git a/pywb/static/autoFetchWorker.js b/pywb/static/autoFetchWorker.js index b5d46ba9..fe0fdcef 100644 --- a/pywb/static/autoFetchWorker.js +++ b/pywb/static/autoFetchWorker.js @@ -50,7 +50,6 @@ function AutoFetcher(init) { if (!(this instanceof AutoFetcher)) { return new AutoFetcher(init); } - this.proxyMode = init.proxyMode; this.prefix = init.prefix; this.mod = init.mod; this.prefixMod = init.prefix + init.mod; @@ -88,14 +87,13 @@ AutoFetcher.prototype.fixupURL = function (url) { }; AutoFetcher.prototype.safeFetch = function (url) { - var fixedURL = this.fixupURL(url); // check to see if we have seen this url before in order // to lessen the load against the server content is fetched from if (this.seen[url] != null) return; this.seen[url] = true; if (this.queuing) { // we are currently waiting for a batch of fetches to complete - return this.queue.push(fixedURL); + return this.queue.push(url); } // fetch this url this.fetches.push(fetch(url)); @@ -103,7 +101,7 @@ AutoFetcher.prototype.safeFetch = function (url) { AutoFetcher.prototype.urlExtractor = function (match, n1, n2, n3, offset, string) { // Same function as style_replacer in wombat.rewrite_style, n2 is our URL - this.safeFetch(n2); + this.safeFetch(this.fixupURL(n2)); return n1 + n2 + n3; }; @@ -154,27 +152,79 @@ AutoFetcher.prototype.extractMedia = function (mediaRules) { } }; -AutoFetcher.prototype.extractSrcset = function (srcsets) { +AutoFetcher.prototype.maybeFixUpRelSchemelessPrefix = function (url) { + // attempt to ensure rewritten relative or schemeless URLs become full URLS! + // otherwise returns null if this did not happen + if (url.indexOf(this.relative) === 0) { + return url.replace(this.relative, this.prefix); + } + if (url.indexOf(this.schemeless) === 0) { + return url.replace(this.schemeless, this.prefix); + } + return null; +}; + +AutoFetcher.prototype.maybeResolveURL = function (url, base) { + // given a url and base url returns a resolved full URL or + // null if resolution was unsuccessful + try { + var _url = new URL(url, base); + return _url.href; + } catch (e) { + return null; + } +}; + + +AutoFetcher.prototype.fixupURLSrcSet = function (url, tagSrc, context) { + // attempt to fix up the url and do our best to ensure we can get dat 200 OK! + if (url.indexOf(this.prefix) !== 0) { + // first check for / (relative) or // (schemeless) rewritten urls + var maybeFixed = this.maybeFixUpRelSchemelessPrefix(url); + if (maybeFixed != null) { + return maybeFixed; + } + // resolve URL against tag src + maybeFixed = this.maybeResolveURL(url, tagSrc); + if (maybeFixed != null) { + return this.prefix + 'im_/' + maybeFixed; + } + // finally last attempt resolve the originating documents base URI + maybeFixed = this.maybeResolveURL(url, context.docBaseURI); + if (maybeFixed != null) { + return this.prefix + 'im_/' + maybeFixed; + } + // not much to do now..... + return this.prefixMod + '/' + url; + } + return url; +}; + +AutoFetcher.prototype.extractSrcset = function (srcsets, context) { if (srcsets == null || srcsets.values == null) return; var srcsetValues = srcsets.values; - // was srcsets from rewrite_srcset and if so no need to split - var presplit = srcsets.presplit; + if (!srcsets.presplit) { + // was from extract from local doc so we need to duplicate work + return this.srcsetNotPreSplit(srcsetValues, context); + } + // was rewrite_srcset so just ensure we just for (var i = 0; i < srcsetValues.length; i++) { - var srcset = srcsetValues[i]; - if (presplit) { - // was rewrite_srcset so just ensure we just + // grab the URL not width/height key + this.safeFetch(srcsetValues[i].split(' ')[0]); + } +}; + +AutoFetcher.prototype.srcsetNotPreSplit = function (values, context) { + // was from extract from local doc so we need to duplicate work + var j; + for (var i = 0; i < values.length; i++) { + var srcsetValues = values[i].srcset.split(srcsetSplit); + var tagSrc = values[i].tagSrc; + for (j = 0; j < srcsetValues.length; j++) { // grab the URL not width/height key - this.safeFetch(srcset.split(' ')[0]); - } else { - // was from extract from local doc so we need to duplicate work - var values = srcset.split(srcsetSplit); - for (var j = 0; j < values.length; j++) { - if (Boolean(values[j])) { - var value = values[j].trim(); - if (value.length > 0) { - this.safeFetch(value.split(' ')[0]); - } - } + if (Boolean(srcsetValues[j])) { + var value = srcsetValues[j].trim().split(' ')[0]; + this.safeFetch(this.fixupURLSrcSet(value, tagSrc, context)); } } } @@ -184,7 +234,7 @@ AutoFetcher.prototype.autofetchMediaSrcset = function (data) { // we got a message and now we autofetch! // these calls turn into no ops if they have no work this.extractMedia(data.media); - this.extractSrcset(data.srcset); + this.extractSrcset(data.srcset, data.context); this.fetchAll(); }; diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 50be9d40..b7bf3c64 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -1151,7 +1151,7 @@ var _WBWombat = function($wbwindow, wbinfo) { } else if (lowername == "style") { value = rewrite_style(value); } else if (lowername == "srcset") { - value = rewrite_srcset(value); + value = rewrite_srcset(value, this.tagName === 'IMG'); } } orig_setAttribute.call(this, name, value); @@ -1403,16 +1403,23 @@ var _WBWombat = function($wbwindow, wbinfo) { this.worker.terminate(); }; - AutoFetchWorker.prototype.postMessage = function (msg) { + AutoFetchWorker.prototype.postMessage = function (msg, deferred) { + if (deferred) { + var self = this; + return Promise.resolve().then(function () { + self.worker.postMessage(msg); + }); + } this.worker.postMessage(msg); }; AutoFetchWorker.prototype.preserveSrcset = function (srcset) { - // send values from rewrite_srcset to the worker + // send values from rewrite_srcset to the worker deferred + // to ensure the page viewer sees the images first this.postMessage({ 'type': 'values', 'srcset': {'values': srcset, 'presplit': true}, - }); + }, true); }; AutoFetchWorker.prototype.preserveMedia = function (media) { @@ -1421,36 +1428,42 @@ var _WBWombat = function($wbwindow, wbinfo) { }; AutoFetchWorker.prototype.extractFromLocalDoc = function () { - // get the values to be preserved from the documents stylesheets - // and all elements with a srcset - var media = []; - var srcset = []; - var sheets = $wbwindow.document.styleSheets; - var i = 0; - for (; i < sheets.length; ++i) { - var rules = sheets[i].cssRules; - for (var j = 0; j < rules.length; ++j) { - var rule = rules[j]; - if (rule.type === CSSRule.MEDIA_RULE) { - media.push(rule.cssText); - } + // get the values to be preserved from the documents stylesheets + // and all elements with a srcset + var media = []; + var srcset = []; + var sheets = $wbwindow.document.styleSheets; + var i = 0; + for (; i < sheets.length; ++i) { + var rules = sheets[i].cssRules; + for (var j = 0; j < rules.length; ++j) { + var rule = rules[j]; + if (rule.type === CSSRule.MEDIA_RULE) { + media.push(rule.cssText); } } - var srcsetElems = $wbwindow.document.querySelectorAll('img[srcset]'); - for (i = 0; i < srcsetElems.length; i++) { - var srcsetElem = srcsetElems[i]; - if (wb_getAttribute) { - srcset.push(wb_getAttribute.call(srcsetElem, 'srcset')); - } else { - srcset.push(srcsetElem.getAttribute('srcset')); - } + } + var srcsetElems = $wbwindow.document.querySelectorAll('img[srcset]'); + for (i = 0; i < srcsetElems.length; i++) { + var ssv = {tagSrc: srcsetElems[i].src}; + if (wb_getAttribute) { + ssv.srcset = wb_getAttribute.call(srcsetElems[i], 'srcset'); + } else { + ssv.srcset = srcsetElems[i].getAttribute('srcset'); } - this.postMessage({ - 'type': 'values', - 'media': media, - 'srcset': {'values': srcset, 'presplit': false}, - }); - }; + srcset.push(ssv); + } + // send the extracted values to the worker deferred + // to ensure the page viewer sees the images first + this.postMessage({ + 'type': 'values', + 'media': media, + 'srcset': {'values': srcset, 'presplit': false}, + 'context': { + 'docBaseURI': $wbwindow.document.baseURI + } + }, true); + }; WBAutoFetchWorker = new AutoFetchWorker(wb_abs_prefix, wbinfo.mod); @@ -1601,7 +1614,7 @@ var _WBWombat = function($wbwindow, wbinfo) { } else if (name == "style") { new_value = rewrite_style(value); } else if (name == "srcset") { - new_value = rewrite_srcset(value); + new_value = rewrite_srcset(value, elem.tagName === 'IMG'); } else { // Only rewrite if absolute url if (abs_url_only && !starts_with(value, VALID_PREFIXES)) { @@ -1643,7 +1656,7 @@ var _WBWombat = function($wbwindow, wbinfo) { } //============================================ - function rewrite_srcset(value) + function rewrite_srcset(value, isImage) { if (!value) { return ""; @@ -1655,7 +1668,8 @@ var _WBWombat = function($wbwindow, wbinfo) { for (var i = 0; i < values.length; i++) { values[i] = rewrite_url(values[i].trim()); } - if (wbUseAFWorker) { + + if (wbUseAFWorker && isImage) { // send post split values to preservation worker WBAutoFetchWorker.preserveSrcset(values); } @@ -2004,7 +2018,7 @@ var _WBWombat = function($wbwindow, wbinfo) { if (mod == "cs_" && orig.indexOf("data:text/css") == 0) { val = rewrite_inline_style(orig); } else if (attr == "srcset") { - val = rewrite_srcset(orig); + val = rewrite_srcset(orig, this.tagName === 'IMG'); } else if (this.tagName === 'LINK' && attr === 'href') { var relV = this.rel; if (relV === 'import' || relV === 'preload') {