1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

autoFetchWorker.js improvements: (#397)

- ensured that autoFetchWorker uses full srcset URLs
- resolves the URL against the img.src or document.baseURI if not rewritten
- otherwise ensures the rewritten URL is not relative or schemeless
wombat.js:
- AutoFetchWorker updated extractFromLocalDoc to send URL resolution information to the worker
- defer extractFromLocalDoc and preserveSrcset postMessages to ensure page viewer can see the images first
This commit is contained in:
John Berlin 2018-10-23 15:52:58 -04:00 committed by Ilya Kreymer
parent a9e4b5c469
commit 82f2dace64
2 changed files with 121 additions and 57 deletions

View File

@ -50,7 +50,6 @@ function AutoFetcher(init) {
if (!(this instanceof AutoFetcher)) { if (!(this instanceof AutoFetcher)) {
return new AutoFetcher(init); return new AutoFetcher(init);
} }
this.proxyMode = init.proxyMode;
this.prefix = init.prefix; this.prefix = init.prefix;
this.mod = init.mod; this.mod = init.mod;
this.prefixMod = init.prefix + init.mod; this.prefixMod = init.prefix + init.mod;
@ -88,14 +87,13 @@ AutoFetcher.prototype.fixupURL = function (url) {
}; };
AutoFetcher.prototype.safeFetch = function (url) { AutoFetcher.prototype.safeFetch = function (url) {
var fixedURL = this.fixupURL(url);
// check to see if we have seen this url before in order // check to see if we have seen this url before in order
// to lessen the load against the server content is fetched from // to lessen the load against the server content is fetched from
if (this.seen[url] != null) return; if (this.seen[url] != null) return;
this.seen[url] = true; this.seen[url] = true;
if (this.queuing) { if (this.queuing) {
// we are currently waiting for a batch of fetches to complete // we are currently waiting for a batch of fetches to complete
return this.queue.push(fixedURL); return this.queue.push(url);
} }
// fetch this url // fetch this url
this.fetches.push(fetch(url)); this.fetches.push(fetch(url));
@ -103,7 +101,7 @@ AutoFetcher.prototype.safeFetch = function (url) {
AutoFetcher.prototype.urlExtractor = function (match, n1, n2, n3, offset, string) { AutoFetcher.prototype.urlExtractor = function (match, n1, n2, n3, offset, string) {
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL // Same function as style_replacer in wombat.rewrite_style, n2 is our URL
this.safeFetch(n2); this.safeFetch(this.fixupURL(n2));
return n1 + n2 + n3; return n1 + n2 + n3;
}; };
@ -154,27 +152,79 @@ AutoFetcher.prototype.extractMedia = function (mediaRules) {
} }
}; };
AutoFetcher.prototype.extractSrcset = function (srcsets) { AutoFetcher.prototype.maybeFixUpRelSchemelessPrefix = function (url) {
// attempt to ensure rewritten relative or schemeless URLs become full URLS!
// otherwise returns null if this did not happen
if (url.indexOf(this.relative) === 0) {
return url.replace(this.relative, this.prefix);
}
if (url.indexOf(this.schemeless) === 0) {
return url.replace(this.schemeless, this.prefix);
}
return null;
};
AutoFetcher.prototype.maybeResolveURL = function (url, base) {
// given a url and base url returns a resolved full URL or
// null if resolution was unsuccessful
try {
var _url = new URL(url, base);
return _url.href;
} catch (e) {
return null;
}
};
AutoFetcher.prototype.fixupURLSrcSet = function (url, tagSrc, context) {
// attempt to fix up the url and do our best to ensure we can get dat 200 OK!
if (url.indexOf(this.prefix) !== 0) {
// first check for / (relative) or // (schemeless) rewritten urls
var maybeFixed = this.maybeFixUpRelSchemelessPrefix(url);
if (maybeFixed != null) {
return maybeFixed;
}
// resolve URL against tag src
maybeFixed = this.maybeResolveURL(url, tagSrc);
if (maybeFixed != null) {
return this.prefix + 'im_/' + maybeFixed;
}
// finally last attempt resolve the originating documents base URI
maybeFixed = this.maybeResolveURL(url, context.docBaseURI);
if (maybeFixed != null) {
return this.prefix + 'im_/' + maybeFixed;
}
// not much to do now.....
return this.prefixMod + '/' + url;
}
return url;
};
AutoFetcher.prototype.extractSrcset = function (srcsets, context) {
if (srcsets == null || srcsets.values == null) return; if (srcsets == null || srcsets.values == null) return;
var srcsetValues = srcsets.values; var srcsetValues = srcsets.values;
// was srcsets from rewrite_srcset and if so no need to split if (!srcsets.presplit) {
var presplit = srcsets.presplit; // was from extract from local doc so we need to duplicate work
return this.srcsetNotPreSplit(srcsetValues, context);
}
// was rewrite_srcset so just ensure we just
for (var i = 0; i < srcsetValues.length; i++) { for (var i = 0; i < srcsetValues.length; i++) {
var srcset = srcsetValues[i]; // grab the URL not width/height key
if (presplit) { this.safeFetch(srcsetValues[i].split(' ')[0]);
// was rewrite_srcset so just ensure we just }
};
AutoFetcher.prototype.srcsetNotPreSplit = function (values, context) {
// was from extract from local doc so we need to duplicate work
var j;
for (var i = 0; i < values.length; i++) {
var srcsetValues = values[i].srcset.split(srcsetSplit);
var tagSrc = values[i].tagSrc;
for (j = 0; j < srcsetValues.length; j++) {
// grab the URL not width/height key // grab the URL not width/height key
this.safeFetch(srcset.split(' ')[0]); if (Boolean(srcsetValues[j])) {
} else { var value = srcsetValues[j].trim().split(' ')[0];
// was from extract from local doc so we need to duplicate work this.safeFetch(this.fixupURLSrcSet(value, tagSrc, context));
var values = srcset.split(srcsetSplit);
for (var j = 0; j < values.length; j++) {
if (Boolean(values[j])) {
var value = values[j].trim();
if (value.length > 0) {
this.safeFetch(value.split(' ')[0]);
}
}
} }
} }
} }
@ -184,7 +234,7 @@ AutoFetcher.prototype.autofetchMediaSrcset = function (data) {
// we got a message and now we autofetch! // we got a message and now we autofetch!
// these calls turn into no ops if they have no work // these calls turn into no ops if they have no work
this.extractMedia(data.media); this.extractMedia(data.media);
this.extractSrcset(data.srcset); this.extractSrcset(data.srcset, data.context);
this.fetchAll(); this.fetchAll();
}; };

View File

@ -1151,7 +1151,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
} else if (lowername == "style") { } else if (lowername == "style") {
value = rewrite_style(value); value = rewrite_style(value);
} else if (lowername == "srcset") { } else if (lowername == "srcset") {
value = rewrite_srcset(value); value = rewrite_srcset(value, this.tagName === 'IMG');
} }
} }
orig_setAttribute.call(this, name, value); orig_setAttribute.call(this, name, value);
@ -1403,16 +1403,23 @@ var _WBWombat = function($wbwindow, wbinfo) {
this.worker.terminate(); this.worker.terminate();
}; };
AutoFetchWorker.prototype.postMessage = function (msg) { AutoFetchWorker.prototype.postMessage = function (msg, deferred) {
if (deferred) {
var self = this;
return Promise.resolve().then(function () {
self.worker.postMessage(msg);
});
}
this.worker.postMessage(msg); this.worker.postMessage(msg);
}; };
AutoFetchWorker.prototype.preserveSrcset = function (srcset) { AutoFetchWorker.prototype.preserveSrcset = function (srcset) {
// send values from rewrite_srcset to the worker // send values from rewrite_srcset to the worker deferred
// to ensure the page viewer sees the images first
this.postMessage({ this.postMessage({
'type': 'values', 'type': 'values',
'srcset': {'values': srcset, 'presplit': true}, 'srcset': {'values': srcset, 'presplit': true},
}); }, true);
}; };
AutoFetchWorker.prototype.preserveMedia = function (media) { AutoFetchWorker.prototype.preserveMedia = function (media) {
@ -1421,36 +1428,42 @@ var _WBWombat = function($wbwindow, wbinfo) {
}; };
AutoFetchWorker.prototype.extractFromLocalDoc = function () { AutoFetchWorker.prototype.extractFromLocalDoc = function () {
// get the values to be preserved from the documents stylesheets // get the values to be preserved from the documents stylesheets
// and all elements with a srcset // and all elements with a srcset
var media = []; var media = [];
var srcset = []; var srcset = [];
var sheets = $wbwindow.document.styleSheets; var sheets = $wbwindow.document.styleSheets;
var i = 0; var i = 0;
for (; i < sheets.length; ++i) { for (; i < sheets.length; ++i) {
var rules = sheets[i].cssRules; var rules = sheets[i].cssRules;
for (var j = 0; j < rules.length; ++j) { for (var j = 0; j < rules.length; ++j) {
var rule = rules[j]; var rule = rules[j];
if (rule.type === CSSRule.MEDIA_RULE) { if (rule.type === CSSRule.MEDIA_RULE) {
media.push(rule.cssText); media.push(rule.cssText);
}
} }
} }
var srcsetElems = $wbwindow.document.querySelectorAll('img[srcset]'); }
for (i = 0; i < srcsetElems.length; i++) { var srcsetElems = $wbwindow.document.querySelectorAll('img[srcset]');
var srcsetElem = srcsetElems[i]; for (i = 0; i < srcsetElems.length; i++) {
if (wb_getAttribute) { var ssv = {tagSrc: srcsetElems[i].src};
srcset.push(wb_getAttribute.call(srcsetElem, 'srcset')); if (wb_getAttribute) {
} else { ssv.srcset = wb_getAttribute.call(srcsetElems[i], 'srcset');
srcset.push(srcsetElem.getAttribute('srcset')); } else {
} ssv.srcset = srcsetElems[i].getAttribute('srcset');
} }
this.postMessage({ srcset.push(ssv);
'type': 'values', }
'media': media, // send the extracted values to the worker deferred
'srcset': {'values': srcset, 'presplit': false}, // to ensure the page viewer sees the images first
}); this.postMessage({
}; 'type': 'values',
'media': media,
'srcset': {'values': srcset, 'presplit': false},
'context': {
'docBaseURI': $wbwindow.document.baseURI
}
}, true);
};
WBAutoFetchWorker = new AutoFetchWorker(wb_abs_prefix, wbinfo.mod); WBAutoFetchWorker = new AutoFetchWorker(wb_abs_prefix, wbinfo.mod);
@ -1601,7 +1614,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
} else if (name == "style") { } else if (name == "style") {
new_value = rewrite_style(value); new_value = rewrite_style(value);
} else if (name == "srcset") { } else if (name == "srcset") {
new_value = rewrite_srcset(value); new_value = rewrite_srcset(value, elem.tagName === 'IMG');
} else { } else {
// Only rewrite if absolute url // Only rewrite if absolute url
if (abs_url_only && !starts_with(value, VALID_PREFIXES)) { if (abs_url_only && !starts_with(value, VALID_PREFIXES)) {
@ -1643,7 +1656,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
} }
//============================================ //============================================
function rewrite_srcset(value) function rewrite_srcset(value, isImage)
{ {
if (!value) { if (!value) {
return ""; return "";
@ -1655,7 +1668,8 @@ var _WBWombat = function($wbwindow, wbinfo) {
for (var i = 0; i < values.length; i++) { for (var i = 0; i < values.length; i++) {
values[i] = rewrite_url(values[i].trim()); values[i] = rewrite_url(values[i].trim());
} }
if (wbUseAFWorker) {
if (wbUseAFWorker && isImage) {
// send post split values to preservation worker // send post split values to preservation worker
WBAutoFetchWorker.preserveSrcset(values); WBAutoFetchWorker.preserveSrcset(values);
} }
@ -2004,7 +2018,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
if (mod == "cs_" && orig.indexOf("data:text/css") == 0) { if (mod == "cs_" && orig.indexOf("data:text/css") == 0) {
val = rewrite_inline_style(orig); val = rewrite_inline_style(orig);
} else if (attr == "srcset") { } else if (attr == "srcset") {
val = rewrite_srcset(orig); val = rewrite_srcset(orig, this.tagName === 'IMG');
} else if (this.tagName === 'LINK' && attr === 'href') { } else if (this.tagName === 'LINK' && attr === 'href') {
var relV = this.rel; var relV = this.rel;
if (relV === 'import' || relV === 'preload') { if (relV === 'import' || relV === 'preload') {