1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

autoFetchWorker.js improvements: (#397)

- ensured that autoFetchWorker uses full srcset URLs
- resolves the URL against the img.src or document.baseURI if not rewritten
- otherwise ensures the rewritten URL is not relative or schemeless
wombat.js:
- AutoFetchWorker updated extractFromLocalDoc to send URL resolution information to the worker
- defer extractFromLocalDoc and preserveSrcset postMessages to ensure page viewer can see the images first
This commit is contained in:
John Berlin 2018-10-23 15:52:58 -04:00 committed by Ilya Kreymer
parent a9e4b5c469
commit 82f2dace64
2 changed files with 121 additions and 57 deletions

View File

@ -50,7 +50,6 @@ function AutoFetcher(init) {
if (!(this instanceof AutoFetcher)) {
return new AutoFetcher(init);
}
this.proxyMode = init.proxyMode;
this.prefix = init.prefix;
this.mod = init.mod;
this.prefixMod = init.prefix + init.mod;
@ -88,14 +87,13 @@ AutoFetcher.prototype.fixupURL = function (url) {
};
AutoFetcher.prototype.safeFetch = function (url) {
var fixedURL = this.fixupURL(url);
// check to see if we have seen this url before in order
// to lessen the load against the server content is fetched from
if (this.seen[url] != null) return;
this.seen[url] = true;
if (this.queuing) {
// we are currently waiting for a batch of fetches to complete
return this.queue.push(fixedURL);
return this.queue.push(url);
}
// fetch this url
this.fetches.push(fetch(url));
@ -103,7 +101,7 @@ AutoFetcher.prototype.safeFetch = function (url) {
AutoFetcher.prototype.urlExtractor = function (match, n1, n2, n3, offset, string) {
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
this.safeFetch(n2);
this.safeFetch(this.fixupURL(n2));
return n1 + n2 + n3;
};
@ -154,27 +152,79 @@ AutoFetcher.prototype.extractMedia = function (mediaRules) {
}
};
AutoFetcher.prototype.extractSrcset = function (srcsets) {
AutoFetcher.prototype.maybeFixUpRelSchemelessPrefix = function (url) {
// attempt to ensure rewritten relative or schemeless URLs become full URLS!
// otherwise returns null if this did not happen
if (url.indexOf(this.relative) === 0) {
return url.replace(this.relative, this.prefix);
}
if (url.indexOf(this.schemeless) === 0) {
return url.replace(this.schemeless, this.prefix);
}
return null;
};
AutoFetcher.prototype.maybeResolveURL = function (url, base) {
// given a url and base url returns a resolved full URL or
// null if resolution was unsuccessful
try {
var _url = new URL(url, base);
return _url.href;
} catch (e) {
return null;
}
};
AutoFetcher.prototype.fixupURLSrcSet = function (url, tagSrc, context) {
// attempt to fix up the url and do our best to ensure we can get dat 200 OK!
if (url.indexOf(this.prefix) !== 0) {
// first check for / (relative) or // (schemeless) rewritten urls
var maybeFixed = this.maybeFixUpRelSchemelessPrefix(url);
if (maybeFixed != null) {
return maybeFixed;
}
// resolve URL against tag src
maybeFixed = this.maybeResolveURL(url, tagSrc);
if (maybeFixed != null) {
return this.prefix + 'im_/' + maybeFixed;
}
// finally last attempt resolve the originating documents base URI
maybeFixed = this.maybeResolveURL(url, context.docBaseURI);
if (maybeFixed != null) {
return this.prefix + 'im_/' + maybeFixed;
}
// not much to do now.....
return this.prefixMod + '/' + url;
}
return url;
};
AutoFetcher.prototype.extractSrcset = function (srcsets, context) {
if (srcsets == null || srcsets.values == null) return;
var srcsetValues = srcsets.values;
// was srcsets from rewrite_srcset and if so no need to split
var presplit = srcsets.presplit;
if (!srcsets.presplit) {
// was from extract from local doc so we need to duplicate work
return this.srcsetNotPreSplit(srcsetValues, context);
}
// was rewrite_srcset so just ensure we just
for (var i = 0; i < srcsetValues.length; i++) {
var srcset = srcsetValues[i];
if (presplit) {
// was rewrite_srcset so just ensure we just
// grab the URL not width/height key
this.safeFetch(srcsetValues[i].split(' ')[0]);
}
};
AutoFetcher.prototype.srcsetNotPreSplit = function (values, context) {
// was from extract from local doc so we need to duplicate work
var j;
for (var i = 0; i < values.length; i++) {
var srcsetValues = values[i].srcset.split(srcsetSplit);
var tagSrc = values[i].tagSrc;
for (j = 0; j < srcsetValues.length; j++) {
// grab the URL not width/height key
this.safeFetch(srcset.split(' ')[0]);
} else {
// was from extract from local doc so we need to duplicate work
var values = srcset.split(srcsetSplit);
for (var j = 0; j < values.length; j++) {
if (Boolean(values[j])) {
var value = values[j].trim();
if (value.length > 0) {
this.safeFetch(value.split(' ')[0]);
}
}
if (Boolean(srcsetValues[j])) {
var value = srcsetValues[j].trim().split(' ')[0];
this.safeFetch(this.fixupURLSrcSet(value, tagSrc, context));
}
}
}
@ -184,7 +234,7 @@ AutoFetcher.prototype.autofetchMediaSrcset = function (data) {
// we got a message and now we autofetch!
// these calls turn into no ops if they have no work
this.extractMedia(data.media);
this.extractSrcset(data.srcset);
this.extractSrcset(data.srcset, data.context);
this.fetchAll();
};

View File

@ -1151,7 +1151,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
} else if (lowername == "style") {
value = rewrite_style(value);
} else if (lowername == "srcset") {
value = rewrite_srcset(value);
value = rewrite_srcset(value, this.tagName === 'IMG');
}
}
orig_setAttribute.call(this, name, value);
@ -1403,16 +1403,23 @@ var _WBWombat = function($wbwindow, wbinfo) {
this.worker.terminate();
};
AutoFetchWorker.prototype.postMessage = function (msg) {
AutoFetchWorker.prototype.postMessage = function (msg, deferred) {
if (deferred) {
var self = this;
return Promise.resolve().then(function () {
self.worker.postMessage(msg);
});
}
this.worker.postMessage(msg);
};
AutoFetchWorker.prototype.preserveSrcset = function (srcset) {
// send values from rewrite_srcset to the worker
// send values from rewrite_srcset to the worker deferred
// to ensure the page viewer sees the images first
this.postMessage({
'type': 'values',
'srcset': {'values': srcset, 'presplit': true},
});
}, true);
};
AutoFetchWorker.prototype.preserveMedia = function (media) {
@ -1421,36 +1428,42 @@ var _WBWombat = function($wbwindow, wbinfo) {
};
AutoFetchWorker.prototype.extractFromLocalDoc = function () {
// get the values to be preserved from the documents stylesheets
// and all elements with a srcset
var media = [];
var srcset = [];
var sheets = $wbwindow.document.styleSheets;
var i = 0;
for (; i < sheets.length; ++i) {
var rules = sheets[i].cssRules;
for (var j = 0; j < rules.length; ++j) {
var rule = rules[j];
if (rule.type === CSSRule.MEDIA_RULE) {
media.push(rule.cssText);
}
// get the values to be preserved from the documents stylesheets
// and all elements with a srcset
var media = [];
var srcset = [];
var sheets = $wbwindow.document.styleSheets;
var i = 0;
for (; i < sheets.length; ++i) {
var rules = sheets[i].cssRules;
for (var j = 0; j < rules.length; ++j) {
var rule = rules[j];
if (rule.type === CSSRule.MEDIA_RULE) {
media.push(rule.cssText);
}
}
var srcsetElems = $wbwindow.document.querySelectorAll('img[srcset]');
for (i = 0; i < srcsetElems.length; i++) {
var srcsetElem = srcsetElems[i];
if (wb_getAttribute) {
srcset.push(wb_getAttribute.call(srcsetElem, 'srcset'));
} else {
srcset.push(srcsetElem.getAttribute('srcset'));
}
}
var srcsetElems = $wbwindow.document.querySelectorAll('img[srcset]');
for (i = 0; i < srcsetElems.length; i++) {
var ssv = {tagSrc: srcsetElems[i].src};
if (wb_getAttribute) {
ssv.srcset = wb_getAttribute.call(srcsetElems[i], 'srcset');
} else {
ssv.srcset = srcsetElems[i].getAttribute('srcset');
}
this.postMessage({
'type': 'values',
'media': media,
'srcset': {'values': srcset, 'presplit': false},
});
};
srcset.push(ssv);
}
// send the extracted values to the worker deferred
// to ensure the page viewer sees the images first
this.postMessage({
'type': 'values',
'media': media,
'srcset': {'values': srcset, 'presplit': false},
'context': {
'docBaseURI': $wbwindow.document.baseURI
}
}, true);
};
WBAutoFetchWorker = new AutoFetchWorker(wb_abs_prefix, wbinfo.mod);
@ -1601,7 +1614,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
} else if (name == "style") {
new_value = rewrite_style(value);
} else if (name == "srcset") {
new_value = rewrite_srcset(value);
new_value = rewrite_srcset(value, elem.tagName === 'IMG');
} else {
// Only rewrite if absolute url
if (abs_url_only && !starts_with(value, VALID_PREFIXES)) {
@ -1643,7 +1656,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
}
//============================================
function rewrite_srcset(value)
function rewrite_srcset(value, isImage)
{
if (!value) {
return "";
@ -1655,7 +1668,8 @@ var _WBWombat = function($wbwindow, wbinfo) {
for (var i = 0; i < values.length; i++) {
values[i] = rewrite_url(values[i].trim());
}
if (wbUseAFWorker) {
if (wbUseAFWorker && isImage) {
// send post split values to preservation worker
WBAutoFetchWorker.preserveSrcset(values);
}
@ -2004,7 +2018,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
if (mod == "cs_" && orig.indexOf("data:text/css") == 0) {
val = rewrite_inline_style(orig);
} else if (attr == "srcset") {
val = rewrite_srcset(orig);
val = rewrite_srcset(orig, this.tagName === 'IMG');
} else if (this.tagName === 'LINK' && attr === 'href') {
var relV = this.rel;
if (relV === 'import' || relV === 'preload') {