mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
autoFetchWorker.js improvements: (#397)
- ensured that autoFetchWorker uses full srcset URLs - resolves the URL against the img.src or document.baseURI if not rewritten - otherwise ensures the rewritten URL is not relative or schemeless wombat.js: - AutoFetchWorker updated extractFromLocalDoc to send URL resolution information to the worker - defer extractFromLocalDoc and preserveSrcset postMessages to ensure page viewer can see the images first
This commit is contained in:
parent
a9e4b5c469
commit
82f2dace64
@ -50,7 +50,6 @@ function AutoFetcher(init) {
|
|||||||
if (!(this instanceof AutoFetcher)) {
|
if (!(this instanceof AutoFetcher)) {
|
||||||
return new AutoFetcher(init);
|
return new AutoFetcher(init);
|
||||||
}
|
}
|
||||||
this.proxyMode = init.proxyMode;
|
|
||||||
this.prefix = init.prefix;
|
this.prefix = init.prefix;
|
||||||
this.mod = init.mod;
|
this.mod = init.mod;
|
||||||
this.prefixMod = init.prefix + init.mod;
|
this.prefixMod = init.prefix + init.mod;
|
||||||
@ -88,14 +87,13 @@ AutoFetcher.prototype.fixupURL = function (url) {
|
|||||||
};
|
};
|
||||||
|
|
||||||
AutoFetcher.prototype.safeFetch = function (url) {
|
AutoFetcher.prototype.safeFetch = function (url) {
|
||||||
var fixedURL = this.fixupURL(url);
|
|
||||||
// check to see if we have seen this url before in order
|
// check to see if we have seen this url before in order
|
||||||
// to lessen the load against the server content is fetched from
|
// to lessen the load against the server content is fetched from
|
||||||
if (this.seen[url] != null) return;
|
if (this.seen[url] != null) return;
|
||||||
this.seen[url] = true;
|
this.seen[url] = true;
|
||||||
if (this.queuing) {
|
if (this.queuing) {
|
||||||
// we are currently waiting for a batch of fetches to complete
|
// we are currently waiting for a batch of fetches to complete
|
||||||
return this.queue.push(fixedURL);
|
return this.queue.push(url);
|
||||||
}
|
}
|
||||||
// fetch this url
|
// fetch this url
|
||||||
this.fetches.push(fetch(url));
|
this.fetches.push(fetch(url));
|
||||||
@ -103,7 +101,7 @@ AutoFetcher.prototype.safeFetch = function (url) {
|
|||||||
|
|
||||||
AutoFetcher.prototype.urlExtractor = function (match, n1, n2, n3, offset, string) {
|
AutoFetcher.prototype.urlExtractor = function (match, n1, n2, n3, offset, string) {
|
||||||
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
|
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
|
||||||
this.safeFetch(n2);
|
this.safeFetch(this.fixupURL(n2));
|
||||||
return n1 + n2 + n3;
|
return n1 + n2 + n3;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -154,27 +152,79 @@ AutoFetcher.prototype.extractMedia = function (mediaRules) {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
AutoFetcher.prototype.extractSrcset = function (srcsets) {
|
AutoFetcher.prototype.maybeFixUpRelSchemelessPrefix = function (url) {
|
||||||
|
// attempt to ensure rewritten relative or schemeless URLs become full URLS!
|
||||||
|
// otherwise returns null if this did not happen
|
||||||
|
if (url.indexOf(this.relative) === 0) {
|
||||||
|
return url.replace(this.relative, this.prefix);
|
||||||
|
}
|
||||||
|
if (url.indexOf(this.schemeless) === 0) {
|
||||||
|
return url.replace(this.schemeless, this.prefix);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
};
|
||||||
|
|
||||||
|
AutoFetcher.prototype.maybeResolveURL = function (url, base) {
|
||||||
|
// given a url and base url returns a resolved full URL or
|
||||||
|
// null if resolution was unsuccessful
|
||||||
|
try {
|
||||||
|
var _url = new URL(url, base);
|
||||||
|
return _url.href;
|
||||||
|
} catch (e) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
AutoFetcher.prototype.fixupURLSrcSet = function (url, tagSrc, context) {
|
||||||
|
// attempt to fix up the url and do our best to ensure we can get dat 200 OK!
|
||||||
|
if (url.indexOf(this.prefix) !== 0) {
|
||||||
|
// first check for / (relative) or // (schemeless) rewritten urls
|
||||||
|
var maybeFixed = this.maybeFixUpRelSchemelessPrefix(url);
|
||||||
|
if (maybeFixed != null) {
|
||||||
|
return maybeFixed;
|
||||||
|
}
|
||||||
|
// resolve URL against tag src
|
||||||
|
maybeFixed = this.maybeResolveURL(url, tagSrc);
|
||||||
|
if (maybeFixed != null) {
|
||||||
|
return this.prefix + 'im_/' + maybeFixed;
|
||||||
|
}
|
||||||
|
// finally last attempt resolve the originating documents base URI
|
||||||
|
maybeFixed = this.maybeResolveURL(url, context.docBaseURI);
|
||||||
|
if (maybeFixed != null) {
|
||||||
|
return this.prefix + 'im_/' + maybeFixed;
|
||||||
|
}
|
||||||
|
// not much to do now.....
|
||||||
|
return this.prefixMod + '/' + url;
|
||||||
|
}
|
||||||
|
return url;
|
||||||
|
};
|
||||||
|
|
||||||
|
AutoFetcher.prototype.extractSrcset = function (srcsets, context) {
|
||||||
if (srcsets == null || srcsets.values == null) return;
|
if (srcsets == null || srcsets.values == null) return;
|
||||||
var srcsetValues = srcsets.values;
|
var srcsetValues = srcsets.values;
|
||||||
// was srcsets from rewrite_srcset and if so no need to split
|
if (!srcsets.presplit) {
|
||||||
var presplit = srcsets.presplit;
|
// was from extract from local doc so we need to duplicate work
|
||||||
|
return this.srcsetNotPreSplit(srcsetValues, context);
|
||||||
|
}
|
||||||
|
// was rewrite_srcset so just ensure we just
|
||||||
for (var i = 0; i < srcsetValues.length; i++) {
|
for (var i = 0; i < srcsetValues.length; i++) {
|
||||||
var srcset = srcsetValues[i];
|
// grab the URL not width/height key
|
||||||
if (presplit) {
|
this.safeFetch(srcsetValues[i].split(' ')[0]);
|
||||||
// was rewrite_srcset so just ensure we just
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
AutoFetcher.prototype.srcsetNotPreSplit = function (values, context) {
|
||||||
|
// was from extract from local doc so we need to duplicate work
|
||||||
|
var j;
|
||||||
|
for (var i = 0; i < values.length; i++) {
|
||||||
|
var srcsetValues = values[i].srcset.split(srcsetSplit);
|
||||||
|
var tagSrc = values[i].tagSrc;
|
||||||
|
for (j = 0; j < srcsetValues.length; j++) {
|
||||||
// grab the URL not width/height key
|
// grab the URL not width/height key
|
||||||
this.safeFetch(srcset.split(' ')[0]);
|
if (Boolean(srcsetValues[j])) {
|
||||||
} else {
|
var value = srcsetValues[j].trim().split(' ')[0];
|
||||||
// was from extract from local doc so we need to duplicate work
|
this.safeFetch(this.fixupURLSrcSet(value, tagSrc, context));
|
||||||
var values = srcset.split(srcsetSplit);
|
|
||||||
for (var j = 0; j < values.length; j++) {
|
|
||||||
if (Boolean(values[j])) {
|
|
||||||
var value = values[j].trim();
|
|
||||||
if (value.length > 0) {
|
|
||||||
this.safeFetch(value.split(' ')[0]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -184,7 +234,7 @@ AutoFetcher.prototype.autofetchMediaSrcset = function (data) {
|
|||||||
// we got a message and now we autofetch!
|
// we got a message and now we autofetch!
|
||||||
// these calls turn into no ops if they have no work
|
// these calls turn into no ops if they have no work
|
||||||
this.extractMedia(data.media);
|
this.extractMedia(data.media);
|
||||||
this.extractSrcset(data.srcset);
|
this.extractSrcset(data.srcset, data.context);
|
||||||
this.fetchAll();
|
this.fetchAll();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1151,7 +1151,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
|||||||
} else if (lowername == "style") {
|
} else if (lowername == "style") {
|
||||||
value = rewrite_style(value);
|
value = rewrite_style(value);
|
||||||
} else if (lowername == "srcset") {
|
} else if (lowername == "srcset") {
|
||||||
value = rewrite_srcset(value);
|
value = rewrite_srcset(value, this.tagName === 'IMG');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
orig_setAttribute.call(this, name, value);
|
orig_setAttribute.call(this, name, value);
|
||||||
@ -1403,16 +1403,23 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
|||||||
this.worker.terminate();
|
this.worker.terminate();
|
||||||
};
|
};
|
||||||
|
|
||||||
AutoFetchWorker.prototype.postMessage = function (msg) {
|
AutoFetchWorker.prototype.postMessage = function (msg, deferred) {
|
||||||
|
if (deferred) {
|
||||||
|
var self = this;
|
||||||
|
return Promise.resolve().then(function () {
|
||||||
|
self.worker.postMessage(msg);
|
||||||
|
});
|
||||||
|
}
|
||||||
this.worker.postMessage(msg);
|
this.worker.postMessage(msg);
|
||||||
};
|
};
|
||||||
|
|
||||||
AutoFetchWorker.prototype.preserveSrcset = function (srcset) {
|
AutoFetchWorker.prototype.preserveSrcset = function (srcset) {
|
||||||
// send values from rewrite_srcset to the worker
|
// send values from rewrite_srcset to the worker deferred
|
||||||
|
// to ensure the page viewer sees the images first
|
||||||
this.postMessage({
|
this.postMessage({
|
||||||
'type': 'values',
|
'type': 'values',
|
||||||
'srcset': {'values': srcset, 'presplit': true},
|
'srcset': {'values': srcset, 'presplit': true},
|
||||||
});
|
}, true);
|
||||||
};
|
};
|
||||||
|
|
||||||
AutoFetchWorker.prototype.preserveMedia = function (media) {
|
AutoFetchWorker.prototype.preserveMedia = function (media) {
|
||||||
@ -1421,36 +1428,42 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
|||||||
};
|
};
|
||||||
|
|
||||||
AutoFetchWorker.prototype.extractFromLocalDoc = function () {
|
AutoFetchWorker.prototype.extractFromLocalDoc = function () {
|
||||||
// get the values to be preserved from the documents stylesheets
|
// get the values to be preserved from the documents stylesheets
|
||||||
// and all elements with a srcset
|
// and all elements with a srcset
|
||||||
var media = [];
|
var media = [];
|
||||||
var srcset = [];
|
var srcset = [];
|
||||||
var sheets = $wbwindow.document.styleSheets;
|
var sheets = $wbwindow.document.styleSheets;
|
||||||
var i = 0;
|
var i = 0;
|
||||||
for (; i < sheets.length; ++i) {
|
for (; i < sheets.length; ++i) {
|
||||||
var rules = sheets[i].cssRules;
|
var rules = sheets[i].cssRules;
|
||||||
for (var j = 0; j < rules.length; ++j) {
|
for (var j = 0; j < rules.length; ++j) {
|
||||||
var rule = rules[j];
|
var rule = rules[j];
|
||||||
if (rule.type === CSSRule.MEDIA_RULE) {
|
if (rule.type === CSSRule.MEDIA_RULE) {
|
||||||
media.push(rule.cssText);
|
media.push(rule.cssText);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
var srcsetElems = $wbwindow.document.querySelectorAll('img[srcset]');
|
}
|
||||||
for (i = 0; i < srcsetElems.length; i++) {
|
var srcsetElems = $wbwindow.document.querySelectorAll('img[srcset]');
|
||||||
var srcsetElem = srcsetElems[i];
|
for (i = 0; i < srcsetElems.length; i++) {
|
||||||
if (wb_getAttribute) {
|
var ssv = {tagSrc: srcsetElems[i].src};
|
||||||
srcset.push(wb_getAttribute.call(srcsetElem, 'srcset'));
|
if (wb_getAttribute) {
|
||||||
} else {
|
ssv.srcset = wb_getAttribute.call(srcsetElems[i], 'srcset');
|
||||||
srcset.push(srcsetElem.getAttribute('srcset'));
|
} else {
|
||||||
}
|
ssv.srcset = srcsetElems[i].getAttribute('srcset');
|
||||||
}
|
}
|
||||||
this.postMessage({
|
srcset.push(ssv);
|
||||||
'type': 'values',
|
}
|
||||||
'media': media,
|
// send the extracted values to the worker deferred
|
||||||
'srcset': {'values': srcset, 'presplit': false},
|
// to ensure the page viewer sees the images first
|
||||||
});
|
this.postMessage({
|
||||||
};
|
'type': 'values',
|
||||||
|
'media': media,
|
||||||
|
'srcset': {'values': srcset, 'presplit': false},
|
||||||
|
'context': {
|
||||||
|
'docBaseURI': $wbwindow.document.baseURI
|
||||||
|
}
|
||||||
|
}, true);
|
||||||
|
};
|
||||||
|
|
||||||
WBAutoFetchWorker = new AutoFetchWorker(wb_abs_prefix, wbinfo.mod);
|
WBAutoFetchWorker = new AutoFetchWorker(wb_abs_prefix, wbinfo.mod);
|
||||||
|
|
||||||
@ -1601,7 +1614,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
|||||||
} else if (name == "style") {
|
} else if (name == "style") {
|
||||||
new_value = rewrite_style(value);
|
new_value = rewrite_style(value);
|
||||||
} else if (name == "srcset") {
|
} else if (name == "srcset") {
|
||||||
new_value = rewrite_srcset(value);
|
new_value = rewrite_srcset(value, elem.tagName === 'IMG');
|
||||||
} else {
|
} else {
|
||||||
// Only rewrite if absolute url
|
// Only rewrite if absolute url
|
||||||
if (abs_url_only && !starts_with(value, VALID_PREFIXES)) {
|
if (abs_url_only && !starts_with(value, VALID_PREFIXES)) {
|
||||||
@ -1643,7 +1656,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
//============================================
|
//============================================
|
||||||
function rewrite_srcset(value)
|
function rewrite_srcset(value, isImage)
|
||||||
{
|
{
|
||||||
if (!value) {
|
if (!value) {
|
||||||
return "";
|
return "";
|
||||||
@ -1655,7 +1668,8 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
|||||||
for (var i = 0; i < values.length; i++) {
|
for (var i = 0; i < values.length; i++) {
|
||||||
values[i] = rewrite_url(values[i].trim());
|
values[i] = rewrite_url(values[i].trim());
|
||||||
}
|
}
|
||||||
if (wbUseAFWorker) {
|
|
||||||
|
if (wbUseAFWorker && isImage) {
|
||||||
// send post split values to preservation worker
|
// send post split values to preservation worker
|
||||||
WBAutoFetchWorker.preserveSrcset(values);
|
WBAutoFetchWorker.preserveSrcset(values);
|
||||||
}
|
}
|
||||||
@ -2004,7 +2018,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
|||||||
if (mod == "cs_" && orig.indexOf("data:text/css") == 0) {
|
if (mod == "cs_" && orig.indexOf("data:text/css") == 0) {
|
||||||
val = rewrite_inline_style(orig);
|
val = rewrite_inline_style(orig);
|
||||||
} else if (attr == "srcset") {
|
} else if (attr == "srcset") {
|
||||||
val = rewrite_srcset(orig);
|
val = rewrite_srcset(orig, this.tagName === 'IMG');
|
||||||
} else if (this.tagName === 'LINK' && attr === 'href') {
|
} else if (this.tagName === 'LINK' && attr === 'href') {
|
||||||
var relV = this.rel;
|
var relV = this.rel;
|
||||||
if (relV === 'import' || relV === 'preload') {
|
if (relV === 'import' || relV === 'preload') {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user