mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
autoFetchWorker.js improvements: (#397)
- ensured that autoFetchWorker uses full srcset URLs - resolves the URL against the img.src or document.baseURI if not rewritten - otherwise ensures the rewritten URL is not relative or schemeless wombat.js: - AutoFetchWorker updated extractFromLocalDoc to send URL resolution information to the worker - defer extractFromLocalDoc and preserveSrcset postMessages to ensure page viewer can see the images first
This commit is contained in:
parent
a9e4b5c469
commit
82f2dace64
@ -50,7 +50,6 @@ function AutoFetcher(init) {
|
||||
if (!(this instanceof AutoFetcher)) {
|
||||
return new AutoFetcher(init);
|
||||
}
|
||||
this.proxyMode = init.proxyMode;
|
||||
this.prefix = init.prefix;
|
||||
this.mod = init.mod;
|
||||
this.prefixMod = init.prefix + init.mod;
|
||||
@ -88,14 +87,13 @@ AutoFetcher.prototype.fixupURL = function (url) {
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.safeFetch = function (url) {
|
||||
var fixedURL = this.fixupURL(url);
|
||||
// check to see if we have seen this url before in order
|
||||
// to lessen the load against the server content is fetched from
|
||||
if (this.seen[url] != null) return;
|
||||
this.seen[url] = true;
|
||||
if (this.queuing) {
|
||||
// we are currently waiting for a batch of fetches to complete
|
||||
return this.queue.push(fixedURL);
|
||||
return this.queue.push(url);
|
||||
}
|
||||
// fetch this url
|
||||
this.fetches.push(fetch(url));
|
||||
@ -103,7 +101,7 @@ AutoFetcher.prototype.safeFetch = function (url) {
|
||||
|
||||
AutoFetcher.prototype.urlExtractor = function (match, n1, n2, n3, offset, string) {
|
||||
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
|
||||
this.safeFetch(n2);
|
||||
this.safeFetch(this.fixupURL(n2));
|
||||
return n1 + n2 + n3;
|
||||
};
|
||||
|
||||
@ -154,27 +152,79 @@ AutoFetcher.prototype.extractMedia = function (mediaRules) {
|
||||
}
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.extractSrcset = function (srcsets) {
|
||||
AutoFetcher.prototype.maybeFixUpRelSchemelessPrefix = function (url) {
|
||||
// attempt to ensure rewritten relative or schemeless URLs become full URLS!
|
||||
// otherwise returns null if this did not happen
|
||||
if (url.indexOf(this.relative) === 0) {
|
||||
return url.replace(this.relative, this.prefix);
|
||||
}
|
||||
if (url.indexOf(this.schemeless) === 0) {
|
||||
return url.replace(this.schemeless, this.prefix);
|
||||
}
|
||||
return null;
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.maybeResolveURL = function (url, base) {
|
||||
// given a url and base url returns a resolved full URL or
|
||||
// null if resolution was unsuccessful
|
||||
try {
|
||||
var _url = new URL(url, base);
|
||||
return _url.href;
|
||||
} catch (e) {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
AutoFetcher.prototype.fixupURLSrcSet = function (url, tagSrc, context) {
|
||||
// attempt to fix up the url and do our best to ensure we can get dat 200 OK!
|
||||
if (url.indexOf(this.prefix) !== 0) {
|
||||
// first check for / (relative) or // (schemeless) rewritten urls
|
||||
var maybeFixed = this.maybeFixUpRelSchemelessPrefix(url);
|
||||
if (maybeFixed != null) {
|
||||
return maybeFixed;
|
||||
}
|
||||
// resolve URL against tag src
|
||||
maybeFixed = this.maybeResolveURL(url, tagSrc);
|
||||
if (maybeFixed != null) {
|
||||
return this.prefix + 'im_/' + maybeFixed;
|
||||
}
|
||||
// finally last attempt resolve the originating documents base URI
|
||||
maybeFixed = this.maybeResolveURL(url, context.docBaseURI);
|
||||
if (maybeFixed != null) {
|
||||
return this.prefix + 'im_/' + maybeFixed;
|
||||
}
|
||||
// not much to do now.....
|
||||
return this.prefixMod + '/' + url;
|
||||
}
|
||||
return url;
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.extractSrcset = function (srcsets, context) {
|
||||
if (srcsets == null || srcsets.values == null) return;
|
||||
var srcsetValues = srcsets.values;
|
||||
// was srcsets from rewrite_srcset and if so no need to split
|
||||
var presplit = srcsets.presplit;
|
||||
if (!srcsets.presplit) {
|
||||
// was from extract from local doc so we need to duplicate work
|
||||
return this.srcsetNotPreSplit(srcsetValues, context);
|
||||
}
|
||||
// was rewrite_srcset so just ensure we just
|
||||
for (var i = 0; i < srcsetValues.length; i++) {
|
||||
var srcset = srcsetValues[i];
|
||||
if (presplit) {
|
||||
// was rewrite_srcset so just ensure we just
|
||||
// grab the URL not width/height key
|
||||
this.safeFetch(srcsetValues[i].split(' ')[0]);
|
||||
}
|
||||
};
|
||||
|
||||
AutoFetcher.prototype.srcsetNotPreSplit = function (values, context) {
|
||||
// was from extract from local doc so we need to duplicate work
|
||||
var j;
|
||||
for (var i = 0; i < values.length; i++) {
|
||||
var srcsetValues = values[i].srcset.split(srcsetSplit);
|
||||
var tagSrc = values[i].tagSrc;
|
||||
for (j = 0; j < srcsetValues.length; j++) {
|
||||
// grab the URL not width/height key
|
||||
this.safeFetch(srcset.split(' ')[0]);
|
||||
} else {
|
||||
// was from extract from local doc so we need to duplicate work
|
||||
var values = srcset.split(srcsetSplit);
|
||||
for (var j = 0; j < values.length; j++) {
|
||||
if (Boolean(values[j])) {
|
||||
var value = values[j].trim();
|
||||
if (value.length > 0) {
|
||||
this.safeFetch(value.split(' ')[0]);
|
||||
}
|
||||
}
|
||||
if (Boolean(srcsetValues[j])) {
|
||||
var value = srcsetValues[j].trim().split(' ')[0];
|
||||
this.safeFetch(this.fixupURLSrcSet(value, tagSrc, context));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -184,7 +234,7 @@ AutoFetcher.prototype.autofetchMediaSrcset = function (data) {
|
||||
// we got a message and now we autofetch!
|
||||
// these calls turn into no ops if they have no work
|
||||
this.extractMedia(data.media);
|
||||
this.extractSrcset(data.srcset);
|
||||
this.extractSrcset(data.srcset, data.context);
|
||||
this.fetchAll();
|
||||
};
|
||||
|
||||
|
@ -1151,7 +1151,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
||||
} else if (lowername == "style") {
|
||||
value = rewrite_style(value);
|
||||
} else if (lowername == "srcset") {
|
||||
value = rewrite_srcset(value);
|
||||
value = rewrite_srcset(value, this.tagName === 'IMG');
|
||||
}
|
||||
}
|
||||
orig_setAttribute.call(this, name, value);
|
||||
@ -1403,16 +1403,23 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
||||
this.worker.terminate();
|
||||
};
|
||||
|
||||
AutoFetchWorker.prototype.postMessage = function (msg) {
|
||||
AutoFetchWorker.prototype.postMessage = function (msg, deferred) {
|
||||
if (deferred) {
|
||||
var self = this;
|
||||
return Promise.resolve().then(function () {
|
||||
self.worker.postMessage(msg);
|
||||
});
|
||||
}
|
||||
this.worker.postMessage(msg);
|
||||
};
|
||||
|
||||
AutoFetchWorker.prototype.preserveSrcset = function (srcset) {
|
||||
// send values from rewrite_srcset to the worker
|
||||
// send values from rewrite_srcset to the worker deferred
|
||||
// to ensure the page viewer sees the images first
|
||||
this.postMessage({
|
||||
'type': 'values',
|
||||
'srcset': {'values': srcset, 'presplit': true},
|
||||
});
|
||||
}, true);
|
||||
};
|
||||
|
||||
AutoFetchWorker.prototype.preserveMedia = function (media) {
|
||||
@ -1421,36 +1428,42 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
||||
};
|
||||
|
||||
AutoFetchWorker.prototype.extractFromLocalDoc = function () {
|
||||
// get the values to be preserved from the documents stylesheets
|
||||
// and all elements with a srcset
|
||||
var media = [];
|
||||
var srcset = [];
|
||||
var sheets = $wbwindow.document.styleSheets;
|
||||
var i = 0;
|
||||
for (; i < sheets.length; ++i) {
|
||||
var rules = sheets[i].cssRules;
|
||||
for (var j = 0; j < rules.length; ++j) {
|
||||
var rule = rules[j];
|
||||
if (rule.type === CSSRule.MEDIA_RULE) {
|
||||
media.push(rule.cssText);
|
||||
}
|
||||
// get the values to be preserved from the documents stylesheets
|
||||
// and all elements with a srcset
|
||||
var media = [];
|
||||
var srcset = [];
|
||||
var sheets = $wbwindow.document.styleSheets;
|
||||
var i = 0;
|
||||
for (; i < sheets.length; ++i) {
|
||||
var rules = sheets[i].cssRules;
|
||||
for (var j = 0; j < rules.length; ++j) {
|
||||
var rule = rules[j];
|
||||
if (rule.type === CSSRule.MEDIA_RULE) {
|
||||
media.push(rule.cssText);
|
||||
}
|
||||
}
|
||||
var srcsetElems = $wbwindow.document.querySelectorAll('img[srcset]');
|
||||
for (i = 0; i < srcsetElems.length; i++) {
|
||||
var srcsetElem = srcsetElems[i];
|
||||
if (wb_getAttribute) {
|
||||
srcset.push(wb_getAttribute.call(srcsetElem, 'srcset'));
|
||||
} else {
|
||||
srcset.push(srcsetElem.getAttribute('srcset'));
|
||||
}
|
||||
}
|
||||
var srcsetElems = $wbwindow.document.querySelectorAll('img[srcset]');
|
||||
for (i = 0; i < srcsetElems.length; i++) {
|
||||
var ssv = {tagSrc: srcsetElems[i].src};
|
||||
if (wb_getAttribute) {
|
||||
ssv.srcset = wb_getAttribute.call(srcsetElems[i], 'srcset');
|
||||
} else {
|
||||
ssv.srcset = srcsetElems[i].getAttribute('srcset');
|
||||
}
|
||||
this.postMessage({
|
||||
'type': 'values',
|
||||
'media': media,
|
||||
'srcset': {'values': srcset, 'presplit': false},
|
||||
});
|
||||
};
|
||||
srcset.push(ssv);
|
||||
}
|
||||
// send the extracted values to the worker deferred
|
||||
// to ensure the page viewer sees the images first
|
||||
this.postMessage({
|
||||
'type': 'values',
|
||||
'media': media,
|
||||
'srcset': {'values': srcset, 'presplit': false},
|
||||
'context': {
|
||||
'docBaseURI': $wbwindow.document.baseURI
|
||||
}
|
||||
}, true);
|
||||
};
|
||||
|
||||
WBAutoFetchWorker = new AutoFetchWorker(wb_abs_prefix, wbinfo.mod);
|
||||
|
||||
@ -1601,7 +1614,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
||||
} else if (name == "style") {
|
||||
new_value = rewrite_style(value);
|
||||
} else if (name == "srcset") {
|
||||
new_value = rewrite_srcset(value);
|
||||
new_value = rewrite_srcset(value, elem.tagName === 'IMG');
|
||||
} else {
|
||||
// Only rewrite if absolute url
|
||||
if (abs_url_only && !starts_with(value, VALID_PREFIXES)) {
|
||||
@ -1643,7 +1656,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
||||
}
|
||||
|
||||
//============================================
|
||||
function rewrite_srcset(value)
|
||||
function rewrite_srcset(value, isImage)
|
||||
{
|
||||
if (!value) {
|
||||
return "";
|
||||
@ -1655,7 +1668,8 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
||||
for (var i = 0; i < values.length; i++) {
|
||||
values[i] = rewrite_url(values[i].trim());
|
||||
}
|
||||
if (wbUseAFWorker) {
|
||||
|
||||
if (wbUseAFWorker && isImage) {
|
||||
// send post split values to preservation worker
|
||||
WBAutoFetchWorker.preserveSrcset(values);
|
||||
}
|
||||
@ -2004,7 +2018,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
||||
if (mod == "cs_" && orig.indexOf("data:text/css") == 0) {
|
||||
val = rewrite_inline_style(orig);
|
||||
} else if (attr == "srcset") {
|
||||
val = rewrite_srcset(orig);
|
||||
val = rewrite_srcset(orig, this.tagName === 'IMG');
|
||||
} else if (this.tagName === 'LINK' && attr === 'href') {
|
||||
var relV = this.rel;
|
||||
if (relV === 'import' || relV === 'preload') {
|
||||
|
Loading…
x
Reference in New Issue
Block a user