mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-26 07:49:24 +01:00
wombat: - I: function overrides applied by wombat now better appear to be the original new function name same as originals when possible - I: WombatLocation now looks and behaves more like the original Location interface - I: The custom storage class now looks and behaves more like the original Storage - I: SVG image rewriting has been improved: both the href and xlink:href deprecated since SVG2 now rewritten always - I: document.open now handles the case of creation of a new window - I: Request object rewriting of the readonly href property is now correctly handled - I: EventTarget.addEventListener, removeEventListener overrides now preserve the original this argument of the wrapped listener - A: document.close override to ensure wombat is initialized after write or writeln usage - A: reconstruction of <doctype...> in rewriteHTMLComplete IFF it was included in the original string of HTML - A: document.body setter override to ensure rewriting of the new body or frameset - A: Attr.[value, nodeValue, textContent] added setter override to perform URL rewrites - A: SVGElements rewriting of the filter, style, xlink:href, href, and src attributes - A: HTMLTrackElement rewriting of the src attribute of the - A: HTMLQuoteElement and HTMLModElement rewriting of the cite attribute - A: Worklet.addModule: Loads JS module specified by a URL. - A: HTMLHyperlinkElementUtils overrides to the areaelement - A: ShadowRootoverrides to: innerHTML even though inherites from DocumentFragement and Node it still has innerHTML getter setter. - A: ShadowRoot, Element, DocumentFragment append, prepend: adds strings of HTML or a new Node inherited from ParentNode - A: StylePropertyMap override: New way to access and set CSS properties. - A: Response.redirecthttps rewriting of the URL argument. - A: UIEvent, MouseEvent, TouchEvent, KeyboardEvent, WheelEvent, InputEvent, and CompositionEven constructor and init{even-name} overrides in order to ensure that wombats JS Proxy usage does not affect their defined behaviors - A: XSLTProcessor override to ensure its usage is not affected by wombats JS Proxy usage. - A: navigator.unregisterProtocolHandler: Same override as existing navigator.registerProtocolHandler but from the inverse operation - A: PresentationRequest: Constructor takes a URL or an array of URLs. - A: EventSource and WebSocket override in order to ensure that they do not cause live leaks - A: overrides for the child node interface - Fix: autofetch worker creatation of the backing worker when it is operating within an execution context with a null origin tests: - A: 559 tests specific to wombat and client side rewritting pywb: - Fix: a few broken tests due to iana.org requiring a user agent in its requests rewrite: - introduced a new JSWorkerRewriter class in order to support rewriting via wombat workers in the context of all supported worker variants via - ensured rewriter app correctly sets the static prefix ci: - Modified travis.yml to specifically enumerate jobs documentation: - Documented new wombat, wombat proxy moded, wombat workers auto-fetch: - switched to mutation observer when in proxy mode so that the behaviors can operate in tandem with the autofetcher
304 lines
9.1 KiB
JavaScript
304 lines
9.1 KiB
JavaScript
'use strict';
|
|
// thanks wombat
|
|
var STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi;
|
|
var IMPORT_REGEX = /(@import\s+[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi;
|
|
var srcsetSplit = /\s*(\S*\s+[\d.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/;
|
|
var DefaultNumImFetches = 30;
|
|
var FullImgQDrainLen = 10;
|
|
var DefaultNumAvFetches = 5;
|
|
var FullAVQDrainLen = 5;
|
|
var DataURLPrefix = 'data:';
|
|
var FetchDelay = 1000;
|
|
// the autofetcher instance for this worker
|
|
var autofetcher = null;
|
|
|
|
function noop() {}
|
|
|
|
if (typeof self.Promise === 'undefined') {
|
|
// not kewl we must polyfill Promise
|
|
self.Promise = function(executor) {
|
|
executor(noop, noop);
|
|
};
|
|
self.Promise.prototype.then = function(cb) {
|
|
if (cb) cb();
|
|
return this;
|
|
};
|
|
self.Promise.prototype.catch = function() {
|
|
return this;
|
|
};
|
|
self.Promise.all = function(values) {
|
|
return new Promise(noop);
|
|
};
|
|
}
|
|
|
|
if (typeof self.fetch === 'undefined') {
|
|
// not kewl we must polyfill fetch.
|
|
self.fetch = function(url) {
|
|
return new Promise(function(resolve) {
|
|
var xhr = new XMLHttpRequest();
|
|
xhr.open('GET', url);
|
|
xhr.send();
|
|
resolve();
|
|
});
|
|
};
|
|
}
|
|
|
|
self.onmessage = function(event) {
|
|
var data = event.data;
|
|
switch (data.type) {
|
|
case 'values':
|
|
autofetcher.autofetchMediaSrcset(data);
|
|
break;
|
|
case 'fetch-all':
|
|
autofetcher.justFetch(data);
|
|
break;
|
|
}
|
|
};
|
|
|
|
function AutoFetcher() {
|
|
if (!(this instanceof AutoFetcher)) {
|
|
return new AutoFetcher();
|
|
}
|
|
// local cache of URLs fetched, to reduce server load
|
|
this.seen = {};
|
|
// array of URLs to be fetched
|
|
this.queue = [];
|
|
this.avQueue = [];
|
|
// should we queue a URL or not
|
|
this.queuing = false;
|
|
// a URL to resolve relative URLs found in the cssText of CSSMedia rules.
|
|
this.currentResolver = null;
|
|
// should we queue a URL or not
|
|
this.queuing = false;
|
|
this.queuingAV = false;
|
|
this.urlExtractor = this.urlExtractor.bind(this);
|
|
this.imgFetchDone = this.imgFetchDone.bind(this);
|
|
this.avFetchDone = this.avFetchDone.bind(this);
|
|
}
|
|
|
|
AutoFetcher.prototype.delay = function() {
|
|
return new Promise(function(resolve, reject) {
|
|
setTimeout(resolve, FetchDelay);
|
|
});
|
|
};
|
|
|
|
AutoFetcher.prototype.imgFetchDone = function() {
|
|
if (this.queue.length > 0) {
|
|
// we have a Q of some length drain it
|
|
var autofetcher = this;
|
|
this.delay().then(function() {
|
|
autofetcher.queuing = false;
|
|
autofetcher.fetchImgs();
|
|
});
|
|
} else {
|
|
this.queuing = false;
|
|
}
|
|
};
|
|
|
|
AutoFetcher.prototype.avFetchDone = function() {
|
|
if (this.avQueue.length > 0) {
|
|
// we have a Q of some length drain it
|
|
var autofetcher = this;
|
|
this.delay().then(function() {
|
|
autofetcher.queuingAV = false;
|
|
autofetcher.fetchAV();
|
|
});
|
|
} else {
|
|
this.queuingAV = false;
|
|
}
|
|
};
|
|
|
|
AutoFetcher.prototype.fetchAV = function() {
|
|
if (this.queuingAV || this.avQueue.length === 0) {
|
|
return;
|
|
}
|
|
// the number of fetches is limited to a maximum of DefaultNumAvFetches + FullAVQDrainLen outstanding fetches
|
|
// the baseline maximum number of fetches is DefaultNumAvFetches but if the size(avQueue) <= FullAVQDrainLen
|
|
// we add them to the current batch. Because audio video resources might be big
|
|
// we limit how many we fetch at a time drastically
|
|
this.queuingAV = true;
|
|
var runningFetchers = [];
|
|
while (
|
|
this.avQueue.length > 0 &&
|
|
runningFetchers.length <= DefaultNumAvFetches
|
|
) {
|
|
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop));
|
|
}
|
|
if (this.avQueue.length <= FullAVQDrainLen) {
|
|
while (this.avQueue.length > 0) {
|
|
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop));
|
|
}
|
|
}
|
|
Promise.all(runningFetchers)
|
|
.then(this.avFetchDone)
|
|
.catch(this.avFetchDone);
|
|
};
|
|
|
|
AutoFetcher.prototype.fetchImgs = function() {
|
|
if (this.queuing || this.queue.length === 0) {
|
|
return;
|
|
}
|
|
// the number of fetches is limited to a maximum of DefaultNumImFetches + FullImgQDrainLen outstanding fetches
|
|
// the baseline maximum number of fetches is DefaultNumImFetches but if the size(queue) <= FullImgQDrainLen
|
|
// we add them to the current batch
|
|
this.queuing = true;
|
|
var runningFetchers = [];
|
|
while (
|
|
this.queue.length > 0 &&
|
|
runningFetchers.length <= DefaultNumImFetches
|
|
) {
|
|
runningFetchers.push(fetch(this.queue.shift()).catch(noop));
|
|
}
|
|
if (this.queue.length <= FullImgQDrainLen) {
|
|
while (this.queue.length > 0) {
|
|
runningFetchers.push(fetch(this.queue.shift()).catch(noop));
|
|
}
|
|
}
|
|
Promise.all(runningFetchers)
|
|
.then(this.imgFetchDone)
|
|
.catch(this.imgFetchDone);
|
|
};
|
|
|
|
AutoFetcher.prototype.queueNonAVURL = function(url) {
|
|
// ensure we do not request data urls
|
|
if (url.indexOf(DataURLPrefix) === 0) return;
|
|
// check to see if we have seen this url before in order
|
|
// to lessen the load against the server content is fetched from
|
|
if (this.seen[url] != null) return;
|
|
this.seen[url] = true;
|
|
this.queue.push(url);
|
|
};
|
|
|
|
AutoFetcher.prototype.queueAVURL = function(url) {
|
|
// ensure we do not request data urls
|
|
if (url.indexOf(DataURLPrefix) === 0) return;
|
|
// check to see if we have seen this url before in order
|
|
// to lessen the load against the server content is fetched from
|
|
if (this.seen[url] != null) return;
|
|
this.seen[url] = true;
|
|
this.avQueue.push(url);
|
|
};
|
|
|
|
AutoFetcher.prototype.safeResolve = function(url, resolver) {
|
|
// Guard against the exception thrown by the URL constructor if the URL or resolver is bad
|
|
// if resolver is undefined/null then this function passes url through
|
|
var resolvedURL = url;
|
|
if (resolver) {
|
|
try {
|
|
resolvedURL = new URL(url, resolver).href;
|
|
} catch (e) {
|
|
resolvedURL = url;
|
|
}
|
|
}
|
|
return resolvedURL;
|
|
};
|
|
|
|
AutoFetcher.prototype.urlExtractor = function(
|
|
match,
|
|
n1,
|
|
n2,
|
|
n3,
|
|
offset,
|
|
string
|
|
) {
|
|
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
|
|
// this.currentResolver is set to the URL which the browser would normally
|
|
// resolve relative urls with (URL of the stylesheet) in an exceptionless manner
|
|
// (resolvedURL will be undefined if an error occurred)
|
|
var resolvedURL = this.safeResolve(n2, this.currentResolver);
|
|
if (resolvedURL) {
|
|
this.queueNonAVURL(resolvedURL);
|
|
}
|
|
return n1 + n2 + n3;
|
|
};
|
|
|
|
AutoFetcher.prototype.extractMedia = function(mediaRules) {
|
|
// this is a broken down rewrite_style
|
|
if (mediaRules == null) return;
|
|
for (var i = 0; i < mediaRules.length; i++) {
|
|
// set currentResolver to the value of this stylesheets URL, done to ensure we do not have to
|
|
// create functions on each loop iteration because we potentially create a new `URL` object
|
|
// twice per iteration
|
|
this.currentResolver = mediaRules[i].resolve;
|
|
mediaRules[i].cssText
|
|
.replace(STYLE_REGEX, this.urlExtractor)
|
|
.replace(IMPORT_REGEX, this.urlExtractor);
|
|
}
|
|
};
|
|
|
|
AutoFetcher.prototype.extractSrcset = function(srcsets) {
|
|
// preservation worker in proxy mode sends us the value of the srcset attribute of an element
|
|
// and a URL to correctly resolve relative URLS. Thus we must recreate rewrite_srcset logic here
|
|
if (srcsets == null) return;
|
|
var length = srcsets.length;
|
|
var extractedSrcSet, srcsetValue, ssSplit, j;
|
|
for (var i = 0; i < length; i++) {
|
|
extractedSrcSet = srcsets[i];
|
|
ssSplit = extractedSrcSet.srcset.split(srcsetSplit);
|
|
console.log(ssSplit);
|
|
for (j = 0; j < ssSplit.length; j++) {
|
|
if (ssSplit[j]) {
|
|
srcsetValue = ssSplit[j].trim();
|
|
if (srcsetValue.length > 0) {
|
|
// resolve the URL in an exceptionless manner (resolvedURL will be undefined if an error occurred)
|
|
var resolvedURL = this.safeResolve(
|
|
srcsetValue.split(' ')[0],
|
|
extractedSrcSet.resolve
|
|
);
|
|
if (resolvedURL) {
|
|
if (extractedSrcSet.mod === 'im_') {
|
|
this.queueNonAVURL(resolvedURL);
|
|
} else {
|
|
this.queueAVURL(resolvedURL);
|
|
}
|
|
} else {
|
|
console.log(resolvedURL);
|
|
}
|
|
} else {
|
|
console.log(srcsetValue);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
AutoFetcher.prototype.extractSrc = function(srcVals) {
|
|
// preservation worker in proxy mode sends us the value of the srcset attribute of an element
|
|
// and a URL to correctly resolve relative URLS. Thus we must recreate rewrite_srcset logic here
|
|
if (srcVals == null || srcVals.length === 0) return;
|
|
var length = srcVals.length;
|
|
var srcVal;
|
|
for (var i = 0; i < length; i++) {
|
|
srcVal = srcVals[i];
|
|
var resolvedURL = this.safeResolve(srcVal.src, srcVal.resolve);
|
|
if (resolvedURL) {
|
|
if (srcVal.mod === 'im_') {
|
|
this.queueNonAVURL(resolvedURL);
|
|
} else {
|
|
this.queueAVURL(resolvedURL);
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
AutoFetcher.prototype.autofetchMediaSrcset = function(data) {
|
|
// we got a message and now we autofetch!
|
|
// these calls turn into no ops if they have no work
|
|
this.extractMedia(data.media);
|
|
this.extractSrcset(data.srcset);
|
|
this.extractSrc(data.src);
|
|
this.fetchImgs();
|
|
this.fetchAV();
|
|
};
|
|
|
|
AutoFetcher.prototype.justFetch = function(data) {
|
|
// we got a message containing only urls to be fetched
|
|
if (data == null || data.values == null) return;
|
|
for (var i = 0; i < data.values.length; ++i) {
|
|
this.queueNonAVURL(data.values[i]);
|
|
}
|
|
this.fetchImgs();
|
|
};
|
|
|
|
autofetcher = new AutoFetcher();
|