1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00
- Fix: a few broken tests due to iana.org requiring a user agent in its requests
rewrite:
  - introduced a new JSWorkerRewriter class in order to support rewriting via wombat workers in the context of all supported worker variants via
  - ensured rewriter app correctly sets the static prefix
wombat:
 - add wombat as submodule!
This commit is contained in:
John Berlin 2019-05-15 14:42:51 -04:00 committed by Ilya Kreymer
parent 77f8bb6476
commit 22b4297fc5
25 changed files with 709 additions and 5675 deletions

3
.gitmodules vendored Normal file
View File

@ -0,0 +1,3 @@
[submodule "wombat"]
path = wombat
url = https://github.com/webrecorder/wombat

View File

@ -1,4 +0,0 @@
NODE_BIN_DIR=../node_modules/.bin
test:
$(NODE_BIN_DIR)/karma start --single-run

View File

@ -1,9 +0,0 @@
<html>
<head><meta charset="UTF-8"></head>
<body>
<!-- This is a dummy page used in
tests of Wombat's live-rewriting
functionality.
!-->
</body>
</html>

View File

@ -1,108 +0,0 @@
var sauceLabsConfig = {
testName: 'pywb Client Tests',
};
// see https://github.com/karma-runner/karma-sauce-launcher/issues/73
if (process.env.TRAVIS_JOB_NUMBER) {
sauceLabsConfig.startConnect = false;
sauceLabsConfig.tunnelIdentifier = process.env.TRAVIS_JOB_NUMBER;
}
var WOMBAT_JS_PATH = 'pywb/static/wombat.js';
var sauceLaunchers = {
sl_chrome: {
base: 'SauceLabs',
browserName: 'chrome',
},
sl_firefox: {
base: 'SauceLabs',
browserName: 'firefox',
},
sl_safari: {
base: 'SauceLabs',
browserName: 'safari',
platform: 'OS X 10.11',
version: '9.0',
},
sl_edge: {
base: 'SauceLabs',
browserName: 'MicrosoftEdge',
},
};
var localLaunchers = {
localFirefox: {
base: 'Firefox',
},
};
var customLaunchers = {};
if (process.env['SAUCE_USERNAME'] && process.env['SAUCE_ACCESS_KEY']) {
customLaunchers = sauceLaunchers;
} else {
console.error('Sauce Labs account details not set, ' +
'Karma tests will be run only against local browsers.' +
'Set SAUCE_USERNAME and SAUCE_ACCESS_KEY environment variables to ' +
'run tests against Sauce Labs browsers');
customLaunchers = localLaunchers;
}
module.exports = function(config) {
config.set({
basePath: '../',
frameworks: ['mocha', 'chai'],
files: [
{
pattern: WOMBAT_JS_PATH,
watched: true,
included: false,
served: true,
},
{
pattern: 'karma-tests/dummy.html',
included: false,
served: true,
},
'karma-tests/*.spec.js',
],
preprocessors: {},
reporters: ['progress'],
port: 9876,
colors: true,
logLevel: config.LOG_INFO,
autoWatch: true,
sauceLabs: sauceLabsConfig,
// Set extended timeouts to account for the slowness
// in connecting to remote browsers (eg. when using
// Sauce Labs)
//
// See https://oligofren.wordpress.com/2014/05/27/running-karma-tests-on-browserstack/
captureTimeout: 3 * 60000,
browserNoActivityTimeout: 30 * 1000,
browserDisconnectTimeout: 10 * 1000,
browserDisconnectTolerance: 1,
customLaunchers: customLaunchers,
browsers: Object.keys(customLaunchers),
singleRun: false,
concurrency: Infinity
})
};

View File

@ -1,225 +0,0 @@
var DEFAULT_TIMEOUT = 20000;
// creates a new document in an <iframe> and runs
// a WombatJS test case in it.
//
// A new <iframe> is used for each test so that each
// case is run with fresh Document and Window objects,
// since Wombat monkey-patches many Document and Window
// functions
//
function runWombatTest(testCase, done) {
// create an <iframe>
var testFrame = document.createElement('iframe');
testFrame.src = '/base/karma-tests/dummy.html';
document.body.appendChild(testFrame);
testFrame.contentWindow.addEventListener('load', function () {
var testDocument = testFrame.contentDocument;
function runFunctionInIFrame(func) {
testFrame.contentWindow.eval('(' + func.toString() + ')()');
}
// expose an error reporting function to the <iframe>
window.reportError = function(ex) {
done(new Error(ex));
};
// expose utility methods for assertion testing in tests.
// (We used to expose chai asserts here but Karma's default
// error reporter replaces URLs in exception messages with
// the corresponding file paths, which is unhelpful for us
// since assert.equal() will often be called with URLs in our tests)
window.assert = {
equal: function (a, b) {
if (a !== b) {
console.error('Mismatch between', a, 'and', b);
throw new Error('AssertionError');
}
}
};
runFunctionInIFrame(function () {
// re-assign the iframe's console object to the parent window's
// console so that messages are intercepted by Karma
// and output to wherever it is configured to send
// console logs (typically stdout)
console = window.parent.console;
window.onerror = function (message, url, line, col, error) {
if (error) {
console.log(error.stack);
}
reportError(new Error(message));
};
// expose chai's assertion testing API to the test script
window.assert = window.parent.assert;
window.reportError = window.parent.reportError;
// helpers which check whether DOM property overrides are supported
// in the current browser
window.domTests = {
areDOMPropertiesConfigurable: function () {
var descriptor = Object.getOwnPropertyDescriptor(Node.prototype, 'baseURI');
if (descriptor && !descriptor.configurable) {
return false;
} else {
return true;
}
}
};
});
try {
runFunctionInIFrame(testCase.initScript);
} catch (e) {
throw new Error('Configuring Wombat failed: ' + e.toString());
}
try {
testFrame.contentWindow.eval(testCase.wombatScript);
runFunctionInIFrame(function () {
new window._WBWombat(window, wbinfo);
});
} catch (e) {
console.error(e.stack);
throw new Error('Initializing WombatJS failed: ' + e.toString());
}
if (testCase.html) {
testDocument.body.innerHTML = testCase.html;
}
if (testCase.testScript) {
try {
runFunctionInIFrame(testCase.testScript);
} catch (e) {
throw new Error('Test script failed: ' + e.toString());
}
}
testFrame.remove();
done();
});
}
describe('WombatJS', function () {
this.timeout(DEFAULT_TIMEOUT);
var wombatScript;
before(function (done) {
// load the source of the WombatJS content
// rewriting script
var req = new XMLHttpRequest();
req.open('GET', '/base/pywb/static/wombat.js');
req.onload = function () {
wombatScript = req.responseText;
done();
};
req.send();
});
it('should load', function (done) {
runWombatTest({
initScript: function () {
wbinfo = {
wombat_opts: {},
wombat_ts: '',
is_live: false,
top_url: ''
};
},
wombatScript: wombatScript,
}, done);
});
describe('anchor rewriting', function () {
var config;
beforeEach(function () {
config = {
initScript: function () {
wbinfo = {
wombat_opts: {},
wombat_scheme: 'http',
prefix: window.location.origin,
wombat_ts: '',
is_live: false,
top_url: ''
};
},
wombatScript: wombatScript,
html: '<a href="foobar.html" id="link">A link</a>',
};
});
it('should rewrite links in dynamically injected <a> tags', function (done) {
config.testScript = function () {
if (domTests.areDOMPropertiesConfigurable()) {
var link = document.getElementById('link');
assert.equal(link.href, 'http:///base/karma-tests/foobar.html');
}
};
runWombatTest(config, done);
});
it('toString() should return the rewritten URL', function (done) {
config.testScript = function () {
if (domTests.areDOMPropertiesConfigurable()) {
var link = document.getElementById('link');
assert.equal(link.href, link.toString());
}
};
runWombatTest(config, done);
});
});
describe('base URL overrides', function () {
it('document.baseURI should return the original URL', function (done) {
runWombatTest({
initScript: function () {
wbinfo = {
wombat_opts: {},
prefix: window.location.origin,
wombat_ts: '',
wombat_scheme: 'http',
is_live: false,
top_url: ''
};
},
wombatScript: wombatScript,
testScript: function () {
var baseURI = document.baseURI;
if (typeof baseURI !== 'string') {
throw new Error('baseURI is not a string');
}
if (domTests.areDOMPropertiesConfigurable()) {
assert.equal(baseURI, 'http:///base/karma-tests/dummy.html');
}
},
}, done);
});
it('should allow base.href to be assigned', function (done) {
runWombatTest({
initScript: function () {
wbinfo = {
wombat_opts: {},
wombat_scheme: 'http',
is_live: false,
top_url: ''
};
},
wombatScript: wombatScript,
testScript: function () {
'use strict';
var baseElement = document.createElement('base');
baseElement.href = 'http://foobar.com/base';
assert.equal(baseElement.href, 'http://foobar.com/base');
},
}, done);
});
});
});

View File

@ -238,7 +238,8 @@ class RewriterApp(object):
host_prefix = self.get_host_prefix(environ) host_prefix = self.get_host_prefix(environ)
rel_prefix = self.get_rel_prefix(environ) rel_prefix = self.get_rel_prefix(environ)
full_prefix = host_prefix + rel_prefix full_prefix = host_prefix + rel_prefix
pywb_static_prefix = environ.get('pywb.host_prefix', '') + environ.get('pywb.app_prefix', '') + environ.get(
'pywb.static_prefix', '/static/')
is_proxy = ('wsgiprox.proxy_host' in environ) is_proxy = ('wsgiprox.proxy_host' in environ)
response = self.handle_custom_response(environ, wb_url, response = self.handle_custom_response(environ, wb_url,
@ -257,7 +258,8 @@ class RewriterApp(object):
urlrewriter = UrlRewriter(wb_url, urlrewriter = UrlRewriter(wb_url,
prefix=full_prefix, prefix=full_prefix,
full_prefix=full_prefix, full_prefix=full_prefix,
rel_prefix=rel_prefix) rel_prefix=rel_prefix,
pywb_static_prefix=pywb_static_prefix)
framed_replay = self.framed_replay framed_replay = self.framed_replay

View File

@ -15,6 +15,8 @@ from pywb.utils.io import StreamIter, BUFF_SIZE
from pywb.utils.loaders import load_yaml_config, load_py_name from pywb.utils.loaders import load_yaml_config, load_py_name
WORKER_MODS = {"wkr_", "sw_"} # type: Set[str]
# ============================================================================ # ============================================================================
class BaseContentRewriter(object): class BaseContentRewriter(object):
@ -423,8 +425,8 @@ class RewriteInfo(object):
def _resolve_text_type(self, text_type): def _resolve_text_type(self, text_type):
mod = self.url_rewriter.wburl.mod mod = self.url_rewriter.wburl.mod
if mod == 'sw_' or mod == 'wkr_': if mod in WORKER_MODS:
return None return 'js-worker'
if text_type == 'css' and mod == 'js_': if text_type == 'css' and mod == 'js_':
text_type = 'css' text_type = 'css'
@ -495,7 +497,7 @@ class RewriteInfo(object):
return True return True
def is_url_rw(self): def is_url_rw(self):
if self.url_rewriter.wburl.mod in ('id_', 'bn_', 'sw_', 'wkr_'): if self.url_rewriter.wburl.mod in ('id_', 'bn_', 'wkrf_'):
return False return False
return True return True

View File

@ -15,6 +15,8 @@ from pywb.rewrite.rewrite_dash import RewriteDASH
from pywb.rewrite.rewrite_hls import RewriteHLS from pywb.rewrite.rewrite_hls import RewriteHLS
from pywb.rewrite.rewrite_amf import RewriteAMF from pywb.rewrite.rewrite_amf import RewriteAMF
from pywb.rewrite.rewrite_js_workers import JSWorkerRewriter
from pywb import DEFAULT_RULES_FILE from pywb import DEFAULT_RULES_FILE
import copy import copy
@ -34,6 +36,7 @@ class DefaultRewriter(BaseContentRewriter):
'js': JSLocationOnlyRewriter, 'js': JSLocationOnlyRewriter,
'js-proxy': JSNoneRewriter, 'js-proxy': JSNoneRewriter,
'js-worker': JSWorkerRewriter,
'json': JSONPRewriter, 'json': JSONPRewriter,

View File

@ -58,7 +58,7 @@ class HTMLRewriterMixin(StreamingRewriter):
'embed': {'src': 'oe_'}, 'embed': {'src': 'oe_'},
'head': {'': defmod}, # for head rewriting 'head': {'': defmod}, # for head rewriting
'iframe': {'src': 'if_'}, 'iframe': {'src': 'if_'},
'image': {'src': 'im_', 'xlink:href': 'im_'}, 'image': {'src': 'im_', 'xlink:href': 'im_', 'href': 'im_'},
'img': {'src': 'im_', 'img': {'src': 'im_',
'srcset': 'im_'}, 'srcset': 'im_'},
'ins': {'cite': defmod}, 'ins': {'cite': defmod},
@ -74,7 +74,7 @@ class HTMLRewriterMixin(StreamingRewriter):
'q': {'cite': defmod}, 'q': {'cite': defmod},
'ref': {'href': 'oe_'}, 'ref': {'href': 'oe_'},
'script': {'src': 'js_', 'xlink:href': 'js_'}, # covers both HTML and SVG script tags 'script': {'src': 'js_', 'xlink:href': 'js_'}, # covers both HTML and SVG script tags
'source': {'src': 'oe_'}, 'source': {'src': 'oe_', 'srcset': 'oe_'},
'video': {'src': 'oe_', 'video': {'src': 'oe_',
'poster': 'im_'}, 'poster': 'im_'},
} }

View File

@ -63,48 +63,59 @@ class RxRules(object):
class JSWombatProxyRules(RxRules): class JSWombatProxyRules(RxRules):
def __init__(self): def __init__(self):
local_init_func = '\nvar {0} = function(name) {{\ local_init_func = '\nvar {0} = function(name) {{\
return (self._wb_wombat && self._wb_wombat.local_init &&\ return (self._wb_wombat && self._wb_wombat.local_init && \
self._wb_wombat.local_init(name)) || self[name]; }};\n\ self._wb_wombat.local_init(name)) || self[name]; }};\n\
if (!self.__WB_pmw) {{ self.__WB_pmw = function(obj) {{ return obj; }} }}\n\ if (!self.__WB_pmw) {{ self.__WB_pmw = function(obj) {{ return obj; }} }}\n\
{{\n' {{\n'
local_check_this_fn = 'var {0} = function (thisObj) {{ \
if (thisObj && thisObj._WB_wombat_obj_proxy) return thisObj._WB_wombat_obj_proxy; return thisObj; }};'
local_init_func_name = '_____WB$wombat$assign$function_____' local_init_func_name = '_____WB$wombat$assign$function_____'
local_var_line = 'let {0} = {1}("{0}");' local_var_line = 'let {0} = {1}("{0}");'
this_rw = '(this && this._WB_wombat_obj_proxy || this)' local_check_this_func_name = '_____WB$wombat$check$this$function_____'
check_loc = '(self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = ' # we must use a function to perform the this check because most minfiers reduce the number of statements
# by turning everything into one or more expressions. Our previous rewrite was an logical expression,
# (this && this._WB_wombat_obj_proxy || this), that would cause the outer expression to be invalid when
# it was used as the LHS of certain expressions.
# e.g. assignment expressions containing non parenthesized logical expression.
# By using a function the expression injected is an call expression that plays nice in those cases
this_rw = '_____WB$wombat$check$this$function_____(this)'
check_loc = '((self.__WB_check_loc && self.__WB_check_loc(location)) || {}).href = '
self.local_objs = [ self.local_objs = [
'window', 'window',
'self', 'self',
'document', 'document',
'location', 'location',
'top', 'top',
'parent', 'parent',
'frames', 'frames',
'opener'] 'opener'
]
local_declares = '\n'.join([local_var_line.format(obj, local_init_func_name) for obj in self.local_objs]) local_declares = '\n'.join([local_var_line.format(obj, local_init_func_name) for obj in self.local_objs])
prop_str = '|'.join(self.local_objs) prop_str = '|'.join(self.local_objs)
rules = [ rules = [
(r'(?<=\.)postMessage\b\(', self.add_prefix('__WB_pmw(self).'), 0), (r'(?<=\.)postMessage\b\(', self.add_prefix('__WB_pmw(self).'), 0),
(r'(?<![$.])\s*location\b\s*[=]\s*(?![=])', self.add_suffix(check_loc), 0), (r'(?<![$.])\s*location\b\s*[=]\s*(?![=])', self.add_suffix(check_loc), 0),
(r'\breturn\s+this\b\s*(?![.$])', self.replace_str(this_rw), 0), (r'\breturn\s+this\b\s*(?![.$])', self.replace_str(this_rw), 0),
(r'(?<=[\n])\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(';' + this_rw), 0), (r'(?<=[\n])\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(';' + this_rw), 0),
(r'(?<![$.])\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(this_rw), 0), (r'(?<![$.])\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(this_rw), 0),
(r'(?<=[=])\s*this\b\s*(?![.$])', self.replace_str(this_rw), 0), (r'(?<=[=])\s*this\b\s*(?![.$])', self.replace_str(this_rw), 0),
('\}(?:\s*\))?\s*\(this\)', self.replace_str(this_rw), 0), ('\}(?:\s*\))?\s*\(this\)', self.replace_str(this_rw), 0),
(r'(?<=[^|&][|&]{2})\s*this\b\s*(?![|&.$]([^|&]|$))', self.replace_str(this_rw), 0), (r'(?<=[^|&][|&]{2})\s*this\b\s*(?![|&.$]([^|&]|$))', self.replace_str(this_rw), 0),
] ]
super(JSWombatProxyRules, self).__init__(rules) super(JSWombatProxyRules, self).__init__(rules)
self.first_buff = local_init_func.format(local_init_func_name) + local_declares self.first_buff = local_check_this_fn.format(local_check_this_func_name) + local_init_func.format(
local_init_func_name) + local_declares + '\n\n'
self.last_buff = '\n\n}' self.last_buff = '\n\n}'

View File

@ -0,0 +1,30 @@
from pywb.rewrite.content_rewriter import StreamingRewriter, WORKER_MODS
__all__ = ["JSWorkerRewriter"]
INJECT = "(function() { self.importScripts('%s'); new WBWombat(%s); })();"
INIT = "{'prefix': '%s', 'prefixMod': '%s/', 'originalURL': '%s'}"
class JSWorkerRewriter(StreamingRewriter):
"""A simple rewriter for rewriting web or service workers.
The only rewriting that occurs is the injection of the init code
for wombatWorkers.js.
This allows for all them to operate as expected on the live web.
"""
def __init__(self, url_rewriter, align_to_line=True, first_buff=''):
"""Initialize a new JSWorkerRewriter
:param UrlRewriter url_rewriter: The url rewriter for this rewrite
:param bool align_to_line: Should the response stream be aliened to line boundaries
:param str first_buff: The first string to be added to the rewrite
:rtype: None
"""
super(JSWorkerRewriter, self).__init__(url_rewriter, align_to_line, first_buff)
wb_url = self.url_rewriter.wburl
if wb_url.mod in WORKER_MODS:
rw_url = self.url_rewriter.pywb_static_prefix + "wombatWorkers.js"
prefix = self.url_rewriter.full_prefix
init = INIT % (prefix, prefix + 'wkrf_', wb_url.url)
self.first_buff = INJECT % (rw_url, init)

View File

@ -235,24 +235,22 @@ class TestContentRewriter(object):
def test_rewrite_sw_add_headers(self): def test_rewrite_sw_add_headers(self):
headers = {'Content-Type': 'application/x-javascript'} headers = {'Content-Type': 'application/x-javascript'}
content = 'function() { location.href = "http://example.com/"; }' content = "function() { location.href = 'http://example.com/'; }"
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701sw_') headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701sw_')
assert ('Content-Type', 'application/x-javascript') in headers.headers assert ('Content-Type', 'application/x-javascript') in headers.headers
assert ('Service-Worker-Allowed', 'http://localhost:8080/prefix/201701mp_/http://example.com/') in headers.headers assert ('Service-Worker-Allowed', 'http://localhost:8080/prefix/201701mp_/http://example.com/') in headers.headers
exp = 'function() { location.href = "http://example.com/"; }' assert "self.importScripts('wombatWorkers.js');" in b''.join(gen).decode('utf-8')
assert b''.join(gen).decode('utf-8') == exp
def test_rewrite_worker(self): def test_rewrite_worker(self):
headers = {'Content-Type': 'application/x-javascript'} headers = {'Content-Type': 'application/x-javascript'}
content = 'importScripts("http://example.com/js.js")' content = "importScripts('http://example.com/js.js')"
rwheaders, gen, is_rw = self.rewrite_record(headers, content, ts='201701wkr_') rwheaders, gen, is_rw = self.rewrite_record(headers, content, ts='201701wkr_')
exp = 'importScripts("http://example.com/js.js")' assert "self.importScripts('wombatWorkers.js');" in b''.join(gen).decode('utf-8')
assert b''.join(gen).decode('utf-8') == exp
def test_banner_only_no_cookie_rewrite(self): def test_banner_only_no_cookie_rewrite(self):
headers = {'Set-Cookie': 'foo=bar; Expires=Wed, 13 Jan 2021 22:23:01 GMT; Path=/', headers = {'Set-Cookie': 'foo=bar; Expires=Wed, 13 Jan 2021 22:23:01 GMT; Path=/',

View File

@ -389,7 +389,7 @@ r"""
# parse attr with js proxy, rewrite location assignment # parse attr with js proxy, rewrite location assignment
>>> parse('<html><a href="javascript:location=\'foo.html\'"></a></html>', js_proxy=True) >>> parse('<html><a href="javascript:location=\'foo.html\'"></a></html>', js_proxy=True)
<html><a href="javascript:{ location=(self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = 'foo.html' }"></a></html> <html><a href="javascript:{ location=((self.__WB_check_loc && self.__WB_check_loc(location)) || {}).href = 'foo.html' }"></a></html>
# parse attr with js proxy, assigning to location.href, no location assignment rewrite needed # parse attr with js proxy, assigning to location.href, no location assignment rewrite needed
>>> parse('<html><a href="javascript:location.href=\'foo.html\'"></a></html>', js_proxy=True) >>> parse('<html><a href="javascript:location.href=\'foo.html\'"></a></html>', js_proxy=True)

View File

@ -131,49 +131,49 @@ r"""
#================================================================= #=================================================================
>>> _test_js_obj_proxy('var foo = this; location = bar') >>> _test_js_obj_proxy('var foo = this; location = bar')
'var foo = (this && this._WB_wombat_obj_proxy || this); location = (self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = bar' 'var foo = _____WB$wombat$check$this$function_____(this); location = ((self.__WB_check_loc && self.__WB_check_loc(location)) || {}).href = bar'
>>> _test_js_obj_proxy('var that = this\n location = bar') >>> _test_js_obj_proxy('var that = this\n location = bar')
'var that = (this && this._WB_wombat_obj_proxy || this)\n location = (self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = bar' 'var that = _____WB$wombat$check$this$function_____(this)\n location = ((self.__WB_check_loc && self.__WB_check_loc(location)) || {}).href = bar'
>>> _test_js_obj_proxy('location = "xyz"') >>> _test_js_obj_proxy('location = "xyz"')
'location = (self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = "xyz"' 'location = ((self.__WB_check_loc && self.__WB_check_loc(location)) || {}).href = "xyz"'
>>> _test_js_obj_proxy('var foo = this.location') >>> _test_js_obj_proxy('var foo = this.location')
'var foo = (this && this._WB_wombat_obj_proxy || this).location' 'var foo = _____WB$wombat$check$this$function_____(this).location'
>>> _test_js_obj_proxy('A = B\nthis.location = "foo"') >>> _test_js_obj_proxy('A = B\nthis.location = "foo"')
'A = B\n;(this && this._WB_wombat_obj_proxy || this).location = "foo"' 'A = B\n;_____WB$wombat$check$this$function_____(this).location = "foo"'
>>> _test_js_obj_proxy('var foo = this.location2') >>> _test_js_obj_proxy('var foo = this.location2')
'var foo = this.location2' 'var foo = this.location2'
>>> _test_js_obj_proxy('func(Function("return this"));') >>> _test_js_obj_proxy('func(Function("return this"));')
'func(Function("return (this && this._WB_wombat_obj_proxy || this)"));' 'func(Function("return _____WB$wombat$check$this$function_____(this)"));'
>>> _test_js_obj_proxy('A.call(function() { return this });') >>> _test_js_obj_proxy('A.call(function() { return this });')
'A.call(function() { return (this && this._WB_wombat_obj_proxy || this) });' 'A.call(function() { return _____WB$wombat$check$this$function_____(this) });'
>>> _test_js_obj_proxy('this.document.location = foo') >>> _test_js_obj_proxy('this.document.location = foo')
'(this && this._WB_wombat_obj_proxy || this).document.location = foo' '_____WB$wombat$check$this$function_____(this).document.location = foo'
>>> _test_js_obj_proxy('if (that != this) { ... }') >>> _test_js_obj_proxy('if (that != this) { ... }')
'if (that != (this && this._WB_wombat_obj_proxy || this)) { ... }' 'if (that != _____WB$wombat$check$this$function_____(this)) { ... }'
>>> _test_js_obj_proxy('function(){...} (this)') >>> _test_js_obj_proxy('function(){...} (this)')
'function(){...} ((this && this._WB_wombat_obj_proxy || this))' 'function(){...} (_____WB$wombat$check$this$function_____(this))'
>>> _test_js_obj_proxy('function(){...} ) (this); foo(this)') >>> _test_js_obj_proxy('function(){...} ) (this); foo(this)')
'function(){...} ) ((this && this._WB_wombat_obj_proxy || this)); foo(this)' 'function(){...} ) (_____WB$wombat$check$this$function_____(this)); foo(this)'
>>> _test_js_obj_proxy('var foo = that || this ;') >>> _test_js_obj_proxy('var foo = that || this ;')
'var foo = that || (this && this._WB_wombat_obj_proxy || this) ;' 'var foo = that || _____WB$wombat$check$this$function_____(this) ;'
>>> _test_js_obj_proxy('a||this||that') >>> _test_js_obj_proxy('a||this||that')
'a||(this && this._WB_wombat_obj_proxy || this)||that' 'a||_____WB$wombat$check$this$function_____(this)||that'
>>> _test_js_obj_proxy('a||this)') >>> _test_js_obj_proxy('a||this)')
'a||(this && this._WB_wombat_obj_proxy || this))' 'a||_____WB$wombat$check$this$function_____(this))'
# not rewritten # not rewritten
>>> _test_js_obj_proxy('var window = this$') >>> _test_js_obj_proxy('var window = this$')
@ -207,7 +207,7 @@ r"""
'this. alocation = http://example.com/' 'this. alocation = http://example.com/'
>>> _test_js_obj_proxy(r'this. location = http://example.com/') >>> _test_js_obj_proxy(r'this. location = http://example.com/')
'this. location = (self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = http://example.com/' 'this. location = ((self.__WB_check_loc && self.__WB_check_loc(location)) || {}).href = http://example.com/'

View File

@ -23,7 +23,7 @@ class UrlRewriter(object):
REL_PATH = '/' REL_PATH = '/'
def __init__(self, wburl, prefix='', full_prefix=None, rel_prefix=None, def __init__(self, wburl, prefix='', full_prefix=None, rel_prefix=None,
root_path=None, cookie_scope=None, rewrite_opts=None): root_path=None, cookie_scope=None, rewrite_opts=None, pywb_static_prefix=None):
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl) self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
self.prefix = prefix self.prefix = prefix
self.full_prefix = full_prefix or prefix self.full_prefix = full_prefix or prefix
@ -36,10 +36,22 @@ class UrlRewriter(object):
self.prefix_abs = self.prefix and self.prefix.startswith(self.PROTOCOLS) self.prefix_abs = self.prefix and self.prefix.startswith(self.PROTOCOLS)
self.cookie_scope = cookie_scope self.cookie_scope = cookie_scope
self.rewrite_opts = rewrite_opts or {} self.rewrite_opts = rewrite_opts or {}
self._pywb_static_prefix = pywb_static_prefix
if self.rewrite_opts.get('punycode_links'): if self.rewrite_opts.get('punycode_links'):
self.wburl._do_percent_encode = False self.wburl._do_percent_encode = False
@property
def pywb_static_prefix(self):
"""Returns the static path URL
:rtype: str
"""
if self._pywb_static_prefix is None:
return ''
if self._pywb_static_prefix.startswith(self.PROTOCOLS):
return self._pywb_static_prefix
return self.urljoin(self.full_prefix, self._pywb_static_prefix)
def rewrite(self, url, mod=None, force_abs=False): def rewrite(self, url, mod=None, force_abs=False):
# if special protocol, no rewriting at all # if special protocol, no rewriting at all
if url.startswith(self.NO_REWRITE_URI_PREFIX): if url.startswith(self.NO_REWRITE_URI_PREFIX):

View File

@ -15,338 +15,355 @@ var autofetcher = null;
function noop() {} function noop() {}
if (typeof self.Promise === 'undefined') { if (typeof self.Promise === 'undefined') {
// not kewl we must polyfill Promise // not kewl we must polyfill Promise
self.Promise = function (executor) { self.Promise = function(executor) {
executor(noop, noop); executor(noop, noop);
}; };
self.Promise.prototype.then = function (cb) { self.Promise.prototype.then = function(cb) {
if (cb) cb(); if (cb) cb();
return this; return this;
}; };
self.Promise.prototype.catch = function () { self.Promise.prototype.catch = function() {
return this; return this;
}; };
self.Promise.all = function (values) { self.Promise.all = function(values) {
return new Promise(noop); return new Promise(noop);
}; };
} }
if (typeof self.fetch === 'undefined') { if (typeof self.fetch === 'undefined') {
// not kewl we must polyfill fetch. // not kewl we must polyfill fetch.
self.fetch = function (url) { self.fetch = function(url) {
return new Promise(function (resolve) { return new Promise(function(resolve) {
var xhr = new XMLHttpRequest(); var xhr = new XMLHttpRequest();
xhr.open('GET', url); xhr.open('GET', url);
xhr.send(); xhr.send();
resolve(); resolve();
}); });
}; };
} }
self.onmessage = function (event) { self.onmessage = function(event) {
var data = event.data; var data = event.data;
switch (data.type) { switch (data.type) {
case 'values': case 'values':
autofetcher.autoFetch(data); autofetcher.autoFetch(data);
break; break;
} }
}; };
function AutoFetcher(init) { function AutoFetcher(init) {
if (!(this instanceof AutoFetcher)) { if (!(this instanceof AutoFetcher)) {
return new AutoFetcher(init); return new AutoFetcher(init);
} }
this.prefix = init.prefix; this.prefix = init.prefix;
this.mod = init.mod; this.mod = init.mod;
this.prefixMod = init.prefix + init.mod; this.prefixMod = init.prefix + init.mod;
this.rwRe = new RegExp(init.rwRe); this.rwRe = new RegExp(init.rwRe);
// relative url, WorkerLocation is set by owning document // relative url, WorkerLocation is set by owning document
this.relative = init.prefix.split(location.origin)[1]; this.relative = init.prefix.split(location.origin)[1];
// schemeless url // schemeless url
this.schemeless = '/' + this.relative; this.schemeless = '/' + this.relative;
// local cache of URLs fetched, to reduce server load // local cache of URLs fetched, to reduce server load
this.seen = {}; this.seen = {};
// array of URLs to be fetched // array of URLs to be fetched
this.queue = []; this.queue = [];
this.avQueue = []; this.avQueue = [];
// should we queue a URL or not // should we queue a URL or not
this.queuing = false; this.queuing = false;
this.queuingAV = false; this.queuingAV = false;
this.urlExtractor = this.urlExtractor.bind(this); this.urlExtractor = this.urlExtractor.bind(this);
this.imgFetchDone = this.imgFetchDone.bind(this); this.imgFetchDone = this.imgFetchDone.bind(this);
this.avFetchDone = this.avFetchDone.bind(this); this.avFetchDone = this.avFetchDone.bind(this);
} }
AutoFetcher.prototype.delay = function () { AutoFetcher.prototype.delay = function() {
// 2 second delay seem reasonable // 2 second delay seem reasonable
return new Promise(function (resolve, reject) { return new Promise(function(resolve, reject) {
setTimeout(resolve, 2000); setTimeout(resolve, 2000);
});
};
AutoFetcher.prototype.imgFetchDone = function() {
if (this.queue.length > 0) {
// we have a Q of some length drain it
var autofetcher = this;
this.delay().then(function() {
autofetcher.queuing = false;
autofetcher.fetchImgs();
}); });
} else {
this.queuing = false;
}
}; };
AutoFetcher.prototype.imgFetchDone = function () { AutoFetcher.prototype.avFetchDone = function() {
if (this.queue.length > 0) { if (this.avQueue.length > 0) {
// we have a Q of some length drain it // we have a Q of some length drain it
var autofetcher = this; var autofetcher = this;
this.delay().then(function () { this.delay().then(function() {
autofetcher.queuing = false; autofetcher.queuingAV = false;
autofetcher.fetchImgs(); autofetcher.fetchAV();
}); });
} else { } else {
this.queuing = false; this.queuingAV = false;
} }
}; };
AutoFetcher.prototype.avFetchDone = function () { AutoFetcher.prototype.fetchAV = function() {
if (this.avQueue.length > 0) { if (this.queuingAV || this.avQueue.length === 0) {
// we have a Q of some length drain it return;
var autofetcher = this; }
this.delay().then(function () { // the number of fetches is limited to a maximum of DefaultNumAvFetches + FullAVQDrainLen outstanding fetches
autofetcher.queuingAV = false; // the baseline maximum number of fetches is DefaultNumAvFetches but if the size(avQueue) <= FullAVQDrainLen
autofetcher.fetchAV(); // we add them to the current batch. Because audio video resources might be big
}); // we limit how many we fetch at a time drastically
} else { this.queuingAV = true;
this.queuingAV = false; var runningFetchers = [];
while (
this.avQueue.length > 0 &&
runningFetchers.length <= DefaultNumAvFetches
) {
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop));
}
if (this.avQueue.length <= FullAVQDrainLen) {
while (this.avQueue.length > 0) {
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop));
} }
}
Promise.all(runningFetchers)
.then(this.avFetchDone)
.catch(this.avFetchDone);
}; };
AutoFetcher.prototype.fetchAV = function () { AutoFetcher.prototype.fetchImgs = function() {
if (this.queuingAV || this.avQueue.length === 0) { if (this.queuing || this.queue.length === 0) {
return; return;
}
// the number of fetches is limited to a maximum of DefaultNumImFetches + FullImgQDrainLen outstanding fetches
// the baseline maximum number of fetches is DefaultNumImFetches but if the size(queue) <= FullImgQDrainLen
// we add them to the current batch
this.queuing = true;
var runningFetchers = [];
while (
this.queue.length > 0 &&
runningFetchers.length <= DefaultNumImFetches
) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop));
}
if (this.queue.length <= FullImgQDrainLen) {
while (this.queue.length > 0) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop));
} }
// the number of fetches is limited to a maximum of DefaultNumAvFetches + FullAVQDrainLen outstanding fetches }
// the baseline maximum number of fetches is DefaultNumAvFetches but if the size(avQueue) <= FullAVQDrainLen Promise.all(runningFetchers)
// we add them to the current batch. Because audio video resources might be big .then(this.imgFetchDone)
// we limit how many we fetch at a time drastically .catch(this.imgFetchDone);
this.queuingAV = true;
var runningFetchers = [];
while (this.avQueue.length > 0 && runningFetchers.length <= DefaultNumAvFetches) {
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop))
}
if (this.avQueue.length <= FullAVQDrainLen) {
while (this.avQueue.length > 0) {
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop))
}
}
Promise.all(runningFetchers)
.then(this.avFetchDone)
.catch(this.avFetchDone);
}; };
AutoFetcher.prototype.fetchImgs = function () { AutoFetcher.prototype.queueNonAVURL = function(url) {
if (this.queuing || this.queue.length === 0) { // ensure we do not request data urls
return; if (url.indexOf(DataURLPrefix) === 0) return;
} // check to see if we have seen this url before in order
// the number of fetches is limited to a maximum of DefaultNumImFetches + FullImgQDrainLen outstanding fetches // to lessen the load against the server content is fetched from
// the baseline maximum number of fetches is DefaultNumImFetches but if the size(queue) <= FullImgQDrainLen if (this.seen[url] != null) return;
// we add them to the current batch this.seen[url] = true;
this.queuing = true; this.queue.push(url);
var runningFetchers = [];
while (this.queue.length > 0 && runningFetchers.length <= DefaultNumImFetches) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop))
}
if (this.queue.length <= FullImgQDrainLen) {
while (this.queue.length > 0) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop))
}
}
Promise.all(runningFetchers)
.then(this.imgFetchDone)
.catch(this.imgFetchDone);
}; };
AutoFetcher.prototype.queueNonAVURL = function (url) { AutoFetcher.prototype.queueAVURL = function(url) {
// ensure we do not request data urls // ensure we do not request data urls
if (url.indexOf(DataURLPrefix) === 0) return; if (url.indexOf(DataURLPrefix) === 0) return;
// check to see if we have seen this url before in order // check to see if we have seen this url before in order
// to lessen the load against the server content is fetched from // to lessen the load against the server content is fetched from
if (this.seen[url] != null) return; if (this.seen[url] != null) return;
this.seen[url] = true; this.seen[url] = true;
this.queue.push(url); this.avQueue.push(url);
}; };
AutoFetcher.prototype.queueAVURL = function (url) { AutoFetcher.prototype.maybeResolveURL = function(url, base) {
// ensure we do not request data urls // given a url and base url returns a resolved full URL or
if (url.indexOf(DataURLPrefix) === 0) return; // null if resolution was unsuccessful
// check to see if we have seen this url before in order try {
// to lessen the load against the server content is fetched from var _url = new URL(url, base);
if (this.seen[url] != null) return; return _url.href;
this.seen[url] = true; } catch (e) {
this.avQueue.push(url);
};
AutoFetcher.prototype.maybeResolveURL = function (url, base) {
// given a url and base url returns a resolved full URL or
// null if resolution was unsuccessful
try {
var _url = new URL(url, base);
return _url.href;
} catch (e) {
return null;
}
};
AutoFetcher.prototype.maybeFixUpRelSchemelessPrefix = function (url) {
// attempt to ensure rewritten relative or schemeless URLs become full URLS!
// otherwise returns null if this did not happen
if (url.indexOf(this.relative) === 0) {
return url.replace(this.relative, this.prefix);
}
if (url.indexOf(this.schemeless) === 0) {
return url.replace(this.schemeless, this.prefix);
}
return null; return null;
}
}; };
AutoFetcher.prototype.maybeFixUpURL = function (url, resolveOpts) { AutoFetcher.prototype.maybeFixUpRelSchemelessPrefix = function(url) {
// attempt to fix up the url and do our best to ensure we can get dat 200 OK! // attempt to ensure rewritten relative or schemeless URLs become full URLS!
if (this.rwRe.test(url)) { // otherwise returns null if this did not happen
return url; if (url.indexOf(this.relative) === 0) {
} return url.replace(this.relative, this.prefix);
var mod = resolveOpts.mod || 'mp_'; }
// first check for / (relative) or // (schemeless) rewritten urls if (url.indexOf(this.schemeless) === 0) {
var maybeFixed = this.maybeFixUpRelSchemelessPrefix(url); return url.replace(this.schemeless, this.prefix);
}
return null;
};
AutoFetcher.prototype.maybeFixUpURL = function(url, resolveOpts) {
// attempt to fix up the url and do our best to ensure we can get dat 200 OK!
if (this.rwRe.test(url)) {
return url;
}
var mod = resolveOpts.mod || 'mp_';
// first check for / (relative) or // (schemeless) rewritten urls
var maybeFixed = this.maybeFixUpRelSchemelessPrefix(url);
if (maybeFixed != null) {
return maybeFixed;
}
// resolve URL against tag src
if (resolveOpts.tagSrc != null) {
maybeFixed = this.maybeResolveURL(url, resolveOpts.tagSrc);
if (maybeFixed != null) { if (maybeFixed != null) {
return maybeFixed; return this.prefix + mod + '/' + maybeFixed;
} }
// resolve URL against tag src }
if (resolveOpts.tagSrc != null) { // finally last attempt resolve the originating documents base URI
maybeFixed = this.maybeResolveURL(url, resolveOpts.tagSrc); if (resolveOpts.docBaseURI) {
if (maybeFixed != null) { maybeFixed = this.maybeResolveURL(url, resolveOpts.docBaseURI);
return this.prefix + mod + '/' + maybeFixed; if (maybeFixed != null) {
} return this.prefix + mod + '/' + maybeFixed;
} }
// finally last attempt resolve the originating documents base URI }
if (resolveOpts.docBaseURI) { // not much to do now.....
maybeFixed = this.maybeResolveURL(url, resolveOpts.docBaseURI); return this.prefixMod + '/' + url;
if (maybeFixed != null) {
return this.prefix + mod + '/' + maybeFixed;
}
}
// not much to do now.....
return this.prefixMod + '/' + url;
}; };
AutoFetcher.prototype.urlExtractor = function (match, n1, n2, n3, offset, string) { AutoFetcher.prototype.urlExtractor = function(
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL match,
this.queueNonAVURL(n2); n1,
return n1 + n2 + n3; n2,
n3,
offset,
string
) {
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
this.queueNonAVURL(n2);
return n1 + n2 + n3;
}; };
AutoFetcher.prototype.handleMedia = function (mediaRules) { AutoFetcher.prototype.handleMedia = function(mediaRules) {
// this is a broken down rewrite_style // this is a broken down rewrite_style
if (mediaRules == null || mediaRules.length === 0) return; if (mediaRules == null || mediaRules.length === 0) return;
// var rules = mediaRules.values; // var rules = mediaRules.values;
for (var i = 0; i < mediaRules.length; i++) { for (var i = 0; i < mediaRules.length; i++) {
mediaRules[i] mediaRules[i]
.replace(STYLE_REGEX, this.urlExtractor) .replace(STYLE_REGEX, this.urlExtractor)
.replace(IMPORT_REGEX, this.urlExtractor); .replace(IMPORT_REGEX, this.urlExtractor);
} }
}; };
AutoFetcher.prototype.handleSrc = function (srcValues, context) { AutoFetcher.prototype.handleSrc = function(srcValues, context) {
var resolveOpts = { 'docBaseURI': context.docBaseURI }; var resolveOpts = { docBaseURI: context.docBaseURI };
if (srcValues.value) { if (srcValues.value) {
resolveOpts.mod = srcValues.mod; resolveOpts.mod = srcValues.mod;
if (resolveOpts.mod === 1) { if (resolveOpts.mod === 1) {
return this.queueNonAVURL(this.maybeFixUpURL(srcValues.value.trim(), resolveOpts)); return this.queueNonAVURL(
} this.maybeFixUpURL(srcValues.value.trim(), resolveOpts)
return this.queueAVURL(this.maybeFixUpURL(srcValues.value.trim(), resolveOpts)); );
} }
var len = srcValues.values.length; return this.queueAVURL(
for (var i = 0; i < len; i++) { this.maybeFixUpURL(srcValues.value.trim(), resolveOpts)
var value = srcValues.values[i]; );
resolveOpts.mod = value.mod; }
if (resolveOpts.mod === 'im_') { var len = srcValues.values.length;
this.queueNonAVURL(this.maybeFixUpURL(value.src, resolveOpts)); for (var i = 0; i < len; i++) {
} else { var value = srcValues.values[i];
this.queueAVURL(this.maybeFixUpURL(value.src, resolveOpts)); resolveOpts.mod = value.mod;
} if (resolveOpts.mod === 'im_') {
this.queueNonAVURL(this.maybeFixUpURL(value.src, resolveOpts));
} else {
this.queueAVURL(this.maybeFixUpURL(value.src, resolveOpts));
} }
}
}; };
AutoFetcher.prototype.extractSrcSetNotPreSplit = function (ssV, resolveOpts) { AutoFetcher.prototype.extractSrcSetNotPreSplit = function(ssV, resolveOpts) {
// was from extract from local doc so we need to duplicate work if (!ssV) return;
var srcsetValues = ssV.split(srcsetSplit); // was from extract from local doc so we need to duplicate work
for (var i = 0; i < srcsetValues.length; i++) { var srcsetValues = ssV.split(srcsetSplit);
// grab the URL not width/height key for (var i = 0; i < srcsetValues.length; i++) {
if (srcsetValues[i]) { // grab the URL not width/height key
var value = srcsetValues[i].trim().split(' ')[0]; if (srcsetValues[i]) {
var maybeResolvedURL = this.maybeFixUpURL(value.trim(), resolveOpts); var value = srcsetValues[i].trim().split(' ')[0];
if (resolveOpts.mod === 'im_') { var maybeResolvedURL = this.maybeFixUpURL(value.trim(), resolveOpts);
this.queueNonAVURL(maybeResolvedURL); if (resolveOpts.mod === 'im_') {
} else { this.queueNonAVURL(maybeResolvedURL);
this.queueAVURL(maybeResolvedURL); } else {
} this.queueAVURL(maybeResolvedURL);
} }
} }
}
}; };
AutoFetcher.prototype.extractSrcset = function (srcsets, context) { AutoFetcher.prototype.extractSrcset = function(srcsets, context) {
// was rewrite_srcset and only need to q // was rewrite_srcset and only need to q
for (var i = 0; i < srcsets.length; i++) { for (var i = 0; i < srcsets.length; i++) {
// grab the URL not width/height key // grab the URL not width/height key
var url = srcsets[i].split(' ')[0]; var url = srcsets[i].split(' ')[0];
if (context.mod === 'im_') { if (context.mod === 'im_') {
this.queueNonAVURL(url); this.queueNonAVURL(url);
} else { } else {
this.queueAVURL(url); this.queueAVURL(url);
}
} }
}
}; };
AutoFetcher.prototype.handleSrcset = function (srcset, context) { AutoFetcher.prototype.handleSrcset = function(srcset, context) {
var resolveOpts = { 'docBaseURI': context.docBaseURI }; var resolveOpts = { docBaseURI: context.docBaseURI };
if (srcset.value) { if (srcset.value) {
// we have a single value, this srcset came from either // we have a single value, this srcset came from either
// preserveDataSrcset (not presplit) preserveSrcset (presplit) // preserveDataSrcset (not presplit) preserveSrcset (presplit)
resolveOpts.mod = srcset.mod; resolveOpts.mod = srcset.mod;
if (!srcset.presplit) { if (!srcset.presplit) {
// extract URLs from the srcset string // extract URLs from the srcset string
return this.extractSrcSetNotPreSplit(srcset.value, resolveOpts); return this.extractSrcSetNotPreSplit(srcset.value, resolveOpts);
}
// we have an array of srcset URL strings
return this.extractSrcset(srcset.value, resolveOpts);
}
// we have an array of values, these srcsets came from extractFromLocalDoc
var len = srcset.values.length;
for (var i = 0; i < len; i++) {
var ssv = srcset.values[i];
resolveOpts.mod = ssv.mod;
resolveOpts.tagSrc = ssv.tagSrc;
this.extractSrcSetNotPreSplit(ssv.srcset, resolveOpts);
} }
// we have an array of srcset URL strings
return this.extractSrcset(srcset.value, resolveOpts);
}
// we have an array of values, these srcsets came from extractFromLocalDoc
var len = srcset.values.length;
for (var i = 0; i < len; i++) {
var ssv = srcset.values[i];
resolveOpts.mod = ssv.mod;
resolveOpts.tagSrc = ssv.tagSrc;
this.extractSrcSetNotPreSplit(ssv.srcset, resolveOpts);
}
}; };
AutoFetcher.prototype.autoFetch = function(data) {
// we got a message and now we autofetch!
// these calls turn into no ops if they have no work
if (data.media) {
this.handleMedia(data.media);
}
AutoFetcher.prototype.autoFetch = function (data) { if (data.src) {
// we got a message and now we autofetch! this.handleSrc(data.src, data.context || {});
// these calls turn into no ops if they have no work }
if (data.media) {
this.handleMedia(data.media);
}
if (data.src) { if (data.srcset) {
this.handleSrc(data.src, data.context || {}); this.handleSrcset(data.srcset, data.context || {});
} }
if (data.srcset) { this.fetchImgs();
this.handleSrcset(data.srcset, data.context || {}); this.fetchAV();
}
this.fetchImgs();
this.fetchAV();
}; };
// initialize ourselves from the query params :) // initialize ourselves from the query params :)
try { try {
var loc = new self.URL(location.href); var loc = new self.URL(location.href);
autofetcher = new AutoFetcher(JSON.parse(loc.searchParams.get('init'))); autofetcher = new AutoFetcher(JSON.parse(loc.searchParams.get('init')));
} catch (e) { } catch (e) {
// likely we are in an older version of safari // likely we are in an older version of safari
var search = decodeURIComponent(location.search.split('?')[1]).split('&'); var search = decodeURIComponent(location.search.split('?')[1]).split('&');
var init = JSON.parse(search[0].substr(search[0].indexOf('=') + 1)); var init = JSON.parse(search[0].substr(search[0].indexOf('=') + 1));
init.prefix = decodeURIComponent(init.prefix); init.prefix = decodeURIComponent(init.prefix);
init.baseURI = decodeURIComponent(init.baseURI); init.baseURI = decodeURIComponent(init.baseURI);
autofetcher = new AutoFetcher(init); autofetcher = new AutoFetcher(init);
} }

View File

@ -15,271 +15,289 @@ var autofetcher = null;
function noop() {} function noop() {}
if (typeof self.Promise === 'undefined') { if (typeof self.Promise === 'undefined') {
// not kewl we must polyfill Promise // not kewl we must polyfill Promise
self.Promise = function (executor) { self.Promise = function(executor) {
executor(noop, noop); executor(noop, noop);
}; };
self.Promise.prototype.then = function (cb) { self.Promise.prototype.then = function(cb) {
if (cb) cb(); if (cb) cb();
return this; return this;
}; };
self.Promise.prototype.catch = function () { self.Promise.prototype.catch = function() {
return this; return this;
}; };
self.Promise.all = function (values) { self.Promise.all = function(values) {
return new Promise(noop); return new Promise(noop);
}; };
} }
if (typeof self.fetch === 'undefined') { if (typeof self.fetch === 'undefined') {
// not kewl we must polyfill fetch. // not kewl we must polyfill fetch.
self.fetch = function (url) { self.fetch = function(url) {
return new Promise(function (resolve) { return new Promise(function(resolve) {
var xhr = new XMLHttpRequest(); var xhr = new XMLHttpRequest();
xhr.open('GET', url); xhr.open('GET', url);
xhr.send(); xhr.send();
resolve(); resolve();
}); });
}; };
} }
self.onmessage = function (event) { self.onmessage = function(event) {
var data = event.data; var data = event.data;
switch (data.type) { switch (data.type) {
case 'values': case 'values':
autofetcher.autofetchMediaSrcset(data); autofetcher.autofetchMediaSrcset(data);
break; break;
case 'fetch-all': case 'fetch-all':
autofetcher.justFetch(data); autofetcher.justFetch(data);
break; break;
} }
}; };
function AutoFetcher() { function AutoFetcher() {
if (!(this instanceof AutoFetcher)) { if (!(this instanceof AutoFetcher)) {
return new AutoFetcher(); return new AutoFetcher();
} }
// local cache of URLs fetched, to reduce server load // local cache of URLs fetched, to reduce server load
this.seen = {}; this.seen = {};
// array of URLs to be fetched // array of URLs to be fetched
this.queue = []; this.queue = [];
this.avQueue = []; this.avQueue = [];
// should we queue a URL or not // should we queue a URL or not
this.queuing = false; this.queuing = false;
// a URL to resolve relative URLs found in the cssText of CSSMedia rules. // a URL to resolve relative URLs found in the cssText of CSSMedia rules.
this.currentResolver = null; this.currentResolver = null;
// should we queue a URL or not // should we queue a URL or not
this.queuing = false; this.queuing = false;
this.queuingAV = false; this.queuingAV = false;
this.urlExtractor = this.urlExtractor.bind(this); this.urlExtractor = this.urlExtractor.bind(this);
this.imgFetchDone = this.imgFetchDone.bind(this); this.imgFetchDone = this.imgFetchDone.bind(this);
this.avFetchDone = this.avFetchDone.bind(this); this.avFetchDone = this.avFetchDone.bind(this);
} }
AutoFetcher.prototype.delay = function () { AutoFetcher.prototype.delay = function() {
return new Promise(function (resolve, reject) { return new Promise(function(resolve, reject) {
setTimeout(resolve, FetchDelay); setTimeout(resolve, FetchDelay);
});
};
AutoFetcher.prototype.imgFetchDone = function() {
if (this.queue.length > 0) {
// we have a Q of some length drain it
var autofetcher = this;
this.delay().then(function() {
autofetcher.queuing = false;
autofetcher.fetchImgs();
}); });
} else {
this.queuing = false;
}
}; };
AutoFetcher.prototype.imgFetchDone = function () { AutoFetcher.prototype.avFetchDone = function() {
if (this.queue.length > 0) { if (this.avQueue.length > 0) {
// we have a Q of some length drain it // we have a Q of some length drain it
var autofetcher = this; var autofetcher = this;
this.delay().then(function () { this.delay().then(function() {
autofetcher.queuing = false; autofetcher.queuingAV = false;
autofetcher.fetchImgs(); autofetcher.fetchAV();
}); });
} else { } else {
this.queuing = false; this.queuingAV = false;
} }
}; };
AutoFetcher.prototype.avFetchDone = function () { AutoFetcher.prototype.fetchAV = function() {
if (this.avQueue.length > 0) { if (this.queuingAV || this.avQueue.length === 0) {
// we have a Q of some length drain it return;
var autofetcher = this; }
this.delay().then(function () { // the number of fetches is limited to a maximum of DefaultNumAvFetches + FullAVQDrainLen outstanding fetches
autofetcher.queuingAV = false; // the baseline maximum number of fetches is DefaultNumAvFetches but if the size(avQueue) <= FullAVQDrainLen
autofetcher.fetchAV(); // we add them to the current batch. Because audio video resources might be big
}); // we limit how many we fetch at a time drastically
} else { this.queuingAV = true;
this.queuingAV = false; var runningFetchers = [];
while (
this.avQueue.length > 0 &&
runningFetchers.length <= DefaultNumAvFetches
) {
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop));
}
if (this.avQueue.length <= FullAVQDrainLen) {
while (this.avQueue.length > 0) {
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop));
} }
}
Promise.all(runningFetchers)
.then(this.avFetchDone)
.catch(this.avFetchDone);
}; };
AutoFetcher.prototype.fetchAV = function () { AutoFetcher.prototype.fetchImgs = function() {
if (this.queuingAV || this.avQueue.length === 0) { if (this.queuing || this.queue.length === 0) {
return; return;
}
// the number of fetches is limited to a maximum of DefaultNumImFetches + FullImgQDrainLen outstanding fetches
// the baseline maximum number of fetches is DefaultNumImFetches but if the size(queue) <= FullImgQDrainLen
// we add them to the current batch
this.queuing = true;
var runningFetchers = [];
while (
this.queue.length > 0 &&
runningFetchers.length <= DefaultNumImFetches
) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop));
}
if (this.queue.length <= FullImgQDrainLen) {
while (this.queue.length > 0) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop));
} }
// the number of fetches is limited to a maximum of DefaultNumAvFetches + FullAVQDrainLen outstanding fetches }
// the baseline maximum number of fetches is DefaultNumAvFetches but if the size(avQueue) <= FullAVQDrainLen Promise.all(runningFetchers)
// we add them to the current batch. Because audio video resources might be big .then(this.imgFetchDone)
// we limit how many we fetch at a time drastically .catch(this.imgFetchDone);
this.queuingAV = true;
var runningFetchers = [];
while (this.avQueue.length > 0 && runningFetchers.length <= DefaultNumAvFetches) {
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop))
}
if (this.avQueue.length <= FullAVQDrainLen) {
while (this.avQueue.length > 0) {
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop))
}
}
Promise.all(runningFetchers)
.then(this.avFetchDone)
.catch(this.avFetchDone);
}; };
AutoFetcher.prototype.fetchImgs = function () { AutoFetcher.prototype.queueNonAVURL = function(url) {
if (this.queuing || this.queue.length === 0) { // ensure we do not request data urls
return; if (url.indexOf(DataURLPrefix) === 0) return;
} // check to see if we have seen this url before in order
// the number of fetches is limited to a maximum of DefaultNumImFetches + FullImgQDrainLen outstanding fetches // to lessen the load against the server content is fetched from
// the baseline maximum number of fetches is DefaultNumImFetches but if the size(queue) <= FullImgQDrainLen if (this.seen[url] != null) return;
// we add them to the current batch this.seen[url] = true;
this.queuing = true; this.queue.push(url);
var runningFetchers = [];
while (this.queue.length > 0 && runningFetchers.length <= DefaultNumImFetches) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop))
}
if (this.queue.length <= FullImgQDrainLen) {
while (this.queue.length > 0) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop))
}
}
Promise.all(runningFetchers)
.then(this.imgFetchDone)
.catch(this.imgFetchDone);
}; };
AutoFetcher.prototype.queueNonAVURL = function (url) { AutoFetcher.prototype.queueAVURL = function(url) {
// ensure we do not request data urls // ensure we do not request data urls
if (url.indexOf(DataURLPrefix) === 0) return; if (url.indexOf(DataURLPrefix) === 0) return;
// check to see if we have seen this url before in order // check to see if we have seen this url before in order
// to lessen the load against the server content is fetched from // to lessen the load against the server content is fetched from
if (this.seen[url] != null) return; if (this.seen[url] != null) return;
this.seen[url] = true; this.seen[url] = true;
this.queue.push(url); this.avQueue.push(url);
}; };
AutoFetcher.prototype.queueAVURL = function (url) { AutoFetcher.prototype.safeResolve = function(url, resolver) {
// ensure we do not request data urls // Guard against the exception thrown by the URL constructor if the URL or resolver is bad
if (url.indexOf(DataURLPrefix) === 0) return; // if resolver is undefined/null then this function passes url through
// check to see if we have seen this url before in order var resolvedURL = url;
// to lessen the load against the server content is fetched from if (resolver) {
if (this.seen[url] != null) return; try {
this.seen[url] = true; resolvedURL = new URL(url, resolver).href;
this.avQueue.push(url); } catch (e) {
}; resolvedURL = url;
AutoFetcher.prototype.safeResolve = function (url, resolver) {
// Guard against the exception thrown by the URL constructor if the URL or resolver is bad
// if resolver is undefined/null then this function passes url through
var resolvedURL = url;
if (resolver) {
try {
resolvedURL = (new URL(url, resolver)).href
} catch (e) {
resolvedURL = url;
}
} }
return resolvedURL; }
return resolvedURL;
}; };
AutoFetcher.prototype.urlExtractor = function(
AutoFetcher.prototype.urlExtractor = function (match, n1, n2, n3, offset, string) { match,
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL n1,
// this.currentResolver is set to the URL which the browser would normally n2,
// resolve relative urls with (URL of the stylesheet) in an exceptionless manner n3,
// (resolvedURL will be undefined if an error occurred) offset,
var resolvedURL = this.safeResolve(n2, this.currentResolver); string
if (resolvedURL) { ) {
this.queueNonAVURL(resolvedURL); // Same function as style_replacer in wombat.rewrite_style, n2 is our URL
} // this.currentResolver is set to the URL which the browser would normally
return n1 + n2 + n3; // resolve relative urls with (URL of the stylesheet) in an exceptionless manner
// (resolvedURL will be undefined if an error occurred)
var resolvedURL = this.safeResolve(n2, this.currentResolver);
if (resolvedURL) {
this.queueNonAVURL(resolvedURL);
}
return n1 + n2 + n3;
}; };
AutoFetcher.prototype.extractMedia = function (mediaRules) { AutoFetcher.prototype.extractMedia = function(mediaRules) {
// this is a broken down rewrite_style // this is a broken down rewrite_style
if (mediaRules == null) return; if (mediaRules == null) return;
for (var i = 0; i < mediaRules.length; i++) { for (var i = 0; i < mediaRules.length; i++) {
// set currentResolver to the value of this stylesheets URL, done to ensure we do not have to // set currentResolver to the value of this stylesheets URL, done to ensure we do not have to
// create functions on each loop iteration because we potentially create a new `URL` object // create functions on each loop iteration because we potentially create a new `URL` object
// twice per iteration // twice per iteration
this.currentResolver = mediaRules[i].resolve; this.currentResolver = mediaRules[i].resolve;
mediaRules[i].cssText mediaRules[i].cssText
.replace(STYLE_REGEX, this.urlExtractor) .replace(STYLE_REGEX, this.urlExtractor)
.replace(IMPORT_REGEX, this.urlExtractor); .replace(IMPORT_REGEX, this.urlExtractor);
} }
}; };
AutoFetcher.prototype.extractSrcset = function (srcsets) { AutoFetcher.prototype.extractSrcset = function(srcsets) {
// preservation worker in proxy mode sends us the value of the srcset attribute of an element // preservation worker in proxy mode sends us the value of the srcset attribute of an element
// and a URL to correctly resolve relative URLS. Thus we must recreate rewrite_srcset logic here // and a URL to correctly resolve relative URLS. Thus we must recreate rewrite_srcset logic here
if (srcsets == null) return; if (srcsets == null) return;
var length = srcsets.length; var length = srcsets.length;
var extractedSrcSet, srcsetValue, ssSplit, j; var extractedSrcSet, srcsetValue, ssSplit, j;
for (var i = 0; i < length; i++) { for (var i = 0; i < length; i++) {
extractedSrcSet = srcsets[i]; extractedSrcSet = srcsets[i];
ssSplit = extractedSrcSet.srcset.split(srcsetSplit); ssSplit = extractedSrcSet.srcset.split(srcsetSplit);
for (j = 0; j < ssSplit.length; j++) { console.log(ssSplit);
if (ssSplit[j]) { for (j = 0; j < ssSplit.length; j++) {
srcsetValue = ssSplit[j].trim(); if (ssSplit[j]) {
if (srcsetValue.length > 0) { srcsetValue = ssSplit[j].trim();
// resolve the URL in an exceptionless manner (resolvedURL will be undefined if an error occurred) if (srcsetValue.length > 0) {
var resolvedURL = this.safeResolve(srcsetValue.split(' ')[0], extractedSrcSet.resolve); // resolve the URL in an exceptionless manner (resolvedURL will be undefined if an error occurred)
if (resolvedURL) { var resolvedURL = this.safeResolve(
if (extractedSrcSet.mod === 'im_') { srcsetValue.split(' ')[0],
this.queueNonAVURL(resolvedURL); extractedSrcSet.resolve
} else { );
this.queueAVURL(resolvedURL); if (resolvedURL) {
} if (extractedSrcSet.mod === 'im_') {
} this.queueNonAVURL(resolvedURL);
}
}
}
}
};
AutoFetcher.prototype.extractSrc = function (srcVals) {
// preservation worker in proxy mode sends us the value of the srcset attribute of an element
// and a URL to correctly resolve relative URLS. Thus we must recreate rewrite_srcset logic here
if (srcVals == null || srcVals.length === 0) return;
var length = srcVals.length;
var srcVal;
for (var i = 0; i < length; i++) {
srcVal = srcVals[i];
var resolvedURL = this.safeResolve(srcVal.src, srcVal.resolve);
if (resolvedURL) {
if (srcVal.mod === 'im_') {
this.queueNonAVURL(resolvedURL);
} else { } else {
this.queueAVURL(resolvedURL); this.queueAVURL(resolvedURL);
} }
} else {
console.log(resolvedURL);
}
} else {
console.log(srcsetValue);
} }
}
} }
}
}; };
AutoFetcher.prototype.extractSrc = function(srcVals) {
AutoFetcher.prototype.autofetchMediaSrcset = function (data) { // preservation worker in proxy mode sends us the value of the srcset attribute of an element
// we got a message and now we autofetch! // and a URL to correctly resolve relative URLS. Thus we must recreate rewrite_srcset logic here
// these calls turn into no ops if they have no work if (srcVals == null || srcVals.length === 0) return;
this.extractMedia(data.media); var length = srcVals.length;
this.extractSrcset(data.srcset); var srcVal;
this.extractSrc(data.src); for (var i = 0; i < length; i++) {
this.fetchImgs(); srcVal = srcVals[i];
this.fetchAV(); var resolvedURL = this.safeResolve(srcVal.src, srcVal.resolve);
if (resolvedURL) {
if (srcVal.mod === 'im_') {
this.queueNonAVURL(resolvedURL);
} else {
this.queueAVURL(resolvedURL);
}
}
}
}; };
AutoFetcher.prototype.justFetch = function (data) { AutoFetcher.prototype.autofetchMediaSrcset = function(data) {
// we got a message containing only urls to be fetched // we got a message and now we autofetch!
if (data == null || data.values == null) return; // these calls turn into no ops if they have no work
for (var i = 0; i < data.values.length; ++i) { this.extractMedia(data.media);
this.queueNonAVURL(data.values[i]); this.extractSrcset(data.srcset);
} this.extractSrc(data.src);
this.fetchImgs(); this.fetchImgs();
this.fetchAV();
};
AutoFetcher.prototype.justFetch = function(data) {
// we got a message containing only urls to be fetched
if (data == null || data.values == null) return;
for (var i = 0; i < data.values.length; ++i) {
this.queueNonAVURL(data.values[i]);
}
this.fetchImgs();
}; };
autofetcher = new AutoFetcher(); autofetcher = new AutoFetcher();

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,82 +0,0 @@
// pywb mini rewriter for injection into web worker scripts
function WBWombat(info) {
function maybeResolveURL(origURL) {
try {
var resolved = new URL(origURL, info.originalURL);
return resolved.href;
} catch (e) {
return origURL;
}
}
function rewrite_url(url) {
if (url.indexOf('blob:') === 0) return url;
if (url && info.originalURL && url.indexOf('/') === 0) {
url = maybeResolveURL(url);
}
if (info.prefix) {
return info.prefix + url;
}
return url;
}
function init_ajax_rewrite() {
var orig = self.XMLHttpRequest.prototype.open;
function open_rewritten(method, url, async, user, password) {
url = rewrite_url(url);
// defaults to true
if (async != false) {
async = true;
}
var result = orig.call(this, method, url, async, user, password);
if (url.indexOf('data:') !== 0) {
this.setRequestHeader('X-Pywb-Requested-With', 'XMLHttpRequest');
}
}
self.XMLHttpRequest.prototype.open = open_rewritten;
}
init_ajax_rewrite();
function rewriteArgs(argsObj) {
// recreate the original arguments object just with URLs rewritten
var newArgObj = new Array(argsObj.length);
for (var i = 0; i < newArgObj.length; i++) {
var arg = argsObj[i];
newArgObj[i] = rewrite_url(arg);
}
return newArgObj;
}
var origImportScripts = self.importScripts;
self.importScripts = function importScripts() {
// rewrite the arguments object and call original function via fn.apply
var rwArgs = rewriteArgs(arguments);
return origImportScripts.apply(this, rwArgs);
};
if (self.fetch != null) {
// this fetch is Worker.fetch
var orig_fetch = self.fetch;
self.fetch = function(input, init_opts) {
var inputType = typeof(input);
if (inputType === 'string') {
input = rewrite_url(input);
} else if (inputType === 'object' && input.url) {
var new_url = rewrite_url(input.url);
if (new_url !== input.url) {
input = new Request(new_url, input);
}
}
init_opts = init_opts || {};
init_opts['credentials'] = 'include';
return orig_fetch.call(this, input, init_opts);
};
}
}

View File

@ -1 +1 @@
__version__ = '2.2.20190410' __version__ = '2.3.0.dev0'

View File

@ -23,6 +23,9 @@ def fmod_sl(request):
# ============================================================================ # ============================================================================
class BaseConfigTest(BaseTestClass): class BaseConfigTest(BaseTestClass):
lint_app = True lint_app = True
extra_headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
@classmethod @classmethod
def get_test_app(cls, config_file, custom_config=None): def get_test_app(cls, config_file, custom_config=None):
@ -62,21 +65,34 @@ class BaseConfigTest(BaseTestClass):
assert resp.content_length > 0 assert resp.content_length > 0
def get(self, url, fmod, *args, **kwargs): def get(self, url, fmod, *args, **kwargs):
self.__ensure_headers(kwargs)
app = self.testapp if fmod else self.testapp_non_frame app = self.testapp if fmod else self.testapp_non_frame
return app.get(url.format(fmod), *args, **kwargs) return app.get(url.format(fmod), *args, **kwargs)
def post(self, url, fmod, *args, **kwargs): def post(self, url, fmod, *args, **kwargs):
self.__ensure_headers(kwargs)
app = self.testapp if fmod else self.testapp_non_frame app = self.testapp if fmod else self.testapp_non_frame
return app.post(url.format(fmod), *args, **kwargs) return app.post(url.format(fmod), *args, **kwargs)
def post_json(self, url, fmod, *args, **kwargs): def post_json(self, url, fmod, *args, **kwargs):
self.__ensure_headers(kwargs)
app = self.testapp if fmod else self.testapp_non_frame app = self.testapp if fmod else self.testapp_non_frame
return app.post_json(url.format(fmod), *args, **kwargs) return app.post_json(url.format(fmod), *args, **kwargs)
def head(self, url, fmod, *args, **kwargs): def head(self, url, fmod, *args, **kwargs):
self.__ensure_headers(kwargs)
app = self.testapp if fmod else self.testapp_non_frame app = self.testapp if fmod else self.testapp_non_frame
return app.head(url.format(fmod), *args, **kwargs) return app.head(url.format(fmod), *args, **kwargs)
def __ensure_headers(self, kwargs):
if 'headers' in kwargs:
headers = kwargs.get('headers')
else:
headers = kwargs['headers'] = {}
if isinstance(headers, dict) and 'User-Agent' not in headers:
headers['User-Agent'] = self.extra_headers['User-Agent']
#============================================================================= #=============================================================================
class CollsDirMixin(TempDirTests): class CollsDirMixin(TempDirTests):

View File

@ -31,7 +31,7 @@ class TestRootColl(BaseConfigTest):
def test_root_replay_redir(self, fmod): def test_root_replay_redir(self, fmod):
resp = self.get('/20140128051539{0}/http://www.iana.org/domains/example', fmod) resp = self.get('/20140128051539{0}/http://www.iana.org/domains/example', fmod)
assert resp.status_int == 302 assert resp.status_int in (301, 302)
assert resp.headers['Location'] == 'http://localhost:80/20140128051539{0}/https://www.iana.org/domains/reserved'.format(fmod) assert resp.headers['Location'] == 'http://localhost:80/20140128051539{0}/https://www.iana.org/domains/reserved'.format(fmod)

1
wombat Submodule

@ -0,0 +1 @@
Subproject commit 0b0c171a4f0f34114ba3cefd5ba80304515f4ef8