1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00
- Fix: a few broken tests due to iana.org requiring a user agent in its requests
rewrite:
  - introduced a new JSWorkerRewriter class in order to support rewriting via wombat workers in the context of all supported worker variants via
  - ensured rewriter app correctly sets the static prefix
wombat:
 - add wombat as submodule!
This commit is contained in:
John Berlin 2019-05-15 14:42:51 -04:00 committed by Ilya Kreymer
parent 77f8bb6476
commit 22b4297fc5
25 changed files with 709 additions and 5675 deletions

3
.gitmodules vendored Normal file
View File

@ -0,0 +1,3 @@
[submodule "wombat"]
path = wombat
url = https://github.com/webrecorder/wombat

View File

@ -1,4 +0,0 @@
NODE_BIN_DIR=../node_modules/.bin
test:
$(NODE_BIN_DIR)/karma start --single-run

View File

@ -1,9 +0,0 @@
<html>
<head><meta charset="UTF-8"></head>
<body>
<!-- This is a dummy page used in
tests of Wombat's live-rewriting
functionality.
!-->
</body>
</html>

View File

@ -1,108 +0,0 @@
var sauceLabsConfig = {
testName: 'pywb Client Tests',
};
// see https://github.com/karma-runner/karma-sauce-launcher/issues/73
if (process.env.TRAVIS_JOB_NUMBER) {
sauceLabsConfig.startConnect = false;
sauceLabsConfig.tunnelIdentifier = process.env.TRAVIS_JOB_NUMBER;
}
var WOMBAT_JS_PATH = 'pywb/static/wombat.js';
var sauceLaunchers = {
sl_chrome: {
base: 'SauceLabs',
browserName: 'chrome',
},
sl_firefox: {
base: 'SauceLabs',
browserName: 'firefox',
},
sl_safari: {
base: 'SauceLabs',
browserName: 'safari',
platform: 'OS X 10.11',
version: '9.0',
},
sl_edge: {
base: 'SauceLabs',
browserName: 'MicrosoftEdge',
},
};
var localLaunchers = {
localFirefox: {
base: 'Firefox',
},
};
var customLaunchers = {};
if (process.env['SAUCE_USERNAME'] && process.env['SAUCE_ACCESS_KEY']) {
customLaunchers = sauceLaunchers;
} else {
console.error('Sauce Labs account details not set, ' +
'Karma tests will be run only against local browsers.' +
'Set SAUCE_USERNAME and SAUCE_ACCESS_KEY environment variables to ' +
'run tests against Sauce Labs browsers');
customLaunchers = localLaunchers;
}
module.exports = function(config) {
config.set({
basePath: '../',
frameworks: ['mocha', 'chai'],
files: [
{
pattern: WOMBAT_JS_PATH,
watched: true,
included: false,
served: true,
},
{
pattern: 'karma-tests/dummy.html',
included: false,
served: true,
},
'karma-tests/*.spec.js',
],
preprocessors: {},
reporters: ['progress'],
port: 9876,
colors: true,
logLevel: config.LOG_INFO,
autoWatch: true,
sauceLabs: sauceLabsConfig,
// Set extended timeouts to account for the slowness
// in connecting to remote browsers (eg. when using
// Sauce Labs)
//
// See https://oligofren.wordpress.com/2014/05/27/running-karma-tests-on-browserstack/
captureTimeout: 3 * 60000,
browserNoActivityTimeout: 30 * 1000,
browserDisconnectTimeout: 10 * 1000,
browserDisconnectTolerance: 1,
customLaunchers: customLaunchers,
browsers: Object.keys(customLaunchers),
singleRun: false,
concurrency: Infinity
})
};

View File

@ -1,225 +0,0 @@
var DEFAULT_TIMEOUT = 20000;
// creates a new document in an <iframe> and runs
// a WombatJS test case in it.
//
// A new <iframe> is used for each test so that each
// case is run with fresh Document and Window objects,
// since Wombat monkey-patches many Document and Window
// functions
//
function runWombatTest(testCase, done) {
// create an <iframe>
var testFrame = document.createElement('iframe');
testFrame.src = '/base/karma-tests/dummy.html';
document.body.appendChild(testFrame);
testFrame.contentWindow.addEventListener('load', function () {
var testDocument = testFrame.contentDocument;
function runFunctionInIFrame(func) {
testFrame.contentWindow.eval('(' + func.toString() + ')()');
}
// expose an error reporting function to the <iframe>
window.reportError = function(ex) {
done(new Error(ex));
};
// expose utility methods for assertion testing in tests.
// (We used to expose chai asserts here but Karma's default
// error reporter replaces URLs in exception messages with
// the corresponding file paths, which is unhelpful for us
// since assert.equal() will often be called with URLs in our tests)
window.assert = {
equal: function (a, b) {
if (a !== b) {
console.error('Mismatch between', a, 'and', b);
throw new Error('AssertionError');
}
}
};
runFunctionInIFrame(function () {
// re-assign the iframe's console object to the parent window's
// console so that messages are intercepted by Karma
// and output to wherever it is configured to send
// console logs (typically stdout)
console = window.parent.console;
window.onerror = function (message, url, line, col, error) {
if (error) {
console.log(error.stack);
}
reportError(new Error(message));
};
// expose chai's assertion testing API to the test script
window.assert = window.parent.assert;
window.reportError = window.parent.reportError;
// helpers which check whether DOM property overrides are supported
// in the current browser
window.domTests = {
areDOMPropertiesConfigurable: function () {
var descriptor = Object.getOwnPropertyDescriptor(Node.prototype, 'baseURI');
if (descriptor && !descriptor.configurable) {
return false;
} else {
return true;
}
}
};
});
try {
runFunctionInIFrame(testCase.initScript);
} catch (e) {
throw new Error('Configuring Wombat failed: ' + e.toString());
}
try {
testFrame.contentWindow.eval(testCase.wombatScript);
runFunctionInIFrame(function () {
new window._WBWombat(window, wbinfo);
});
} catch (e) {
console.error(e.stack);
throw new Error('Initializing WombatJS failed: ' + e.toString());
}
if (testCase.html) {
testDocument.body.innerHTML = testCase.html;
}
if (testCase.testScript) {
try {
runFunctionInIFrame(testCase.testScript);
} catch (e) {
throw new Error('Test script failed: ' + e.toString());
}
}
testFrame.remove();
done();
});
}
describe('WombatJS', function () {
this.timeout(DEFAULT_TIMEOUT);
var wombatScript;
before(function (done) {
// load the source of the WombatJS content
// rewriting script
var req = new XMLHttpRequest();
req.open('GET', '/base/pywb/static/wombat.js');
req.onload = function () {
wombatScript = req.responseText;
done();
};
req.send();
});
it('should load', function (done) {
runWombatTest({
initScript: function () {
wbinfo = {
wombat_opts: {},
wombat_ts: '',
is_live: false,
top_url: ''
};
},
wombatScript: wombatScript,
}, done);
});
describe('anchor rewriting', function () {
var config;
beforeEach(function () {
config = {
initScript: function () {
wbinfo = {
wombat_opts: {},
wombat_scheme: 'http',
prefix: window.location.origin,
wombat_ts: '',
is_live: false,
top_url: ''
};
},
wombatScript: wombatScript,
html: '<a href="foobar.html" id="link">A link</a>',
};
});
it('should rewrite links in dynamically injected <a> tags', function (done) {
config.testScript = function () {
if (domTests.areDOMPropertiesConfigurable()) {
var link = document.getElementById('link');
assert.equal(link.href, 'http:///base/karma-tests/foobar.html');
}
};
runWombatTest(config, done);
});
it('toString() should return the rewritten URL', function (done) {
config.testScript = function () {
if (domTests.areDOMPropertiesConfigurable()) {
var link = document.getElementById('link');
assert.equal(link.href, link.toString());
}
};
runWombatTest(config, done);
});
});
describe('base URL overrides', function () {
it('document.baseURI should return the original URL', function (done) {
runWombatTest({
initScript: function () {
wbinfo = {
wombat_opts: {},
prefix: window.location.origin,
wombat_ts: '',
wombat_scheme: 'http',
is_live: false,
top_url: ''
};
},
wombatScript: wombatScript,
testScript: function () {
var baseURI = document.baseURI;
if (typeof baseURI !== 'string') {
throw new Error('baseURI is not a string');
}
if (domTests.areDOMPropertiesConfigurable()) {
assert.equal(baseURI, 'http:///base/karma-tests/dummy.html');
}
},
}, done);
});
it('should allow base.href to be assigned', function (done) {
runWombatTest({
initScript: function () {
wbinfo = {
wombat_opts: {},
wombat_scheme: 'http',
is_live: false,
top_url: ''
};
},
wombatScript: wombatScript,
testScript: function () {
'use strict';
var baseElement = document.createElement('base');
baseElement.href = 'http://foobar.com/base';
assert.equal(baseElement.href, 'http://foobar.com/base');
},
}, done);
});
});
});

View File

@ -238,7 +238,8 @@ class RewriterApp(object):
host_prefix = self.get_host_prefix(environ) host_prefix = self.get_host_prefix(environ)
rel_prefix = self.get_rel_prefix(environ) rel_prefix = self.get_rel_prefix(environ)
full_prefix = host_prefix + rel_prefix full_prefix = host_prefix + rel_prefix
pywb_static_prefix = environ.get('pywb.host_prefix', '') + environ.get('pywb.app_prefix', '') + environ.get(
'pywb.static_prefix', '/static/')
is_proxy = ('wsgiprox.proxy_host' in environ) is_proxy = ('wsgiprox.proxy_host' in environ)
response = self.handle_custom_response(environ, wb_url, response = self.handle_custom_response(environ, wb_url,
@ -257,7 +258,8 @@ class RewriterApp(object):
urlrewriter = UrlRewriter(wb_url, urlrewriter = UrlRewriter(wb_url,
prefix=full_prefix, prefix=full_prefix,
full_prefix=full_prefix, full_prefix=full_prefix,
rel_prefix=rel_prefix) rel_prefix=rel_prefix,
pywb_static_prefix=pywb_static_prefix)
framed_replay = self.framed_replay framed_replay = self.framed_replay

View File

@ -15,6 +15,8 @@ from pywb.utils.io import StreamIter, BUFF_SIZE
from pywb.utils.loaders import load_yaml_config, load_py_name from pywb.utils.loaders import load_yaml_config, load_py_name
WORKER_MODS = {"wkr_", "sw_"} # type: Set[str]
# ============================================================================ # ============================================================================
class BaseContentRewriter(object): class BaseContentRewriter(object):
@ -423,8 +425,8 @@ class RewriteInfo(object):
def _resolve_text_type(self, text_type): def _resolve_text_type(self, text_type):
mod = self.url_rewriter.wburl.mod mod = self.url_rewriter.wburl.mod
if mod == 'sw_' or mod == 'wkr_': if mod in WORKER_MODS:
return None return 'js-worker'
if text_type == 'css' and mod == 'js_': if text_type == 'css' and mod == 'js_':
text_type = 'css' text_type = 'css'
@ -495,7 +497,7 @@ class RewriteInfo(object):
return True return True
def is_url_rw(self): def is_url_rw(self):
if self.url_rewriter.wburl.mod in ('id_', 'bn_', 'sw_', 'wkr_'): if self.url_rewriter.wburl.mod in ('id_', 'bn_', 'wkrf_'):
return False return False
return True return True

View File

@ -15,6 +15,8 @@ from pywb.rewrite.rewrite_dash import RewriteDASH
from pywb.rewrite.rewrite_hls import RewriteHLS from pywb.rewrite.rewrite_hls import RewriteHLS
from pywb.rewrite.rewrite_amf import RewriteAMF from pywb.rewrite.rewrite_amf import RewriteAMF
from pywb.rewrite.rewrite_js_workers import JSWorkerRewriter
from pywb import DEFAULT_RULES_FILE from pywb import DEFAULT_RULES_FILE
import copy import copy
@ -34,6 +36,7 @@ class DefaultRewriter(BaseContentRewriter):
'js': JSLocationOnlyRewriter, 'js': JSLocationOnlyRewriter,
'js-proxy': JSNoneRewriter, 'js-proxy': JSNoneRewriter,
'js-worker': JSWorkerRewriter,
'json': JSONPRewriter, 'json': JSONPRewriter,

View File

@ -58,7 +58,7 @@ class HTMLRewriterMixin(StreamingRewriter):
'embed': {'src': 'oe_'}, 'embed': {'src': 'oe_'},
'head': {'': defmod}, # for head rewriting 'head': {'': defmod}, # for head rewriting
'iframe': {'src': 'if_'}, 'iframe': {'src': 'if_'},
'image': {'src': 'im_', 'xlink:href': 'im_'}, 'image': {'src': 'im_', 'xlink:href': 'im_', 'href': 'im_'},
'img': {'src': 'im_', 'img': {'src': 'im_',
'srcset': 'im_'}, 'srcset': 'im_'},
'ins': {'cite': defmod}, 'ins': {'cite': defmod},
@ -74,7 +74,7 @@ class HTMLRewriterMixin(StreamingRewriter):
'q': {'cite': defmod}, 'q': {'cite': defmod},
'ref': {'href': 'oe_'}, 'ref': {'href': 'oe_'},
'script': {'src': 'js_', 'xlink:href': 'js_'}, # covers both HTML and SVG script tags 'script': {'src': 'js_', 'xlink:href': 'js_'}, # covers both HTML and SVG script tags
'source': {'src': 'oe_'}, 'source': {'src': 'oe_', 'srcset': 'oe_'},
'video': {'src': 'oe_', 'video': {'src': 'oe_',
'poster': 'im_'}, 'poster': 'im_'},
} }

View File

@ -67,14 +67,24 @@ return (self._wb_wombat && self._wb_wombat.local_init &&\
self._wb_wombat.local_init(name)) || self[name]; }};\n\ self._wb_wombat.local_init(name)) || self[name]; }};\n\
if (!self.__WB_pmw) {{ self.__WB_pmw = function(obj) {{ return obj; }} }}\n\ if (!self.__WB_pmw) {{ self.__WB_pmw = function(obj) {{ return obj; }} }}\n\
{{\n' {{\n'
local_check_this_fn = 'var {0} = function (thisObj) {{ \
if (thisObj && thisObj._WB_wombat_obj_proxy) return thisObj._WB_wombat_obj_proxy; return thisObj; }};'
local_init_func_name = '_____WB$wombat$assign$function_____' local_init_func_name = '_____WB$wombat$assign$function_____'
local_var_line = 'let {0} = {1}("{0}");' local_var_line = 'let {0} = {1}("{0}");'
this_rw = '(this && this._WB_wombat_obj_proxy || this)' local_check_this_func_name = '_____WB$wombat$check$this$function_____'
check_loc = '(self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = ' # we must use a function to perform the this check because most minfiers reduce the number of statements
# by turning everything into one or more expressions. Our previous rewrite was an logical expression,
# (this && this._WB_wombat_obj_proxy || this), that would cause the outer expression to be invalid when
# it was used as the LHS of certain expressions.
# e.g. assignment expressions containing non parenthesized logical expression.
# By using a function the expression injected is an call expression that plays nice in those cases
this_rw = '_____WB$wombat$check$this$function_____(this)'
check_loc = '((self.__WB_check_loc && self.__WB_check_loc(location)) || {}).href = '
self.local_objs = [ self.local_objs = [
'window', 'window',
@ -84,8 +94,8 @@ if (!self.__WB_pmw) {{ self.__WB_pmw = function(obj) {{ return obj; }} }}\n\
'top', 'top',
'parent', 'parent',
'frames', 'frames',
'opener'] 'opener'
]
local_declares = '\n'.join([local_var_line.format(obj, local_init_func_name) for obj in self.local_objs]) local_declares = '\n'.join([local_var_line.format(obj, local_init_func_name) for obj in self.local_objs])
@ -104,7 +114,8 @@ if (!self.__WB_pmw) {{ self.__WB_pmw = function(obj) {{ return obj; }} }}\n\
super(JSWombatProxyRules, self).__init__(rules) super(JSWombatProxyRules, self).__init__(rules)
self.first_buff = local_init_func.format(local_init_func_name) + local_declares self.first_buff = local_check_this_fn.format(local_check_this_func_name) + local_init_func.format(
local_init_func_name) + local_declares + '\n\n'
self.last_buff = '\n\n}' self.last_buff = '\n\n}'

View File

@ -0,0 +1,30 @@
from pywb.rewrite.content_rewriter import StreamingRewriter, WORKER_MODS
__all__ = ["JSWorkerRewriter"]
INJECT = "(function() { self.importScripts('%s'); new WBWombat(%s); })();"
INIT = "{'prefix': '%s', 'prefixMod': '%s/', 'originalURL': '%s'}"
class JSWorkerRewriter(StreamingRewriter):
"""A simple rewriter for rewriting web or service workers.
The only rewriting that occurs is the injection of the init code
for wombatWorkers.js.
This allows for all them to operate as expected on the live web.
"""
def __init__(self, url_rewriter, align_to_line=True, first_buff=''):
"""Initialize a new JSWorkerRewriter
:param UrlRewriter url_rewriter: The url rewriter for this rewrite
:param bool align_to_line: Should the response stream be aliened to line boundaries
:param str first_buff: The first string to be added to the rewrite
:rtype: None
"""
super(JSWorkerRewriter, self).__init__(url_rewriter, align_to_line, first_buff)
wb_url = self.url_rewriter.wburl
if wb_url.mod in WORKER_MODS:
rw_url = self.url_rewriter.pywb_static_prefix + "wombatWorkers.js"
prefix = self.url_rewriter.full_prefix
init = INIT % (prefix, prefix + 'wkrf_', wb_url.url)
self.first_buff = INJECT % (rw_url, init)

View File

@ -235,24 +235,22 @@ class TestContentRewriter(object):
def test_rewrite_sw_add_headers(self): def test_rewrite_sw_add_headers(self):
headers = {'Content-Type': 'application/x-javascript'} headers = {'Content-Type': 'application/x-javascript'}
content = 'function() { location.href = "http://example.com/"; }' content = "function() { location.href = 'http://example.com/'; }"
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701sw_') headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701sw_')
assert ('Content-Type', 'application/x-javascript') in headers.headers assert ('Content-Type', 'application/x-javascript') in headers.headers
assert ('Service-Worker-Allowed', 'http://localhost:8080/prefix/201701mp_/http://example.com/') in headers.headers assert ('Service-Worker-Allowed', 'http://localhost:8080/prefix/201701mp_/http://example.com/') in headers.headers
exp = 'function() { location.href = "http://example.com/"; }' assert "self.importScripts('wombatWorkers.js');" in b''.join(gen).decode('utf-8')
assert b''.join(gen).decode('utf-8') == exp
def test_rewrite_worker(self): def test_rewrite_worker(self):
headers = {'Content-Type': 'application/x-javascript'} headers = {'Content-Type': 'application/x-javascript'}
content = 'importScripts("http://example.com/js.js")' content = "importScripts('http://example.com/js.js')"
rwheaders, gen, is_rw = self.rewrite_record(headers, content, ts='201701wkr_') rwheaders, gen, is_rw = self.rewrite_record(headers, content, ts='201701wkr_')
exp = 'importScripts("http://example.com/js.js")' assert "self.importScripts('wombatWorkers.js');" in b''.join(gen).decode('utf-8')
assert b''.join(gen).decode('utf-8') == exp
def test_banner_only_no_cookie_rewrite(self): def test_banner_only_no_cookie_rewrite(self):
headers = {'Set-Cookie': 'foo=bar; Expires=Wed, 13 Jan 2021 22:23:01 GMT; Path=/', headers = {'Set-Cookie': 'foo=bar; Expires=Wed, 13 Jan 2021 22:23:01 GMT; Path=/',

View File

@ -389,7 +389,7 @@ r"""
# parse attr with js proxy, rewrite location assignment # parse attr with js proxy, rewrite location assignment
>>> parse('<html><a href="javascript:location=\'foo.html\'"></a></html>', js_proxy=True) >>> parse('<html><a href="javascript:location=\'foo.html\'"></a></html>', js_proxy=True)
<html><a href="javascript:{ location=(self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = 'foo.html' }"></a></html> <html><a href="javascript:{ location=((self.__WB_check_loc && self.__WB_check_loc(location)) || {}).href = 'foo.html' }"></a></html>
# parse attr with js proxy, assigning to location.href, no location assignment rewrite needed # parse attr with js proxy, assigning to location.href, no location assignment rewrite needed
>>> parse('<html><a href="javascript:location.href=\'foo.html\'"></a></html>', js_proxy=True) >>> parse('<html><a href="javascript:location.href=\'foo.html\'"></a></html>', js_proxy=True)

View File

@ -131,49 +131,49 @@ r"""
#================================================================= #=================================================================
>>> _test_js_obj_proxy('var foo = this; location = bar') >>> _test_js_obj_proxy('var foo = this; location = bar')
'var foo = (this && this._WB_wombat_obj_proxy || this); location = (self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = bar' 'var foo = _____WB$wombat$check$this$function_____(this); location = ((self.__WB_check_loc && self.__WB_check_loc(location)) || {}).href = bar'
>>> _test_js_obj_proxy('var that = this\n location = bar') >>> _test_js_obj_proxy('var that = this\n location = bar')
'var that = (this && this._WB_wombat_obj_proxy || this)\n location = (self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = bar' 'var that = _____WB$wombat$check$this$function_____(this)\n location = ((self.__WB_check_loc && self.__WB_check_loc(location)) || {}).href = bar'
>>> _test_js_obj_proxy('location = "xyz"') >>> _test_js_obj_proxy('location = "xyz"')
'location = (self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = "xyz"' 'location = ((self.__WB_check_loc && self.__WB_check_loc(location)) || {}).href = "xyz"'
>>> _test_js_obj_proxy('var foo = this.location') >>> _test_js_obj_proxy('var foo = this.location')
'var foo = (this && this._WB_wombat_obj_proxy || this).location' 'var foo = _____WB$wombat$check$this$function_____(this).location'
>>> _test_js_obj_proxy('A = B\nthis.location = "foo"') >>> _test_js_obj_proxy('A = B\nthis.location = "foo"')
'A = B\n;(this && this._WB_wombat_obj_proxy || this).location = "foo"' 'A = B\n;_____WB$wombat$check$this$function_____(this).location = "foo"'
>>> _test_js_obj_proxy('var foo = this.location2') >>> _test_js_obj_proxy('var foo = this.location2')
'var foo = this.location2' 'var foo = this.location2'
>>> _test_js_obj_proxy('func(Function("return this"));') >>> _test_js_obj_proxy('func(Function("return this"));')
'func(Function("return (this && this._WB_wombat_obj_proxy || this)"));' 'func(Function("return _____WB$wombat$check$this$function_____(this)"));'
>>> _test_js_obj_proxy('A.call(function() { return this });') >>> _test_js_obj_proxy('A.call(function() { return this });')
'A.call(function() { return (this && this._WB_wombat_obj_proxy || this) });' 'A.call(function() { return _____WB$wombat$check$this$function_____(this) });'
>>> _test_js_obj_proxy('this.document.location = foo') >>> _test_js_obj_proxy('this.document.location = foo')
'(this && this._WB_wombat_obj_proxy || this).document.location = foo' '_____WB$wombat$check$this$function_____(this).document.location = foo'
>>> _test_js_obj_proxy('if (that != this) { ... }') >>> _test_js_obj_proxy('if (that != this) { ... }')
'if (that != (this && this._WB_wombat_obj_proxy || this)) { ... }' 'if (that != _____WB$wombat$check$this$function_____(this)) { ... }'
>>> _test_js_obj_proxy('function(){...} (this)') >>> _test_js_obj_proxy('function(){...} (this)')
'function(){...} ((this && this._WB_wombat_obj_proxy || this))' 'function(){...} (_____WB$wombat$check$this$function_____(this))'
>>> _test_js_obj_proxy('function(){...} ) (this); foo(this)') >>> _test_js_obj_proxy('function(){...} ) (this); foo(this)')
'function(){...} ) ((this && this._WB_wombat_obj_proxy || this)); foo(this)' 'function(){...} ) (_____WB$wombat$check$this$function_____(this)); foo(this)'
>>> _test_js_obj_proxy('var foo = that || this ;') >>> _test_js_obj_proxy('var foo = that || this ;')
'var foo = that || (this && this._WB_wombat_obj_proxy || this) ;' 'var foo = that || _____WB$wombat$check$this$function_____(this) ;'
>>> _test_js_obj_proxy('a||this||that') >>> _test_js_obj_proxy('a||this||that')
'a||(this && this._WB_wombat_obj_proxy || this)||that' 'a||_____WB$wombat$check$this$function_____(this)||that'
>>> _test_js_obj_proxy('a||this)') >>> _test_js_obj_proxy('a||this)')
'a||(this && this._WB_wombat_obj_proxy || this))' 'a||_____WB$wombat$check$this$function_____(this))'
# not rewritten # not rewritten
>>> _test_js_obj_proxy('var window = this$') >>> _test_js_obj_proxy('var window = this$')
@ -207,7 +207,7 @@ r"""
'this. alocation = http://example.com/' 'this. alocation = http://example.com/'
>>> _test_js_obj_proxy(r'this. location = http://example.com/') >>> _test_js_obj_proxy(r'this. location = http://example.com/')
'this. location = (self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = http://example.com/' 'this. location = ((self.__WB_check_loc && self.__WB_check_loc(location)) || {}).href = http://example.com/'

View File

@ -23,7 +23,7 @@ class UrlRewriter(object):
REL_PATH = '/' REL_PATH = '/'
def __init__(self, wburl, prefix='', full_prefix=None, rel_prefix=None, def __init__(self, wburl, prefix='', full_prefix=None, rel_prefix=None,
root_path=None, cookie_scope=None, rewrite_opts=None): root_path=None, cookie_scope=None, rewrite_opts=None, pywb_static_prefix=None):
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl) self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
self.prefix = prefix self.prefix = prefix
self.full_prefix = full_prefix or prefix self.full_prefix = full_prefix or prefix
@ -36,10 +36,22 @@ class UrlRewriter(object):
self.prefix_abs = self.prefix and self.prefix.startswith(self.PROTOCOLS) self.prefix_abs = self.prefix and self.prefix.startswith(self.PROTOCOLS)
self.cookie_scope = cookie_scope self.cookie_scope = cookie_scope
self.rewrite_opts = rewrite_opts or {} self.rewrite_opts = rewrite_opts or {}
self._pywb_static_prefix = pywb_static_prefix
if self.rewrite_opts.get('punycode_links'): if self.rewrite_opts.get('punycode_links'):
self.wburl._do_percent_encode = False self.wburl._do_percent_encode = False
@property
def pywb_static_prefix(self):
"""Returns the static path URL
:rtype: str
"""
if self._pywb_static_prefix is None:
return ''
if self._pywb_static_prefix.startswith(self.PROTOCOLS):
return self._pywb_static_prefix
return self.urljoin(self.full_prefix, self._pywb_static_prefix)
def rewrite(self, url, mod=None, force_abs=False): def rewrite(self, url, mod=None, force_abs=False):
# if special protocol, no rewriting at all # if special protocol, no rewriting at all
if url.startswith(self.NO_REWRITE_URI_PREFIX): if url.startswith(self.NO_REWRITE_URI_PREFIX):

View File

@ -120,12 +120,15 @@ AutoFetcher.prototype.fetchAV = function () {
// we limit how many we fetch at a time drastically // we limit how many we fetch at a time drastically
this.queuingAV = true; this.queuingAV = true;
var runningFetchers = []; var runningFetchers = [];
while (this.avQueue.length > 0 && runningFetchers.length <= DefaultNumAvFetches) { while (
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop)) this.avQueue.length > 0 &&
runningFetchers.length <= DefaultNumAvFetches
) {
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop));
} }
if (this.avQueue.length <= FullAVQDrainLen) { if (this.avQueue.length <= FullAVQDrainLen) {
while (this.avQueue.length > 0) { while (this.avQueue.length > 0) {
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop)) runningFetchers.push(fetch(this.avQueue.shift()).catch(noop));
} }
} }
Promise.all(runningFetchers) Promise.all(runningFetchers)
@ -142,12 +145,15 @@ AutoFetcher.prototype.fetchImgs = function () {
// we add them to the current batch // we add them to the current batch
this.queuing = true; this.queuing = true;
var runningFetchers = []; var runningFetchers = [];
while (this.queue.length > 0 && runningFetchers.length <= DefaultNumImFetches) { while (
runningFetchers.push(fetch(this.queue.shift()).catch(noop)) this.queue.length > 0 &&
runningFetchers.length <= DefaultNumImFetches
) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop));
} }
if (this.queue.length <= FullImgQDrainLen) { if (this.queue.length <= FullImgQDrainLen) {
while (this.queue.length > 0) { while (this.queue.length > 0) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop)) runningFetchers.push(fetch(this.queue.shift()).catch(noop));
} }
} }
Promise.all(runningFetchers) Promise.all(runningFetchers)
@ -227,7 +233,14 @@ AutoFetcher.prototype.maybeFixUpURL = function (url, resolveOpts) {
return this.prefixMod + '/' + url; return this.prefixMod + '/' + url;
}; };
AutoFetcher.prototype.urlExtractor = function (match, n1, n2, n3, offset, string) { AutoFetcher.prototype.urlExtractor = function(
match,
n1,
n2,
n3,
offset,
string
) {
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL // Same function as style_replacer in wombat.rewrite_style, n2 is our URL
this.queueNonAVURL(n2); this.queueNonAVURL(n2);
return n1 + n2 + n3; return n1 + n2 + n3;
@ -245,13 +258,17 @@ AutoFetcher.prototype.handleMedia = function (mediaRules) {
}; };
AutoFetcher.prototype.handleSrc = function(srcValues, context) { AutoFetcher.prototype.handleSrc = function(srcValues, context) {
var resolveOpts = { 'docBaseURI': context.docBaseURI }; var resolveOpts = { docBaseURI: context.docBaseURI };
if (srcValues.value) { if (srcValues.value) {
resolveOpts.mod = srcValues.mod; resolveOpts.mod = srcValues.mod;
if (resolveOpts.mod === 1) { if (resolveOpts.mod === 1) {
return this.queueNonAVURL(this.maybeFixUpURL(srcValues.value.trim(), resolveOpts)); return this.queueNonAVURL(
this.maybeFixUpURL(srcValues.value.trim(), resolveOpts)
);
} }
return this.queueAVURL(this.maybeFixUpURL(srcValues.value.trim(), resolveOpts)); return this.queueAVURL(
this.maybeFixUpURL(srcValues.value.trim(), resolveOpts)
);
} }
var len = srcValues.values.length; var len = srcValues.values.length;
for (var i = 0; i < len; i++) { for (var i = 0; i < len; i++) {
@ -266,6 +283,7 @@ AutoFetcher.prototype.handleSrc = function (srcValues, context) {
}; };
AutoFetcher.prototype.extractSrcSetNotPreSplit = function(ssV, resolveOpts) { AutoFetcher.prototype.extractSrcSetNotPreSplit = function(ssV, resolveOpts) {
if (!ssV) return;
// was from extract from local doc so we need to duplicate work // was from extract from local doc so we need to duplicate work
var srcsetValues = ssV.split(srcsetSplit); var srcsetValues = ssV.split(srcsetSplit);
for (var i = 0; i < srcsetValues.length; i++) { for (var i = 0; i < srcsetValues.length; i++) {
@ -296,7 +314,7 @@ AutoFetcher.prototype.extractSrcset = function (srcsets, context) {
}; };
AutoFetcher.prototype.handleSrcset = function(srcset, context) { AutoFetcher.prototype.handleSrcset = function(srcset, context) {
var resolveOpts = { 'docBaseURI': context.docBaseURI }; var resolveOpts = { docBaseURI: context.docBaseURI };
if (srcset.value) { if (srcset.value) {
// we have a single value, this srcset came from either // we have a single value, this srcset came from either
// preserveDataSrcset (not presplit) preserveSrcset (presplit) // preserveDataSrcset (not presplit) preserveSrcset (presplit)
@ -318,7 +336,6 @@ AutoFetcher.prototype.handleSrcset = function (srcset, context) {
} }
}; };
AutoFetcher.prototype.autoFetch = function(data) { AutoFetcher.prototype.autoFetch = function(data) {
// we got a message and now we autofetch! // we got a message and now we autofetch!
// these calls turn into no ops if they have no work // these calls turn into no ops if they have no work

View File

@ -31,7 +31,6 @@ if (typeof self.Promise === 'undefined') {
}; };
} }
if (typeof self.fetch === 'undefined') { if (typeof self.fetch === 'undefined') {
// not kewl we must polyfill fetch. // not kewl we must polyfill fetch.
self.fetch = function(url) { self.fetch = function(url) {
@ -119,12 +118,15 @@ AutoFetcher.prototype.fetchAV = function () {
// we limit how many we fetch at a time drastically // we limit how many we fetch at a time drastically
this.queuingAV = true; this.queuingAV = true;
var runningFetchers = []; var runningFetchers = [];
while (this.avQueue.length > 0 && runningFetchers.length <= DefaultNumAvFetches) { while (
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop)) this.avQueue.length > 0 &&
runningFetchers.length <= DefaultNumAvFetches
) {
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop));
} }
if (this.avQueue.length <= FullAVQDrainLen) { if (this.avQueue.length <= FullAVQDrainLen) {
while (this.avQueue.length > 0) { while (this.avQueue.length > 0) {
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop)) runningFetchers.push(fetch(this.avQueue.shift()).catch(noop));
} }
} }
Promise.all(runningFetchers) Promise.all(runningFetchers)
@ -141,12 +143,15 @@ AutoFetcher.prototype.fetchImgs = function () {
// we add them to the current batch // we add them to the current batch
this.queuing = true; this.queuing = true;
var runningFetchers = []; var runningFetchers = [];
while (this.queue.length > 0 && runningFetchers.length <= DefaultNumImFetches) { while (
runningFetchers.push(fetch(this.queue.shift()).catch(noop)) this.queue.length > 0 &&
runningFetchers.length <= DefaultNumImFetches
) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop));
} }
if (this.queue.length <= FullImgQDrainLen) { if (this.queue.length <= FullImgQDrainLen) {
while (this.queue.length > 0) { while (this.queue.length > 0) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop)) runningFetchers.push(fetch(this.queue.shift()).catch(noop));
} }
} }
Promise.all(runningFetchers) Promise.all(runningFetchers)
@ -180,7 +185,7 @@ AutoFetcher.prototype.safeResolve = function (url, resolver) {
var resolvedURL = url; var resolvedURL = url;
if (resolver) { if (resolver) {
try { try {
resolvedURL = (new URL(url, resolver)).href resolvedURL = new URL(url, resolver).href;
} catch (e) { } catch (e) {
resolvedURL = url; resolvedURL = url;
} }
@ -188,8 +193,14 @@ AutoFetcher.prototype.safeResolve = function (url, resolver) {
return resolvedURL; return resolvedURL;
}; };
AutoFetcher.prototype.urlExtractor = function(
AutoFetcher.prototype.urlExtractor = function (match, n1, n2, n3, offset, string) { match,
n1,
n2,
n3,
offset,
string
) {
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL // Same function as style_replacer in wombat.rewrite_style, n2 is our URL
// this.currentResolver is set to the URL which the browser would normally // this.currentResolver is set to the URL which the browser would normally
// resolve relative urls with (URL of the stylesheet) in an exceptionless manner // resolve relative urls with (URL of the stylesheet) in an exceptionless manner
@ -224,19 +235,27 @@ AutoFetcher.prototype.extractSrcset = function (srcsets) {
for (var i = 0; i < length; i++) { for (var i = 0; i < length; i++) {
extractedSrcSet = srcsets[i]; extractedSrcSet = srcsets[i];
ssSplit = extractedSrcSet.srcset.split(srcsetSplit); ssSplit = extractedSrcSet.srcset.split(srcsetSplit);
console.log(ssSplit);
for (j = 0; j < ssSplit.length; j++) { for (j = 0; j < ssSplit.length; j++) {
if (ssSplit[j]) { if (ssSplit[j]) {
srcsetValue = ssSplit[j].trim(); srcsetValue = ssSplit[j].trim();
if (srcsetValue.length > 0) { if (srcsetValue.length > 0) {
// resolve the URL in an exceptionless manner (resolvedURL will be undefined if an error occurred) // resolve the URL in an exceptionless manner (resolvedURL will be undefined if an error occurred)
var resolvedURL = this.safeResolve(srcsetValue.split(' ')[0], extractedSrcSet.resolve); var resolvedURL = this.safeResolve(
srcsetValue.split(' ')[0],
extractedSrcSet.resolve
);
if (resolvedURL) { if (resolvedURL) {
if (extractedSrcSet.mod === 'im_') { if (extractedSrcSet.mod === 'im_') {
this.queueNonAVURL(resolvedURL); this.queueNonAVURL(resolvedURL);
} else { } else {
this.queueAVURL(resolvedURL); this.queueAVURL(resolvedURL);
} }
} else {
console.log(resolvedURL);
} }
} else {
console.log(srcsetValue);
} }
} }
} }
@ -262,7 +281,6 @@ AutoFetcher.prototype.extractSrc = function (srcVals) {
} }
}; };
AutoFetcher.prototype.autofetchMediaSrcset = function(data) { AutoFetcher.prototype.autofetchMediaSrcset = function(data) {
// we got a message and now we autofetch! // we got a message and now we autofetch!
// these calls turn into no ops if they have no work // these calls turn into no ops if they have no work

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,82 +0,0 @@
// pywb mini rewriter for injection into web worker scripts
function WBWombat(info) {
function maybeResolveURL(origURL) {
try {
var resolved = new URL(origURL, info.originalURL);
return resolved.href;
} catch (e) {
return origURL;
}
}
function rewrite_url(url) {
if (url.indexOf('blob:') === 0) return url;
if (url && info.originalURL && url.indexOf('/') === 0) {
url = maybeResolveURL(url);
}
if (info.prefix) {
return info.prefix + url;
}
return url;
}
function init_ajax_rewrite() {
var orig = self.XMLHttpRequest.prototype.open;
function open_rewritten(method, url, async, user, password) {
url = rewrite_url(url);
// defaults to true
if (async != false) {
async = true;
}
var result = orig.call(this, method, url, async, user, password);
if (url.indexOf('data:') !== 0) {
this.setRequestHeader('X-Pywb-Requested-With', 'XMLHttpRequest');
}
}
self.XMLHttpRequest.prototype.open = open_rewritten;
}
init_ajax_rewrite();
function rewriteArgs(argsObj) {
// recreate the original arguments object just with URLs rewritten
var newArgObj = new Array(argsObj.length);
for (var i = 0; i < newArgObj.length; i++) {
var arg = argsObj[i];
newArgObj[i] = rewrite_url(arg);
}
return newArgObj;
}
var origImportScripts = self.importScripts;
self.importScripts = function importScripts() {
// rewrite the arguments object and call original function via fn.apply
var rwArgs = rewriteArgs(arguments);
return origImportScripts.apply(this, rwArgs);
};
if (self.fetch != null) {
// this fetch is Worker.fetch
var orig_fetch = self.fetch;
self.fetch = function(input, init_opts) {
var inputType = typeof(input);
if (inputType === 'string') {
input = rewrite_url(input);
} else if (inputType === 'object' && input.url) {
var new_url = rewrite_url(input.url);
if (new_url !== input.url) {
input = new Request(new_url, input);
}
}
init_opts = init_opts || {};
init_opts['credentials'] = 'include';
return orig_fetch.call(this, input, init_opts);
};
}
}

View File

@ -1 +1 @@
__version__ = '2.2.20190410' __version__ = '2.3.0.dev0'

View File

@ -23,6 +23,9 @@ def fmod_sl(request):
# ============================================================================ # ============================================================================
class BaseConfigTest(BaseTestClass): class BaseConfigTest(BaseTestClass):
lint_app = True lint_app = True
extra_headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
@classmethod @classmethod
def get_test_app(cls, config_file, custom_config=None): def get_test_app(cls, config_file, custom_config=None):
@ -62,21 +65,34 @@ class BaseConfigTest(BaseTestClass):
assert resp.content_length > 0 assert resp.content_length > 0
def get(self, url, fmod, *args, **kwargs): def get(self, url, fmod, *args, **kwargs):
self.__ensure_headers(kwargs)
app = self.testapp if fmod else self.testapp_non_frame app = self.testapp if fmod else self.testapp_non_frame
return app.get(url.format(fmod), *args, **kwargs) return app.get(url.format(fmod), *args, **kwargs)
def post(self, url, fmod, *args, **kwargs): def post(self, url, fmod, *args, **kwargs):
self.__ensure_headers(kwargs)
app = self.testapp if fmod else self.testapp_non_frame app = self.testapp if fmod else self.testapp_non_frame
return app.post(url.format(fmod), *args, **kwargs) return app.post(url.format(fmod), *args, **kwargs)
def post_json(self, url, fmod, *args, **kwargs): def post_json(self, url, fmod, *args, **kwargs):
self.__ensure_headers(kwargs)
app = self.testapp if fmod else self.testapp_non_frame app = self.testapp if fmod else self.testapp_non_frame
return app.post_json(url.format(fmod), *args, **kwargs) return app.post_json(url.format(fmod), *args, **kwargs)
def head(self, url, fmod, *args, **kwargs): def head(self, url, fmod, *args, **kwargs):
self.__ensure_headers(kwargs)
app = self.testapp if fmod else self.testapp_non_frame app = self.testapp if fmod else self.testapp_non_frame
return app.head(url.format(fmod), *args, **kwargs) return app.head(url.format(fmod), *args, **kwargs)
def __ensure_headers(self, kwargs):
if 'headers' in kwargs:
headers = kwargs.get('headers')
else:
headers = kwargs['headers'] = {}
if isinstance(headers, dict) and 'User-Agent' not in headers:
headers['User-Agent'] = self.extra_headers['User-Agent']
#============================================================================= #=============================================================================
class CollsDirMixin(TempDirTests): class CollsDirMixin(TempDirTests):

View File

@ -31,7 +31,7 @@ class TestRootColl(BaseConfigTest):
def test_root_replay_redir(self, fmod): def test_root_replay_redir(self, fmod):
resp = self.get('/20140128051539{0}/http://www.iana.org/domains/example', fmod) resp = self.get('/20140128051539{0}/http://www.iana.org/domains/example', fmod)
assert resp.status_int == 302 assert resp.status_int in (301, 302)
assert resp.headers['Location'] == 'http://localhost:80/20140128051539{0}/https://www.iana.org/domains/reserved'.format(fmod) assert resp.headers['Location'] == 'http://localhost:80/20140128051539{0}/https://www.iana.org/domains/reserved'.format(fmod)

1
wombat Submodule

@ -0,0 +1 @@
Subproject commit 0b0c171a4f0f34114ba3cefd5ba80304515f4ef8