1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-14 15:53:28 +01:00
- Fix: a few broken tests due to iana.org requiring a user agent in its requests
rewrite:
  - introduced a new JSWorkerRewriter class in order to support rewriting via wombat workers in the context of all supported worker variants via
  - ensured rewriter app correctly sets the static prefix
wombat:
 - add wombat as submodule!
This commit is contained in:
John Berlin 2019-05-15 14:42:51 -04:00 committed by Ilya Kreymer
parent 77f8bb6476
commit 22b4297fc5
25 changed files with 709 additions and 5675 deletions

3
.gitmodules vendored Normal file
View File

@ -0,0 +1,3 @@
[submodule "wombat"]
path = wombat
url = https://github.com/webrecorder/wombat

View File

@ -1,4 +0,0 @@
NODE_BIN_DIR=../node_modules/.bin
test:
$(NODE_BIN_DIR)/karma start --single-run

View File

@ -1,9 +0,0 @@
<html>
<head><meta charset="UTF-8"></head>
<body>
<!-- This is a dummy page used in
tests of Wombat's live-rewriting
functionality.
!-->
</body>
</html>

View File

@ -1,108 +0,0 @@
var sauceLabsConfig = {
testName: 'pywb Client Tests',
};
// see https://github.com/karma-runner/karma-sauce-launcher/issues/73
if (process.env.TRAVIS_JOB_NUMBER) {
sauceLabsConfig.startConnect = false;
sauceLabsConfig.tunnelIdentifier = process.env.TRAVIS_JOB_NUMBER;
}
var WOMBAT_JS_PATH = 'pywb/static/wombat.js';
var sauceLaunchers = {
sl_chrome: {
base: 'SauceLabs',
browserName: 'chrome',
},
sl_firefox: {
base: 'SauceLabs',
browserName: 'firefox',
},
sl_safari: {
base: 'SauceLabs',
browserName: 'safari',
platform: 'OS X 10.11',
version: '9.0',
},
sl_edge: {
base: 'SauceLabs',
browserName: 'MicrosoftEdge',
},
};
var localLaunchers = {
localFirefox: {
base: 'Firefox',
},
};
var customLaunchers = {};
if (process.env['SAUCE_USERNAME'] && process.env['SAUCE_ACCESS_KEY']) {
customLaunchers = sauceLaunchers;
} else {
console.error('Sauce Labs account details not set, ' +
'Karma tests will be run only against local browsers.' +
'Set SAUCE_USERNAME and SAUCE_ACCESS_KEY environment variables to ' +
'run tests against Sauce Labs browsers');
customLaunchers = localLaunchers;
}
module.exports = function(config) {
config.set({
basePath: '../',
frameworks: ['mocha', 'chai'],
files: [
{
pattern: WOMBAT_JS_PATH,
watched: true,
included: false,
served: true,
},
{
pattern: 'karma-tests/dummy.html',
included: false,
served: true,
},
'karma-tests/*.spec.js',
],
preprocessors: {},
reporters: ['progress'],
port: 9876,
colors: true,
logLevel: config.LOG_INFO,
autoWatch: true,
sauceLabs: sauceLabsConfig,
// Set extended timeouts to account for the slowness
// in connecting to remote browsers (eg. when using
// Sauce Labs)
//
// See https://oligofren.wordpress.com/2014/05/27/running-karma-tests-on-browserstack/
captureTimeout: 3 * 60000,
browserNoActivityTimeout: 30 * 1000,
browserDisconnectTimeout: 10 * 1000,
browserDisconnectTolerance: 1,
customLaunchers: customLaunchers,
browsers: Object.keys(customLaunchers),
singleRun: false,
concurrency: Infinity
})
};

View File

@ -1,225 +0,0 @@
var DEFAULT_TIMEOUT = 20000;
// creates a new document in an <iframe> and runs
// a WombatJS test case in it.
//
// A new <iframe> is used for each test so that each
// case is run with fresh Document and Window objects,
// since Wombat monkey-patches many Document and Window
// functions
//
function runWombatTest(testCase, done) {
// create an <iframe>
var testFrame = document.createElement('iframe');
testFrame.src = '/base/karma-tests/dummy.html';
document.body.appendChild(testFrame);
testFrame.contentWindow.addEventListener('load', function () {
var testDocument = testFrame.contentDocument;
function runFunctionInIFrame(func) {
testFrame.contentWindow.eval('(' + func.toString() + ')()');
}
// expose an error reporting function to the <iframe>
window.reportError = function(ex) {
done(new Error(ex));
};
// expose utility methods for assertion testing in tests.
// (We used to expose chai asserts here but Karma's default
// error reporter replaces URLs in exception messages with
// the corresponding file paths, which is unhelpful for us
// since assert.equal() will often be called with URLs in our tests)
window.assert = {
equal: function (a, b) {
if (a !== b) {
console.error('Mismatch between', a, 'and', b);
throw new Error('AssertionError');
}
}
};
runFunctionInIFrame(function () {
// re-assign the iframe's console object to the parent window's
// console so that messages are intercepted by Karma
// and output to wherever it is configured to send
// console logs (typically stdout)
console = window.parent.console;
window.onerror = function (message, url, line, col, error) {
if (error) {
console.log(error.stack);
}
reportError(new Error(message));
};
// expose chai's assertion testing API to the test script
window.assert = window.parent.assert;
window.reportError = window.parent.reportError;
// helpers which check whether DOM property overrides are supported
// in the current browser
window.domTests = {
areDOMPropertiesConfigurable: function () {
var descriptor = Object.getOwnPropertyDescriptor(Node.prototype, 'baseURI');
if (descriptor && !descriptor.configurable) {
return false;
} else {
return true;
}
}
};
});
try {
runFunctionInIFrame(testCase.initScript);
} catch (e) {
throw new Error('Configuring Wombat failed: ' + e.toString());
}
try {
testFrame.contentWindow.eval(testCase.wombatScript);
runFunctionInIFrame(function () {
new window._WBWombat(window, wbinfo);
});
} catch (e) {
console.error(e.stack);
throw new Error('Initializing WombatJS failed: ' + e.toString());
}
if (testCase.html) {
testDocument.body.innerHTML = testCase.html;
}
if (testCase.testScript) {
try {
runFunctionInIFrame(testCase.testScript);
} catch (e) {
throw new Error('Test script failed: ' + e.toString());
}
}
testFrame.remove();
done();
});
}
describe('WombatJS', function () {
this.timeout(DEFAULT_TIMEOUT);
var wombatScript;
before(function (done) {
// load the source of the WombatJS content
// rewriting script
var req = new XMLHttpRequest();
req.open('GET', '/base/pywb/static/wombat.js');
req.onload = function () {
wombatScript = req.responseText;
done();
};
req.send();
});
it('should load', function (done) {
runWombatTest({
initScript: function () {
wbinfo = {
wombat_opts: {},
wombat_ts: '',
is_live: false,
top_url: ''
};
},
wombatScript: wombatScript,
}, done);
});
describe('anchor rewriting', function () {
var config;
beforeEach(function () {
config = {
initScript: function () {
wbinfo = {
wombat_opts: {},
wombat_scheme: 'http',
prefix: window.location.origin,
wombat_ts: '',
is_live: false,
top_url: ''
};
},
wombatScript: wombatScript,
html: '<a href="foobar.html" id="link">A link</a>',
};
});
it('should rewrite links in dynamically injected <a> tags', function (done) {
config.testScript = function () {
if (domTests.areDOMPropertiesConfigurable()) {
var link = document.getElementById('link');
assert.equal(link.href, 'http:///base/karma-tests/foobar.html');
}
};
runWombatTest(config, done);
});
it('toString() should return the rewritten URL', function (done) {
config.testScript = function () {
if (domTests.areDOMPropertiesConfigurable()) {
var link = document.getElementById('link');
assert.equal(link.href, link.toString());
}
};
runWombatTest(config, done);
});
});
describe('base URL overrides', function () {
it('document.baseURI should return the original URL', function (done) {
runWombatTest({
initScript: function () {
wbinfo = {
wombat_opts: {},
prefix: window.location.origin,
wombat_ts: '',
wombat_scheme: 'http',
is_live: false,
top_url: ''
};
},
wombatScript: wombatScript,
testScript: function () {
var baseURI = document.baseURI;
if (typeof baseURI !== 'string') {
throw new Error('baseURI is not a string');
}
if (domTests.areDOMPropertiesConfigurable()) {
assert.equal(baseURI, 'http:///base/karma-tests/dummy.html');
}
},
}, done);
});
it('should allow base.href to be assigned', function (done) {
runWombatTest({
initScript: function () {
wbinfo = {
wombat_opts: {},
wombat_scheme: 'http',
is_live: false,
top_url: ''
};
},
wombatScript: wombatScript,
testScript: function () {
'use strict';
var baseElement = document.createElement('base');
baseElement.href = 'http://foobar.com/base';
assert.equal(baseElement.href, 'http://foobar.com/base');
},
}, done);
});
});
});

View File

@ -238,7 +238,8 @@ class RewriterApp(object):
host_prefix = self.get_host_prefix(environ)
rel_prefix = self.get_rel_prefix(environ)
full_prefix = host_prefix + rel_prefix
pywb_static_prefix = environ.get('pywb.host_prefix', '') + environ.get('pywb.app_prefix', '') + environ.get(
'pywb.static_prefix', '/static/')
is_proxy = ('wsgiprox.proxy_host' in environ)
response = self.handle_custom_response(environ, wb_url,
@ -257,7 +258,8 @@ class RewriterApp(object):
urlrewriter = UrlRewriter(wb_url,
prefix=full_prefix,
full_prefix=full_prefix,
rel_prefix=rel_prefix)
rel_prefix=rel_prefix,
pywb_static_prefix=pywb_static_prefix)
framed_replay = self.framed_replay

View File

@ -15,6 +15,8 @@ from pywb.utils.io import StreamIter, BUFF_SIZE
from pywb.utils.loaders import load_yaml_config, load_py_name
WORKER_MODS = {"wkr_", "sw_"} # type: Set[str]
# ============================================================================
class BaseContentRewriter(object):
@ -423,8 +425,8 @@ class RewriteInfo(object):
def _resolve_text_type(self, text_type):
mod = self.url_rewriter.wburl.mod
if mod == 'sw_' or mod == 'wkr_':
return None
if mod in WORKER_MODS:
return 'js-worker'
if text_type == 'css' and mod == 'js_':
text_type = 'css'
@ -495,7 +497,7 @@ class RewriteInfo(object):
return True
def is_url_rw(self):
if self.url_rewriter.wburl.mod in ('id_', 'bn_', 'sw_', 'wkr_'):
if self.url_rewriter.wburl.mod in ('id_', 'bn_', 'wkrf_'):
return False
return True

View File

@ -15,6 +15,8 @@ from pywb.rewrite.rewrite_dash import RewriteDASH
from pywb.rewrite.rewrite_hls import RewriteHLS
from pywb.rewrite.rewrite_amf import RewriteAMF
from pywb.rewrite.rewrite_js_workers import JSWorkerRewriter
from pywb import DEFAULT_RULES_FILE
import copy
@ -34,6 +36,7 @@ class DefaultRewriter(BaseContentRewriter):
'js': JSLocationOnlyRewriter,
'js-proxy': JSNoneRewriter,
'js-worker': JSWorkerRewriter,
'json': JSONPRewriter,

View File

@ -58,7 +58,7 @@ class HTMLRewriterMixin(StreamingRewriter):
'embed': {'src': 'oe_'},
'head': {'': defmod}, # for head rewriting
'iframe': {'src': 'if_'},
'image': {'src': 'im_', 'xlink:href': 'im_'},
'image': {'src': 'im_', 'xlink:href': 'im_', 'href': 'im_'},
'img': {'src': 'im_',
'srcset': 'im_'},
'ins': {'cite': defmod},
@ -74,7 +74,7 @@ class HTMLRewriterMixin(StreamingRewriter):
'q': {'cite': defmod},
'ref': {'href': 'oe_'},
'script': {'src': 'js_', 'xlink:href': 'js_'}, # covers both HTML and SVG script tags
'source': {'src': 'oe_'},
'source': {'src': 'oe_', 'srcset': 'oe_'},
'video': {'src': 'oe_',
'poster': 'im_'},
}

View File

@ -63,48 +63,59 @@ class RxRules(object):
class JSWombatProxyRules(RxRules):
def __init__(self):
local_init_func = '\nvar {0} = function(name) {{\
return (self._wb_wombat && self._wb_wombat.local_init &&\
return (self._wb_wombat && self._wb_wombat.local_init && \
self._wb_wombat.local_init(name)) || self[name]; }};\n\
if (!self.__WB_pmw) {{ self.__WB_pmw = function(obj) {{ return obj; }} }}\n\
{{\n'
local_check_this_fn = 'var {0} = function (thisObj) {{ \
if (thisObj && thisObj._WB_wombat_obj_proxy) return thisObj._WB_wombat_obj_proxy; return thisObj; }};'
local_init_func_name = '_____WB$wombat$assign$function_____'
local_var_line = 'let {0} = {1}("{0}");'
this_rw = '(this && this._WB_wombat_obj_proxy || this)'
local_check_this_func_name = '_____WB$wombat$check$this$function_____'
check_loc = '(self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = '
# we must use a function to perform the this check because most minfiers reduce the number of statements
# by turning everything into one or more expressions. Our previous rewrite was an logical expression,
# (this && this._WB_wombat_obj_proxy || this), that would cause the outer expression to be invalid when
# it was used as the LHS of certain expressions.
# e.g. assignment expressions containing non parenthesized logical expression.
# By using a function the expression injected is an call expression that plays nice in those cases
this_rw = '_____WB$wombat$check$this$function_____(this)'
check_loc = '((self.__WB_check_loc && self.__WB_check_loc(location)) || {}).href = '
self.local_objs = [
'window',
'self',
'document',
'location',
'top',
'parent',
'frames',
'opener']
'window',
'self',
'document',
'location',
'top',
'parent',
'frames',
'opener'
]
local_declares = '\n'.join([local_var_line.format(obj, local_init_func_name) for obj in self.local_objs])
prop_str = '|'.join(self.local_objs)
rules = [
(r'(?<=\.)postMessage\b\(', self.add_prefix('__WB_pmw(self).'), 0),
(r'(?<![$.])\s*location\b\s*[=]\s*(?![=])', self.add_suffix(check_loc), 0),
(r'\breturn\s+this\b\s*(?![.$])', self.replace_str(this_rw), 0),
(r'(?<=[\n])\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(';' + this_rw), 0),
(r'(?<![$.])\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(this_rw), 0),
(r'(?<=[=])\s*this\b\s*(?![.$])', self.replace_str(this_rw), 0),
('\}(?:\s*\))?\s*\(this\)', self.replace_str(this_rw), 0),
(r'(?<=[^|&][|&]{2})\s*this\b\s*(?![|&.$]([^|&]|$))', self.replace_str(this_rw), 0),
(r'(?<=\.)postMessage\b\(', self.add_prefix('__WB_pmw(self).'), 0),
(r'(?<![$.])\s*location\b\s*[=]\s*(?![=])', self.add_suffix(check_loc), 0),
(r'\breturn\s+this\b\s*(?![.$])', self.replace_str(this_rw), 0),
(r'(?<=[\n])\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(';' + this_rw), 0),
(r'(?<![$.])\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(this_rw), 0),
(r'(?<=[=])\s*this\b\s*(?![.$])', self.replace_str(this_rw), 0),
('\}(?:\s*\))?\s*\(this\)', self.replace_str(this_rw), 0),
(r'(?<=[^|&][|&]{2})\s*this\b\s*(?![|&.$]([^|&]|$))', self.replace_str(this_rw), 0),
]
super(JSWombatProxyRules, self).__init__(rules)
self.first_buff = local_init_func.format(local_init_func_name) + local_declares
self.first_buff = local_check_this_fn.format(local_check_this_func_name) + local_init_func.format(
local_init_func_name) + local_declares + '\n\n'
self.last_buff = '\n\n}'

View File

@ -0,0 +1,30 @@
from pywb.rewrite.content_rewriter import StreamingRewriter, WORKER_MODS
__all__ = ["JSWorkerRewriter"]
INJECT = "(function() { self.importScripts('%s'); new WBWombat(%s); })();"
INIT = "{'prefix': '%s', 'prefixMod': '%s/', 'originalURL': '%s'}"
class JSWorkerRewriter(StreamingRewriter):
"""A simple rewriter for rewriting web or service workers.
The only rewriting that occurs is the injection of the init code
for wombatWorkers.js.
This allows for all them to operate as expected on the live web.
"""
def __init__(self, url_rewriter, align_to_line=True, first_buff=''):
"""Initialize a new JSWorkerRewriter
:param UrlRewriter url_rewriter: The url rewriter for this rewrite
:param bool align_to_line: Should the response stream be aliened to line boundaries
:param str first_buff: The first string to be added to the rewrite
:rtype: None
"""
super(JSWorkerRewriter, self).__init__(url_rewriter, align_to_line, first_buff)
wb_url = self.url_rewriter.wburl
if wb_url.mod in WORKER_MODS:
rw_url = self.url_rewriter.pywb_static_prefix + "wombatWorkers.js"
prefix = self.url_rewriter.full_prefix
init = INIT % (prefix, prefix + 'wkrf_', wb_url.url)
self.first_buff = INJECT % (rw_url, init)

View File

@ -235,24 +235,22 @@ class TestContentRewriter(object):
def test_rewrite_sw_add_headers(self):
headers = {'Content-Type': 'application/x-javascript'}
content = 'function() { location.href = "http://example.com/"; }'
content = "function() { location.href = 'http://example.com/'; }"
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701sw_')
assert ('Content-Type', 'application/x-javascript') in headers.headers
assert ('Service-Worker-Allowed', 'http://localhost:8080/prefix/201701mp_/http://example.com/') in headers.headers
exp = 'function() { location.href = "http://example.com/"; }'
assert b''.join(gen).decode('utf-8') == exp
assert "self.importScripts('wombatWorkers.js');" in b''.join(gen).decode('utf-8')
def test_rewrite_worker(self):
headers = {'Content-Type': 'application/x-javascript'}
content = 'importScripts("http://example.com/js.js")'
content = "importScripts('http://example.com/js.js')"
rwheaders, gen, is_rw = self.rewrite_record(headers, content, ts='201701wkr_')
exp = 'importScripts("http://example.com/js.js")'
assert b''.join(gen).decode('utf-8') == exp
assert "self.importScripts('wombatWorkers.js');" in b''.join(gen).decode('utf-8')
def test_banner_only_no_cookie_rewrite(self):
headers = {'Set-Cookie': 'foo=bar; Expires=Wed, 13 Jan 2021 22:23:01 GMT; Path=/',

View File

@ -389,7 +389,7 @@ r"""
# parse attr with js proxy, rewrite location assignment
>>> parse('<html><a href="javascript:location=\'foo.html\'"></a></html>', js_proxy=True)
<html><a href="javascript:{ location=(self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = 'foo.html' }"></a></html>
<html><a href="javascript:{ location=((self.__WB_check_loc && self.__WB_check_loc(location)) || {}).href = 'foo.html' }"></a></html>
# parse attr with js proxy, assigning to location.href, no location assignment rewrite needed
>>> parse('<html><a href="javascript:location.href=\'foo.html\'"></a></html>', js_proxy=True)

View File

@ -131,49 +131,49 @@ r"""
#=================================================================
>>> _test_js_obj_proxy('var foo = this; location = bar')
'var foo = (this && this._WB_wombat_obj_proxy || this); location = (self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = bar'
'var foo = _____WB$wombat$check$this$function_____(this); location = ((self.__WB_check_loc && self.__WB_check_loc(location)) || {}).href = bar'
>>> _test_js_obj_proxy('var that = this\n location = bar')
'var that = (this && this._WB_wombat_obj_proxy || this)\n location = (self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = bar'
'var that = _____WB$wombat$check$this$function_____(this)\n location = ((self.__WB_check_loc && self.__WB_check_loc(location)) || {}).href = bar'
>>> _test_js_obj_proxy('location = "xyz"')
'location = (self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = "xyz"'
'location = ((self.__WB_check_loc && self.__WB_check_loc(location)) || {}).href = "xyz"'
>>> _test_js_obj_proxy('var foo = this.location')
'var foo = (this && this._WB_wombat_obj_proxy || this).location'
'var foo = _____WB$wombat$check$this$function_____(this).location'
>>> _test_js_obj_proxy('A = B\nthis.location = "foo"')
'A = B\n;(this && this._WB_wombat_obj_proxy || this).location = "foo"'
'A = B\n;_____WB$wombat$check$this$function_____(this).location = "foo"'
>>> _test_js_obj_proxy('var foo = this.location2')
'var foo = this.location2'
>>> _test_js_obj_proxy('func(Function("return this"));')
'func(Function("return (this && this._WB_wombat_obj_proxy || this)"));'
'func(Function("return _____WB$wombat$check$this$function_____(this)"));'
>>> _test_js_obj_proxy('A.call(function() { return this });')
'A.call(function() { return (this && this._WB_wombat_obj_proxy || this) });'
>>> _test_js_obj_proxy('A.call(function() { return this });')
'A.call(function() { return _____WB$wombat$check$this$function_____(this) });'
>>> _test_js_obj_proxy('this.document.location = foo')
'(this && this._WB_wombat_obj_proxy || this).document.location = foo'
'_____WB$wombat$check$this$function_____(this).document.location = foo'
>>> _test_js_obj_proxy('if (that != this) { ... }')
'if (that != (this && this._WB_wombat_obj_proxy || this)) { ... }'
'if (that != _____WB$wombat$check$this$function_____(this)) { ... }'
>>> _test_js_obj_proxy('function(){...} (this)')
'function(){...} ((this && this._WB_wombat_obj_proxy || this))'
'function(){...} (_____WB$wombat$check$this$function_____(this))'
>>> _test_js_obj_proxy('function(){...} ) (this); foo(this)')
'function(){...} ) ((this && this._WB_wombat_obj_proxy || this)); foo(this)'
'function(){...} ) (_____WB$wombat$check$this$function_____(this)); foo(this)'
>>> _test_js_obj_proxy('var foo = that || this ;')
'var foo = that || (this && this._WB_wombat_obj_proxy || this) ;'
'var foo = that || _____WB$wombat$check$this$function_____(this) ;'
>>> _test_js_obj_proxy('a||this||that')
'a||(this && this._WB_wombat_obj_proxy || this)||that'
'a||_____WB$wombat$check$this$function_____(this)||that'
>>> _test_js_obj_proxy('a||this)')
'a||(this && this._WB_wombat_obj_proxy || this))'
'a||_____WB$wombat$check$this$function_____(this))'
# not rewritten
>>> _test_js_obj_proxy('var window = this$')
@ -207,7 +207,7 @@ r"""
'this. alocation = http://example.com/'
>>> _test_js_obj_proxy(r'this. location = http://example.com/')
'this. location = (self.__WB_check_loc && self.__WB_check_loc(location) || {}).href = http://example.com/'
'this. location = ((self.__WB_check_loc && self.__WB_check_loc(location)) || {}).href = http://example.com/'

View File

@ -23,7 +23,7 @@ class UrlRewriter(object):
REL_PATH = '/'
def __init__(self, wburl, prefix='', full_prefix=None, rel_prefix=None,
root_path=None, cookie_scope=None, rewrite_opts=None):
root_path=None, cookie_scope=None, rewrite_opts=None, pywb_static_prefix=None):
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
self.prefix = prefix
self.full_prefix = full_prefix or prefix
@ -36,10 +36,22 @@ class UrlRewriter(object):
self.prefix_abs = self.prefix and self.prefix.startswith(self.PROTOCOLS)
self.cookie_scope = cookie_scope
self.rewrite_opts = rewrite_opts or {}
self._pywb_static_prefix = pywb_static_prefix
if self.rewrite_opts.get('punycode_links'):
self.wburl._do_percent_encode = False
@property
def pywb_static_prefix(self):
"""Returns the static path URL
:rtype: str
"""
if self._pywb_static_prefix is None:
return ''
if self._pywb_static_prefix.startswith(self.PROTOCOLS):
return self._pywb_static_prefix
return self.urljoin(self.full_prefix, self._pywb_static_prefix)
def rewrite(self, url, mod=None, force_abs=False):
# if special protocol, no rewriting at all
if url.startswith(self.NO_REWRITE_URI_PREFIX):

View File

@ -15,338 +15,355 @@ var autofetcher = null;
function noop() {}
if (typeof self.Promise === 'undefined') {
// not kewl we must polyfill Promise
self.Promise = function (executor) {
executor(noop, noop);
};
self.Promise.prototype.then = function (cb) {
if (cb) cb();
return this;
};
self.Promise.prototype.catch = function () {
return this;
};
self.Promise.all = function (values) {
return new Promise(noop);
};
// not kewl we must polyfill Promise
self.Promise = function(executor) {
executor(noop, noop);
};
self.Promise.prototype.then = function(cb) {
if (cb) cb();
return this;
};
self.Promise.prototype.catch = function() {
return this;
};
self.Promise.all = function(values) {
return new Promise(noop);
};
}
if (typeof self.fetch === 'undefined') {
// not kewl we must polyfill fetch.
self.fetch = function (url) {
return new Promise(function (resolve) {
var xhr = new XMLHttpRequest();
xhr.open('GET', url);
xhr.send();
resolve();
});
};
// not kewl we must polyfill fetch.
self.fetch = function(url) {
return new Promise(function(resolve) {
var xhr = new XMLHttpRequest();
xhr.open('GET', url);
xhr.send();
resolve();
});
};
}
self.onmessage = function (event) {
var data = event.data;
switch (data.type) {
case 'values':
autofetcher.autoFetch(data);
break;
}
self.onmessage = function(event) {
var data = event.data;
switch (data.type) {
case 'values':
autofetcher.autoFetch(data);
break;
}
};
function AutoFetcher(init) {
if (!(this instanceof AutoFetcher)) {
return new AutoFetcher(init);
}
this.prefix = init.prefix;
this.mod = init.mod;
this.prefixMod = init.prefix + init.mod;
this.rwRe = new RegExp(init.rwRe);
// relative url, WorkerLocation is set by owning document
this.relative = init.prefix.split(location.origin)[1];
// schemeless url
this.schemeless = '/' + this.relative;
// local cache of URLs fetched, to reduce server load
this.seen = {};
// array of URLs to be fetched
this.queue = [];
this.avQueue = [];
// should we queue a URL or not
this.queuing = false;
this.queuingAV = false;
this.urlExtractor = this.urlExtractor.bind(this);
this.imgFetchDone = this.imgFetchDone.bind(this);
this.avFetchDone = this.avFetchDone.bind(this);
if (!(this instanceof AutoFetcher)) {
return new AutoFetcher(init);
}
this.prefix = init.prefix;
this.mod = init.mod;
this.prefixMod = init.prefix + init.mod;
this.rwRe = new RegExp(init.rwRe);
// relative url, WorkerLocation is set by owning document
this.relative = init.prefix.split(location.origin)[1];
// schemeless url
this.schemeless = '/' + this.relative;
// local cache of URLs fetched, to reduce server load
this.seen = {};
// array of URLs to be fetched
this.queue = [];
this.avQueue = [];
// should we queue a URL or not
this.queuing = false;
this.queuingAV = false;
this.urlExtractor = this.urlExtractor.bind(this);
this.imgFetchDone = this.imgFetchDone.bind(this);
this.avFetchDone = this.avFetchDone.bind(this);
}
AutoFetcher.prototype.delay = function () {
// 2 second delay seem reasonable
return new Promise(function (resolve, reject) {
setTimeout(resolve, 2000);
AutoFetcher.prototype.delay = function() {
// 2 second delay seem reasonable
return new Promise(function(resolve, reject) {
setTimeout(resolve, 2000);
});
};
AutoFetcher.prototype.imgFetchDone = function() {
if (this.queue.length > 0) {
// we have a Q of some length drain it
var autofetcher = this;
this.delay().then(function() {
autofetcher.queuing = false;
autofetcher.fetchImgs();
});
} else {
this.queuing = false;
}
};
AutoFetcher.prototype.imgFetchDone = function () {
if (this.queue.length > 0) {
// we have a Q of some length drain it
var autofetcher = this;
this.delay().then(function () {
autofetcher.queuing = false;
autofetcher.fetchImgs();
});
} else {
this.queuing = false;
}
AutoFetcher.prototype.avFetchDone = function() {
if (this.avQueue.length > 0) {
// we have a Q of some length drain it
var autofetcher = this;
this.delay().then(function() {
autofetcher.queuingAV = false;
autofetcher.fetchAV();
});
} else {
this.queuingAV = false;
}
};
AutoFetcher.prototype.avFetchDone = function () {
if (this.avQueue.length > 0) {
// we have a Q of some length drain it
var autofetcher = this;
this.delay().then(function () {
autofetcher.queuingAV = false;
autofetcher.fetchAV();
});
} else {
this.queuingAV = false;
AutoFetcher.prototype.fetchAV = function() {
if (this.queuingAV || this.avQueue.length === 0) {
return;
}
// the number of fetches is limited to a maximum of DefaultNumAvFetches + FullAVQDrainLen outstanding fetches
// the baseline maximum number of fetches is DefaultNumAvFetches but if the size(avQueue) <= FullAVQDrainLen
// we add them to the current batch. Because audio video resources might be big
// we limit how many we fetch at a time drastically
this.queuingAV = true;
var runningFetchers = [];
while (
this.avQueue.length > 0 &&
runningFetchers.length <= DefaultNumAvFetches
) {
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop));
}
if (this.avQueue.length <= FullAVQDrainLen) {
while (this.avQueue.length > 0) {
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop));
}
}
Promise.all(runningFetchers)
.then(this.avFetchDone)
.catch(this.avFetchDone);
};
AutoFetcher.prototype.fetchAV = function () {
if (this.queuingAV || this.avQueue.length === 0) {
return;
AutoFetcher.prototype.fetchImgs = function() {
if (this.queuing || this.queue.length === 0) {
return;
}
// the number of fetches is limited to a maximum of DefaultNumImFetches + FullImgQDrainLen outstanding fetches
// the baseline maximum number of fetches is DefaultNumImFetches but if the size(queue) <= FullImgQDrainLen
// we add them to the current batch
this.queuing = true;
var runningFetchers = [];
while (
this.queue.length > 0 &&
runningFetchers.length <= DefaultNumImFetches
) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop));
}
if (this.queue.length <= FullImgQDrainLen) {
while (this.queue.length > 0) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop));
}
// the number of fetches is limited to a maximum of DefaultNumAvFetches + FullAVQDrainLen outstanding fetches
// the baseline maximum number of fetches is DefaultNumAvFetches but if the size(avQueue) <= FullAVQDrainLen
// we add them to the current batch. Because audio video resources might be big
// we limit how many we fetch at a time drastically
this.queuingAV = true;
var runningFetchers = [];
while (this.avQueue.length > 0 && runningFetchers.length <= DefaultNumAvFetches) {
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop))
}
if (this.avQueue.length <= FullAVQDrainLen) {
while (this.avQueue.length > 0) {
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop))
}
}
Promise.all(runningFetchers)
.then(this.avFetchDone)
.catch(this.avFetchDone);
}
Promise.all(runningFetchers)
.then(this.imgFetchDone)
.catch(this.imgFetchDone);
};
AutoFetcher.prototype.fetchImgs = function () {
if (this.queuing || this.queue.length === 0) {
return;
}
// the number of fetches is limited to a maximum of DefaultNumImFetches + FullImgQDrainLen outstanding fetches
// the baseline maximum number of fetches is DefaultNumImFetches but if the size(queue) <= FullImgQDrainLen
// we add them to the current batch
this.queuing = true;
var runningFetchers = [];
while (this.queue.length > 0 && runningFetchers.length <= DefaultNumImFetches) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop))
}
if (this.queue.length <= FullImgQDrainLen) {
while (this.queue.length > 0) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop))
}
}
Promise.all(runningFetchers)
.then(this.imgFetchDone)
.catch(this.imgFetchDone);
AutoFetcher.prototype.queueNonAVURL = function(url) {
// ensure we do not request data urls
if (url.indexOf(DataURLPrefix) === 0) return;
// check to see if we have seen this url before in order
// to lessen the load against the server content is fetched from
if (this.seen[url] != null) return;
this.seen[url] = true;
this.queue.push(url);
};
AutoFetcher.prototype.queueNonAVURL = function (url) {
// ensure we do not request data urls
if (url.indexOf(DataURLPrefix) === 0) return;
// check to see if we have seen this url before in order
// to lessen the load against the server content is fetched from
if (this.seen[url] != null) return;
this.seen[url] = true;
this.queue.push(url);
AutoFetcher.prototype.queueAVURL = function(url) {
// ensure we do not request data urls
if (url.indexOf(DataURLPrefix) === 0) return;
// check to see if we have seen this url before in order
// to lessen the load against the server content is fetched from
if (this.seen[url] != null) return;
this.seen[url] = true;
this.avQueue.push(url);
};
AutoFetcher.prototype.queueAVURL = function (url) {
// ensure we do not request data urls
if (url.indexOf(DataURLPrefix) === 0) return;
// check to see if we have seen this url before in order
// to lessen the load against the server content is fetched from
if (this.seen[url] != null) return;
this.seen[url] = true;
this.avQueue.push(url);
};
AutoFetcher.prototype.maybeResolveURL = function (url, base) {
// given a url and base url returns a resolved full URL or
// null if resolution was unsuccessful
try {
var _url = new URL(url, base);
return _url.href;
} catch (e) {
return null;
}
};
AutoFetcher.prototype.maybeFixUpRelSchemelessPrefix = function (url) {
// attempt to ensure rewritten relative or schemeless URLs become full URLS!
// otherwise returns null if this did not happen
if (url.indexOf(this.relative) === 0) {
return url.replace(this.relative, this.prefix);
}
if (url.indexOf(this.schemeless) === 0) {
return url.replace(this.schemeless, this.prefix);
}
AutoFetcher.prototype.maybeResolveURL = function(url, base) {
// given a url and base url returns a resolved full URL or
// null if resolution was unsuccessful
try {
var _url = new URL(url, base);
return _url.href;
} catch (e) {
return null;
}
};
AutoFetcher.prototype.maybeFixUpURL = function (url, resolveOpts) {
// attempt to fix up the url and do our best to ensure we can get dat 200 OK!
if (this.rwRe.test(url)) {
return url;
}
var mod = resolveOpts.mod || 'mp_';
// first check for / (relative) or // (schemeless) rewritten urls
var maybeFixed = this.maybeFixUpRelSchemelessPrefix(url);
AutoFetcher.prototype.maybeFixUpRelSchemelessPrefix = function(url) {
// attempt to ensure rewritten relative or schemeless URLs become full URLS!
// otherwise returns null if this did not happen
if (url.indexOf(this.relative) === 0) {
return url.replace(this.relative, this.prefix);
}
if (url.indexOf(this.schemeless) === 0) {
return url.replace(this.schemeless, this.prefix);
}
return null;
};
AutoFetcher.prototype.maybeFixUpURL = function(url, resolveOpts) {
// attempt to fix up the url and do our best to ensure we can get dat 200 OK!
if (this.rwRe.test(url)) {
return url;
}
var mod = resolveOpts.mod || 'mp_';
// first check for / (relative) or // (schemeless) rewritten urls
var maybeFixed = this.maybeFixUpRelSchemelessPrefix(url);
if (maybeFixed != null) {
return maybeFixed;
}
// resolve URL against tag src
if (resolveOpts.tagSrc != null) {
maybeFixed = this.maybeResolveURL(url, resolveOpts.tagSrc);
if (maybeFixed != null) {
return maybeFixed;
return this.prefix + mod + '/' + maybeFixed;
}
// resolve URL against tag src
if (resolveOpts.tagSrc != null) {
maybeFixed = this.maybeResolveURL(url, resolveOpts.tagSrc);
if (maybeFixed != null) {
return this.prefix + mod + '/' + maybeFixed;
}
}
// finally last attempt resolve the originating documents base URI
if (resolveOpts.docBaseURI) {
maybeFixed = this.maybeResolveURL(url, resolveOpts.docBaseURI);
if (maybeFixed != null) {
return this.prefix + mod + '/' + maybeFixed;
}
// finally last attempt resolve the originating documents base URI
if (resolveOpts.docBaseURI) {
maybeFixed = this.maybeResolveURL(url, resolveOpts.docBaseURI);
if (maybeFixed != null) {
return this.prefix + mod + '/' + maybeFixed;
}
}
// not much to do now.....
return this.prefixMod + '/' + url;
}
// not much to do now.....
return this.prefixMod + '/' + url;
};
AutoFetcher.prototype.urlExtractor = function (match, n1, n2, n3, offset, string) {
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
this.queueNonAVURL(n2);
return n1 + n2 + n3;
AutoFetcher.prototype.urlExtractor = function(
match,
n1,
n2,
n3,
offset,
string
) {
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
this.queueNonAVURL(n2);
return n1 + n2 + n3;
};
AutoFetcher.prototype.handleMedia = function (mediaRules) {
// this is a broken down rewrite_style
if (mediaRules == null || mediaRules.length === 0) return;
// var rules = mediaRules.values;
for (var i = 0; i < mediaRules.length; i++) {
mediaRules[i]
.replace(STYLE_REGEX, this.urlExtractor)
.replace(IMPORT_REGEX, this.urlExtractor);
}
AutoFetcher.prototype.handleMedia = function(mediaRules) {
// this is a broken down rewrite_style
if (mediaRules == null || mediaRules.length === 0) return;
// var rules = mediaRules.values;
for (var i = 0; i < mediaRules.length; i++) {
mediaRules[i]
.replace(STYLE_REGEX, this.urlExtractor)
.replace(IMPORT_REGEX, this.urlExtractor);
}
};
AutoFetcher.prototype.handleSrc = function (srcValues, context) {
var resolveOpts = { 'docBaseURI': context.docBaseURI };
if (srcValues.value) {
resolveOpts.mod = srcValues.mod;
if (resolveOpts.mod === 1) {
return this.queueNonAVURL(this.maybeFixUpURL(srcValues.value.trim(), resolveOpts));
}
return this.queueAVURL(this.maybeFixUpURL(srcValues.value.trim(), resolveOpts));
AutoFetcher.prototype.handleSrc = function(srcValues, context) {
var resolveOpts = { docBaseURI: context.docBaseURI };
if (srcValues.value) {
resolveOpts.mod = srcValues.mod;
if (resolveOpts.mod === 1) {
return this.queueNonAVURL(
this.maybeFixUpURL(srcValues.value.trim(), resolveOpts)
);
}
var len = srcValues.values.length;
for (var i = 0; i < len; i++) {
var value = srcValues.values[i];
resolveOpts.mod = value.mod;
if (resolveOpts.mod === 'im_') {
this.queueNonAVURL(this.maybeFixUpURL(value.src, resolveOpts));
} else {
this.queueAVURL(this.maybeFixUpURL(value.src, resolveOpts));
}
return this.queueAVURL(
this.maybeFixUpURL(srcValues.value.trim(), resolveOpts)
);
}
var len = srcValues.values.length;
for (var i = 0; i < len; i++) {
var value = srcValues.values[i];
resolveOpts.mod = value.mod;
if (resolveOpts.mod === 'im_') {
this.queueNonAVURL(this.maybeFixUpURL(value.src, resolveOpts));
} else {
this.queueAVURL(this.maybeFixUpURL(value.src, resolveOpts));
}
}
};
AutoFetcher.prototype.extractSrcSetNotPreSplit = function (ssV, resolveOpts) {
// was from extract from local doc so we need to duplicate work
var srcsetValues = ssV.split(srcsetSplit);
for (var i = 0; i < srcsetValues.length; i++) {
// grab the URL not width/height key
if (srcsetValues[i]) {
var value = srcsetValues[i].trim().split(' ')[0];
var maybeResolvedURL = this.maybeFixUpURL(value.trim(), resolveOpts);
if (resolveOpts.mod === 'im_') {
this.queueNonAVURL(maybeResolvedURL);
} else {
this.queueAVURL(maybeResolvedURL);
}
}
AutoFetcher.prototype.extractSrcSetNotPreSplit = function(ssV, resolveOpts) {
if (!ssV) return;
// was from extract from local doc so we need to duplicate work
var srcsetValues = ssV.split(srcsetSplit);
for (var i = 0; i < srcsetValues.length; i++) {
// grab the URL not width/height key
if (srcsetValues[i]) {
var value = srcsetValues[i].trim().split(' ')[0];
var maybeResolvedURL = this.maybeFixUpURL(value.trim(), resolveOpts);
if (resolveOpts.mod === 'im_') {
this.queueNonAVURL(maybeResolvedURL);
} else {
this.queueAVURL(maybeResolvedURL);
}
}
}
};
AutoFetcher.prototype.extractSrcset = function (srcsets, context) {
// was rewrite_srcset and only need to q
for (var i = 0; i < srcsets.length; i++) {
// grab the URL not width/height key
var url = srcsets[i].split(' ')[0];
if (context.mod === 'im_') {
this.queueNonAVURL(url);
} else {
this.queueAVURL(url);
}
AutoFetcher.prototype.extractSrcset = function(srcsets, context) {
// was rewrite_srcset and only need to q
for (var i = 0; i < srcsets.length; i++) {
// grab the URL not width/height key
var url = srcsets[i].split(' ')[0];
if (context.mod === 'im_') {
this.queueNonAVURL(url);
} else {
this.queueAVURL(url);
}
}
};
AutoFetcher.prototype.handleSrcset = function (srcset, context) {
var resolveOpts = { 'docBaseURI': context.docBaseURI };
if (srcset.value) {
// we have a single value, this srcset came from either
// preserveDataSrcset (not presplit) preserveSrcset (presplit)
resolveOpts.mod = srcset.mod;
if (!srcset.presplit) {
// extract URLs from the srcset string
return this.extractSrcSetNotPreSplit(srcset.value, resolveOpts);
}
// we have an array of srcset URL strings
return this.extractSrcset(srcset.value, resolveOpts);
}
// we have an array of values, these srcsets came from extractFromLocalDoc
var len = srcset.values.length;
for (var i = 0; i < len; i++) {
var ssv = srcset.values[i];
resolveOpts.mod = ssv.mod;
resolveOpts.tagSrc = ssv.tagSrc;
this.extractSrcSetNotPreSplit(ssv.srcset, resolveOpts);
AutoFetcher.prototype.handleSrcset = function(srcset, context) {
var resolveOpts = { docBaseURI: context.docBaseURI };
if (srcset.value) {
// we have a single value, this srcset came from either
// preserveDataSrcset (not presplit) preserveSrcset (presplit)
resolveOpts.mod = srcset.mod;
if (!srcset.presplit) {
// extract URLs from the srcset string
return this.extractSrcSetNotPreSplit(srcset.value, resolveOpts);
}
// we have an array of srcset URL strings
return this.extractSrcset(srcset.value, resolveOpts);
}
// we have an array of values, these srcsets came from extractFromLocalDoc
var len = srcset.values.length;
for (var i = 0; i < len; i++) {
var ssv = srcset.values[i];
resolveOpts.mod = ssv.mod;
resolveOpts.tagSrc = ssv.tagSrc;
this.extractSrcSetNotPreSplit(ssv.srcset, resolveOpts);
}
};
AutoFetcher.prototype.autoFetch = function(data) {
// we got a message and now we autofetch!
// these calls turn into no ops if they have no work
if (data.media) {
this.handleMedia(data.media);
}
AutoFetcher.prototype.autoFetch = function (data) {
// we got a message and now we autofetch!
// these calls turn into no ops if they have no work
if (data.media) {
this.handleMedia(data.media);
}
if (data.src) {
this.handleSrc(data.src, data.context || {});
}
if (data.src) {
this.handleSrc(data.src, data.context || {});
}
if (data.srcset) {
this.handleSrcset(data.srcset, data.context || {});
}
if (data.srcset) {
this.handleSrcset(data.srcset, data.context || {});
}
this.fetchImgs();
this.fetchAV();
this.fetchImgs();
this.fetchAV();
};
// initialize ourselves from the query params :)
try {
var loc = new self.URL(location.href);
autofetcher = new AutoFetcher(JSON.parse(loc.searchParams.get('init')));
var loc = new self.URL(location.href);
autofetcher = new AutoFetcher(JSON.parse(loc.searchParams.get('init')));
} catch (e) {
// likely we are in an older version of safari
var search = decodeURIComponent(location.search.split('?')[1]).split('&');
var init = JSON.parse(search[0].substr(search[0].indexOf('=') + 1));
init.prefix = decodeURIComponent(init.prefix);
init.baseURI = decodeURIComponent(init.baseURI);
autofetcher = new AutoFetcher(init);
// likely we are in an older version of safari
var search = decodeURIComponent(location.search.split('?')[1]).split('&');
var init = JSON.parse(search[0].substr(search[0].indexOf('=') + 1));
init.prefix = decodeURIComponent(init.prefix);
init.baseURI = decodeURIComponent(init.baseURI);
autofetcher = new AutoFetcher(init);
}

View File

@ -15,271 +15,289 @@ var autofetcher = null;
function noop() {}
if (typeof self.Promise === 'undefined') {
// not kewl we must polyfill Promise
self.Promise = function (executor) {
executor(noop, noop);
};
self.Promise.prototype.then = function (cb) {
if (cb) cb();
return this;
};
self.Promise.prototype.catch = function () {
return this;
};
self.Promise.all = function (values) {
return new Promise(noop);
};
// not kewl we must polyfill Promise
self.Promise = function(executor) {
executor(noop, noop);
};
self.Promise.prototype.then = function(cb) {
if (cb) cb();
return this;
};
self.Promise.prototype.catch = function() {
return this;
};
self.Promise.all = function(values) {
return new Promise(noop);
};
}
if (typeof self.fetch === 'undefined') {
// not kewl we must polyfill fetch.
self.fetch = function (url) {
return new Promise(function (resolve) {
var xhr = new XMLHttpRequest();
xhr.open('GET', url);
xhr.send();
resolve();
});
};
// not kewl we must polyfill fetch.
self.fetch = function(url) {
return new Promise(function(resolve) {
var xhr = new XMLHttpRequest();
xhr.open('GET', url);
xhr.send();
resolve();
});
};
}
self.onmessage = function (event) {
var data = event.data;
switch (data.type) {
case 'values':
autofetcher.autofetchMediaSrcset(data);
break;
case 'fetch-all':
autofetcher.justFetch(data);
break;
}
self.onmessage = function(event) {
var data = event.data;
switch (data.type) {
case 'values':
autofetcher.autofetchMediaSrcset(data);
break;
case 'fetch-all':
autofetcher.justFetch(data);
break;
}
};
function AutoFetcher() {
if (!(this instanceof AutoFetcher)) {
return new AutoFetcher();
}
// local cache of URLs fetched, to reduce server load
this.seen = {};
// array of URLs to be fetched
this.queue = [];
this.avQueue = [];
// should we queue a URL or not
this.queuing = false;
// a URL to resolve relative URLs found in the cssText of CSSMedia rules.
this.currentResolver = null;
// should we queue a URL or not
this.queuing = false;
this.queuingAV = false;
this.urlExtractor = this.urlExtractor.bind(this);
this.imgFetchDone = this.imgFetchDone.bind(this);
this.avFetchDone = this.avFetchDone.bind(this);
if (!(this instanceof AutoFetcher)) {
return new AutoFetcher();
}
// local cache of URLs fetched, to reduce server load
this.seen = {};
// array of URLs to be fetched
this.queue = [];
this.avQueue = [];
// should we queue a URL or not
this.queuing = false;
// a URL to resolve relative URLs found in the cssText of CSSMedia rules.
this.currentResolver = null;
// should we queue a URL or not
this.queuing = false;
this.queuingAV = false;
this.urlExtractor = this.urlExtractor.bind(this);
this.imgFetchDone = this.imgFetchDone.bind(this);
this.avFetchDone = this.avFetchDone.bind(this);
}
AutoFetcher.prototype.delay = function () {
return new Promise(function (resolve, reject) {
setTimeout(resolve, FetchDelay);
AutoFetcher.prototype.delay = function() {
return new Promise(function(resolve, reject) {
setTimeout(resolve, FetchDelay);
});
};
AutoFetcher.prototype.imgFetchDone = function() {
if (this.queue.length > 0) {
// we have a Q of some length drain it
var autofetcher = this;
this.delay().then(function() {
autofetcher.queuing = false;
autofetcher.fetchImgs();
});
} else {
this.queuing = false;
}
};
AutoFetcher.prototype.imgFetchDone = function () {
if (this.queue.length > 0) {
// we have a Q of some length drain it
var autofetcher = this;
this.delay().then(function () {
autofetcher.queuing = false;
autofetcher.fetchImgs();
});
} else {
this.queuing = false;
}
AutoFetcher.prototype.avFetchDone = function() {
if (this.avQueue.length > 0) {
// we have a Q of some length drain it
var autofetcher = this;
this.delay().then(function() {
autofetcher.queuingAV = false;
autofetcher.fetchAV();
});
} else {
this.queuingAV = false;
}
};
AutoFetcher.prototype.avFetchDone = function () {
if (this.avQueue.length > 0) {
// we have a Q of some length drain it
var autofetcher = this;
this.delay().then(function () {
autofetcher.queuingAV = false;
autofetcher.fetchAV();
});
} else {
this.queuingAV = false;
AutoFetcher.prototype.fetchAV = function() {
if (this.queuingAV || this.avQueue.length === 0) {
return;
}
// the number of fetches is limited to a maximum of DefaultNumAvFetches + FullAVQDrainLen outstanding fetches
// the baseline maximum number of fetches is DefaultNumAvFetches but if the size(avQueue) <= FullAVQDrainLen
// we add them to the current batch. Because audio video resources might be big
// we limit how many we fetch at a time drastically
this.queuingAV = true;
var runningFetchers = [];
while (
this.avQueue.length > 0 &&
runningFetchers.length <= DefaultNumAvFetches
) {
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop));
}
if (this.avQueue.length <= FullAVQDrainLen) {
while (this.avQueue.length > 0) {
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop));
}
}
Promise.all(runningFetchers)
.then(this.avFetchDone)
.catch(this.avFetchDone);
};
AutoFetcher.prototype.fetchAV = function () {
if (this.queuingAV || this.avQueue.length === 0) {
return;
AutoFetcher.prototype.fetchImgs = function() {
if (this.queuing || this.queue.length === 0) {
return;
}
// the number of fetches is limited to a maximum of DefaultNumImFetches + FullImgQDrainLen outstanding fetches
// the baseline maximum number of fetches is DefaultNumImFetches but if the size(queue) <= FullImgQDrainLen
// we add them to the current batch
this.queuing = true;
var runningFetchers = [];
while (
this.queue.length > 0 &&
runningFetchers.length <= DefaultNumImFetches
) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop));
}
if (this.queue.length <= FullImgQDrainLen) {
while (this.queue.length > 0) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop));
}
// the number of fetches is limited to a maximum of DefaultNumAvFetches + FullAVQDrainLen outstanding fetches
// the baseline maximum number of fetches is DefaultNumAvFetches but if the size(avQueue) <= FullAVQDrainLen
// we add them to the current batch. Because audio video resources might be big
// we limit how many we fetch at a time drastically
this.queuingAV = true;
var runningFetchers = [];
while (this.avQueue.length > 0 && runningFetchers.length <= DefaultNumAvFetches) {
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop))
}
if (this.avQueue.length <= FullAVQDrainLen) {
while (this.avQueue.length > 0) {
runningFetchers.push(fetch(this.avQueue.shift()).catch(noop))
}
}
Promise.all(runningFetchers)
.then(this.avFetchDone)
.catch(this.avFetchDone);
}
Promise.all(runningFetchers)
.then(this.imgFetchDone)
.catch(this.imgFetchDone);
};
AutoFetcher.prototype.fetchImgs = function () {
if (this.queuing || this.queue.length === 0) {
return;
}
// the number of fetches is limited to a maximum of DefaultNumImFetches + FullImgQDrainLen outstanding fetches
// the baseline maximum number of fetches is DefaultNumImFetches but if the size(queue) <= FullImgQDrainLen
// we add them to the current batch
this.queuing = true;
var runningFetchers = [];
while (this.queue.length > 0 && runningFetchers.length <= DefaultNumImFetches) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop))
}
if (this.queue.length <= FullImgQDrainLen) {
while (this.queue.length > 0) {
runningFetchers.push(fetch(this.queue.shift()).catch(noop))
}
}
Promise.all(runningFetchers)
.then(this.imgFetchDone)
.catch(this.imgFetchDone);
AutoFetcher.prototype.queueNonAVURL = function(url) {
// ensure we do not request data urls
if (url.indexOf(DataURLPrefix) === 0) return;
// check to see if we have seen this url before in order
// to lessen the load against the server content is fetched from
if (this.seen[url] != null) return;
this.seen[url] = true;
this.queue.push(url);
};
AutoFetcher.prototype.queueNonAVURL = function (url) {
// ensure we do not request data urls
if (url.indexOf(DataURLPrefix) === 0) return;
// check to see if we have seen this url before in order
// to lessen the load against the server content is fetched from
if (this.seen[url] != null) return;
this.seen[url] = true;
this.queue.push(url);
AutoFetcher.prototype.queueAVURL = function(url) {
// ensure we do not request data urls
if (url.indexOf(DataURLPrefix) === 0) return;
// check to see if we have seen this url before in order
// to lessen the load against the server content is fetched from
if (this.seen[url] != null) return;
this.seen[url] = true;
this.avQueue.push(url);
};
AutoFetcher.prototype.queueAVURL = function (url) {
// ensure we do not request data urls
if (url.indexOf(DataURLPrefix) === 0) return;
// check to see if we have seen this url before in order
// to lessen the load against the server content is fetched from
if (this.seen[url] != null) return;
this.seen[url] = true;
this.avQueue.push(url);
};
AutoFetcher.prototype.safeResolve = function (url, resolver) {
// Guard against the exception thrown by the URL constructor if the URL or resolver is bad
// if resolver is undefined/null then this function passes url through
var resolvedURL = url;
if (resolver) {
try {
resolvedURL = (new URL(url, resolver)).href
} catch (e) {
resolvedURL = url;
}
AutoFetcher.prototype.safeResolve = function(url, resolver) {
// Guard against the exception thrown by the URL constructor if the URL or resolver is bad
// if resolver is undefined/null then this function passes url through
var resolvedURL = url;
if (resolver) {
try {
resolvedURL = new URL(url, resolver).href;
} catch (e) {
resolvedURL = url;
}
return resolvedURL;
}
return resolvedURL;
};
AutoFetcher.prototype.urlExtractor = function (match, n1, n2, n3, offset, string) {
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
// this.currentResolver is set to the URL which the browser would normally
// resolve relative urls with (URL of the stylesheet) in an exceptionless manner
// (resolvedURL will be undefined if an error occurred)
var resolvedURL = this.safeResolve(n2, this.currentResolver);
if (resolvedURL) {
this.queueNonAVURL(resolvedURL);
}
return n1 + n2 + n3;
AutoFetcher.prototype.urlExtractor = function(
match,
n1,
n2,
n3,
offset,
string
) {
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
// this.currentResolver is set to the URL which the browser would normally
// resolve relative urls with (URL of the stylesheet) in an exceptionless manner
// (resolvedURL will be undefined if an error occurred)
var resolvedURL = this.safeResolve(n2, this.currentResolver);
if (resolvedURL) {
this.queueNonAVURL(resolvedURL);
}
return n1 + n2 + n3;
};
AutoFetcher.prototype.extractMedia = function (mediaRules) {
// this is a broken down rewrite_style
if (mediaRules == null) return;
for (var i = 0; i < mediaRules.length; i++) {
// set currentResolver to the value of this stylesheets URL, done to ensure we do not have to
// create functions on each loop iteration because we potentially create a new `URL` object
// twice per iteration
this.currentResolver = mediaRules[i].resolve;
mediaRules[i].cssText
.replace(STYLE_REGEX, this.urlExtractor)
.replace(IMPORT_REGEX, this.urlExtractor);
}
AutoFetcher.prototype.extractMedia = function(mediaRules) {
// this is a broken down rewrite_style
if (mediaRules == null) return;
for (var i = 0; i < mediaRules.length; i++) {
// set currentResolver to the value of this stylesheets URL, done to ensure we do not have to
// create functions on each loop iteration because we potentially create a new `URL` object
// twice per iteration
this.currentResolver = mediaRules[i].resolve;
mediaRules[i].cssText
.replace(STYLE_REGEX, this.urlExtractor)
.replace(IMPORT_REGEX, this.urlExtractor);
}
};
AutoFetcher.prototype.extractSrcset = function (srcsets) {
// preservation worker in proxy mode sends us the value of the srcset attribute of an element
// and a URL to correctly resolve relative URLS. Thus we must recreate rewrite_srcset logic here
if (srcsets == null) return;
var length = srcsets.length;
var extractedSrcSet, srcsetValue, ssSplit, j;
for (var i = 0; i < length; i++) {
extractedSrcSet = srcsets[i];
ssSplit = extractedSrcSet.srcset.split(srcsetSplit);
for (j = 0; j < ssSplit.length; j++) {
if (ssSplit[j]) {
srcsetValue = ssSplit[j].trim();
if (srcsetValue.length > 0) {
// resolve the URL in an exceptionless manner (resolvedURL will be undefined if an error occurred)
var resolvedURL = this.safeResolve(srcsetValue.split(' ')[0], extractedSrcSet.resolve);
if (resolvedURL) {
if (extractedSrcSet.mod === 'im_') {
this.queueNonAVURL(resolvedURL);
} else {
this.queueAVURL(resolvedURL);
}
}
}
}
}
}
};
AutoFetcher.prototype.extractSrc = function (srcVals) {
// preservation worker in proxy mode sends us the value of the srcset attribute of an element
// and a URL to correctly resolve relative URLS. Thus we must recreate rewrite_srcset logic here
if (srcVals == null || srcVals.length === 0) return;
var length = srcVals.length;
var srcVal;
for (var i = 0; i < length; i++) {
srcVal = srcVals[i];
var resolvedURL = this.safeResolve(srcVal.src, srcVal.resolve);
if (resolvedURL) {
if (srcVal.mod === 'im_') {
this.queueNonAVURL(resolvedURL);
AutoFetcher.prototype.extractSrcset = function(srcsets) {
// preservation worker in proxy mode sends us the value of the srcset attribute of an element
// and a URL to correctly resolve relative URLS. Thus we must recreate rewrite_srcset logic here
if (srcsets == null) return;
var length = srcsets.length;
var extractedSrcSet, srcsetValue, ssSplit, j;
for (var i = 0; i < length; i++) {
extractedSrcSet = srcsets[i];
ssSplit = extractedSrcSet.srcset.split(srcsetSplit);
console.log(ssSplit);
for (j = 0; j < ssSplit.length; j++) {
if (ssSplit[j]) {
srcsetValue = ssSplit[j].trim();
if (srcsetValue.length > 0) {
// resolve the URL in an exceptionless manner (resolvedURL will be undefined if an error occurred)
var resolvedURL = this.safeResolve(
srcsetValue.split(' ')[0],
extractedSrcSet.resolve
);
if (resolvedURL) {
if (extractedSrcSet.mod === 'im_') {
this.queueNonAVURL(resolvedURL);
} else {
this.queueAVURL(resolvedURL);
this.queueAVURL(resolvedURL);
}
} else {
console.log(resolvedURL);
}
} else {
console.log(srcsetValue);
}
}
}
}
};
AutoFetcher.prototype.autofetchMediaSrcset = function (data) {
// we got a message and now we autofetch!
// these calls turn into no ops if they have no work
this.extractMedia(data.media);
this.extractSrcset(data.srcset);
this.extractSrc(data.src);
this.fetchImgs();
this.fetchAV();
AutoFetcher.prototype.extractSrc = function(srcVals) {
// preservation worker in proxy mode sends us the value of the srcset attribute of an element
// and a URL to correctly resolve relative URLS. Thus we must recreate rewrite_srcset logic here
if (srcVals == null || srcVals.length === 0) return;
var length = srcVals.length;
var srcVal;
for (var i = 0; i < length; i++) {
srcVal = srcVals[i];
var resolvedURL = this.safeResolve(srcVal.src, srcVal.resolve);
if (resolvedURL) {
if (srcVal.mod === 'im_') {
this.queueNonAVURL(resolvedURL);
} else {
this.queueAVURL(resolvedURL);
}
}
}
};
AutoFetcher.prototype.justFetch = function (data) {
// we got a message containing only urls to be fetched
if (data == null || data.values == null) return;
for (var i = 0; i < data.values.length; ++i) {
this.queueNonAVURL(data.values[i]);
}
this.fetchImgs();
AutoFetcher.prototype.autofetchMediaSrcset = function(data) {
// we got a message and now we autofetch!
// these calls turn into no ops if they have no work
this.extractMedia(data.media);
this.extractSrcset(data.srcset);
this.extractSrc(data.src);
this.fetchImgs();
this.fetchAV();
};
AutoFetcher.prototype.justFetch = function(data) {
// we got a message containing only urls to be fetched
if (data == null || data.values == null) return;
for (var i = 0; i < data.values.length; ++i) {
this.queueNonAVURL(data.values[i]);
}
this.fetchImgs();
};
autofetcher = new AutoFetcher();

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,82 +0,0 @@
// pywb mini rewriter for injection into web worker scripts
function WBWombat(info) {
function maybeResolveURL(origURL) {
try {
var resolved = new URL(origURL, info.originalURL);
return resolved.href;
} catch (e) {
return origURL;
}
}
function rewrite_url(url) {
if (url.indexOf('blob:') === 0) return url;
if (url && info.originalURL && url.indexOf('/') === 0) {
url = maybeResolveURL(url);
}
if (info.prefix) {
return info.prefix + url;
}
return url;
}
function init_ajax_rewrite() {
var orig = self.XMLHttpRequest.prototype.open;
function open_rewritten(method, url, async, user, password) {
url = rewrite_url(url);
// defaults to true
if (async != false) {
async = true;
}
var result = orig.call(this, method, url, async, user, password);
if (url.indexOf('data:') !== 0) {
this.setRequestHeader('X-Pywb-Requested-With', 'XMLHttpRequest');
}
}
self.XMLHttpRequest.prototype.open = open_rewritten;
}
init_ajax_rewrite();
function rewriteArgs(argsObj) {
// recreate the original arguments object just with URLs rewritten
var newArgObj = new Array(argsObj.length);
for (var i = 0; i < newArgObj.length; i++) {
var arg = argsObj[i];
newArgObj[i] = rewrite_url(arg);
}
return newArgObj;
}
var origImportScripts = self.importScripts;
self.importScripts = function importScripts() {
// rewrite the arguments object and call original function via fn.apply
var rwArgs = rewriteArgs(arguments);
return origImportScripts.apply(this, rwArgs);
};
if (self.fetch != null) {
// this fetch is Worker.fetch
var orig_fetch = self.fetch;
self.fetch = function(input, init_opts) {
var inputType = typeof(input);
if (inputType === 'string') {
input = rewrite_url(input);
} else if (inputType === 'object' && input.url) {
var new_url = rewrite_url(input.url);
if (new_url !== input.url) {
input = new Request(new_url, input);
}
}
init_opts = init_opts || {};
init_opts['credentials'] = 'include';
return orig_fetch.call(this, input, init_opts);
};
}
}

View File

@ -1 +1 @@
__version__ = '2.2.20190410'
__version__ = '2.3.0.dev0'

View File

@ -23,6 +23,9 @@ def fmod_sl(request):
# ============================================================================
class BaseConfigTest(BaseTestClass):
lint_app = True
extra_headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
@classmethod
def get_test_app(cls, config_file, custom_config=None):
@ -62,21 +65,34 @@ class BaseConfigTest(BaseTestClass):
assert resp.content_length > 0
def get(self, url, fmod, *args, **kwargs):
self.__ensure_headers(kwargs)
app = self.testapp if fmod else self.testapp_non_frame
return app.get(url.format(fmod), *args, **kwargs)
def post(self, url, fmod, *args, **kwargs):
self.__ensure_headers(kwargs)
app = self.testapp if fmod else self.testapp_non_frame
return app.post(url.format(fmod), *args, **kwargs)
def post_json(self, url, fmod, *args, **kwargs):
self.__ensure_headers(kwargs)
app = self.testapp if fmod else self.testapp_non_frame
return app.post_json(url.format(fmod), *args, **kwargs)
def head(self, url, fmod, *args, **kwargs):
self.__ensure_headers(kwargs)
app = self.testapp if fmod else self.testapp_non_frame
return app.head(url.format(fmod), *args, **kwargs)
def __ensure_headers(self, kwargs):
if 'headers' in kwargs:
headers = kwargs.get('headers')
else:
headers = kwargs['headers'] = {}
if isinstance(headers, dict) and 'User-Agent' not in headers:
headers['User-Agent'] = self.extra_headers['User-Agent']
#=============================================================================
class CollsDirMixin(TempDirTests):

View File

@ -31,7 +31,7 @@ class TestRootColl(BaseConfigTest):
def test_root_replay_redir(self, fmod):
resp = self.get('/20140128051539{0}/http://www.iana.org/domains/example', fmod)
assert resp.status_int == 302
assert resp.status_int in (301, 302)
assert resp.headers['Location'] == 'http://localhost:80/20140128051539{0}/https://www.iana.org/domains/reserved'.format(fmod)

1
wombat Submodule

@ -0,0 +1 @@
Subproject commit 0b0c171a4f0f34114ba3cefd5ba80304515f4ef8