mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Server-side rewriting of script[src='js/...'] and link rel='import' (#334)
* Updated html_rewriter.py to account for rewriting of script[src] values that are super relative (http://fotopaulmartens.netcam.nl/vucht.php) and added link rel='import' rewriting Updated test_html_rewriter.py for super rel script[src] rewriting and link rel='import' Updated wombat to account for the new rewriting of script[src] (http://fotopaulmartens.netcam.nl/vucht.php) Changed the postMessage override in wombat to use $wbwindow rather than window to fix google calendar replay / recording (http://qasrcc.org/events/calendar/) * Updated tests for forcing absolute and fixed merge conflicts * wombat: extracted removal and retrieval of __wb_original_src into own functions
This commit is contained in:
parent
ac5b4da9eb
commit
bb5d46d19b
@ -221,7 +221,7 @@ class HTMLRewriterMixin(StreamingRewriter):
|
||||
url = urlunsplit((scheme, netloc, path, query, frag))
|
||||
return url
|
||||
|
||||
def _rewrite_url(self, value, mod=None):
|
||||
def _rewrite_url(self, value, mod=None, force_abs=False):
|
||||
if not value:
|
||||
return ''
|
||||
|
||||
@ -230,7 +230,7 @@ class HTMLRewriterMixin(StreamingRewriter):
|
||||
return ''
|
||||
|
||||
unesc_value = self.try_unescape(value)
|
||||
rewritten_value = self.url_rewriter.rewrite(unesc_value, mod)
|
||||
rewritten_value = self.url_rewriter.rewrite(unesc_value, mod, force_abs)
|
||||
|
||||
if unesc_value != value and rewritten_value != unesc_value:
|
||||
rewritten_value = rewritten_value.replace(unesc_value, value)
|
||||
@ -379,6 +379,14 @@ class HTMLRewriterMixin(StreamingRewriter):
|
||||
rw_mod = self.defmod
|
||||
attr_value = self._rewrite_url(attr_value, rw_mod)
|
||||
|
||||
elif tag == 'script' and attr_name == 'src':
|
||||
rw_mod = handler.get(attr_name)
|
||||
ov = attr_value
|
||||
attr_value = self._rewrite_url(attr_value, rw_mod)
|
||||
if attr_value == ov and not ov.startswith(self.url_rewriter.NO_REWRITE_URI_PREFIX):
|
||||
# URL not skipped, likely src='js/....', forcing abs to make sure, cause PHP MIME(JS) === HTML
|
||||
attr_value = self._rewrite_url(attr_value, rw_mod, True)
|
||||
self._write_attr('__wb_orig_src', ov, empty_attr=None)
|
||||
else:
|
||||
# rewrite url using tag handler
|
||||
rw_mod = handler.get(attr_name)
|
||||
@ -408,6 +416,10 @@ class HTMLRewriterMixin(StreamingRewriter):
|
||||
preload = self.get_attr(tag_attrs, 'as')
|
||||
rw_mod = self.PRELOAD_TYPES.get(preload, rw_mod)
|
||||
|
||||
# for html imports with an optional as (google exclusive)
|
||||
elif rel == 'import':
|
||||
rw_mod = 'mp_'
|
||||
|
||||
elif rel == 'stylesheet':
|
||||
rw_mod = 'cs_'
|
||||
|
||||
|
@ -211,6 +211,10 @@ r"""
|
||||
>>> parse('<script type="application/json">{"embed top test": "http://example.com/a/b/c.html"}</script>')
|
||||
<script type="application/json">{"embed top test": "http://example.com/a/b/c.html"}</script>
|
||||
|
||||
# Script tag with super relative src
|
||||
>>> parse('<script src="js/fun.js"></script>')
|
||||
<script __wb_orig_src="js/fun.js" src="/web/20131226101010js_/http://example.com/some/path/js/fun.js"></script>
|
||||
|
||||
# Script tag + crossorigin + integrity
|
||||
>>> parse('<script src="/js/scripts.js" crossorigin="anonymous" integrity="ABC"></script>')
|
||||
<script src="/web/20131226101010js_/http://example.com/js/scripts.js" _crossorigin="anonymous" _integrity="ABC"></script>
|
||||
@ -260,7 +264,7 @@ r"""
|
||||
<html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/other.js"></script></head><body>Test</body></html>
|
||||
|
||||
>>> parse('<html><script src="other.js"></script></html>', head_insert = '<script src="cool.js"></script>')
|
||||
<html><script src="cool.js"></script><script src="other.js"></script></html>
|
||||
<html><script src="cool.js"></script><script __wb_orig_src="other.js" src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></html>
|
||||
|
||||
>>> parse('<html><head/><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||
<html><head><script src="cool.js"></script></head><body>Test</body></html>
|
||||
@ -310,10 +314,37 @@ r"""
|
||||
>>> parse('<link rel="preload" as="video" href="http://example.com/some/other/path">')
|
||||
<link rel="preload" as="video" href="/web/20131226101010oe_/http://example.com/some/other/path">
|
||||
|
||||
>>> parse('<link rel="preload" as="worker" href="http://example.com/some/other/path">')
|
||||
<link rel="preload" as="worker" href="/web/20131226101010js_/http://example.com/some/other/path">
|
||||
|
||||
>>> parse('<link rel="preload" as="font" href="http://example.com/some/other/path">')
|
||||
<link rel="preload" as="font" href="/web/20131226101010oe_/http://example.com/some/other/path">
|
||||
|
||||
>>> parse('<link rel="preload" as="audio" href="http://example.com/some/other/path">')
|
||||
<link rel="preload" as="audio" href="/web/20131226101010oe_/http://example.com/some/other/path">
|
||||
|
||||
>>> parse('<link rel="preload" as="embed" href="http://example.com/some/other/path">')
|
||||
<link rel="preload" as="embed" href="/web/20131226101010oe_/http://example.com/some/other/path">
|
||||
|
||||
>>> parse('<link rel="preload" as="object" href="http://example.com/some/other/path">')
|
||||
<link rel="preload" as="object" href="/web/20131226101010oe_/http://example.com/some/other/path">
|
||||
|
||||
>>> parse('<link rel="preload" as="track" href="http://example.com/some/other/path">')
|
||||
<link rel="preload" as="track" href="/web/20131226101010oe_/http://example.com/some/other/path">
|
||||
|
||||
>>> parse('<link rel="preload" as="fetch" href="http://example.com/some/other/path">')
|
||||
<link rel="preload" as="fetch" href="/web/20131226101010mp_/http://example.com/some/other/path">
|
||||
|
||||
# stylesheet
|
||||
>>> parse('<link rel="stylesheet" href="http://example.com/some/other/path">')
|
||||
<link rel="stylesheet" href="/web/20131226101010cs_/http://example.com/some/other/path">
|
||||
|
||||
# rel='import'
|
||||
>>> parse('<link rel="import" href="http://example.com/componemts/app.html">')
|
||||
<link rel="import" href="/web/20131226101010mp_/http://example.com/componemts/app.html">
|
||||
|
||||
>>> parse('<link rel="import" as="document" href="http://example.com/componemts/app.html">')
|
||||
<link rel="import" as="document" href="/web/20131226101010mp_/http://example.com/componemts/app.html">
|
||||
|
||||
# doctype
|
||||
>>> parse('<!doctype html PUBLIC "public">')
|
||||
|
@ -148,6 +148,15 @@
|
||||
>>> x = SchemeOnlyUrlRewriter('http://example.com'); x.rebase_rewriter('https://example.com/') == x
|
||||
True
|
||||
|
||||
# forcing absolute url rewrites
|
||||
>>> UrlRewriter('http://example.com/vucht.php', 'http://localhost:8080/live/').rewrite('js/bundle.php?v=1', 'js_', True)
|
||||
'/live/js_/http://example.com/js/bundle.php?v=1'
|
||||
|
||||
>>> UrlRewriter('http://example.com/vucht.php', 'http://localhost:8080/live/').rewrite('js/bundle.php?v=1', 'js_')
|
||||
'js/bundle.php?v=1'
|
||||
|
||||
>>> SchemeOnlyUrlRewriter('https://example.com/abc').rewrite('//example.com/abc', force_abs=True)
|
||||
'//example.com/abc'
|
||||
"""
|
||||
|
||||
|
||||
|
@ -40,7 +40,7 @@ class UrlRewriter(object):
|
||||
if self.rewrite_opts.get('punycode_links'):
|
||||
self.wburl._do_percent_encode = False
|
||||
|
||||
def rewrite(self, url, mod=None):
|
||||
def rewrite(self, url, mod=None, force_abs=False):
|
||||
# if special protocol, no rewriting at all
|
||||
if url.startswith(self.NO_REWRITE_URI_PREFIX):
|
||||
return url
|
||||
@ -63,7 +63,7 @@ class UrlRewriter(object):
|
||||
if url.startswith(self.REL_SCHEME):
|
||||
is_abs = True
|
||||
scheme_rel = True
|
||||
elif (not is_abs and
|
||||
elif (not force_abs and not is_abs and
|
||||
not url.startswith(self.REL_PATH) and
|
||||
self.PARENT_PATH not in url):
|
||||
return url
|
||||
@ -165,7 +165,7 @@ class IdentityUrlRewriter(UrlRewriter):
|
||||
"""
|
||||
No rewriting performed, return original url
|
||||
"""
|
||||
def rewrite(self, url, mod=None):
|
||||
def rewrite(self, url, mod=None, force_abs=False):
|
||||
return url
|
||||
|
||||
def get_new_url(self, **kwargs):
|
||||
@ -197,7 +197,7 @@ class SchemeOnlyUrlRewriter(IdentityUrlRewriter):
|
||||
else:
|
||||
self.opposite_scheme = 'https'
|
||||
|
||||
def rewrite(self, url, mod=None):
|
||||
def rewrite(self, url, mod=None, force_abs=False):
|
||||
if url.startswith(self.opposite_scheme + '://'):
|
||||
url = self.url_scheme + url[len(self.opposite_scheme):]
|
||||
|
||||
|
@ -153,6 +153,27 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
||||
return mod;
|
||||
}
|
||||
|
||||
function removeWBOSRC(elem) {
|
||||
if (elem.tagName === 'SCRIPT' && !elem.__$removedWBOSRC$__) {
|
||||
if (elem.hasAttribute('__wb_orig_src')) {
|
||||
elem.removeAttribute('__wb_orig_src');
|
||||
}
|
||||
elem.__$removedWBOSRC$__ = true;
|
||||
}
|
||||
}
|
||||
|
||||
function retrieveWBOSRC(elem) {
|
||||
if (elem.tagName === 'SCRIPT' && !elem.__$removedWBOSRC$__) {
|
||||
var maybeWBOSRC;
|
||||
if (wb_getAttribute) {
|
||||
maybeWBOSRC = wb_getAttribute.call(elem, '__wb_orig_src');
|
||||
} else {
|
||||
maybeWBOSRC = elem.getAttribute('__wb_orig_src');
|
||||
}
|
||||
return maybeWBOSRC;
|
||||
}
|
||||
}
|
||||
|
||||
//============================================
|
||||
function is_host_url(str) {
|
||||
// Good guess that's its a hostname
|
||||
@ -703,7 +724,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
||||
this.reload = function() {
|
||||
return this._orig_loc.reload();
|
||||
}
|
||||
|
||||
|
||||
this.orig_getter = function(prop) {
|
||||
return this._orig_loc[prop];
|
||||
}
|
||||
@ -713,7 +734,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
||||
}
|
||||
|
||||
init_loc_override(this, this.orig_setter, this.orig_getter);
|
||||
|
||||
|
||||
set_loc(this, orig_loc.href);
|
||||
|
||||
this.toString = function() {
|
||||
@ -1089,6 +1110,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
||||
}
|
||||
|
||||
//============================================
|
||||
|
||||
function init_setAttribute_override()
|
||||
{
|
||||
if (!$wbwindow.Element ||
|
||||
@ -1110,6 +1132,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
||||
value = rewrite_inline_style(value);
|
||||
|
||||
} else if (should_rewrite_attr(this.tagName, lowername)) {
|
||||
removeWBOSRC(this);
|
||||
if (!this._no_rewrite) {
|
||||
var mod = rwModForElement(this, lowername);
|
||||
value = rewrite_url(value, false, mod);
|
||||
@ -1129,7 +1152,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
||||
{
|
||||
if (!$wbwindow.Element ||
|
||||
!$wbwindow.Element.prototype ||
|
||||
!$wbwindow.Element.prototype.setAttribute) {
|
||||
!$wbwindow.Element.prototype.getAttribute) {
|
||||
return;
|
||||
}
|
||||
|
||||
@ -1140,6 +1163,10 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
||||
var result = orig_getAttribute.call(this, name);
|
||||
|
||||
if (should_rewrite_attr(this.tagName, name)) {
|
||||
var maybeWBOSRC = retrieveWBOSRC(this);
|
||||
if (maybeWBOSRC) {
|
||||
return maybeWBOSRC;
|
||||
}
|
||||
result = extract_orig(result);
|
||||
} else if (starts_with(name, "data-") && starts_with(result, VALID_PREFIXES)) {
|
||||
result = extract_orig(result);
|
||||
@ -1409,6 +1436,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
||||
}
|
||||
|
||||
if (new_value != value) {
|
||||
removeWBOSRC(elem);
|
||||
wb_setAttribute.call(elem, name, new_value);
|
||||
return true;
|
||||
}
|
||||
@ -1785,6 +1813,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
||||
}
|
||||
val = rewrite_url(orig, false, mod);
|
||||
} else {
|
||||
removeWBOSRC(this);
|
||||
val = rewrite_url(orig, false, mod);
|
||||
}
|
||||
|
||||
@ -1864,7 +1893,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
||||
override_attr($wbwindow.HTMLMetaElement.prototype, "content", "mp_");
|
||||
|
||||
override_attr($wbwindow.HTMLFormElement.prototype, "action", "mp_");
|
||||
|
||||
|
||||
override_anchor_elem();
|
||||
|
||||
var style_proto = $wbwindow.CSSStyleDeclaration.prototype;
|
||||
@ -1913,7 +1942,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
||||
|
||||
for (var i = 0; i < URL_PROPS.length; i++) {
|
||||
save_prop(URL_PROPS[i]);
|
||||
}
|
||||
}
|
||||
|
||||
var anchor_setter = function(prop, value) {
|
||||
var func = anchor_orig["set_" + prop];
|
||||
@ -2046,7 +2075,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
||||
//}
|
||||
text = rewrite_html(text);
|
||||
}
|
||||
|
||||
|
||||
return orig_insertAdjacentHTML.call(this, position, text);
|
||||
}
|
||||
|
||||
@ -2286,7 +2315,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
||||
}
|
||||
|
||||
var orig = $wbwindow.postMessage;
|
||||
|
||||
|
||||
$wbwindow.__orig_postMessage = orig;
|
||||
|
||||
// use this_obj.__WB_source not window to fix google calendar embeds, pm_origin sets this.__WB_source
|
||||
|
Loading…
x
Reference in New Issue
Block a user