From bb5d46d19b8740753504bbfba361e24da7c31777 Mon Sep 17 00:00:00 2001
From: John Berlin
Date: Thu, 14 Jun 2018 13:56:46 -0400
Subject: [PATCH] Server-side rewriting of script[src='js/...'] and link
rel='import' (#334)
* Updated html_rewriter.py to account for rewriting of script[src] values that are super relative (http://fotopaulmartens.netcam.nl/vucht.php) and added link rel='import' rewriting
Updated test_html_rewriter.py for super rel script[src] rewriting and link rel='import'
Updated wombat to account for the new rewriting of script[src] (http://fotopaulmartens.netcam.nl/vucht.php)
Changed the postMessage override in wombat to use $wbwindow rather than window to fix google calendar replay / recording (http://qasrcc.org/events/calendar/)
* Updated tests for forcing absolute and fixed merge conflicts
* wombat: extracted removal and retrieval of __wb_original_src into own functions
---
pywb/rewrite/html_rewriter.py | 16 +++++++--
pywb/rewrite/test/test_html_rewriter.py | 33 ++++++++++++++++++-
pywb/rewrite/test/test_url_rewriter.py | 9 ++++++
pywb/rewrite/url_rewriter.py | 8 ++---
pywb/static/wombat.js | 43 +++++++++++++++++++++----
5 files changed, 95 insertions(+), 14 deletions(-)
diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py
index ccc8bf02..a77e02b1 100644
--- a/pywb/rewrite/html_rewriter.py
+++ b/pywb/rewrite/html_rewriter.py
@@ -221,7 +221,7 @@ class HTMLRewriterMixin(StreamingRewriter):
url = urlunsplit((scheme, netloc, path, query, frag))
return url
- def _rewrite_url(self, value, mod=None):
+ def _rewrite_url(self, value, mod=None, force_abs=False):
if not value:
return ''
@@ -230,7 +230,7 @@ class HTMLRewriterMixin(StreamingRewriter):
return ''
unesc_value = self.try_unescape(value)
- rewritten_value = self.url_rewriter.rewrite(unesc_value, mod)
+ rewritten_value = self.url_rewriter.rewrite(unesc_value, mod, force_abs)
if unesc_value != value and rewritten_value != unesc_value:
rewritten_value = rewritten_value.replace(unesc_value, value)
@@ -379,6 +379,14 @@ class HTMLRewriterMixin(StreamingRewriter):
rw_mod = self.defmod
attr_value = self._rewrite_url(attr_value, rw_mod)
+ elif tag == 'script' and attr_name == 'src':
+ rw_mod = handler.get(attr_name)
+ ov = attr_value
+ attr_value = self._rewrite_url(attr_value, rw_mod)
+ if attr_value == ov and not ov.startswith(self.url_rewriter.NO_REWRITE_URI_PREFIX):
+ # URL not skipped, likely src='js/....', forcing abs to make sure, cause PHP MIME(JS) === HTML
+ attr_value = self._rewrite_url(attr_value, rw_mod, True)
+ self._write_attr('__wb_orig_src', ov, empty_attr=None)
else:
# rewrite url using tag handler
rw_mod = handler.get(attr_name)
@@ -408,6 +416,10 @@ class HTMLRewriterMixin(StreamingRewriter):
preload = self.get_attr(tag_attrs, 'as')
rw_mod = self.PRELOAD_TYPES.get(preload, rw_mod)
+ # for html imports with an optional as (google exclusive)
+ elif rel == 'import':
+ rw_mod = 'mp_'
+
elif rel == 'stylesheet':
rw_mod = 'cs_'
diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py
index 7c4ff26b..d85b7b78 100644
--- a/pywb/rewrite/test/test_html_rewriter.py
+++ b/pywb/rewrite/test/test_html_rewriter.py
@@ -211,6 +211,10 @@ r"""
>>> parse('')
+# Script tag with super relative src
+>>> parse('')
+
+
# Script tag + crossorigin + integrity
>>> parse('')
@@ -260,7 +264,7 @@ r"""
Test
>>> parse('', head_insert = '')
-
+
>>> parse('
Test', head_insert = '')
Test
@@ -310,10 +314,37 @@ r"""
>>> parse('')
+>>> parse('')
+
+
+>>> parse('')
+
+
+>>> parse('')
+
+
+>>> parse('')
+
+
+>>> parse('')
+
+
+>>> parse('')
+
+
+>>> parse('')
+
+
# stylesheet
>>> parse('')
+# rel='import'
+>>> parse('')
+
+
+>>> parse('')
+
# doctype
>>> parse('')
diff --git a/pywb/rewrite/test/test_url_rewriter.py b/pywb/rewrite/test/test_url_rewriter.py
index da243cd7..2fb537e4 100644
--- a/pywb/rewrite/test/test_url_rewriter.py
+++ b/pywb/rewrite/test/test_url_rewriter.py
@@ -148,6 +148,15 @@
>>> x = SchemeOnlyUrlRewriter('http://example.com'); x.rebase_rewriter('https://example.com/') == x
True
+# forcing absolute url rewrites
+>>> UrlRewriter('http://example.com/vucht.php', 'http://localhost:8080/live/').rewrite('js/bundle.php?v=1', 'js_', True)
+'/live/js_/http://example.com/js/bundle.php?v=1'
+
+>>> UrlRewriter('http://example.com/vucht.php', 'http://localhost:8080/live/').rewrite('js/bundle.php?v=1', 'js_')
+'js/bundle.php?v=1'
+
+>>> SchemeOnlyUrlRewriter('https://example.com/abc').rewrite('//example.com/abc', force_abs=True)
+'//example.com/abc'
"""
diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py
index 6a42e3bf..d608ad1e 100644
--- a/pywb/rewrite/url_rewriter.py
+++ b/pywb/rewrite/url_rewriter.py
@@ -40,7 +40,7 @@ class UrlRewriter(object):
if self.rewrite_opts.get('punycode_links'):
self.wburl._do_percent_encode = False
- def rewrite(self, url, mod=None):
+ def rewrite(self, url, mod=None, force_abs=False):
# if special protocol, no rewriting at all
if url.startswith(self.NO_REWRITE_URI_PREFIX):
return url
@@ -63,7 +63,7 @@ class UrlRewriter(object):
if url.startswith(self.REL_SCHEME):
is_abs = True
scheme_rel = True
- elif (not is_abs and
+ elif (not force_abs and not is_abs and
not url.startswith(self.REL_PATH) and
self.PARENT_PATH not in url):
return url
@@ -165,7 +165,7 @@ class IdentityUrlRewriter(UrlRewriter):
"""
No rewriting performed, return original url
"""
- def rewrite(self, url, mod=None):
+ def rewrite(self, url, mod=None, force_abs=False):
return url
def get_new_url(self, **kwargs):
@@ -197,7 +197,7 @@ class SchemeOnlyUrlRewriter(IdentityUrlRewriter):
else:
self.opposite_scheme = 'https'
- def rewrite(self, url, mod=None):
+ def rewrite(self, url, mod=None, force_abs=False):
if url.startswith(self.opposite_scheme + '://'):
url = self.url_scheme + url[len(self.opposite_scheme):]
diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js
index 336fcf5b..86622931 100644
--- a/pywb/static/wombat.js
+++ b/pywb/static/wombat.js
@@ -153,6 +153,27 @@ var _WBWombat = function($wbwindow, wbinfo) {
return mod;
}
+ function removeWBOSRC(elem) {
+ if (elem.tagName === 'SCRIPT' && !elem.__$removedWBOSRC$__) {
+ if (elem.hasAttribute('__wb_orig_src')) {
+ elem.removeAttribute('__wb_orig_src');
+ }
+ elem.__$removedWBOSRC$__ = true;
+ }
+ }
+
+ function retrieveWBOSRC(elem) {
+ if (elem.tagName === 'SCRIPT' && !elem.__$removedWBOSRC$__) {
+ var maybeWBOSRC;
+ if (wb_getAttribute) {
+ maybeWBOSRC = wb_getAttribute.call(elem, '__wb_orig_src');
+ } else {
+ maybeWBOSRC = elem.getAttribute('__wb_orig_src');
+ }
+ return maybeWBOSRC;
+ }
+ }
+
//============================================
function is_host_url(str) {
// Good guess that's its a hostname
@@ -703,7 +724,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
this.reload = function() {
return this._orig_loc.reload();
}
-
+
this.orig_getter = function(prop) {
return this._orig_loc[prop];
}
@@ -713,7 +734,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
}
init_loc_override(this, this.orig_setter, this.orig_getter);
-
+
set_loc(this, orig_loc.href);
this.toString = function() {
@@ -1089,6 +1110,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
}
//============================================
+
function init_setAttribute_override()
{
if (!$wbwindow.Element ||
@@ -1110,6 +1132,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
value = rewrite_inline_style(value);
} else if (should_rewrite_attr(this.tagName, lowername)) {
+ removeWBOSRC(this);
if (!this._no_rewrite) {
var mod = rwModForElement(this, lowername);
value = rewrite_url(value, false, mod);
@@ -1129,7 +1152,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
{
if (!$wbwindow.Element ||
!$wbwindow.Element.prototype ||
- !$wbwindow.Element.prototype.setAttribute) {
+ !$wbwindow.Element.prototype.getAttribute) {
return;
}
@@ -1140,6 +1163,10 @@ var _WBWombat = function($wbwindow, wbinfo) {
var result = orig_getAttribute.call(this, name);
if (should_rewrite_attr(this.tagName, name)) {
+ var maybeWBOSRC = retrieveWBOSRC(this);
+ if (maybeWBOSRC) {
+ return maybeWBOSRC;
+ }
result = extract_orig(result);
} else if (starts_with(name, "data-") && starts_with(result, VALID_PREFIXES)) {
result = extract_orig(result);
@@ -1409,6 +1436,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
}
if (new_value != value) {
+ removeWBOSRC(elem);
wb_setAttribute.call(elem, name, new_value);
return true;
}
@@ -1785,6 +1813,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
}
val = rewrite_url(orig, false, mod);
} else {
+ removeWBOSRC(this);
val = rewrite_url(orig, false, mod);
}
@@ -1864,7 +1893,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
override_attr($wbwindow.HTMLMetaElement.prototype, "content", "mp_");
override_attr($wbwindow.HTMLFormElement.prototype, "action", "mp_");
-
+
override_anchor_elem();
var style_proto = $wbwindow.CSSStyleDeclaration.prototype;
@@ -1913,7 +1942,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
for (var i = 0; i < URL_PROPS.length; i++) {
save_prop(URL_PROPS[i]);
- }
+ }
var anchor_setter = function(prop, value) {
var func = anchor_orig["set_" + prop];
@@ -2046,7 +2075,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
//}
text = rewrite_html(text);
}
-
+
return orig_insertAdjacentHTML.call(this, position, text);
}
@@ -2286,7 +2315,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
}
var orig = $wbwindow.postMessage;
-
+
$wbwindow.__orig_postMessage = orig;
// use this_obj.__WB_source not window to fix google calendar embeds, pm_origin sets this.__WB_source