1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Server-side rewriting of script[src='js/...'] and link rel='import' (#334)

* Updated html_rewriter.py to account for rewriting of script[src] values that are super relative (http://fotopaulmartens.netcam.nl/vucht.php) and added link rel='import' rewriting
Updated test_html_rewriter.py for super rel script[src] rewriting and link rel='import'
Updated wombat to account for the new rewriting of script[src]  (http://fotopaulmartens.netcam.nl/vucht.php)
Changed the postMessage override in wombat to use $wbwindow rather than window to fix google calendar replay / recording (http://qasrcc.org/events/calendar/)

* Updated tests for forcing absolute and fixed merge conflicts

* wombat: extracted removal and retrieval of __wb_original_src into own functions
This commit is contained in:
John Berlin 2018-06-14 13:56:46 -04:00 committed by Ilya Kreymer
parent ac5b4da9eb
commit bb5d46d19b
5 changed files with 95 additions and 14 deletions

View File

@ -221,7 +221,7 @@ class HTMLRewriterMixin(StreamingRewriter):
url = urlunsplit((scheme, netloc, path, query, frag))
return url
def _rewrite_url(self, value, mod=None):
def _rewrite_url(self, value, mod=None, force_abs=False):
if not value:
return ''
@ -230,7 +230,7 @@ class HTMLRewriterMixin(StreamingRewriter):
return ''
unesc_value = self.try_unescape(value)
rewritten_value = self.url_rewriter.rewrite(unesc_value, mod)
rewritten_value = self.url_rewriter.rewrite(unesc_value, mod, force_abs)
if unesc_value != value and rewritten_value != unesc_value:
rewritten_value = rewritten_value.replace(unesc_value, value)
@ -379,6 +379,14 @@ class HTMLRewriterMixin(StreamingRewriter):
rw_mod = self.defmod
attr_value = self._rewrite_url(attr_value, rw_mod)
elif tag == 'script' and attr_name == 'src':
rw_mod = handler.get(attr_name)
ov = attr_value
attr_value = self._rewrite_url(attr_value, rw_mod)
if attr_value == ov and not ov.startswith(self.url_rewriter.NO_REWRITE_URI_PREFIX):
# URL not skipped, likely src='js/....', forcing abs to make sure, cause PHP MIME(JS) === HTML
attr_value = self._rewrite_url(attr_value, rw_mod, True)
self._write_attr('__wb_orig_src', ov, empty_attr=None)
else:
# rewrite url using tag handler
rw_mod = handler.get(attr_name)
@ -408,6 +416,10 @@ class HTMLRewriterMixin(StreamingRewriter):
preload = self.get_attr(tag_attrs, 'as')
rw_mod = self.PRELOAD_TYPES.get(preload, rw_mod)
# for html imports with an optional as (google exclusive)
elif rel == 'import':
rw_mod = 'mp_'
elif rel == 'stylesheet':
rw_mod = 'cs_'

View File

@ -211,6 +211,10 @@ r"""
>>> parse('<script type="application/json">{"embed top test": "http://example.com/a/b/c.html"}</script>')
<script type="application/json">{"embed top test": "http://example.com/a/b/c.html"}</script>
# Script tag with super relative src
>>> parse('<script src="js/fun.js"></script>')
<script __wb_orig_src="js/fun.js" src="/web/20131226101010js_/http://example.com/some/path/js/fun.js"></script>
# Script tag + crossorigin + integrity
>>> parse('<script src="/js/scripts.js" crossorigin="anonymous" integrity="ABC"></script>')
<script src="/web/20131226101010js_/http://example.com/js/scripts.js" _crossorigin="anonymous" _integrity="ABC"></script>
@ -260,7 +264,7 @@ r"""
<html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/other.js"></script></head><body>Test</body></html>
>>> parse('<html><script src="other.js"></script></html>', head_insert = '<script src="cool.js"></script>')
<html><script src="cool.js"></script><script src="other.js"></script></html>
<html><script src="cool.js"></script><script __wb_orig_src="other.js" src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></html>
>>> parse('<html><head/><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
<html><head><script src="cool.js"></script></head><body>Test</body></html>
@ -310,10 +314,37 @@ r"""
>>> parse('<link rel="preload" as="video" href="http://example.com/some/other/path">')
<link rel="preload" as="video" href="/web/20131226101010oe_/http://example.com/some/other/path">
>>> parse('<link rel="preload" as="worker" href="http://example.com/some/other/path">')
<link rel="preload" as="worker" href="/web/20131226101010js_/http://example.com/some/other/path">
>>> parse('<link rel="preload" as="font" href="http://example.com/some/other/path">')
<link rel="preload" as="font" href="/web/20131226101010oe_/http://example.com/some/other/path">
>>> parse('<link rel="preload" as="audio" href="http://example.com/some/other/path">')
<link rel="preload" as="audio" href="/web/20131226101010oe_/http://example.com/some/other/path">
>>> parse('<link rel="preload" as="embed" href="http://example.com/some/other/path">')
<link rel="preload" as="embed" href="/web/20131226101010oe_/http://example.com/some/other/path">
>>> parse('<link rel="preload" as="object" href="http://example.com/some/other/path">')
<link rel="preload" as="object" href="/web/20131226101010oe_/http://example.com/some/other/path">
>>> parse('<link rel="preload" as="track" href="http://example.com/some/other/path">')
<link rel="preload" as="track" href="/web/20131226101010oe_/http://example.com/some/other/path">
>>> parse('<link rel="preload" as="fetch" href="http://example.com/some/other/path">')
<link rel="preload" as="fetch" href="/web/20131226101010mp_/http://example.com/some/other/path">
# stylesheet
>>> parse('<link rel="stylesheet" href="http://example.com/some/other/path">')
<link rel="stylesheet" href="/web/20131226101010cs_/http://example.com/some/other/path">
# rel='import'
>>> parse('<link rel="import" href="http://example.com/componemts/app.html">')
<link rel="import" href="/web/20131226101010mp_/http://example.com/componemts/app.html">
>>> parse('<link rel="import" as="document" href="http://example.com/componemts/app.html">')
<link rel="import" as="document" href="/web/20131226101010mp_/http://example.com/componemts/app.html">
# doctype
>>> parse('<!doctype html PUBLIC "public">')

View File

@ -148,6 +148,15 @@
>>> x = SchemeOnlyUrlRewriter('http://example.com'); x.rebase_rewriter('https://example.com/') == x
True
# forcing absolute url rewrites
>>> UrlRewriter('http://example.com/vucht.php', 'http://localhost:8080/live/').rewrite('js/bundle.php?v=1', 'js_', True)
'/live/js_/http://example.com/js/bundle.php?v=1'
>>> UrlRewriter('http://example.com/vucht.php', 'http://localhost:8080/live/').rewrite('js/bundle.php?v=1', 'js_')
'js/bundle.php?v=1'
>>> SchemeOnlyUrlRewriter('https://example.com/abc').rewrite('//example.com/abc', force_abs=True)
'//example.com/abc'
"""

View File

@ -40,7 +40,7 @@ class UrlRewriter(object):
if self.rewrite_opts.get('punycode_links'):
self.wburl._do_percent_encode = False
def rewrite(self, url, mod=None):
def rewrite(self, url, mod=None, force_abs=False):
# if special protocol, no rewriting at all
if url.startswith(self.NO_REWRITE_URI_PREFIX):
return url
@ -63,7 +63,7 @@ class UrlRewriter(object):
if url.startswith(self.REL_SCHEME):
is_abs = True
scheme_rel = True
elif (not is_abs and
elif (not force_abs and not is_abs and
not url.startswith(self.REL_PATH) and
self.PARENT_PATH not in url):
return url
@ -165,7 +165,7 @@ class IdentityUrlRewriter(UrlRewriter):
"""
No rewriting performed, return original url
"""
def rewrite(self, url, mod=None):
def rewrite(self, url, mod=None, force_abs=False):
return url
def get_new_url(self, **kwargs):
@ -197,7 +197,7 @@ class SchemeOnlyUrlRewriter(IdentityUrlRewriter):
else:
self.opposite_scheme = 'https'
def rewrite(self, url, mod=None):
def rewrite(self, url, mod=None, force_abs=False):
if url.startswith(self.opposite_scheme + '://'):
url = self.url_scheme + url[len(self.opposite_scheme):]

View File

@ -153,6 +153,27 @@ var _WBWombat = function($wbwindow, wbinfo) {
return mod;
}
function removeWBOSRC(elem) {
if (elem.tagName === 'SCRIPT' && !elem.__$removedWBOSRC$__) {
if (elem.hasAttribute('__wb_orig_src')) {
elem.removeAttribute('__wb_orig_src');
}
elem.__$removedWBOSRC$__ = true;
}
}
function retrieveWBOSRC(elem) {
if (elem.tagName === 'SCRIPT' && !elem.__$removedWBOSRC$__) {
var maybeWBOSRC;
if (wb_getAttribute) {
maybeWBOSRC = wb_getAttribute.call(elem, '__wb_orig_src');
} else {
maybeWBOSRC = elem.getAttribute('__wb_orig_src');
}
return maybeWBOSRC;
}
}
//============================================
function is_host_url(str) {
// Good guess that's its a hostname
@ -703,7 +724,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
this.reload = function() {
return this._orig_loc.reload();
}
this.orig_getter = function(prop) {
return this._orig_loc[prop];
}
@ -713,7 +734,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
}
init_loc_override(this, this.orig_setter, this.orig_getter);
set_loc(this, orig_loc.href);
this.toString = function() {
@ -1089,6 +1110,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
}
//============================================
function init_setAttribute_override()
{
if (!$wbwindow.Element ||
@ -1110,6 +1132,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
value = rewrite_inline_style(value);
} else if (should_rewrite_attr(this.tagName, lowername)) {
removeWBOSRC(this);
if (!this._no_rewrite) {
var mod = rwModForElement(this, lowername);
value = rewrite_url(value, false, mod);
@ -1129,7 +1152,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
{
if (!$wbwindow.Element ||
!$wbwindow.Element.prototype ||
!$wbwindow.Element.prototype.setAttribute) {
!$wbwindow.Element.prototype.getAttribute) {
return;
}
@ -1140,6 +1163,10 @@ var _WBWombat = function($wbwindow, wbinfo) {
var result = orig_getAttribute.call(this, name);
if (should_rewrite_attr(this.tagName, name)) {
var maybeWBOSRC = retrieveWBOSRC(this);
if (maybeWBOSRC) {
return maybeWBOSRC;
}
result = extract_orig(result);
} else if (starts_with(name, "data-") && starts_with(result, VALID_PREFIXES)) {
result = extract_orig(result);
@ -1409,6 +1436,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
}
if (new_value != value) {
removeWBOSRC(elem);
wb_setAttribute.call(elem, name, new_value);
return true;
}
@ -1785,6 +1813,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
}
val = rewrite_url(orig, false, mod);
} else {
removeWBOSRC(this);
val = rewrite_url(orig, false, mod);
}
@ -1864,7 +1893,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
override_attr($wbwindow.HTMLMetaElement.prototype, "content", "mp_");
override_attr($wbwindow.HTMLFormElement.prototype, "action", "mp_");
override_anchor_elem();
var style_proto = $wbwindow.CSSStyleDeclaration.prototype;
@ -1913,7 +1942,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
for (var i = 0; i < URL_PROPS.length; i++) {
save_prop(URL_PROPS[i]);
}
}
var anchor_setter = function(prop, value) {
var func = anchor_orig["set_" + prop];
@ -2046,7 +2075,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
//}
text = rewrite_html(text);
}
return orig_insertAdjacentHTML.call(this, position, text);
}
@ -2286,7 +2315,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
}
var orig = $wbwindow.postMessage;
$wbwindow.__orig_postMessage = orig;
// use this_obj.__WB_source not window to fix google calendar embeds, pm_origin sets this.__WB_source