diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index ccc8bf02..a77e02b1 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -221,7 +221,7 @@ class HTMLRewriterMixin(StreamingRewriter): url = urlunsplit((scheme, netloc, path, query, frag)) return url - def _rewrite_url(self, value, mod=None): + def _rewrite_url(self, value, mod=None, force_abs=False): if not value: return '' @@ -230,7 +230,7 @@ class HTMLRewriterMixin(StreamingRewriter): return '' unesc_value = self.try_unescape(value) - rewritten_value = self.url_rewriter.rewrite(unesc_value, mod) + rewritten_value = self.url_rewriter.rewrite(unesc_value, mod, force_abs) if unesc_value != value and rewritten_value != unesc_value: rewritten_value = rewritten_value.replace(unesc_value, value) @@ -379,6 +379,14 @@ class HTMLRewriterMixin(StreamingRewriter): rw_mod = self.defmod attr_value = self._rewrite_url(attr_value, rw_mod) + elif tag == 'script' and attr_name == 'src': + rw_mod = handler.get(attr_name) + ov = attr_value + attr_value = self._rewrite_url(attr_value, rw_mod) + if attr_value == ov and not ov.startswith(self.url_rewriter.NO_REWRITE_URI_PREFIX): + # URL not skipped, likely src='js/....', forcing abs to make sure, cause PHP MIME(JS) === HTML + attr_value = self._rewrite_url(attr_value, rw_mod, True) + self._write_attr('__wb_orig_src', ov, empty_attr=None) else: # rewrite url using tag handler rw_mod = handler.get(attr_name) @@ -408,6 +416,10 @@ class HTMLRewriterMixin(StreamingRewriter): preload = self.get_attr(tag_attrs, 'as') rw_mod = self.PRELOAD_TYPES.get(preload, rw_mod) + # for html imports with an optional as (google exclusive) + elif rel == 'import': + rw_mod = 'mp_' + elif rel == 'stylesheet': rw_mod = 'cs_' diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index 7c4ff26b..d85b7b78 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -211,6 +211,10 @@ r""" >>> parse('') +# Script tag with super relative src +>>> parse('') + + # Script tag + crossorigin + integrity >>> parse('') @@ -260,7 +264,7 @@ r"""
Test >>> parse('', head_insert = '') - + >>> parse('Test', head_insert = '') Test @@ -310,10 +314,37 @@ r""" >>> parse('') +>>> parse('') + + +>>> parse('') + + +>>> parse('') + + +>>> parse('') + + +>>> parse('') + + +>>> parse('') + + +>>> parse('') + + # stylesheet >>> parse('') +# rel='import' +>>> parse('') + + +>>> parse('') + # doctype >>> parse('') diff --git a/pywb/rewrite/test/test_url_rewriter.py b/pywb/rewrite/test/test_url_rewriter.py index da243cd7..2fb537e4 100644 --- a/pywb/rewrite/test/test_url_rewriter.py +++ b/pywb/rewrite/test/test_url_rewriter.py @@ -148,6 +148,15 @@ >>> x = SchemeOnlyUrlRewriter('http://example.com'); x.rebase_rewriter('https://example.com/') == x True +# forcing absolute url rewrites +>>> UrlRewriter('http://example.com/vucht.php', 'http://localhost:8080/live/').rewrite('js/bundle.php?v=1', 'js_', True) +'/live/js_/http://example.com/js/bundle.php?v=1' + +>>> UrlRewriter('http://example.com/vucht.php', 'http://localhost:8080/live/').rewrite('js/bundle.php?v=1', 'js_') +'js/bundle.php?v=1' + +>>> SchemeOnlyUrlRewriter('https://example.com/abc').rewrite('//example.com/abc', force_abs=True) +'//example.com/abc' """ diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index 6a42e3bf..d608ad1e 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -40,7 +40,7 @@ class UrlRewriter(object): if self.rewrite_opts.get('punycode_links'): self.wburl._do_percent_encode = False - def rewrite(self, url, mod=None): + def rewrite(self, url, mod=None, force_abs=False): # if special protocol, no rewriting at all if url.startswith(self.NO_REWRITE_URI_PREFIX): return url @@ -63,7 +63,7 @@ class UrlRewriter(object): if url.startswith(self.REL_SCHEME): is_abs = True scheme_rel = True - elif (not is_abs and + elif (not force_abs and not is_abs and not url.startswith(self.REL_PATH) and self.PARENT_PATH not in url): return url @@ -165,7 +165,7 @@ class IdentityUrlRewriter(UrlRewriter): """ No rewriting performed, return original url """ - def rewrite(self, url, mod=None): + def rewrite(self, url, mod=None, force_abs=False): return url def get_new_url(self, **kwargs): @@ -197,7 +197,7 @@ class SchemeOnlyUrlRewriter(IdentityUrlRewriter): else: self.opposite_scheme = 'https' - def rewrite(self, url, mod=None): + def rewrite(self, url, mod=None, force_abs=False): if url.startswith(self.opposite_scheme + '://'): url = self.url_scheme + url[len(self.opposite_scheme):] diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 336fcf5b..86622931 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -153,6 +153,27 @@ var _WBWombat = function($wbwindow, wbinfo) { return mod; } + function removeWBOSRC(elem) { + if (elem.tagName === 'SCRIPT' && !elem.__$removedWBOSRC$__) { + if (elem.hasAttribute('__wb_orig_src')) { + elem.removeAttribute('__wb_orig_src'); + } + elem.__$removedWBOSRC$__ = true; + } + } + + function retrieveWBOSRC(elem) { + if (elem.tagName === 'SCRIPT' && !elem.__$removedWBOSRC$__) { + var maybeWBOSRC; + if (wb_getAttribute) { + maybeWBOSRC = wb_getAttribute.call(elem, '__wb_orig_src'); + } else { + maybeWBOSRC = elem.getAttribute('__wb_orig_src'); + } + return maybeWBOSRC; + } + } + //============================================ function is_host_url(str) { // Good guess that's its a hostname @@ -703,7 +724,7 @@ var _WBWombat = function($wbwindow, wbinfo) { this.reload = function() { return this._orig_loc.reload(); } - + this.orig_getter = function(prop) { return this._orig_loc[prop]; } @@ -713,7 +734,7 @@ var _WBWombat = function($wbwindow, wbinfo) { } init_loc_override(this, this.orig_setter, this.orig_getter); - + set_loc(this, orig_loc.href); this.toString = function() { @@ -1089,6 +1110,7 @@ var _WBWombat = function($wbwindow, wbinfo) { } //============================================ + function init_setAttribute_override() { if (!$wbwindow.Element || @@ -1110,6 +1132,7 @@ var _WBWombat = function($wbwindow, wbinfo) { value = rewrite_inline_style(value); } else if (should_rewrite_attr(this.tagName, lowername)) { + removeWBOSRC(this); if (!this._no_rewrite) { var mod = rwModForElement(this, lowername); value = rewrite_url(value, false, mod); @@ -1129,7 +1152,7 @@ var _WBWombat = function($wbwindow, wbinfo) { { if (!$wbwindow.Element || !$wbwindow.Element.prototype || - !$wbwindow.Element.prototype.setAttribute) { + !$wbwindow.Element.prototype.getAttribute) { return; } @@ -1140,6 +1163,10 @@ var _WBWombat = function($wbwindow, wbinfo) { var result = orig_getAttribute.call(this, name); if (should_rewrite_attr(this.tagName, name)) { + var maybeWBOSRC = retrieveWBOSRC(this); + if (maybeWBOSRC) { + return maybeWBOSRC; + } result = extract_orig(result); } else if (starts_with(name, "data-") && starts_with(result, VALID_PREFIXES)) { result = extract_orig(result); @@ -1409,6 +1436,7 @@ var _WBWombat = function($wbwindow, wbinfo) { } if (new_value != value) { + removeWBOSRC(elem); wb_setAttribute.call(elem, name, new_value); return true; } @@ -1785,6 +1813,7 @@ var _WBWombat = function($wbwindow, wbinfo) { } val = rewrite_url(orig, false, mod); } else { + removeWBOSRC(this); val = rewrite_url(orig, false, mod); } @@ -1864,7 +1893,7 @@ var _WBWombat = function($wbwindow, wbinfo) { override_attr($wbwindow.HTMLMetaElement.prototype, "content", "mp_"); override_attr($wbwindow.HTMLFormElement.prototype, "action", "mp_"); - + override_anchor_elem(); var style_proto = $wbwindow.CSSStyleDeclaration.prototype; @@ -1913,7 +1942,7 @@ var _WBWombat = function($wbwindow, wbinfo) { for (var i = 0; i < URL_PROPS.length; i++) { save_prop(URL_PROPS[i]); - } + } var anchor_setter = function(prop, value) { var func = anchor_orig["set_" + prop]; @@ -2046,7 +2075,7 @@ var _WBWombat = function($wbwindow, wbinfo) { //} text = rewrite_html(text); } - + return orig_insertAdjacentHTML.call(this, position, text); } @@ -2286,7 +2315,7 @@ var _WBWombat = function($wbwindow, wbinfo) { } var orig = $wbwindow.postMessage; - + $wbwindow.__orig_postMessage = orig; // use this_obj.__WB_source not window to fix google calendar embeds, pm_origin sets this.__WB_source