mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
rewrite: fix html rewriting, if forcing end </script>, </style>,
don't actually output to preserve original wombat: copy over all Location settings wburl: convert :/ -> :// if 2nd slash missing, only check for <scheme>:/ and ignore subsequent slashes
This commit is contained in:
parent
541c076b77
commit
584d826f05
@ -196,7 +196,10 @@ class HTMLRewriter(HTMLParser):
|
||||
# HTMLParser overrides below
|
||||
def close(self):
|
||||
if (self._wb_parse_context):
|
||||
result = self.rewrite('</' + self._wb_parse_context + '>')
|
||||
end_tag = '</' + self._wb_parse_context + '>'
|
||||
result = self.rewrite(end_tag)
|
||||
if result.endswith(end_tag):
|
||||
result = result[:-len(end_tag)]
|
||||
self._wb_parse_context = None
|
||||
else:
|
||||
result = ''
|
||||
|
@ -53,9 +53,9 @@ r"""
|
||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
||||
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
|
||||
|
||||
# Unterminated script tag auto-terminate
|
||||
# Unterminated script tag, handle but don't auto-terminate
|
||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
|
||||
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc></script>
|
||||
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc>
|
||||
|
||||
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
|
||||
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
|
||||
@ -66,9 +66,9 @@ r"""
|
||||
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
|
||||
<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
|
||||
|
||||
# Unterminated style tag auto-terminate
|
||||
# Unterminated style tag, handle but don't auto-terminate
|
||||
>>> parse('<style>@import url(styles.css)')
|
||||
<style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style>
|
||||
<style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)
|
||||
|
||||
# Head Insertion
|
||||
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||
|
@ -57,7 +57,7 @@ class UrlRewriter:
|
||||
|
||||
NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:', 'mailto:', 'about:']
|
||||
|
||||
PROTOCOLS = ['http://', 'https://', '//', 'ftp://', 'mms://', 'rtsp://', 'wais://']
|
||||
PROTOCOLS = ['http:/', 'https:/', '//', 'ftp:/', 'mms:/', 'rtsp:/', 'wais:/']
|
||||
|
||||
def __init__(self, wburl, prefix):
|
||||
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
|
||||
|
@ -71,6 +71,9 @@ class WbUrl(BaseWbUrl):
|
||||
>>> repr(WbUrl('20130102im_/https://example.com'))
|
||||
"('replay', '20130102', 'im_', 'https://example.com', '20130102im_/https://example.com')"
|
||||
|
||||
>>> repr(WbUrl('20130102im_/https:/example.com'))
|
||||
"('replay', '20130102', 'im_', 'https://example.com', '20130102im_/https://example.com')"
|
||||
|
||||
# Protocol agnostic convert to http
|
||||
>>> repr(WbUrl('20130102im_///example.com'))
|
||||
"('replay', '20130102', 'im_', 'http://example.com', '20130102im_/http://example.com')"
|
||||
@ -81,6 +84,9 @@ class WbUrl(BaseWbUrl):
|
||||
>>> repr(WbUrl('https://example.com/xyz'))
|
||||
"('latest_replay', '', '', 'https://example.com/xyz', 'https://example.com/xyz')"
|
||||
|
||||
>>> repr(WbUrl('https:/example.com/xyz'))
|
||||
"('latest_replay', '', '', 'https://example.com/xyz', 'https://example.com/xyz')"
|
||||
|
||||
>>> repr(WbUrl('https://example.com/xyz?a=%2f&b=%2E'))
|
||||
"('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', 'https://example.com/xyz?a=%2f&b=%2E')"
|
||||
|
||||
@ -125,6 +131,11 @@ class WbUrl(BaseWbUrl):
|
||||
>>> x = WbUrl('/http://example.com:abc/')
|
||||
Traceback (most recent call last):
|
||||
Exception: Bad Request Url: http://example.com:abc/
|
||||
|
||||
# considered blank
|
||||
>>> x = WbUrl('https:/')
|
||||
>>> x = WbUrl('https:///')
|
||||
>>> x = WbUrl('http://')
|
||||
"""
|
||||
|
||||
# Regexs
|
||||
@ -148,11 +159,14 @@ class WbUrl(BaseWbUrl):
|
||||
raise Exception('Invalid WbUrl: ', url)
|
||||
|
||||
# protocol agnostic url -> http://
|
||||
#if self.url.startswith('//'):
|
||||
# self.url = self.DEFAULT_SCHEME + self.url[2:]
|
||||
# no protocol -> http://
|
||||
if not '://' in self.url:
|
||||
inx = self.url.find(':/')
|
||||
if inx < 0:
|
||||
self.url = self.DEFAULT_SCHEME + self.url
|
||||
else:
|
||||
inx += 2
|
||||
if inx < len(self.url) and self.url[inx] != '/':
|
||||
self.url = self.url[:inx] + '/' + self.url[inx:]
|
||||
|
||||
# BUG?: adding upper() because rfc3987 lib rejects lower case %-encoding
|
||||
# %2F is fine, but %2f -- standard supports either
|
||||
|
@ -138,7 +138,26 @@ function WB_CopyLocationObj(loc)
|
||||
newLoc.replace = function(url) { this._origLoc.replace(WB_RewriteUrl(url)); }
|
||||
newLoc.assign = function(url) { this._origLoc.assign(WB_RewriteUrl(url)); }
|
||||
newLoc.reload = loc.reload;
|
||||
newLoc.href = WB_ExtractOrig(newLoc._origHref);
|
||||
|
||||
// Adapted from:
|
||||
// https://gist.github.com/jlong/2428561
|
||||
var parser = document.createElement('a');
|
||||
parser.href = WB_ExtractOrig(newLoc._origHref);
|
||||
|
||||
newLoc.hash = parser.hash;
|
||||
newLoc.host = parser.host;
|
||||
newLoc.hostname = parser.hostname;
|
||||
newLoc.href = parser.href;
|
||||
|
||||
if (newLoc.origin) {
|
||||
newLoc.origin = parser.origin;
|
||||
}
|
||||
|
||||
newLoc.pathname = parser.pathname;
|
||||
newLoc.port = parser.port
|
||||
newLoc.protocol = parser.protocol;
|
||||
newLoc.search = parser.search;
|
||||
|
||||
newLoc.toString = function() { return this.href; }
|
||||
|
||||
return newLoc;
|
||||
@ -148,7 +167,8 @@ function WB_wombat_updateLoc(reqHref, origHref, location)
|
||||
{
|
||||
if (reqHref && (WB_ExtractOrig(origHref) != WB_ExtractOrig(reqHref))) {
|
||||
var finalHref = WB_RewriteUrl(reqHref);
|
||||
|
||||
|
||||
console.log("Rewrite: " + reqHref + " => " + origHref);
|
||||
location.href = finalHref;
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user