mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
wbhtml: add script and style doctests
override close() to handle open <script> and <style> tags by forcing an end tag, otherwise parser does not process the remainder
This commit is contained in:
parent
6050ea1ffa
commit
787dfc136e
@ -19,7 +19,8 @@ class RemoteCDXServer:
|
||||
'statuscode': '200',
|
||||
'timestamp': '20020120142510',
|
||||
'urlkey': 'com,example)/'}
|
||||
"""
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, serverUrl):
|
||||
self.serverUrl = serverUrl
|
||||
|
@ -148,6 +148,9 @@ class CSSRewriter(RegexRewriter):
|
||||
|
||||
"""
|
||||
|
||||
CSS_URL_REGEX = "url\\s*\\(\\s*[\\\\\"']*([^'\"]+)[\\\\\"']*\\s*\\)"
|
||||
CSS_IMPORT_NO_URL_REGEX = "@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)"
|
||||
|
||||
def __init__(self, rewriter):
|
||||
rules = self._createRules(rewriter)
|
||||
|
||||
@ -156,8 +159,8 @@ class CSSRewriter(RegexRewriter):
|
||||
|
||||
def _createRules(self, rewriter):
|
||||
return [
|
||||
("url\\s*\\(\\s*[\\\\\"']*([^'\"]+)[\\\\\"']*\\s*\\)", RegexRewriter.archivalRewrite(rewriter), 1),
|
||||
("@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)", RegexRewriter.archivalRewrite(rewriter), 1),
|
||||
(CSSRewriter.CSS_URL_REGEX, RegexRewriter.archivalRewrite(rewriter), 1),
|
||||
(CSSRewriter.CSS_IMPORT_NO_URL_REGEX, RegexRewriter.archivalRewrite(rewriter), 1),
|
||||
]
|
||||
|
||||
|
||||
|
@ -10,30 +10,38 @@ from regexmatch import JSRewriter, CSSRewriter
|
||||
#=================================================================
|
||||
class WBHtml(HTMLParser):
|
||||
r"""
|
||||
>>> WBHtml(rewriter).feed('<HTML><A Href="page.html">Text</a></hTmL>')
|
||||
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
|
||||
<HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
|
||||
|
||||
>>> WBHtml(rewriter).feed('<body x="y"><img src="../img.gif"/><br/></body>')
|
||||
>>> parse('<body x="y"><img src="../img.gif"/><br/></body>')
|
||||
<body x="y"><img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/></body>
|
||||
|
||||
>>> WBHtml(rewriter).feed('<body x="y"><img src="/img.gif"/><br/></body>')
|
||||
>>> parse('<body x="y"><img src="/img.gif"/><br/></body>')
|
||||
<body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>
|
||||
|
||||
>>> WBHtml(rewriter).feed('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
||||
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
||||
<meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
|
||||
|
||||
>>> WBHtml(rewriter).feed('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
||||
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
|
||||
|
||||
>>> WBHtml(rewriter).feed('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
|
||||
# Unterminated script tag auto-terminate
|
||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
|
||||
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc></script>
|
||||
|
||||
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
|
||||
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
|
||||
|
||||
>>> WBHtml(rewriter).feed('<div style="background: url(\'abc.html\')" onclick="location = \'redirect.html\'"></div>')
|
||||
>>> parse('<div style="background: url(\'abc.html\')" onclick="location = \'redirect.html\'"></div>')
|
||||
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onclick="WB_wombat_location = 'redirect.html'"></div>
|
||||
|
||||
>>> WBHtml(rewriter).feed('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
|
||||
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
|
||||
<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
|
||||
"""
|
||||
|
||||
# Unterminated style tag auto-terminate
|
||||
>>> parse('<style>@import url(styles.css)')
|
||||
<style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style>
|
||||
"""
|
||||
|
||||
REWRITE_TAGS = {
|
||||
'a': {'href': ''},
|
||||
@ -77,6 +85,14 @@ class WBHtml(HTMLParser):
|
||||
self.cssRewriter = CSSRewriter(rewriter)
|
||||
|
||||
|
||||
def close(self):
|
||||
if (self._wbParseContext):
|
||||
self.feed('</' + self._wbParseContext + '>')
|
||||
self._wbParseContext = None
|
||||
|
||||
HTMLParser.close(self)
|
||||
|
||||
|
||||
# ===========================
|
||||
META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE)
|
||||
|
||||
@ -217,4 +233,9 @@ if __name__ == "__main__":
|
||||
|
||||
rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
|
||||
|
||||
def parse(data):
|
||||
parser = WBHtml(rewriter)
|
||||
parser.feed(data)
|
||||
parser.close()
|
||||
|
||||
doctest.testmod()
|
||||
|
@ -41,7 +41,7 @@ class ArchivalUrlRewriter:
|
||||
|
||||
NO_REWRITE_URI_PREFIX = ['javascript:', 'data:', 'mailto:', 'about:']
|
||||
|
||||
PROTOCOLS = ['http://', 'https://', '//', 'mms://', 'rtsp://', 'wais://']
|
||||
PROTOCOLS = ['http://', 'https://', '//', 'ftp://', 'mms://', 'rtsp://', 'wais://']
|
||||
|
||||
def __init__(self, wburl_str, prefix):
|
||||
self.wburl = ArchivalUrl(wburl_str)
|
||||
|
Loading…
x
Reference in New Issue
Block a user