1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

wbhtml: add script and style doctests

override close() to handle open <script> and <style> tags by forcing an end tag,
otherwise parser does not process the remainder
This commit is contained in:
Ilya Kreymer 2013-12-24 22:51:33 -08:00
parent 6050ea1ffa
commit 787dfc136e
4 changed files with 38 additions and 13 deletions

View File

@ -19,7 +19,8 @@ class RemoteCDXServer:
'statuscode': '200',
'timestamp': '20020120142510',
'urlkey': 'com,example)/'}
"""
"""
def __init__(self, serverUrl):
self.serverUrl = serverUrl

View File

@ -148,6 +148,9 @@ class CSSRewriter(RegexRewriter):
"""
CSS_URL_REGEX = "url\\s*\\(\\s*[\\\\\"']*([^'\"]+)[\\\\\"']*\\s*\\)"
CSS_IMPORT_NO_URL_REGEX = "@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)"
def __init__(self, rewriter):
rules = self._createRules(rewriter)
@ -156,8 +159,8 @@ class CSSRewriter(RegexRewriter):
def _createRules(self, rewriter):
return [
("url\\s*\\(\\s*[\\\\\"']*([^'\"]+)[\\\\\"']*\\s*\\)", RegexRewriter.archivalRewrite(rewriter), 1),
("@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)", RegexRewriter.archivalRewrite(rewriter), 1),
(CSSRewriter.CSS_URL_REGEX, RegexRewriter.archivalRewrite(rewriter), 1),
(CSSRewriter.CSS_IMPORT_NO_URL_REGEX, RegexRewriter.archivalRewrite(rewriter), 1),
]

View File

@ -10,30 +10,38 @@ from regexmatch import JSRewriter, CSSRewriter
#=================================================================
class WBHtml(HTMLParser):
r"""
>>> WBHtml(rewriter).feed('<HTML><A Href="page.html">Text</a></hTmL>')
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
<HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
>>> WBHtml(rewriter).feed('<body x="y"><img src="../img.gif"/><br/></body>')
>>> parse('<body x="y"><img src="../img.gif"/><br/></body>')
<body x="y"><img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/></body>
>>> WBHtml(rewriter).feed('<body x="y"><img src="/img.gif"/><br/></body>')
>>> parse('<body x="y"><img src="/img.gif"/><br/></body>')
<body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>
>>> WBHtml(rewriter).feed('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
<meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
>>> WBHtml(rewriter).feed('<script>window.location = "http://example.com/a/b/c.html"</script>')
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
>>> WBHtml(rewriter).feed('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
# Unterminated script tag auto-terminate
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc></script>
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
>>> WBHtml(rewriter).feed('<div style="background: url(\'abc.html\')" onclick="location = \'redirect.html\'"></div>')
>>> parse('<div style="background: url(\'abc.html\')" onclick="location = \'redirect.html\'"></div>')
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onclick="WB_wombat_location = 'redirect.html'"></div>
>>> WBHtml(rewriter).feed('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
"""
# Unterminated style tag auto-terminate
>>> parse('<style>@import url(styles.css)')
<style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style>
"""
REWRITE_TAGS = {
'a': {'href': ''},
@ -77,6 +85,14 @@ class WBHtml(HTMLParser):
self.cssRewriter = CSSRewriter(rewriter)
def close(self):
if (self._wbParseContext):
self.feed('</' + self._wbParseContext + '>')
self._wbParseContext = None
HTMLParser.close(self)
# ===========================
META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE)
@ -217,4 +233,9 @@ if __name__ == "__main__":
rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
def parse(data):
parser = WBHtml(rewriter)
parser.feed(data)
parser.close()
doctest.testmod()

View File

@ -41,7 +41,7 @@ class ArchivalUrlRewriter:
NO_REWRITE_URI_PREFIX = ['javascript:', 'data:', 'mailto:', 'about:']
PROTOCOLS = ['http://', 'https://', '//', 'mms://', 'rtsp://', 'wais://']
PROTOCOLS = ['http://', 'https://', '//', 'ftp://', 'mms://', 'rtsp://', 'wais://']
def __init__(self, wburl_str, prefix):
self.wburl = ArchivalUrl(wburl_str)