wbhtml: add script and style doctests

override close() to handle open <script> and <style> tags by forcing an end tag, otherwise parser does not process the remainder
2025-03-15 00:03:28 +01:00 · 2013-12-24 22:51:33 -08:00 · 2013-12-24 22:51:33 -08:00 · 787dfc136e
commit 787dfc136e
parent 6050ea1ffa
4 changed files with 38 additions and 13 deletions
--- a/pywb/indexreader.py
+++ b/pywb/indexreader.py
@ -19,7 +19,8 @@ class RemoteCDXServer:
     'statuscode': '200',
     'timestamp': '20020120142510',
     'urlkey': 'com,example)/'}
-    """
+
+   """

    def __init__(self, serverUrl):
        self.serverUrl = serverUrl
--- a/pywb/regexmatch.py
+++ b/pywb/regexmatch.py
@ -148,6 +148,9 @@ class CSSRewriter(RegexRewriter):

    """

+    CSS_URL_REGEX = "url\\s*\\(\\s*[\\\\\"']*([^'\"]+)[\\\\\"']*\\s*\\)"
+    CSS_IMPORT_NO_URL_REGEX = "@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)"
+
    def __init__(self, rewriter):
        rules = self._createRules(rewriter)

@ -156,8 +159,8 @@ class CSSRewriter(RegexRewriter):

    def _createRules(self, rewriter):
        return [
-             ("url\\s*\\(\\s*[\\\\\"']*([^'\"]+)[\\\\\"']*\\s*\\)", RegexRewriter.archivalRewrite(rewriter), 1),
-             ("@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)", RegexRewriter.archivalRewrite(rewriter), 1),
+             (CSSRewriter.CSS_URL_REGEX, RegexRewriter.archivalRewrite(rewriter), 1),
+             (CSSRewriter.CSS_IMPORT_NO_URL_REGEX, RegexRewriter.archivalRewrite(rewriter), 1),
        ]


--- a/pywb/wbhtml.py
+++ b/pywb/wbhtml.py
@ -10,30 +10,38 @@ from regexmatch import JSRewriter, CSSRewriter
 #=================================================================
 class WBHtml(HTMLParser):
    r"""
-    >>> WBHtml(rewriter).feed('<HTML><A Href="page.html">Text</a></hTmL>')
+    >>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
    <HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>

-    >>> WBHtml(rewriter).feed('<body x="y"><img src="../img.gif"/><br/></body>')
+    >>> parse('<body x="y"><img src="../img.gif"/><br/></body>')
    <body x="y"><img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/></body>

-    >>> WBHtml(rewriter).feed('<body x="y"><img src="/img.gif"/><br/></body>')
+    >>> parse('<body x="y"><img src="/img.gif"/><br/></body>')
    <body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>

-    >>> WBHtml(rewriter).feed('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
+    >>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
    <meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">

-    >>> WBHtml(rewriter).feed('<script>window.location = "http://example.com/a/b/c.html"</script>')
+    >>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
    <script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>

-    >>> WBHtml(rewriter).feed('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
+    # Unterminated script tag auto-terminate
+    >>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
+    <script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc></script>
+
+    >>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
    <script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>

-    >>> WBHtml(rewriter).feed('<div style="background: url(\'abc.html\')" onclick="location = \'redirect.html\'"></div>')
+    >>> parse('<div style="background: url(\'abc.html\')" onclick="location = \'redirect.html\'"></div>')
    <div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onclick="WB_wombat_location = 'redirect.html'"></div>

-    >>> WBHtml(rewriter).feed('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
+    >>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
    <style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
-    """
+
+    # Unterminated style tag auto-terminate
+    >>> parse('<style>@import url(styles.css)')
+    <style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style>
+     """

    REWRITE_TAGS = {
        'a':       {'href': ''},
@ -77,6 +85,14 @@ class WBHtml(HTMLParser):
        self.cssRewriter = CSSRewriter(rewriter)


+    def close(self):
+        if (self._wbParseContext):
+            self.feed('</' + self._wbParseContext + '>')
+            self._wbParseContext = None
+
+        HTMLParser.close(self)
+
+
    # ===========================
    META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE)

@ -217,4 +233,9 @@ if __name__ == "__main__":

    rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')

+    def parse(data):
+        parser = WBHtml(rewriter)
+        parser.feed(data)
+        parser.close()
+
    doctest.testmod()
--- a/pywb/wburlrewriter.py
+++ b/pywb/wburlrewriter.py
@ -41,7 +41,7 @@ class ArchivalUrlRewriter:

    NO_REWRITE_URI_PREFIX = ['javascript:', 'data:', 'mailto:', 'about:']

-    PROTOCOLS = ['http://', 'https://', '//', 'mms://', 'rtsp://', 'wais://']
+    PROTOCOLS = ['http://', 'https://', '//', 'ftp://', 'mms://', 'rtsp://', 'wais://']

    def __init__(self, wburl_str, prefix):
        self.wburl = ArchivalUrl(wburl_str)