standard JS and CSS rewriting working, with generic regex rewriter

which supports extensions!
2025-03-15 00:03:28 +01:00 · 2013-12-23 23:57:13 -08:00 · 2013-12-23 23:57:13 -08:00 · 6050ea1ffa
commit 6050ea1ffa
parent 3a896f7cd3
3 changed files with 159 additions and 181 deletions
--- a/pywb/regexmatch.py
+++ b/pywb/regexmatch.py
@ -1,156 +1,14 @@
 import re
 import sys
+import itertools
+
 from wburlrewriter import ArchivalUrlRewriter

-class RegexMatchReplacer:
-    def __init__(self, regexStr):
-        self.regex = re.compile(regexStr)
-
-    def replaceAll(self, string):
-        last = 0
-        result = ''
-        for m in self.regex.finditer(string):
-            start = m.start(1)
-            end = m.end(1)
-            result += string[last:start]
-            result += self.replace(string[start:end], m)
-            last = end
-
-        result += string[last:]
-        return result
-
-    def replace(self, string, m):
-        return string
-
-
-class HttpMatchReplacer(RegexMatchReplacer):
-    HTTP_REGEX = "(https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+)"
-
-    def __init__(self, rewriter):
-        RegexMatchReplacer.__init__(self, HttpMatchReplacer.HTTP_REGEX)
-        self.rewriter = rewriter
-
-    def replace(self, string, m):
-        return self.rewriter.rewrite(string)
-
-class CustomMatchReplacer(RegexMatchReplacer):
-    def __init__(self, matchRegex, replaceStr):
-        RegexMatchReplacer.__init__(self, matchRegex)
-        self.replaceStr = replaceStr
-
-    def replace(self, string, m):
-        return self.replaceStr
-
-class Replacers:
+class RegexRewriter:
    """
-    >>> replacer.replaceAll('location = "http://example.com/abc.html"')
-    'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
-
-    >>> replacer.replaceAll('cool_Location = "http://example.com/abc.html"')
-    'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
-
-    >>> replacer.replaceAll('window.location = "http://example.com/abc.html"')
-    'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
-    """
-
-    def __init__(self, replacers):
-        self.replacers = replacers
-
-    def replaceAll(self, string):
-        for x in self.replacers:
-            string = x.replaceAll(string)
-
-        return string
-
-replacer = Replacers([HttpMatchReplacer(ArchivalUrlRewriter('/20131010im_/http://abc.com/XYZ/', '/web/')), CustomMatchReplacer('[^\w]?(location|domain)', 'WB_wombat_location')])
-
-# =================================
-arw = ArchivalUrlRewriter('/20131010im_/http://abc.com/XYZ/', '/web/')
-
-
-
-class MultiRegexReplacer:
-    """
-    >>> MultiRegexReplacer().replaceAll('location = "http://example.com/abc.html"', arw)
-    'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
-
-    >>> MultiRegexReplacer().replaceAll('cool_Location = "http://example.com/abc.html"', arw)
-    'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
-
-    >>> MultiRegexReplacer().replaceAll('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"', arw)
-    'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
-
-    """
-
-    DEFAULT_RULES = [
-     ('https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+', ArchivalUrlRewriter.rewrite),
-     ('location', 'WB_wombat_location'),
-     ('domain', 'WB_wombat_domain'),
-     ('some_func\(\)', '/* \\1 */')
-     ]
-
-    def __init__(self, rules = None):
-        if not rules:
-            rules = MultiRegexReplacer.DEFAULT_RULES
-
-        # Build regexstr, concatenating regex list
-        regexStr = '|'.join(['(' + rx + ')' for rx, op in rules])
-
-        # ensure it's not middle of a word, wrap in non-capture group
-        regexStr = '(?<!\w)(?:' + regexStr + ')'
-
-        self.regex = re.compile(regexStr)
-        self.rules = rules
-
-    def replaceAll(self, string, rewriter):
-        last = 0
-        result = ''
-
-        for m in self.regex.finditer(string):
-
-            groups = m.groups()
-
-            numGroups = len(groups)
-
-            for g, i in zip(groups, range(numGroups)):
-                if g:
-                    break
-
-            # Add 1 as group 0 is always entire match
-            start = m.start(i + 1)
-            end = m.end(i + 1)
-
-            result += string[last:start]
-
-            # i-th rule, 1st index of tuple
-            op = self.rules[i][1]
-
-            if hasattr(op, '__call__'):
-                result += op(rewriter, string[start:end])
-            else:
-                result += str(op)
-
-            last = end
-
-        result += string[last:]
-        return result
-
-
-
-class RxRep:
-    """
-    >>> test_repl('location = "http://example.com/abc.html"')
-    'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
-
-    >>> test_repl('cool_Location = "http://example.com/abc.html"')
-    'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
-
-    >>> test_repl('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
-    'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
-
-    >>> test_repl('window.location = "http://example.com/abc.html"; some_func(); ')
-    'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func()*/; '
-
+    # Test https->http converter (other tests below in subclasses)
+    >>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_REGEX, RegexRewriter.removeHttps, 0)]).replaceAll('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
+    'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'
    """

    @staticmethod
@ -165,7 +23,11 @@ class RxRep:
    def addPrefix(prefix):
        return lambda string: prefix + string

-    HTTP_MATCH_REGEX = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+'
+    @staticmethod
+    def archivalRewrite(rewriter):
+        return lambda x: rewriter.rewrite(x)
+
+    HTTPX_MATCH_REGEX = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+'

    DEFAULT_OP = addPrefix

@ -174,55 +36,145 @@ class RxRep:
        #rules = self.createRules(httpPrefix)

        # Build regexstr, concatenating regex list
-        regexStr = '|'.join(['(' + rx + ')' for rx, op in rules])
+        regexStr = '|'.join(['(' + rx + ')' for rx, op, count in rules])

        # ensure it's not middle of a word, wrap in non-capture group
        regexStr = '(?<!\w)(?:' + regexStr + ')'

-        self.regex = re.compile(regexStr)
+        self.regex = re.compile(regexStr, re.M)
        self.rules = rules

    def replaceAll(self, string):
        return self.regex.sub(lambda x: self.replace(x), string)

    def replace(self, m):
-        for group, (_, op) in zip(m.groups(), self.rules):
-            if group:
-                # Custom func
-                if not hasattr(op, '__call__'):
-                    op = RxRep.DEFAULT_OP(op)
+        i = 0
+        for _, op, count in self.rules:
+            i += 1

-                return op(group)
+            fullM = i
+            while count > 0:
+                i += 1
+                count -= 1

-        raise re.error('No Match Found for replacement')
+            if not m.group(i):
+                continue
+
+            # Custom func
+            if not hasattr(op, '__call__'):
+                op = RegexRewriter.DEFAULT_OP(op)
+
+            result = op(m.group(i))
+
+            # if extracting partial match
+            if i != fullM:
+                result = m.string[m.start(fullM):m.start(i)] + result + m.string[m.end(i):m.end(fullM)]
+
+            return result


-class JSRewriter(RxRep):
+
+class JSRewriter(RegexRewriter):
+    """
+    >>> test_js('location = "http://example.com/abc.html"')
+    'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
+
+    >>> test_js('cool_Location = "http://example.com/abc.html"')
+    'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
+
+    >>> test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
+    'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
+
+    # custom rules added
+    >>> test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.commentOut, 0)])
+    'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
+
+    """
+
    def __init__(self, httpPrefix, extra = []):
        rules = self._createRules(httpPrefix)
        rules.extend(extra)
- 
-        RxRep.__init__(self, rules)
+
+        RegexRewriter.__init__(self, rules)


    def _createRules(self, httpPrefix):
        return [
-             (RxRep.HTTP_MATCH_REGEX, httpPrefix),
-             ('location', 'WB_wombat_'),
-             ('domain', 'WB_wombat_'),
+             (RegexRewriter.HTTPX_MATCH_REGEX, httpPrefix, 0),
+             ('location|domain', 'WB_wombat_', 0),
        ]


+class CSSRewriter(RegexRewriter):
+    r"""
+    >>> test_css("background: url('/some/path.html')")
+    "background: url('/web/20131010im_/http://example.com/some/path.html')"
+
+    >>> test_css("background: url('../path.html')")
+    "background: url('/web/20131010im_/http://example.com/path.html')"
+
+    >>> test_css("background: url(\"http://domain.com/path.html\")")
+    'background: url("/web/20131010im_/http://domain.com/path.html")'
+
+    >>> test_css("background: url(file.jpeg)")
+    'background: url(/web/20131010im_/http://example.com/file.jpeg)'
+
+    >>> test_css("background: url('')")
+    "background: url('')"
+
+    >>> test_css("background: url (\"weirdpath\')")
+    'background: url ("/web/20131010im_/http://example.com/weirdpath\')'
+
+    >>> test_css("@import   url ('path.css')")
+    "@import   url ('/web/20131010im_/http://example.com/path.css')"
+
+    >>> test_css("@import url('path.css')")
+    "@import url('/web/20131010im_/http://example.com/path.css')"
+
+    >>> test_css("@import ( 'path.css')")
+    "@import ( '/web/20131010im_/http://example.com/path.css')"
+
+    >>> test_css("@import  \"path.css\"")
+    '@import  "/web/20131010im_/http://example.com/path.css"'
+
+    >>> test_css("@import ('../path.css\"")
+    '@import (\'/web/20131010im_/http://example.com/path.css"'
+
+    >>> test_css("@import ('../url.css\"")
+    '@import (\'/web/20131010im_/http://example.com/url.css"'
+
+    >>> test_css("@import (\"url.css\")")
+    '@import ("/web/20131010im_/http://example.com/url.css")'
+
+    """
+
+    def __init__(self, rewriter):
+        rules = self._createRules(rewriter)
+
+        RegexRewriter.__init__(self, rules)
+
+
+    def _createRules(self, rewriter):
+        return [
+             ("url\\s*\\(\\s*[\\\\\"']*([^'\"]+)[\\\\\"']*\\s*\\)", RegexRewriter.archivalRewrite(rewriter), 1),
+             ("@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)", RegexRewriter.archivalRewrite(rewriter), 1),
+        ]
+

 if __name__ == "__main__":
    import doctest

-    extra = [('some_func\(\)', RxRep.commentOut)]
+    rwPrefix = '/web/20131010im_/'
+
+    arcrw = ArchivalUrlRewriter('/20131010im_/http://example.com/', '/web/')
+
+    def test_js(string, extra = []):
+        return JSRewriter(rwPrefix, extra).replaceAll(string)
+
+    def test_css(string):
+        return CSSRewriter(arcrw).replaceAll(string)

-    rxrep = JSRewriter('/web/20131010im_/', extra)

-    def test_repl(string):
-        return rxrep.replaceAll(string)

    doctest.testmod()

--- a/pywb/wbhtml.py
+++ b/pywb/wbhtml.py
@ -3,12 +3,13 @@ import re

 from HTMLParser import HTMLParser
 from wburlrewriter import ArchivalUrlRewriter
+from regexmatch import JSRewriter, CSSRewriter

 #=================================================================
 # WBHtml --html parser for custom rewriting, also handlers for script and css
 #=================================================================
 class WBHtml(HTMLParser):
-    """
+    r"""
    >>> WBHtml(rewriter).feed('<HTML><A Href="page.html">Text</a></hTmL>')
    <HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>

@ -20,6 +21,18 @@ class WBHtml(HTMLParser):

    >>> WBHtml(rewriter).feed('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
    <meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
+
+    >>> WBHtml(rewriter).feed('<script>window.location = "http://example.com/a/b/c.html"</script>')
+    <script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
+
+    >>> WBHtml(rewriter).feed('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
+    <script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
+
+    >>> WBHtml(rewriter).feed('<div style="background: url(\'abc.html\')" onclick="location = \'redirect.html\'"></div>')
+    <div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onclick="WB_wombat_location = 'redirect.html'"></div>
+
+    >>> WBHtml(rewriter).feed('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
+    <style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
    """

    REWRITE_TAGS = {
@ -50,7 +63,7 @@ class WBHtml(HTMLParser):
                    'data-uri' : ''},
    }

-    STATE_TAGS = ['head', 'body', 'script', 'style']
+    STATE_TAGS = ['script', 'style']


    def __init__(self, rewriter, outstream = None):
@ -60,6 +73,9 @@ class WBHtml(HTMLParser):
        self._wbParseContext = None
        self.out = outstream if outstream else sys.stdout

+        self.jsRewriter = JSRewriter(rewriter.getAbsUrl())
+        self.cssRewriter = CSSRewriter(rewriter)
+

    # ===========================
    META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE)
@ -82,10 +98,10 @@ class WBHtml(HTMLParser):


    def _rewriteCSS(self, cssContent):
-        return cssContent
+        return self.cssRewriter.replaceAll(cssContent)

    def _rewriteScript(self, scriptContent):
-        return scriptContent
+        return self.jsRewriter.replaceAll(scriptContent)

    def hasAttr(self, tagAttrs, attr):
        name, value = attr
@ -95,13 +111,6 @@ class WBHtml(HTMLParser):
        return False

    def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
-        handler = WBHtml.REWRITE_TAGS.get(tag)
-        if not handler:
-            handler = WBHtml.REWRITE_TAGS.get('')
-
-        if not handler:
-            return False
-
        # special case: base tag
        if (tag == 'base'):
            newBase = tagAttrs.get('href')
@ -109,9 +118,17 @@ class WBHtml(HTMLParser):
                self.rewriter.setBaseUrl(newBase[1])

        # special case: script or style parse context
-        elif ((tag == 'script') or (tag == 'style')) and (self._wbParseContext == None):
+        elif (tag in WBHtml.STATE_TAGS) and (self._wbParseContext == None):
            self._wbParseContext = tag

+        # attr rewriting
+        handler = WBHtml.REWRITE_TAGS.get(tag)
+        if not handler:
+            handler = WBHtml.REWRITE_TAGS.get('')
+
+        if not handler:
+            return False
+
        self.out.write('<' + tag)

        for attr in tagAttrs:
--- a/pywb/wburlrewriter.py
+++ b/pywb/wburlrewriter.py
@ -31,7 +31,13 @@ class ArchivalUrlRewriter:

    >>> test_rewrite('../../other.html', '/2020/http://example.com/index.html', '')
    '/2020/http://example.com/other.html'
-      """
+
+    >>> test_rewrite('', '/20131010010203/http://example.com/file.html', '/web/')
+    '/web/20131010010203/http://example.com/file.html'
+
+    >>> ArchivalUrlRewriter('/19960708im_/http://domain.example.com/path.txt', '/abc/').getAbsUrl()
+    '/abc/19960708im_/'
+    """

    NO_REWRITE_URI_PREFIX = ['javascript:', 'data:', 'mailto:', 'about:']

@ -72,6 +78,9 @@ class ArchivalUrlRewriter:

        return finalUrl

+    def getAbsUrl(self, url = ''):
+        return self.prefix + ArchivalUrl.to_str(self.wburl.type, self.wburl.mod, self.wburl.timestamp, url)
+

    def setBaseUrl(self, newUrl):
        self.wburl.url = newUrl