move norewrite prefixs down to ArchivalUrlRewriter (was in html parser)

Add new general regex match work, (several attempts, though last one is simplest/best!)
2025-03-24 06:59:52 +01:00 · 2013-12-23 15:52:33 -08:00 · 2013-12-23 15:52:33 -08:00 · 3a896f7cd3
commit 3a896f7cd3
parent 37e57f7013
3 changed files with 260 additions and 20 deletions
--- a/pywb/regexmatch.py
+++ b/pywb/regexmatch.py
@ -0,0 +1,230 @@
 import re
 import sys
 from wburlrewriter import ArchivalUrlRewriter
 class RegexMatchReplacer:
    def __init__(self, regexStr):
        self.regex = re.compile(regexStr)
    def replaceAll(self, string):
        last = 0
        result = ''
        for m in self.regex.finditer(string):
            start = m.start(1)
            end = m.end(1)
            result += string[last:start]
            result += self.replace(string[start:end], m)
            last = end
        result += string[last:]
        return result
    def replace(self, string, m):
        return string
 class HttpMatchReplacer(RegexMatchReplacer):
    HTTP_REGEX = "(https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+)"
    def __init__(self, rewriter):
        RegexMatchReplacer.__init__(self, HttpMatchReplacer.HTTP_REGEX)
        self.rewriter = rewriter
    def replace(self, string, m):
        return self.rewriter.rewrite(string)
 class CustomMatchReplacer(RegexMatchReplacer):
    def __init__(self, matchRegex, replaceStr):
        RegexMatchReplacer.__init__(self, matchRegex)
        self.replaceStr = replaceStr
    def replace(self, string, m):
        return self.replaceStr
 class Replacers:
    """
    >>> replacer.replaceAll('location = "http://example.com/abc.html"')
    'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
    >>> replacer.replaceAll('cool_Location = "http://example.com/abc.html"')
    'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
    >>> replacer.replaceAll('window.location = "http://example.com/abc.html"')
    'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
    """
    def __init__(self, replacers):
        self.replacers = replacers
    def replaceAll(self, string):
        for x in self.replacers:
            string = x.replaceAll(string)
        return string
 replacer = Replacers([HttpMatchReplacer(ArchivalUrlRewriter('/20131010im_/http://abc.com/XYZ/', '/web/')), CustomMatchReplacer('[^\w]?(location|domain)', 'WB_wombat_location')])
 # =================================
 arw = ArchivalUrlRewriter('/20131010im_/http://abc.com/XYZ/', '/web/')
 class MultiRegexReplacer:
    """
    >>> MultiRegexReplacer().replaceAll('location = "http://example.com/abc.html"', arw)
    'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
    >>> MultiRegexReplacer().replaceAll('cool_Location = "http://example.com/abc.html"', arw)
    'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
    >>> MultiRegexReplacer().replaceAll('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"', arw)
    'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
    """
    DEFAULT_RULES = [
     ('https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+', ArchivalUrlRewriter.rewrite),
     ('location', 'WB_wombat_location'),
     ('domain', 'WB_wombat_domain'),
     ('some_func\(\)', '/* \\1 */')
     ]
    def __init__(self, rules = None):
        if not rules:
            rules = MultiRegexReplacer.DEFAULT_RULES
        # Build regexstr, concatenating regex list
        regexStr = '|'.join(['(' + rx + ')' for rx, op in rules])
        # ensure it's not middle of a word, wrap in non-capture group
        regexStr = '(?<!\w)(?:' + regexStr + ')'
        self.regex = re.compile(regexStr)
        self.rules = rules
    def replaceAll(self, string, rewriter):
        last = 0
        result = ''
        for m in self.regex.finditer(string):
            groups = m.groups()
            numGroups = len(groups)
            for g, i in zip(groups, range(numGroups)):
                if g:
                    break
            # Add 1 as group 0 is always entire match
            start = m.start(i + 1)
            end = m.end(i + 1)
            result += string[last:start]
            # i-th rule, 1st index of tuple
            op = self.rules[i][1]
            if hasattr(op, '__call__'):
                result += op(rewriter, string[start:end])
            else:
                result += str(op)
            last = end
        result += string[last:]
        return result
 class RxRep:
    """
    >>> test_repl('location = "http://example.com/abc.html"')
    'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
    >>> test_repl('cool_Location = "http://example.com/abc.html"')
    'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
    >>> test_repl('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
    'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
    >>> test_repl('window.location = "http://example.com/abc.html"; some_func(); ')
    'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func()*/; '
    """
    @staticmethod
    def commentOut(string):
        return '/*' + string + '*/'
    @staticmethod
    def removeHttps(string):
        return string.replace("https", "http")
    @staticmethod
    def addPrefix(prefix):
        return lambda string: prefix + string
    HTTP_MATCH_REGEX = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+'
    DEFAULT_OP = addPrefix
    def __init__(self, rules):
        #rules = self.createRules(httpPrefix)
        # Build regexstr, concatenating regex list
        regexStr = '|'.join(['(' + rx + ')' for rx, op in rules])
        # ensure it's not middle of a word, wrap in non-capture group
        regexStr = '(?<!\w)(?:' + regexStr + ')'
        self.regex = re.compile(regexStr)
        self.rules = rules
    def replaceAll(self, string):
        return self.regex.sub(lambda x: self.replace(x), string)
    def replace(self, m):
        for group, (_, op) in zip(m.groups(), self.rules):
            if group:
                # Custom func
                if not hasattr(op, '__call__'):
                    op = RxRep.DEFAULT_OP(op)
                return op(group)
        raise re.error('No Match Found for replacement')
 class JSRewriter(RxRep):
    def __init__(self, httpPrefix, extra = []):
        rules = self._createRules(httpPrefix)
        rules.extend(extra)
        RxRep.__init__(self, rules)
    def _createRules(self, httpPrefix):
        return [
             (RxRep.HTTP_MATCH_REGEX, httpPrefix),
             ('location', 'WB_wombat_'),
             ('domain', 'WB_wombat_'),
        ]
 if __name__ == "__main__":
    import doctest
    extra = [('some_func\(\)', RxRep.commentOut)]
    rxrep = JSRewriter('/web/20131010im_/', extra)
    def test_repl(string):
        return rxrep.replaceAll(string)
    doctest.testmod()
--- a/pywb/wbhtml.py
+++ b/pywb/wbhtml.py
@ -4,9 +4,9 @@ import re
 from HTMLParser import HTMLParser
 from wburlrewriter import ArchivalUrlRewriter
-
+#=================================================================
-
+# WBHtml --html parser for custom rewriting, also handlers for script and css
-# create a subclass and override the handler methods
+#=================================================================
 class WBHtml(HTMLParser):
    """
    >>> WBHtml(rewriter).feed('<HTML><A Href="page.html">Text</a></hTmL>')
@ -77,12 +77,7 @@ class WBHtml(HTMLParser):
        return metaRefresh
    # ===========================
    NO_REWRITE_PREFIX = ['javascript:', 'data:', 'mailto:']
    def _rewriteURL(self, value, mod = None):
        if any (value.startswith(x) for x in WBHtml.NO_REWRITE_PREFIX):
            return value
        return self.rewriter.rewrite(value, mod)
--- a/pywb/wburlrewriter.py
+++ b/pywb/wburlrewriter.py
@ -33,6 +33,10 @@ class ArchivalUrlRewriter:
    '/2020/http://example.com/other.html'
      """
    NO_REWRITE_URI_PREFIX = ['javascript:', 'data:', 'mailto:', 'about:']
    PROTOCOLS = ['http://', 'https://', '//', 'mms://', 'rtsp://', 'wais://']
    def __init__(self, wburl_str, prefix):
        self.wburl = ArchivalUrl(wburl_str)
        self.prefix = prefix
@ -40,23 +44,34 @@ class ArchivalUrlRewriter:
        if self.prefix.endswith('/'):
            self.prefix = self.prefix[:-1]
-    def rewrite(self, rel_url, mod = None):
+    def rewrite(self, url, mod = None):
        # if special protocol, no rewriting at all
        if any (url.startswith(x) for x in ArchivalUrlRewriter.NO_REWRITE_URI_PREFIX):
            return url
        wburl = self.wburl
-        # Disable optimization, doesn't work for external links
+        isAbs = any (url.startswith(x) for x in ArchivalUrlRewriter.PROTOCOLS)
        # if relative path or different mod, create rewrite from split up ArchivalUrl
        #if rel_url.startswith('/') or ('../' in rel_url) or mod:
        newUrl = urlparse.urljoin(wburl.url, rel_url).replace('../', '')
-        if mod is None:
+        # Optimized rewriter for
-            mod = wburl.mod
+        # -rel urls that don't start with / and  don't contain ../ and no special mod
        if not (isAbs or mod or url.startswith('/') or ('../' in url)):
            finalUrl = urlparse.urljoin(self.prefix + wburl.original_url, url)
-        final_url = self.prefix + ArchivalUrl.to_str(wburl.type, mod, wburl.timestamp, newUrl)
+        else:
-        # otherwise, optimize, and join directly with full url
+            # optimize: join if not absolute url, otherwise just use that
-        #else:
+            if not isAbs:
-        #    final_url = urlparse.urljoin(self.prefix + wburl.original_url, rel_url)
+                newUrl = urlparse.urljoin(wburl.url, url).replace('../', '')
            else:
                newUrl = url
            if mod is None:
                mod = wburl.mod
            finalUrl = self.prefix + ArchivalUrl.to_str(wburl.type, mod, wburl.timestamp, newUrl)
        return finalUrl
        return final_url
    def setBaseUrl(self, newUrl):
        self.wburl.url = newUrl