From 3a896f7cd3b9a69d407a8728ab3e725cbceb57a0 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ilya@archive.org>
Date: Mon, 23 Dec 2013 15:52:33 -0800
Subject: [PATCH] move norewrite prefixs down to ArchivalUrlRewriter (was in
 html parser) Add new general regex match work, (several attempts, though last
 one is simplest/best!)

---
 pywb/regexmatch.py    | 230 ++++++++++++++++++++++++++++++++++++++++++
 pywb/wbhtml.py        |  11 +-
 pywb/wburlrewriter.py |  39 ++++---
 3 files changed, 260 insertions(+), 20 deletions(-)
 create mode 100644 pywb/regexmatch.py

diff --git a/pywb/regexmatch.py b/pywb/regexmatch.py
new file mode 100644
index 00000000..7c0147f3
--- /dev/null
+++ b/pywb/regexmatch.py
@@ -0,0 +1,230 @@
+import re
+import sys
+from wburlrewriter import ArchivalUrlRewriter
+
+class RegexMatchReplacer:
+    def __init__(self, regexStr):
+        self.regex = re.compile(regexStr)
+
+    def replaceAll(self, string):
+        last = 0
+        result = ''
+        for m in self.regex.finditer(string):
+            start = m.start(1)
+            end = m.end(1)
+            result += string[last:start]
+            result += self.replace(string[start:end], m)
+            last = end
+
+        result += string[last:]
+        return result
+
+    def replace(self, string, m):
+        return string
+
+
+class HttpMatchReplacer(RegexMatchReplacer):
+    HTTP_REGEX = "(https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+)"
+
+    def __init__(self, rewriter):
+        RegexMatchReplacer.__init__(self, HttpMatchReplacer.HTTP_REGEX)
+        self.rewriter = rewriter
+
+    def replace(self, string, m):
+        return self.rewriter.rewrite(string)
+
+class CustomMatchReplacer(RegexMatchReplacer):
+    def __init__(self, matchRegex, replaceStr):
+        RegexMatchReplacer.__init__(self, matchRegex)
+        self.replaceStr = replaceStr
+
+    def replace(self, string, m):
+        return self.replaceStr
+
+class Replacers:
+    """
+    >>> replacer.replaceAll('location = "http://example.com/abc.html"')
+    'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
+
+    >>> replacer.replaceAll('cool_Location = "http://example.com/abc.html"')
+    'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
+
+    >>> replacer.replaceAll('window.location = "http://example.com/abc.html"')
+    'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
+    """
+
+    def __init__(self, replacers):
+        self.replacers = replacers
+
+    def replaceAll(self, string):
+        for x in self.replacers:
+            string = x.replaceAll(string)
+
+        return string
+
+replacer = Replacers([HttpMatchReplacer(ArchivalUrlRewriter('/20131010im_/http://abc.com/XYZ/', '/web/')), CustomMatchReplacer('[^\w]?(location|domain)', 'WB_wombat_location')])
+
+# =================================
+arw = ArchivalUrlRewriter('/20131010im_/http://abc.com/XYZ/', '/web/')
+
+
+
+class MultiRegexReplacer:
+    """
+    >>> MultiRegexReplacer().replaceAll('location = "http://example.com/abc.html"', arw)
+    'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
+
+    >>> MultiRegexReplacer().replaceAll('cool_Location = "http://example.com/abc.html"', arw)
+    'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
+
+    >>> MultiRegexReplacer().replaceAll('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"', arw)
+    'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
+
+    """
+
+    DEFAULT_RULES = [
+     ('https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+', ArchivalUrlRewriter.rewrite),
+     ('location', 'WB_wombat_location'),
+     ('domain', 'WB_wombat_domain'),
+     ('some_func\(\)', '/* \\1 */')
+     ]
+
+    def __init__(self, rules = None):
+        if not rules:
+            rules = MultiRegexReplacer.DEFAULT_RULES
+
+        # Build regexstr, concatenating regex list
+        regexStr = '|'.join(['(' + rx + ')' for rx, op in rules])
+
+        # ensure it's not middle of a word, wrap in non-capture group
+        regexStr = '(?<!\w)(?:' + regexStr + ')'
+
+        self.regex = re.compile(regexStr)
+        self.rules = rules
+
+    def replaceAll(self, string, rewriter):
+        last = 0
+        result = ''
+
+        for m in self.regex.finditer(string):
+
+            groups = m.groups()
+
+            numGroups = len(groups)
+
+            for g, i in zip(groups, range(numGroups)):
+                if g:
+                    break
+
+            # Add 1 as group 0 is always entire match
+            start = m.start(i + 1)
+            end = m.end(i + 1)
+
+            result += string[last:start]
+
+            # i-th rule, 1st index of tuple
+            op = self.rules[i][1]
+
+            if hasattr(op, '__call__'):
+                result += op(rewriter, string[start:end])
+            else:
+                result += str(op)
+
+            last = end
+
+        result += string[last:]
+        return result
+
+
+
+class RxRep:
+    """
+    >>> test_repl('location = "http://example.com/abc.html"')
+    'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
+
+    >>> test_repl('cool_Location = "http://example.com/abc.html"')
+    'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
+
+    >>> test_repl('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
+    'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
+
+    >>> test_repl('window.location = "http://example.com/abc.html"; some_func(); ')
+    'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func()*/; '
+
+    """
+
+    @staticmethod
+    def commentOut(string):
+        return '/*' + string + '*/'
+
+    @staticmethod
+    def removeHttps(string):
+        return string.replace("https", "http")
+
+    @staticmethod
+    def addPrefix(prefix):
+        return lambda string: prefix + string
+
+    HTTP_MATCH_REGEX = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+'
+
+    DEFAULT_OP = addPrefix
+
+
+    def __init__(self, rules):
+        #rules = self.createRules(httpPrefix)
+
+        # Build regexstr, concatenating regex list
+        regexStr = '|'.join(['(' + rx + ')' for rx, op in rules])
+
+        # ensure it's not middle of a word, wrap in non-capture group
+        regexStr = '(?<!\w)(?:' + regexStr + ')'
+
+        self.regex = re.compile(regexStr)
+        self.rules = rules
+
+    def replaceAll(self, string):
+        return self.regex.sub(lambda x: self.replace(x), string)
+
+    def replace(self, m):
+        for group, (_, op) in zip(m.groups(), self.rules):
+            if group:
+                # Custom func
+                if not hasattr(op, '__call__'):
+                    op = RxRep.DEFAULT_OP(op)
+
+                return op(group)
+
+        raise re.error('No Match Found for replacement')
+
+
+class JSRewriter(RxRep):
+    def __init__(self, httpPrefix, extra = []):
+        rules = self._createRules(httpPrefix)
+        rules.extend(extra)
+ 
+        RxRep.__init__(self, rules)
+
+
+    def _createRules(self, httpPrefix):
+        return [
+             (RxRep.HTTP_MATCH_REGEX, httpPrefix),
+             ('location', 'WB_wombat_'),
+             ('domain', 'WB_wombat_'),
+        ]
+
+
+
+if __name__ == "__main__":
+    import doctest
+
+    extra = [('some_func\(\)', RxRep.commentOut)]
+
+    rxrep = JSRewriter('/web/20131010im_/', extra)
+
+    def test_repl(string):
+        return rxrep.replaceAll(string)
+
+    doctest.testmod()
+
+
+
diff --git a/pywb/wbhtml.py b/pywb/wbhtml.py
index d707eef3..0a965d74 100644
--- a/pywb/wbhtml.py
+++ b/pywb/wbhtml.py
@@ -4,9 +4,9 @@ import re
 from HTMLParser import HTMLParser
 from wburlrewriter import ArchivalUrlRewriter
 
-
-
-# create a subclass and override the handler methods
+#=================================================================
+# WBHtml --html parser for custom rewriting, also handlers for script and css
+#=================================================================
 class WBHtml(HTMLParser):
     """
     >>> WBHtml(rewriter).feed('<HTML><A Href="page.html">Text</a></hTmL>')
@@ -77,12 +77,7 @@ class WBHtml(HTMLParser):
         return metaRefresh
     # ===========================
 
-    NO_REWRITE_PREFIX = ['javascript:', 'data:', 'mailto:']
-
     def _rewriteURL(self, value, mod = None):
-        if any (value.startswith(x) for x in WBHtml.NO_REWRITE_PREFIX):
-            return value
-
         return self.rewriter.rewrite(value, mod)
 
 
diff --git a/pywb/wburlrewriter.py b/pywb/wburlrewriter.py
index 041dc753..5a2bc260 100644
--- a/pywb/wburlrewriter.py
+++ b/pywb/wburlrewriter.py
@@ -33,6 +33,10 @@ class ArchivalUrlRewriter:
     '/2020/http://example.com/other.html'
       """
 
+    NO_REWRITE_URI_PREFIX = ['javascript:', 'data:', 'mailto:', 'about:']
+
+    PROTOCOLS = ['http://', 'https://', '//', 'mms://', 'rtsp://', 'wais://']
+
     def __init__(self, wburl_str, prefix):
         self.wburl = ArchivalUrl(wburl_str)
         self.prefix = prefix
@@ -40,23 +44,34 @@ class ArchivalUrlRewriter:
         if self.prefix.endswith('/'):
             self.prefix = self.prefix[:-1]
 
-    def rewrite(self, rel_url, mod = None):
+    def rewrite(self, url, mod = None):
+        # if special protocol, no rewriting at all
+        if any (url.startswith(x) for x in ArchivalUrlRewriter.NO_REWRITE_URI_PREFIX):
+            return url
+
         wburl = self.wburl
 
-        # Disable optimization, doesn't work for external links
-        # if relative path or different mod, create rewrite from split up ArchivalUrl
-        #if rel_url.startswith('/') or ('../' in rel_url) or mod:
-        newUrl = urlparse.urljoin(wburl.url, rel_url).replace('../', '')
+        isAbs = any (url.startswith(x) for x in ArchivalUrlRewriter.PROTOCOLS)
 
-        if mod is None:
-            mod = wburl.mod
+        # Optimized rewriter for
+        # -rel urls that don't start with / and  don't contain ../ and no special mod
+        if not (isAbs or mod or url.startswith('/') or ('../' in url)):
+            finalUrl = urlparse.urljoin(self.prefix + wburl.original_url, url)
 
-        final_url = self.prefix + ArchivalUrl.to_str(wburl.type, mod, wburl.timestamp, newUrl)
-        # otherwise, optimize, and join directly with full url
-        #else:
-        #    final_url = urlparse.urljoin(self.prefix + wburl.original_url, rel_url)
+        else:
+            # optimize: join if not absolute url, otherwise just use that
+            if not isAbs:
+                newUrl = urlparse.urljoin(wburl.url, url).replace('../', '')
+            else:
+                newUrl = url
+
+            if mod is None:
+                mod = wburl.mod
+
+            finalUrl = self.prefix + ArchivalUrl.to_str(wburl.type, mod, wburl.timestamp, newUrl)
+
+        return finalUrl
 
-        return final_url
 
     def setBaseUrl(self, newUrl):
         self.wburl.url = newUrl