From 3a896f7cd3b9a69d407a8728ab3e725cbceb57a0 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 23 Dec 2013 15:52:33 -0800 Subject: [PATCH] move norewrite prefixs down to ArchivalUrlRewriter (was in html parser) Add new general regex match work, (several attempts, though last one is simplest/best!) --- pywb/regexmatch.py | 230 ++++++++++++++++++++++++++++++++++++++++++ pywb/wbhtml.py | 11 +- pywb/wburlrewriter.py | 39 ++++--- 3 files changed, 260 insertions(+), 20 deletions(-) create mode 100644 pywb/regexmatch.py diff --git a/pywb/regexmatch.py b/pywb/regexmatch.py new file mode 100644 index 00000000..7c0147f3 --- /dev/null +++ b/pywb/regexmatch.py @@ -0,0 +1,230 @@ +import re +import sys +from wburlrewriter import ArchivalUrlRewriter + +class RegexMatchReplacer: + def __init__(self, regexStr): + self.regex = re.compile(regexStr) + + def replaceAll(self, string): + last = 0 + result = '' + for m in self.regex.finditer(string): + start = m.start(1) + end = m.end(1) + result += string[last:start] + result += self.replace(string[start:end], m) + last = end + + result += string[last:] + return result + + def replace(self, string, m): + return string + + +class HttpMatchReplacer(RegexMatchReplacer): + HTTP_REGEX = "(https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+)" + + def __init__(self, rewriter): + RegexMatchReplacer.__init__(self, HttpMatchReplacer.HTTP_REGEX) + self.rewriter = rewriter + + def replace(self, string, m): + return self.rewriter.rewrite(string) + +class CustomMatchReplacer(RegexMatchReplacer): + def __init__(self, matchRegex, replaceStr): + RegexMatchReplacer.__init__(self, matchRegex) + self.replaceStr = replaceStr + + def replace(self, string, m): + return self.replaceStr + +class Replacers: + """ + >>> replacer.replaceAll('location = "http://example.com/abc.html"') + 'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"' + + >>> replacer.replaceAll('cool_Location = "http://example.com/abc.html"') + 'cool_Location = "/web/20131010im_/http://example.com/abc.html"' + + >>> replacer.replaceAll('window.location = "http://example.com/abc.html"') + 'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"' + """ + + def __init__(self, replacers): + self.replacers = replacers + + def replaceAll(self, string): + for x in self.replacers: + string = x.replaceAll(string) + + return string + +replacer = Replacers([HttpMatchReplacer(ArchivalUrlRewriter('/20131010im_/http://abc.com/XYZ/', '/web/')), CustomMatchReplacer('[^\w]?(location|domain)', 'WB_wombat_location')]) + +# ================================= +arw = ArchivalUrlRewriter('/20131010im_/http://abc.com/XYZ/', '/web/') + + + +class MultiRegexReplacer: + """ + >>> MultiRegexReplacer().replaceAll('location = "http://example.com/abc.html"', arw) + 'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"' + + >>> MultiRegexReplacer().replaceAll('cool_Location = "http://example.com/abc.html"', arw) + 'cool_Location = "/web/20131010im_/http://example.com/abc.html"' + + >>> MultiRegexReplacer().replaceAll('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"', arw) + 'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"' + + """ + + DEFAULT_RULES = [ + ('https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+', ArchivalUrlRewriter.rewrite), + ('location', 'WB_wombat_location'), + ('domain', 'WB_wombat_domain'), + ('some_func\(\)', '/* \\1 */') + ] + + def __init__(self, rules = None): + if not rules: + rules = MultiRegexReplacer.DEFAULT_RULES + + # Build regexstr, concatenating regex list + regexStr = '|'.join(['(' + rx + ')' for rx, op in rules]) + + # ensure it's not middle of a word, wrap in non-capture group + regexStr = '(?>> test_repl('location = "http://example.com/abc.html"') + 'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"' + + >>> test_repl('cool_Location = "http://example.com/abc.html"') + 'cool_Location = "/web/20131010im_/http://example.com/abc.html"' + + >>> test_repl('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"') + 'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"' + + >>> test_repl('window.location = "http://example.com/abc.html"; some_func(); ') + 'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func()*/; ' + + """ + + @staticmethod + def commentOut(string): + return '/*' + string + '*/' + + @staticmethod + def removeHttps(string): + return string.replace("https", "http") + + @staticmethod + def addPrefix(prefix): + return lambda string: prefix + string + + HTTP_MATCH_REGEX = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+' + + DEFAULT_OP = addPrefix + + + def __init__(self, rules): + #rules = self.createRules(httpPrefix) + + # Build regexstr, concatenating regex list + regexStr = '|'.join(['(' + rx + ')' for rx, op in rules]) + + # ensure it's not middle of a word, wrap in non-capture group + regexStr = '(?>> WBHtml(rewriter).feed('Text') @@ -77,12 +77,7 @@ class WBHtml(HTMLParser): return metaRefresh # =========================== - NO_REWRITE_PREFIX = ['javascript:', 'data:', 'mailto:'] - def _rewriteURL(self, value, mod = None): - if any (value.startswith(x) for x in WBHtml.NO_REWRITE_PREFIX): - return value - return self.rewriter.rewrite(value, mod) diff --git a/pywb/wburlrewriter.py b/pywb/wburlrewriter.py index 041dc753..5a2bc260 100644 --- a/pywb/wburlrewriter.py +++ b/pywb/wburlrewriter.py @@ -33,6 +33,10 @@ class ArchivalUrlRewriter: '/2020/http://example.com/other.html' """ + NO_REWRITE_URI_PREFIX = ['javascript:', 'data:', 'mailto:', 'about:'] + + PROTOCOLS = ['http://', 'https://', '//', 'mms://', 'rtsp://', 'wais://'] + def __init__(self, wburl_str, prefix): self.wburl = ArchivalUrl(wburl_str) self.prefix = prefix @@ -40,23 +44,34 @@ class ArchivalUrlRewriter: if self.prefix.endswith('/'): self.prefix = self.prefix[:-1] - def rewrite(self, rel_url, mod = None): + def rewrite(self, url, mod = None): + # if special protocol, no rewriting at all + if any (url.startswith(x) for x in ArchivalUrlRewriter.NO_REWRITE_URI_PREFIX): + return url + wburl = self.wburl - # Disable optimization, doesn't work for external links - # if relative path or different mod, create rewrite from split up ArchivalUrl - #if rel_url.startswith('/') or ('../' in rel_url) or mod: - newUrl = urlparse.urljoin(wburl.url, rel_url).replace('../', '') + isAbs = any (url.startswith(x) for x in ArchivalUrlRewriter.PROTOCOLS) - if mod is None: - mod = wburl.mod + # Optimized rewriter for + # -rel urls that don't start with / and don't contain ../ and no special mod + if not (isAbs or mod or url.startswith('/') or ('../' in url)): + finalUrl = urlparse.urljoin(self.prefix + wburl.original_url, url) - final_url = self.prefix + ArchivalUrl.to_str(wburl.type, mod, wburl.timestamp, newUrl) - # otherwise, optimize, and join directly with full url - #else: - # final_url = urlparse.urljoin(self.prefix + wburl.original_url, rel_url) + else: + # optimize: join if not absolute url, otherwise just use that + if not isAbs: + newUrl = urlparse.urljoin(wburl.url, url).replace('../', '') + else: + newUrl = url + + if mod is None: + mod = wburl.mod + + finalUrl = self.prefix + ArchivalUrl.to_str(wburl.type, mod, wburl.timestamp, newUrl) + + return finalUrl - return final_url def setBaseUrl(self, newUrl): self.wburl.url = newUrl