1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

move norewrite prefixs down to ArchivalUrlRewriter (was in html parser)

Add new general regex match work, (several attempts, though last one is simplest/best!)
This commit is contained in:
Ilya Kreymer 2013-12-23 15:52:33 -08:00
parent 37e57f7013
commit 3a896f7cd3
3 changed files with 260 additions and 20 deletions

230
pywb/regexmatch.py Normal file
View File

@ -0,0 +1,230 @@
import re
import sys
from wburlrewriter import ArchivalUrlRewriter
class RegexMatchReplacer:
def __init__(self, regexStr):
self.regex = re.compile(regexStr)
def replaceAll(self, string):
last = 0
result = ''
for m in self.regex.finditer(string):
start = m.start(1)
end = m.end(1)
result += string[last:start]
result += self.replace(string[start:end], m)
last = end
result += string[last:]
return result
def replace(self, string, m):
return string
class HttpMatchReplacer(RegexMatchReplacer):
HTTP_REGEX = "(https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+)"
def __init__(self, rewriter):
RegexMatchReplacer.__init__(self, HttpMatchReplacer.HTTP_REGEX)
self.rewriter = rewriter
def replace(self, string, m):
return self.rewriter.rewrite(string)
class CustomMatchReplacer(RegexMatchReplacer):
def __init__(self, matchRegex, replaceStr):
RegexMatchReplacer.__init__(self, matchRegex)
self.replaceStr = replaceStr
def replace(self, string, m):
return self.replaceStr
class Replacers:
"""
>>> replacer.replaceAll('location = "http://example.com/abc.html"')
'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
>>> replacer.replaceAll('cool_Location = "http://example.com/abc.html"')
'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
>>> replacer.replaceAll('window.location = "http://example.com/abc.html"')
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
"""
def __init__(self, replacers):
self.replacers = replacers
def replaceAll(self, string):
for x in self.replacers:
string = x.replaceAll(string)
return string
replacer = Replacers([HttpMatchReplacer(ArchivalUrlRewriter('/20131010im_/http://abc.com/XYZ/', '/web/')), CustomMatchReplacer('[^\w]?(location|domain)', 'WB_wombat_location')])
# =================================
arw = ArchivalUrlRewriter('/20131010im_/http://abc.com/XYZ/', '/web/')
class MultiRegexReplacer:
"""
>>> MultiRegexReplacer().replaceAll('location = "http://example.com/abc.html"', arw)
'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
>>> MultiRegexReplacer().replaceAll('cool_Location = "http://example.com/abc.html"', arw)
'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
>>> MultiRegexReplacer().replaceAll('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"', arw)
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
"""
DEFAULT_RULES = [
('https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+', ArchivalUrlRewriter.rewrite),
('location', 'WB_wombat_location'),
('domain', 'WB_wombat_domain'),
('some_func\(\)', '/* \\1 */')
]
def __init__(self, rules = None):
if not rules:
rules = MultiRegexReplacer.DEFAULT_RULES
# Build regexstr, concatenating regex list
regexStr = '|'.join(['(' + rx + ')' for rx, op in rules])
# ensure it's not middle of a word, wrap in non-capture group
regexStr = '(?<!\w)(?:' + regexStr + ')'
self.regex = re.compile(regexStr)
self.rules = rules
def replaceAll(self, string, rewriter):
last = 0
result = ''
for m in self.regex.finditer(string):
groups = m.groups()
numGroups = len(groups)
for g, i in zip(groups, range(numGroups)):
if g:
break
# Add 1 as group 0 is always entire match
start = m.start(i + 1)
end = m.end(i + 1)
result += string[last:start]
# i-th rule, 1st index of tuple
op = self.rules[i][1]
if hasattr(op, '__call__'):
result += op(rewriter, string[start:end])
else:
result += str(op)
last = end
result += string[last:]
return result
class RxRep:
"""
>>> test_repl('location = "http://example.com/abc.html"')
'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
>>> test_repl('cool_Location = "http://example.com/abc.html"')
'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
>>> test_repl('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
>>> test_repl('window.location = "http://example.com/abc.html"; some_func(); ')
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func()*/; '
"""
@staticmethod
def commentOut(string):
return '/*' + string + '*/'
@staticmethod
def removeHttps(string):
return string.replace("https", "http")
@staticmethod
def addPrefix(prefix):
return lambda string: prefix + string
HTTP_MATCH_REGEX = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+'
DEFAULT_OP = addPrefix
def __init__(self, rules):
#rules = self.createRules(httpPrefix)
# Build regexstr, concatenating regex list
regexStr = '|'.join(['(' + rx + ')' for rx, op in rules])
# ensure it's not middle of a word, wrap in non-capture group
regexStr = '(?<!\w)(?:' + regexStr + ')'
self.regex = re.compile(regexStr)
self.rules = rules
def replaceAll(self, string):
return self.regex.sub(lambda x: self.replace(x), string)
def replace(self, m):
for group, (_, op) in zip(m.groups(), self.rules):
if group:
# Custom func
if not hasattr(op, '__call__'):
op = RxRep.DEFAULT_OP(op)
return op(group)
raise re.error('No Match Found for replacement')
class JSRewriter(RxRep):
def __init__(self, httpPrefix, extra = []):
rules = self._createRules(httpPrefix)
rules.extend(extra)
RxRep.__init__(self, rules)
def _createRules(self, httpPrefix):
return [
(RxRep.HTTP_MATCH_REGEX, httpPrefix),
('location', 'WB_wombat_'),
('domain', 'WB_wombat_'),
]
if __name__ == "__main__":
import doctest
extra = [('some_func\(\)', RxRep.commentOut)]
rxrep = JSRewriter('/web/20131010im_/', extra)
def test_repl(string):
return rxrep.replaceAll(string)
doctest.testmod()

View File

@ -4,9 +4,9 @@ import re
from HTMLParser import HTMLParser
from wburlrewriter import ArchivalUrlRewriter
# create a subclass and override the handler methods
#=================================================================
# WBHtml --html parser for custom rewriting, also handlers for script and css
#=================================================================
class WBHtml(HTMLParser):
"""
>>> WBHtml(rewriter).feed('<HTML><A Href="page.html">Text</a></hTmL>')
@ -77,12 +77,7 @@ class WBHtml(HTMLParser):
return metaRefresh
# ===========================
NO_REWRITE_PREFIX = ['javascript:', 'data:', 'mailto:']
def _rewriteURL(self, value, mod = None):
if any (value.startswith(x) for x in WBHtml.NO_REWRITE_PREFIX):
return value
return self.rewriter.rewrite(value, mod)

View File

@ -33,6 +33,10 @@ class ArchivalUrlRewriter:
'/2020/http://example.com/other.html'
"""
NO_REWRITE_URI_PREFIX = ['javascript:', 'data:', 'mailto:', 'about:']
PROTOCOLS = ['http://', 'https://', '//', 'mms://', 'rtsp://', 'wais://']
def __init__(self, wburl_str, prefix):
self.wburl = ArchivalUrl(wburl_str)
self.prefix = prefix
@ -40,23 +44,34 @@ class ArchivalUrlRewriter:
if self.prefix.endswith('/'):
self.prefix = self.prefix[:-1]
def rewrite(self, rel_url, mod = None):
def rewrite(self, url, mod = None):
# if special protocol, no rewriting at all
if any (url.startswith(x) for x in ArchivalUrlRewriter.NO_REWRITE_URI_PREFIX):
return url
wburl = self.wburl
# Disable optimization, doesn't work for external links
# if relative path or different mod, create rewrite from split up ArchivalUrl
#if rel_url.startswith('/') or ('../' in rel_url) or mod:
newUrl = urlparse.urljoin(wburl.url, rel_url).replace('../', '')
isAbs = any (url.startswith(x) for x in ArchivalUrlRewriter.PROTOCOLS)
if mod is None:
mod = wburl.mod
# Optimized rewriter for
# -rel urls that don't start with / and don't contain ../ and no special mod
if not (isAbs or mod or url.startswith('/') or ('../' in url)):
finalUrl = urlparse.urljoin(self.prefix + wburl.original_url, url)
final_url = self.prefix + ArchivalUrl.to_str(wburl.type, mod, wburl.timestamp, newUrl)
# otherwise, optimize, and join directly with full url
#else:
# final_url = urlparse.urljoin(self.prefix + wburl.original_url, rel_url)
else:
# optimize: join if not absolute url, otherwise just use that
if not isAbs:
newUrl = urlparse.urljoin(wburl.url, url).replace('../', '')
else:
newUrl = url
if mod is None:
mod = wburl.mod
finalUrl = self.prefix + ArchivalUrl.to_str(wburl.type, mod, wburl.timestamp, newUrl)
return finalUrl
return final_url
def setBaseUrl(self, newUrl):
self.wburl.url = newUrl