mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
move norewrite prefixs down to ArchivalUrlRewriter (was in html parser)
Add new general regex match work, (several attempts, though last one is simplest/best!)
This commit is contained in:
parent
37e57f7013
commit
3a896f7cd3
230
pywb/regexmatch.py
Normal file
230
pywb/regexmatch.py
Normal file
@ -0,0 +1,230 @@
|
|||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from wburlrewriter import ArchivalUrlRewriter
|
||||||
|
|
||||||
|
class RegexMatchReplacer:
|
||||||
|
def __init__(self, regexStr):
|
||||||
|
self.regex = re.compile(regexStr)
|
||||||
|
|
||||||
|
def replaceAll(self, string):
|
||||||
|
last = 0
|
||||||
|
result = ''
|
||||||
|
for m in self.regex.finditer(string):
|
||||||
|
start = m.start(1)
|
||||||
|
end = m.end(1)
|
||||||
|
result += string[last:start]
|
||||||
|
result += self.replace(string[start:end], m)
|
||||||
|
last = end
|
||||||
|
|
||||||
|
result += string[last:]
|
||||||
|
return result
|
||||||
|
|
||||||
|
def replace(self, string, m):
|
||||||
|
return string
|
||||||
|
|
||||||
|
|
||||||
|
class HttpMatchReplacer(RegexMatchReplacer):
|
||||||
|
HTTP_REGEX = "(https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+)"
|
||||||
|
|
||||||
|
def __init__(self, rewriter):
|
||||||
|
RegexMatchReplacer.__init__(self, HttpMatchReplacer.HTTP_REGEX)
|
||||||
|
self.rewriter = rewriter
|
||||||
|
|
||||||
|
def replace(self, string, m):
|
||||||
|
return self.rewriter.rewrite(string)
|
||||||
|
|
||||||
|
class CustomMatchReplacer(RegexMatchReplacer):
|
||||||
|
def __init__(self, matchRegex, replaceStr):
|
||||||
|
RegexMatchReplacer.__init__(self, matchRegex)
|
||||||
|
self.replaceStr = replaceStr
|
||||||
|
|
||||||
|
def replace(self, string, m):
|
||||||
|
return self.replaceStr
|
||||||
|
|
||||||
|
class Replacers:
|
||||||
|
"""
|
||||||
|
>>> replacer.replaceAll('location = "http://example.com/abc.html"')
|
||||||
|
'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
|
||||||
|
|
||||||
|
>>> replacer.replaceAll('cool_Location = "http://example.com/abc.html"')
|
||||||
|
'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
|
||||||
|
|
||||||
|
>>> replacer.replaceAll('window.location = "http://example.com/abc.html"')
|
||||||
|
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, replacers):
|
||||||
|
self.replacers = replacers
|
||||||
|
|
||||||
|
def replaceAll(self, string):
|
||||||
|
for x in self.replacers:
|
||||||
|
string = x.replaceAll(string)
|
||||||
|
|
||||||
|
return string
|
||||||
|
|
||||||
|
replacer = Replacers([HttpMatchReplacer(ArchivalUrlRewriter('/20131010im_/http://abc.com/XYZ/', '/web/')), CustomMatchReplacer('[^\w]?(location|domain)', 'WB_wombat_location')])
|
||||||
|
|
||||||
|
# =================================
|
||||||
|
arw = ArchivalUrlRewriter('/20131010im_/http://abc.com/XYZ/', '/web/')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class MultiRegexReplacer:
|
||||||
|
"""
|
||||||
|
>>> MultiRegexReplacer().replaceAll('location = "http://example.com/abc.html"', arw)
|
||||||
|
'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
|
||||||
|
|
||||||
|
>>> MultiRegexReplacer().replaceAll('cool_Location = "http://example.com/abc.html"', arw)
|
||||||
|
'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
|
||||||
|
|
||||||
|
>>> MultiRegexReplacer().replaceAll('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"', arw)
|
||||||
|
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
DEFAULT_RULES = [
|
||||||
|
('https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+', ArchivalUrlRewriter.rewrite),
|
||||||
|
('location', 'WB_wombat_location'),
|
||||||
|
('domain', 'WB_wombat_domain'),
|
||||||
|
('some_func\(\)', '/* \\1 */')
|
||||||
|
]
|
||||||
|
|
||||||
|
def __init__(self, rules = None):
|
||||||
|
if not rules:
|
||||||
|
rules = MultiRegexReplacer.DEFAULT_RULES
|
||||||
|
|
||||||
|
# Build regexstr, concatenating regex list
|
||||||
|
regexStr = '|'.join(['(' + rx + ')' for rx, op in rules])
|
||||||
|
|
||||||
|
# ensure it's not middle of a word, wrap in non-capture group
|
||||||
|
regexStr = '(?<!\w)(?:' + regexStr + ')'
|
||||||
|
|
||||||
|
self.regex = re.compile(regexStr)
|
||||||
|
self.rules = rules
|
||||||
|
|
||||||
|
def replaceAll(self, string, rewriter):
|
||||||
|
last = 0
|
||||||
|
result = ''
|
||||||
|
|
||||||
|
for m in self.regex.finditer(string):
|
||||||
|
|
||||||
|
groups = m.groups()
|
||||||
|
|
||||||
|
numGroups = len(groups)
|
||||||
|
|
||||||
|
for g, i in zip(groups, range(numGroups)):
|
||||||
|
if g:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Add 1 as group 0 is always entire match
|
||||||
|
start = m.start(i + 1)
|
||||||
|
end = m.end(i + 1)
|
||||||
|
|
||||||
|
result += string[last:start]
|
||||||
|
|
||||||
|
# i-th rule, 1st index of tuple
|
||||||
|
op = self.rules[i][1]
|
||||||
|
|
||||||
|
if hasattr(op, '__call__'):
|
||||||
|
result += op(rewriter, string[start:end])
|
||||||
|
else:
|
||||||
|
result += str(op)
|
||||||
|
|
||||||
|
last = end
|
||||||
|
|
||||||
|
result += string[last:]
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class RxRep:
|
||||||
|
"""
|
||||||
|
>>> test_repl('location = "http://example.com/abc.html"')
|
||||||
|
'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
|
||||||
|
|
||||||
|
>>> test_repl('cool_Location = "http://example.com/abc.html"')
|
||||||
|
'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
|
||||||
|
|
||||||
|
>>> test_repl('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
|
||||||
|
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
|
||||||
|
|
||||||
|
>>> test_repl('window.location = "http://example.com/abc.html"; some_func(); ')
|
||||||
|
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func()*/; '
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def commentOut(string):
|
||||||
|
return '/*' + string + '*/'
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def removeHttps(string):
|
||||||
|
return string.replace("https", "http")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def addPrefix(prefix):
|
||||||
|
return lambda string: prefix + string
|
||||||
|
|
||||||
|
HTTP_MATCH_REGEX = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+'
|
||||||
|
|
||||||
|
DEFAULT_OP = addPrefix
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self, rules):
|
||||||
|
#rules = self.createRules(httpPrefix)
|
||||||
|
|
||||||
|
# Build regexstr, concatenating regex list
|
||||||
|
regexStr = '|'.join(['(' + rx + ')' for rx, op in rules])
|
||||||
|
|
||||||
|
# ensure it's not middle of a word, wrap in non-capture group
|
||||||
|
regexStr = '(?<!\w)(?:' + regexStr + ')'
|
||||||
|
|
||||||
|
self.regex = re.compile(regexStr)
|
||||||
|
self.rules = rules
|
||||||
|
|
||||||
|
def replaceAll(self, string):
|
||||||
|
return self.regex.sub(lambda x: self.replace(x), string)
|
||||||
|
|
||||||
|
def replace(self, m):
|
||||||
|
for group, (_, op) in zip(m.groups(), self.rules):
|
||||||
|
if group:
|
||||||
|
# Custom func
|
||||||
|
if not hasattr(op, '__call__'):
|
||||||
|
op = RxRep.DEFAULT_OP(op)
|
||||||
|
|
||||||
|
return op(group)
|
||||||
|
|
||||||
|
raise re.error('No Match Found for replacement')
|
||||||
|
|
||||||
|
|
||||||
|
class JSRewriter(RxRep):
|
||||||
|
def __init__(self, httpPrefix, extra = []):
|
||||||
|
rules = self._createRules(httpPrefix)
|
||||||
|
rules.extend(extra)
|
||||||
|
|
||||||
|
RxRep.__init__(self, rules)
|
||||||
|
|
||||||
|
|
||||||
|
def _createRules(self, httpPrefix):
|
||||||
|
return [
|
||||||
|
(RxRep.HTTP_MATCH_REGEX, httpPrefix),
|
||||||
|
('location', 'WB_wombat_'),
|
||||||
|
('domain', 'WB_wombat_'),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
|
||||||
|
extra = [('some_func\(\)', RxRep.commentOut)]
|
||||||
|
|
||||||
|
rxrep = JSRewriter('/web/20131010im_/', extra)
|
||||||
|
|
||||||
|
def test_repl(string):
|
||||||
|
return rxrep.replaceAll(string)
|
||||||
|
|
||||||
|
doctest.testmod()
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -4,9 +4,9 @@ import re
|
|||||||
from HTMLParser import HTMLParser
|
from HTMLParser import HTMLParser
|
||||||
from wburlrewriter import ArchivalUrlRewriter
|
from wburlrewriter import ArchivalUrlRewriter
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# WBHtml --html parser for custom rewriting, also handlers for script and css
|
||||||
# create a subclass and override the handler methods
|
#=================================================================
|
||||||
class WBHtml(HTMLParser):
|
class WBHtml(HTMLParser):
|
||||||
"""
|
"""
|
||||||
>>> WBHtml(rewriter).feed('<HTML><A Href="page.html">Text</a></hTmL>')
|
>>> WBHtml(rewriter).feed('<HTML><A Href="page.html">Text</a></hTmL>')
|
||||||
@ -77,12 +77,7 @@ class WBHtml(HTMLParser):
|
|||||||
return metaRefresh
|
return metaRefresh
|
||||||
# ===========================
|
# ===========================
|
||||||
|
|
||||||
NO_REWRITE_PREFIX = ['javascript:', 'data:', 'mailto:']
|
|
||||||
|
|
||||||
def _rewriteURL(self, value, mod = None):
|
def _rewriteURL(self, value, mod = None):
|
||||||
if any (value.startswith(x) for x in WBHtml.NO_REWRITE_PREFIX):
|
|
||||||
return value
|
|
||||||
|
|
||||||
return self.rewriter.rewrite(value, mod)
|
return self.rewriter.rewrite(value, mod)
|
||||||
|
|
||||||
|
|
||||||
|
@ -33,6 +33,10 @@ class ArchivalUrlRewriter:
|
|||||||
'/2020/http://example.com/other.html'
|
'/2020/http://example.com/other.html'
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
NO_REWRITE_URI_PREFIX = ['javascript:', 'data:', 'mailto:', 'about:']
|
||||||
|
|
||||||
|
PROTOCOLS = ['http://', 'https://', '//', 'mms://', 'rtsp://', 'wais://']
|
||||||
|
|
||||||
def __init__(self, wburl_str, prefix):
|
def __init__(self, wburl_str, prefix):
|
||||||
self.wburl = ArchivalUrl(wburl_str)
|
self.wburl = ArchivalUrl(wburl_str)
|
||||||
self.prefix = prefix
|
self.prefix = prefix
|
||||||
@ -40,23 +44,34 @@ class ArchivalUrlRewriter:
|
|||||||
if self.prefix.endswith('/'):
|
if self.prefix.endswith('/'):
|
||||||
self.prefix = self.prefix[:-1]
|
self.prefix = self.prefix[:-1]
|
||||||
|
|
||||||
def rewrite(self, rel_url, mod = None):
|
def rewrite(self, url, mod = None):
|
||||||
|
# if special protocol, no rewriting at all
|
||||||
|
if any (url.startswith(x) for x in ArchivalUrlRewriter.NO_REWRITE_URI_PREFIX):
|
||||||
|
return url
|
||||||
|
|
||||||
wburl = self.wburl
|
wburl = self.wburl
|
||||||
|
|
||||||
# Disable optimization, doesn't work for external links
|
isAbs = any (url.startswith(x) for x in ArchivalUrlRewriter.PROTOCOLS)
|
||||||
# if relative path or different mod, create rewrite from split up ArchivalUrl
|
|
||||||
#if rel_url.startswith('/') or ('../' in rel_url) or mod:
|
|
||||||
newUrl = urlparse.urljoin(wburl.url, rel_url).replace('../', '')
|
|
||||||
|
|
||||||
if mod is None:
|
# Optimized rewriter for
|
||||||
mod = wburl.mod
|
# -rel urls that don't start with / and don't contain ../ and no special mod
|
||||||
|
if not (isAbs or mod or url.startswith('/') or ('../' in url)):
|
||||||
|
finalUrl = urlparse.urljoin(self.prefix + wburl.original_url, url)
|
||||||
|
|
||||||
final_url = self.prefix + ArchivalUrl.to_str(wburl.type, mod, wburl.timestamp, newUrl)
|
else:
|
||||||
# otherwise, optimize, and join directly with full url
|
# optimize: join if not absolute url, otherwise just use that
|
||||||
#else:
|
if not isAbs:
|
||||||
# final_url = urlparse.urljoin(self.prefix + wburl.original_url, rel_url)
|
newUrl = urlparse.urljoin(wburl.url, url).replace('../', '')
|
||||||
|
else:
|
||||||
|
newUrl = url
|
||||||
|
|
||||||
|
if mod is None:
|
||||||
|
mod = wburl.mod
|
||||||
|
|
||||||
|
finalUrl = self.prefix + ArchivalUrl.to_str(wburl.type, mod, wburl.timestamp, newUrl)
|
||||||
|
|
||||||
|
return finalUrl
|
||||||
|
|
||||||
return final_url
|
|
||||||
|
|
||||||
def setBaseUrl(self, newUrl):
|
def setBaseUrl(self, newUrl):
|
||||||
self.wburl.url = newUrl
|
self.wburl.url = newUrl
|
||||||
|
Loading…
x
Reference in New Issue
Block a user