mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
move norewrite prefixs down to ArchivalUrlRewriter (was in html parser)
Add new general regex match work, (several attempts, though last one is simplest/best!)
This commit is contained in:
parent
37e57f7013
commit
3a896f7cd3
230
pywb/regexmatch.py
Normal file
230
pywb/regexmatch.py
Normal file
@ -0,0 +1,230 @@
|
||||
import re
|
||||
import sys
|
||||
from wburlrewriter import ArchivalUrlRewriter
|
||||
|
||||
class RegexMatchReplacer:
|
||||
def __init__(self, regexStr):
|
||||
self.regex = re.compile(regexStr)
|
||||
|
||||
def replaceAll(self, string):
|
||||
last = 0
|
||||
result = ''
|
||||
for m in self.regex.finditer(string):
|
||||
start = m.start(1)
|
||||
end = m.end(1)
|
||||
result += string[last:start]
|
||||
result += self.replace(string[start:end], m)
|
||||
last = end
|
||||
|
||||
result += string[last:]
|
||||
return result
|
||||
|
||||
def replace(self, string, m):
|
||||
return string
|
||||
|
||||
|
||||
class HttpMatchReplacer(RegexMatchReplacer):
|
||||
HTTP_REGEX = "(https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+)"
|
||||
|
||||
def __init__(self, rewriter):
|
||||
RegexMatchReplacer.__init__(self, HttpMatchReplacer.HTTP_REGEX)
|
||||
self.rewriter = rewriter
|
||||
|
||||
def replace(self, string, m):
|
||||
return self.rewriter.rewrite(string)
|
||||
|
||||
class CustomMatchReplacer(RegexMatchReplacer):
|
||||
def __init__(self, matchRegex, replaceStr):
|
||||
RegexMatchReplacer.__init__(self, matchRegex)
|
||||
self.replaceStr = replaceStr
|
||||
|
||||
def replace(self, string, m):
|
||||
return self.replaceStr
|
||||
|
||||
class Replacers:
|
||||
"""
|
||||
>>> replacer.replaceAll('location = "http://example.com/abc.html"')
|
||||
'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
|
||||
|
||||
>>> replacer.replaceAll('cool_Location = "http://example.com/abc.html"')
|
||||
'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
|
||||
|
||||
>>> replacer.replaceAll('window.location = "http://example.com/abc.html"')
|
||||
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
|
||||
"""
|
||||
|
||||
def __init__(self, replacers):
|
||||
self.replacers = replacers
|
||||
|
||||
def replaceAll(self, string):
|
||||
for x in self.replacers:
|
||||
string = x.replaceAll(string)
|
||||
|
||||
return string
|
||||
|
||||
replacer = Replacers([HttpMatchReplacer(ArchivalUrlRewriter('/20131010im_/http://abc.com/XYZ/', '/web/')), CustomMatchReplacer('[^\w]?(location|domain)', 'WB_wombat_location')])
|
||||
|
||||
# =================================
|
||||
arw = ArchivalUrlRewriter('/20131010im_/http://abc.com/XYZ/', '/web/')
|
||||
|
||||
|
||||
|
||||
class MultiRegexReplacer:
|
||||
"""
|
||||
>>> MultiRegexReplacer().replaceAll('location = "http://example.com/abc.html"', arw)
|
||||
'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
|
||||
|
||||
>>> MultiRegexReplacer().replaceAll('cool_Location = "http://example.com/abc.html"', arw)
|
||||
'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
|
||||
|
||||
>>> MultiRegexReplacer().replaceAll('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"', arw)
|
||||
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
|
||||
|
||||
"""
|
||||
|
||||
DEFAULT_RULES = [
|
||||
('https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+', ArchivalUrlRewriter.rewrite),
|
||||
('location', 'WB_wombat_location'),
|
||||
('domain', 'WB_wombat_domain'),
|
||||
('some_func\(\)', '/* \\1 */')
|
||||
]
|
||||
|
||||
def __init__(self, rules = None):
|
||||
if not rules:
|
||||
rules = MultiRegexReplacer.DEFAULT_RULES
|
||||
|
||||
# Build regexstr, concatenating regex list
|
||||
regexStr = '|'.join(['(' + rx + ')' for rx, op in rules])
|
||||
|
||||
# ensure it's not middle of a word, wrap in non-capture group
|
||||
regexStr = '(?<!\w)(?:' + regexStr + ')'
|
||||
|
||||
self.regex = re.compile(regexStr)
|
||||
self.rules = rules
|
||||
|
||||
def replaceAll(self, string, rewriter):
|
||||
last = 0
|
||||
result = ''
|
||||
|
||||
for m in self.regex.finditer(string):
|
||||
|
||||
groups = m.groups()
|
||||
|
||||
numGroups = len(groups)
|
||||
|
||||
for g, i in zip(groups, range(numGroups)):
|
||||
if g:
|
||||
break
|
||||
|
||||
# Add 1 as group 0 is always entire match
|
||||
start = m.start(i + 1)
|
||||
end = m.end(i + 1)
|
||||
|
||||
result += string[last:start]
|
||||
|
||||
# i-th rule, 1st index of tuple
|
||||
op = self.rules[i][1]
|
||||
|
||||
if hasattr(op, '__call__'):
|
||||
result += op(rewriter, string[start:end])
|
||||
else:
|
||||
result += str(op)
|
||||
|
||||
last = end
|
||||
|
||||
result += string[last:]
|
||||
return result
|
||||
|
||||
|
||||
|
||||
class RxRep:
|
||||
"""
|
||||
>>> test_repl('location = "http://example.com/abc.html"')
|
||||
'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
|
||||
|
||||
>>> test_repl('cool_Location = "http://example.com/abc.html"')
|
||||
'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
|
||||
|
||||
>>> test_repl('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
|
||||
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
|
||||
|
||||
>>> test_repl('window.location = "http://example.com/abc.html"; some_func(); ')
|
||||
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func()*/; '
|
||||
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def commentOut(string):
|
||||
return '/*' + string + '*/'
|
||||
|
||||
@staticmethod
|
||||
def removeHttps(string):
|
||||
return string.replace("https", "http")
|
||||
|
||||
@staticmethod
|
||||
def addPrefix(prefix):
|
||||
return lambda string: prefix + string
|
||||
|
||||
HTTP_MATCH_REGEX = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+'
|
||||
|
||||
DEFAULT_OP = addPrefix
|
||||
|
||||
|
||||
def __init__(self, rules):
|
||||
#rules = self.createRules(httpPrefix)
|
||||
|
||||
# Build regexstr, concatenating regex list
|
||||
regexStr = '|'.join(['(' + rx + ')' for rx, op in rules])
|
||||
|
||||
# ensure it's not middle of a word, wrap in non-capture group
|
||||
regexStr = '(?<!\w)(?:' + regexStr + ')'
|
||||
|
||||
self.regex = re.compile(regexStr)
|
||||
self.rules = rules
|
||||
|
||||
def replaceAll(self, string):
|
||||
return self.regex.sub(lambda x: self.replace(x), string)
|
||||
|
||||
def replace(self, m):
|
||||
for group, (_, op) in zip(m.groups(), self.rules):
|
||||
if group:
|
||||
# Custom func
|
||||
if not hasattr(op, '__call__'):
|
||||
op = RxRep.DEFAULT_OP(op)
|
||||
|
||||
return op(group)
|
||||
|
||||
raise re.error('No Match Found for replacement')
|
||||
|
||||
|
||||
class JSRewriter(RxRep):
|
||||
def __init__(self, httpPrefix, extra = []):
|
||||
rules = self._createRules(httpPrefix)
|
||||
rules.extend(extra)
|
||||
|
||||
RxRep.__init__(self, rules)
|
||||
|
||||
|
||||
def _createRules(self, httpPrefix):
|
||||
return [
|
||||
(RxRep.HTTP_MATCH_REGEX, httpPrefix),
|
||||
('location', 'WB_wombat_'),
|
||||
('domain', 'WB_wombat_'),
|
||||
]
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
|
||||
extra = [('some_func\(\)', RxRep.commentOut)]
|
||||
|
||||
rxrep = JSRewriter('/web/20131010im_/', extra)
|
||||
|
||||
def test_repl(string):
|
||||
return rxrep.replaceAll(string)
|
||||
|
||||
doctest.testmod()
|
||||
|
||||
|
||||
|
@ -4,9 +4,9 @@ import re
|
||||
from HTMLParser import HTMLParser
|
||||
from wburlrewriter import ArchivalUrlRewriter
|
||||
|
||||
|
||||
|
||||
# create a subclass and override the handler methods
|
||||
#=================================================================
|
||||
# WBHtml --html parser for custom rewriting, also handlers for script and css
|
||||
#=================================================================
|
||||
class WBHtml(HTMLParser):
|
||||
"""
|
||||
>>> WBHtml(rewriter).feed('<HTML><A Href="page.html">Text</a></hTmL>')
|
||||
@ -77,12 +77,7 @@ class WBHtml(HTMLParser):
|
||||
return metaRefresh
|
||||
# ===========================
|
||||
|
||||
NO_REWRITE_PREFIX = ['javascript:', 'data:', 'mailto:']
|
||||
|
||||
def _rewriteURL(self, value, mod = None):
|
||||
if any (value.startswith(x) for x in WBHtml.NO_REWRITE_PREFIX):
|
||||
return value
|
||||
|
||||
return self.rewriter.rewrite(value, mod)
|
||||
|
||||
|
||||
|
@ -33,6 +33,10 @@ class ArchivalUrlRewriter:
|
||||
'/2020/http://example.com/other.html'
|
||||
"""
|
||||
|
||||
NO_REWRITE_URI_PREFIX = ['javascript:', 'data:', 'mailto:', 'about:']
|
||||
|
||||
PROTOCOLS = ['http://', 'https://', '//', 'mms://', 'rtsp://', 'wais://']
|
||||
|
||||
def __init__(self, wburl_str, prefix):
|
||||
self.wburl = ArchivalUrl(wburl_str)
|
||||
self.prefix = prefix
|
||||
@ -40,23 +44,34 @@ class ArchivalUrlRewriter:
|
||||
if self.prefix.endswith('/'):
|
||||
self.prefix = self.prefix[:-1]
|
||||
|
||||
def rewrite(self, rel_url, mod = None):
|
||||
def rewrite(self, url, mod = None):
|
||||
# if special protocol, no rewriting at all
|
||||
if any (url.startswith(x) for x in ArchivalUrlRewriter.NO_REWRITE_URI_PREFIX):
|
||||
return url
|
||||
|
||||
wburl = self.wburl
|
||||
|
||||
# Disable optimization, doesn't work for external links
|
||||
# if relative path or different mod, create rewrite from split up ArchivalUrl
|
||||
#if rel_url.startswith('/') or ('../' in rel_url) or mod:
|
||||
newUrl = urlparse.urljoin(wburl.url, rel_url).replace('../', '')
|
||||
isAbs = any (url.startswith(x) for x in ArchivalUrlRewriter.PROTOCOLS)
|
||||
|
||||
if mod is None:
|
||||
mod = wburl.mod
|
||||
# Optimized rewriter for
|
||||
# -rel urls that don't start with / and don't contain ../ and no special mod
|
||||
if not (isAbs or mod or url.startswith('/') or ('../' in url)):
|
||||
finalUrl = urlparse.urljoin(self.prefix + wburl.original_url, url)
|
||||
|
||||
final_url = self.prefix + ArchivalUrl.to_str(wburl.type, mod, wburl.timestamp, newUrl)
|
||||
# otherwise, optimize, and join directly with full url
|
||||
#else:
|
||||
# final_url = urlparse.urljoin(self.prefix + wburl.original_url, rel_url)
|
||||
else:
|
||||
# optimize: join if not absolute url, otherwise just use that
|
||||
if not isAbs:
|
||||
newUrl = urlparse.urljoin(wburl.url, url).replace('../', '')
|
||||
else:
|
||||
newUrl = url
|
||||
|
||||
if mod is None:
|
||||
mod = wburl.mod
|
||||
|
||||
finalUrl = self.prefix + ArchivalUrl.to_str(wburl.type, mod, wburl.timestamp, newUrl)
|
||||
|
||||
return finalUrl
|
||||
|
||||
return final_url
|
||||
|
||||
def setBaseUrl(self, newUrl):
|
||||
self.wburl.url = newUrl
|
||||
|
Loading…
x
Reference in New Issue
Block a user