mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
standard JS and CSS rewriting working, with generic regex rewriter
which supports extensions!
This commit is contained in:
parent
3a896f7cd3
commit
6050ea1ffa
@ -1,156 +1,14 @@
|
|||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
import itertools
|
||||||
|
|
||||||
from wburlrewriter import ArchivalUrlRewriter
|
from wburlrewriter import ArchivalUrlRewriter
|
||||||
|
|
||||||
class RegexMatchReplacer:
|
class RegexRewriter:
|
||||||
def __init__(self, regexStr):
|
|
||||||
self.regex = re.compile(regexStr)
|
|
||||||
|
|
||||||
def replaceAll(self, string):
|
|
||||||
last = 0
|
|
||||||
result = ''
|
|
||||||
for m in self.regex.finditer(string):
|
|
||||||
start = m.start(1)
|
|
||||||
end = m.end(1)
|
|
||||||
result += string[last:start]
|
|
||||||
result += self.replace(string[start:end], m)
|
|
||||||
last = end
|
|
||||||
|
|
||||||
result += string[last:]
|
|
||||||
return result
|
|
||||||
|
|
||||||
def replace(self, string, m):
|
|
||||||
return string
|
|
||||||
|
|
||||||
|
|
||||||
class HttpMatchReplacer(RegexMatchReplacer):
|
|
||||||
HTTP_REGEX = "(https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+)"
|
|
||||||
|
|
||||||
def __init__(self, rewriter):
|
|
||||||
RegexMatchReplacer.__init__(self, HttpMatchReplacer.HTTP_REGEX)
|
|
||||||
self.rewriter = rewriter
|
|
||||||
|
|
||||||
def replace(self, string, m):
|
|
||||||
return self.rewriter.rewrite(string)
|
|
||||||
|
|
||||||
class CustomMatchReplacer(RegexMatchReplacer):
|
|
||||||
def __init__(self, matchRegex, replaceStr):
|
|
||||||
RegexMatchReplacer.__init__(self, matchRegex)
|
|
||||||
self.replaceStr = replaceStr
|
|
||||||
|
|
||||||
def replace(self, string, m):
|
|
||||||
return self.replaceStr
|
|
||||||
|
|
||||||
class Replacers:
|
|
||||||
"""
|
"""
|
||||||
>>> replacer.replaceAll('location = "http://example.com/abc.html"')
|
# Test https->http converter (other tests below in subclasses)
|
||||||
'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
|
>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_REGEX, RegexRewriter.removeHttps, 0)]).replaceAll('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
|
||||||
|
'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'
|
||||||
>>> replacer.replaceAll('cool_Location = "http://example.com/abc.html"')
|
|
||||||
'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
|
|
||||||
|
|
||||||
>>> replacer.replaceAll('window.location = "http://example.com/abc.html"')
|
|
||||||
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, replacers):
|
|
||||||
self.replacers = replacers
|
|
||||||
|
|
||||||
def replaceAll(self, string):
|
|
||||||
for x in self.replacers:
|
|
||||||
string = x.replaceAll(string)
|
|
||||||
|
|
||||||
return string
|
|
||||||
|
|
||||||
replacer = Replacers([HttpMatchReplacer(ArchivalUrlRewriter('/20131010im_/http://abc.com/XYZ/', '/web/')), CustomMatchReplacer('[^\w]?(location|domain)', 'WB_wombat_location')])
|
|
||||||
|
|
||||||
# =================================
|
|
||||||
arw = ArchivalUrlRewriter('/20131010im_/http://abc.com/XYZ/', '/web/')
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class MultiRegexReplacer:
|
|
||||||
"""
|
|
||||||
>>> MultiRegexReplacer().replaceAll('location = "http://example.com/abc.html"', arw)
|
|
||||||
'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
|
|
||||||
|
|
||||||
>>> MultiRegexReplacer().replaceAll('cool_Location = "http://example.com/abc.html"', arw)
|
|
||||||
'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
|
|
||||||
|
|
||||||
>>> MultiRegexReplacer().replaceAll('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"', arw)
|
|
||||||
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
DEFAULT_RULES = [
|
|
||||||
('https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+', ArchivalUrlRewriter.rewrite),
|
|
||||||
('location', 'WB_wombat_location'),
|
|
||||||
('domain', 'WB_wombat_domain'),
|
|
||||||
('some_func\(\)', '/* \\1 */')
|
|
||||||
]
|
|
||||||
|
|
||||||
def __init__(self, rules = None):
|
|
||||||
if not rules:
|
|
||||||
rules = MultiRegexReplacer.DEFAULT_RULES
|
|
||||||
|
|
||||||
# Build regexstr, concatenating regex list
|
|
||||||
regexStr = '|'.join(['(' + rx + ')' for rx, op in rules])
|
|
||||||
|
|
||||||
# ensure it's not middle of a word, wrap in non-capture group
|
|
||||||
regexStr = '(?<!\w)(?:' + regexStr + ')'
|
|
||||||
|
|
||||||
self.regex = re.compile(regexStr)
|
|
||||||
self.rules = rules
|
|
||||||
|
|
||||||
def replaceAll(self, string, rewriter):
|
|
||||||
last = 0
|
|
||||||
result = ''
|
|
||||||
|
|
||||||
for m in self.regex.finditer(string):
|
|
||||||
|
|
||||||
groups = m.groups()
|
|
||||||
|
|
||||||
numGroups = len(groups)
|
|
||||||
|
|
||||||
for g, i in zip(groups, range(numGroups)):
|
|
||||||
if g:
|
|
||||||
break
|
|
||||||
|
|
||||||
# Add 1 as group 0 is always entire match
|
|
||||||
start = m.start(i + 1)
|
|
||||||
end = m.end(i + 1)
|
|
||||||
|
|
||||||
result += string[last:start]
|
|
||||||
|
|
||||||
# i-th rule, 1st index of tuple
|
|
||||||
op = self.rules[i][1]
|
|
||||||
|
|
||||||
if hasattr(op, '__call__'):
|
|
||||||
result += op(rewriter, string[start:end])
|
|
||||||
else:
|
|
||||||
result += str(op)
|
|
||||||
|
|
||||||
last = end
|
|
||||||
|
|
||||||
result += string[last:]
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class RxRep:
|
|
||||||
"""
|
|
||||||
>>> test_repl('location = "http://example.com/abc.html"')
|
|
||||||
'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
|
|
||||||
|
|
||||||
>>> test_repl('cool_Location = "http://example.com/abc.html"')
|
|
||||||
'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
|
|
||||||
|
|
||||||
>>> test_repl('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
|
|
||||||
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
|
|
||||||
|
|
||||||
>>> test_repl('window.location = "http://example.com/abc.html"; some_func(); ')
|
|
||||||
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func()*/; '
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -165,7 +23,11 @@ class RxRep:
|
|||||||
def addPrefix(prefix):
|
def addPrefix(prefix):
|
||||||
return lambda string: prefix + string
|
return lambda string: prefix + string
|
||||||
|
|
||||||
HTTP_MATCH_REGEX = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+'
|
@staticmethod
|
||||||
|
def archivalRewrite(rewriter):
|
||||||
|
return lambda x: rewriter.rewrite(x)
|
||||||
|
|
||||||
|
HTTPX_MATCH_REGEX = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+'
|
||||||
|
|
||||||
DEFAULT_OP = addPrefix
|
DEFAULT_OP = addPrefix
|
||||||
|
|
||||||
@ -174,55 +36,145 @@ class RxRep:
|
|||||||
#rules = self.createRules(httpPrefix)
|
#rules = self.createRules(httpPrefix)
|
||||||
|
|
||||||
# Build regexstr, concatenating regex list
|
# Build regexstr, concatenating regex list
|
||||||
regexStr = '|'.join(['(' + rx + ')' for rx, op in rules])
|
regexStr = '|'.join(['(' + rx + ')' for rx, op, count in rules])
|
||||||
|
|
||||||
# ensure it's not middle of a word, wrap in non-capture group
|
# ensure it's not middle of a word, wrap in non-capture group
|
||||||
regexStr = '(?<!\w)(?:' + regexStr + ')'
|
regexStr = '(?<!\w)(?:' + regexStr + ')'
|
||||||
|
|
||||||
self.regex = re.compile(regexStr)
|
self.regex = re.compile(regexStr, re.M)
|
||||||
self.rules = rules
|
self.rules = rules
|
||||||
|
|
||||||
def replaceAll(self, string):
|
def replaceAll(self, string):
|
||||||
return self.regex.sub(lambda x: self.replace(x), string)
|
return self.regex.sub(lambda x: self.replace(x), string)
|
||||||
|
|
||||||
def replace(self, m):
|
def replace(self, m):
|
||||||
for group, (_, op) in zip(m.groups(), self.rules):
|
i = 0
|
||||||
if group:
|
for _, op, count in self.rules:
|
||||||
# Custom func
|
i += 1
|
||||||
if not hasattr(op, '__call__'):
|
|
||||||
op = RxRep.DEFAULT_OP(op)
|
|
||||||
|
|
||||||
return op(group)
|
fullM = i
|
||||||
|
while count > 0:
|
||||||
|
i += 1
|
||||||
|
count -= 1
|
||||||
|
|
||||||
raise re.error('No Match Found for replacement')
|
if not m.group(i):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Custom func
|
||||||
|
if not hasattr(op, '__call__'):
|
||||||
|
op = RegexRewriter.DEFAULT_OP(op)
|
||||||
|
|
||||||
|
result = op(m.group(i))
|
||||||
|
|
||||||
|
# if extracting partial match
|
||||||
|
if i != fullM:
|
||||||
|
result = m.string[m.start(fullM):m.start(i)] + result + m.string[m.end(i):m.end(fullM)]
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
class JSRewriter(RxRep):
|
|
||||||
|
class JSRewriter(RegexRewriter):
|
||||||
|
"""
|
||||||
|
>>> test_js('location = "http://example.com/abc.html"')
|
||||||
|
'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
|
||||||
|
|
||||||
|
>>> test_js('cool_Location = "http://example.com/abc.html"')
|
||||||
|
'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
|
||||||
|
|
||||||
|
>>> test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
|
||||||
|
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
|
||||||
|
|
||||||
|
# custom rules added
|
||||||
|
>>> test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.commentOut, 0)])
|
||||||
|
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, httpPrefix, extra = []):
|
def __init__(self, httpPrefix, extra = []):
|
||||||
rules = self._createRules(httpPrefix)
|
rules = self._createRules(httpPrefix)
|
||||||
rules.extend(extra)
|
rules.extend(extra)
|
||||||
|
|
||||||
RxRep.__init__(self, rules)
|
RegexRewriter.__init__(self, rules)
|
||||||
|
|
||||||
|
|
||||||
def _createRules(self, httpPrefix):
|
def _createRules(self, httpPrefix):
|
||||||
return [
|
return [
|
||||||
(RxRep.HTTP_MATCH_REGEX, httpPrefix),
|
(RegexRewriter.HTTPX_MATCH_REGEX, httpPrefix, 0),
|
||||||
('location', 'WB_wombat_'),
|
('location|domain', 'WB_wombat_', 0),
|
||||||
('domain', 'WB_wombat_'),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class CSSRewriter(RegexRewriter):
|
||||||
|
r"""
|
||||||
|
>>> test_css("background: url('/some/path.html')")
|
||||||
|
"background: url('/web/20131010im_/http://example.com/some/path.html')"
|
||||||
|
|
||||||
|
>>> test_css("background: url('../path.html')")
|
||||||
|
"background: url('/web/20131010im_/http://example.com/path.html')"
|
||||||
|
|
||||||
|
>>> test_css("background: url(\"http://domain.com/path.html\")")
|
||||||
|
'background: url("/web/20131010im_/http://domain.com/path.html")'
|
||||||
|
|
||||||
|
>>> test_css("background: url(file.jpeg)")
|
||||||
|
'background: url(/web/20131010im_/http://example.com/file.jpeg)'
|
||||||
|
|
||||||
|
>>> test_css("background: url('')")
|
||||||
|
"background: url('')"
|
||||||
|
|
||||||
|
>>> test_css("background: url (\"weirdpath\')")
|
||||||
|
'background: url ("/web/20131010im_/http://example.com/weirdpath\')'
|
||||||
|
|
||||||
|
>>> test_css("@import url ('path.css')")
|
||||||
|
"@import url ('/web/20131010im_/http://example.com/path.css')"
|
||||||
|
|
||||||
|
>>> test_css("@import url('path.css')")
|
||||||
|
"@import url('/web/20131010im_/http://example.com/path.css')"
|
||||||
|
|
||||||
|
>>> test_css("@import ( 'path.css')")
|
||||||
|
"@import ( '/web/20131010im_/http://example.com/path.css')"
|
||||||
|
|
||||||
|
>>> test_css("@import \"path.css\"")
|
||||||
|
'@import "/web/20131010im_/http://example.com/path.css"'
|
||||||
|
|
||||||
|
>>> test_css("@import ('../path.css\"")
|
||||||
|
'@import (\'/web/20131010im_/http://example.com/path.css"'
|
||||||
|
|
||||||
|
>>> test_css("@import ('../url.css\"")
|
||||||
|
'@import (\'/web/20131010im_/http://example.com/url.css"'
|
||||||
|
|
||||||
|
>>> test_css("@import (\"url.css\")")
|
||||||
|
'@import ("/web/20131010im_/http://example.com/url.css")'
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, rewriter):
|
||||||
|
rules = self._createRules(rewriter)
|
||||||
|
|
||||||
|
RegexRewriter.__init__(self, rules)
|
||||||
|
|
||||||
|
|
||||||
|
def _createRules(self, rewriter):
|
||||||
|
return [
|
||||||
|
("url\\s*\\(\\s*[\\\\\"']*([^'\"]+)[\\\\\"']*\\s*\\)", RegexRewriter.archivalRewrite(rewriter), 1),
|
||||||
|
("@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)", RegexRewriter.archivalRewrite(rewriter), 1),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import doctest
|
import doctest
|
||||||
|
|
||||||
extra = [('some_func\(\)', RxRep.commentOut)]
|
rwPrefix = '/web/20131010im_/'
|
||||||
|
|
||||||
|
arcrw = ArchivalUrlRewriter('/20131010im_/http://example.com/', '/web/')
|
||||||
|
|
||||||
|
def test_js(string, extra = []):
|
||||||
|
return JSRewriter(rwPrefix, extra).replaceAll(string)
|
||||||
|
|
||||||
|
def test_css(string):
|
||||||
|
return CSSRewriter(arcrw).replaceAll(string)
|
||||||
|
|
||||||
rxrep = JSRewriter('/web/20131010im_/', extra)
|
|
||||||
|
|
||||||
def test_repl(string):
|
|
||||||
return rxrep.replaceAll(string)
|
|
||||||
|
|
||||||
doctest.testmod()
|
doctest.testmod()
|
||||||
|
|
||||||
|
@ -3,12 +3,13 @@ import re
|
|||||||
|
|
||||||
from HTMLParser import HTMLParser
|
from HTMLParser import HTMLParser
|
||||||
from wburlrewriter import ArchivalUrlRewriter
|
from wburlrewriter import ArchivalUrlRewriter
|
||||||
|
from regexmatch import JSRewriter, CSSRewriter
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# WBHtml --html parser for custom rewriting, also handlers for script and css
|
# WBHtml --html parser for custom rewriting, also handlers for script and css
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class WBHtml(HTMLParser):
|
class WBHtml(HTMLParser):
|
||||||
"""
|
r"""
|
||||||
>>> WBHtml(rewriter).feed('<HTML><A Href="page.html">Text</a></hTmL>')
|
>>> WBHtml(rewriter).feed('<HTML><A Href="page.html">Text</a></hTmL>')
|
||||||
<HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
|
<HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
|
||||||
|
|
||||||
@ -20,6 +21,18 @@ class WBHtml(HTMLParser):
|
|||||||
|
|
||||||
>>> WBHtml(rewriter).feed('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
>>> WBHtml(rewriter).feed('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
||||||
<meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
|
<meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
|
||||||
|
|
||||||
|
>>> WBHtml(rewriter).feed('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
||||||
|
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
|
||||||
|
|
||||||
|
>>> WBHtml(rewriter).feed('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
|
||||||
|
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
|
||||||
|
|
||||||
|
>>> WBHtml(rewriter).feed('<div style="background: url(\'abc.html\')" onclick="location = \'redirect.html\'"></div>')
|
||||||
|
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onclick="WB_wombat_location = 'redirect.html'"></div>
|
||||||
|
|
||||||
|
>>> WBHtml(rewriter).feed('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
|
||||||
|
<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
REWRITE_TAGS = {
|
REWRITE_TAGS = {
|
||||||
@ -50,7 +63,7 @@ class WBHtml(HTMLParser):
|
|||||||
'data-uri' : ''},
|
'data-uri' : ''},
|
||||||
}
|
}
|
||||||
|
|
||||||
STATE_TAGS = ['head', 'body', 'script', 'style']
|
STATE_TAGS = ['script', 'style']
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, rewriter, outstream = None):
|
def __init__(self, rewriter, outstream = None):
|
||||||
@ -60,6 +73,9 @@ class WBHtml(HTMLParser):
|
|||||||
self._wbParseContext = None
|
self._wbParseContext = None
|
||||||
self.out = outstream if outstream else sys.stdout
|
self.out = outstream if outstream else sys.stdout
|
||||||
|
|
||||||
|
self.jsRewriter = JSRewriter(rewriter.getAbsUrl())
|
||||||
|
self.cssRewriter = CSSRewriter(rewriter)
|
||||||
|
|
||||||
|
|
||||||
# ===========================
|
# ===========================
|
||||||
META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE)
|
META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE)
|
||||||
@ -82,10 +98,10 @@ class WBHtml(HTMLParser):
|
|||||||
|
|
||||||
|
|
||||||
def _rewriteCSS(self, cssContent):
|
def _rewriteCSS(self, cssContent):
|
||||||
return cssContent
|
return self.cssRewriter.replaceAll(cssContent)
|
||||||
|
|
||||||
def _rewriteScript(self, scriptContent):
|
def _rewriteScript(self, scriptContent):
|
||||||
return scriptContent
|
return self.jsRewriter.replaceAll(scriptContent)
|
||||||
|
|
||||||
def hasAttr(self, tagAttrs, attr):
|
def hasAttr(self, tagAttrs, attr):
|
||||||
name, value = attr
|
name, value = attr
|
||||||
@ -95,13 +111,6 @@ class WBHtml(HTMLParser):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
|
def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
|
||||||
handler = WBHtml.REWRITE_TAGS.get(tag)
|
|
||||||
if not handler:
|
|
||||||
handler = WBHtml.REWRITE_TAGS.get('')
|
|
||||||
|
|
||||||
if not handler:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# special case: base tag
|
# special case: base tag
|
||||||
if (tag == 'base'):
|
if (tag == 'base'):
|
||||||
newBase = tagAttrs.get('href')
|
newBase = tagAttrs.get('href')
|
||||||
@ -109,9 +118,17 @@ class WBHtml(HTMLParser):
|
|||||||
self.rewriter.setBaseUrl(newBase[1])
|
self.rewriter.setBaseUrl(newBase[1])
|
||||||
|
|
||||||
# special case: script or style parse context
|
# special case: script or style parse context
|
||||||
elif ((tag == 'script') or (tag == 'style')) and (self._wbParseContext == None):
|
elif (tag in WBHtml.STATE_TAGS) and (self._wbParseContext == None):
|
||||||
self._wbParseContext = tag
|
self._wbParseContext = tag
|
||||||
|
|
||||||
|
# attr rewriting
|
||||||
|
handler = WBHtml.REWRITE_TAGS.get(tag)
|
||||||
|
if not handler:
|
||||||
|
handler = WBHtml.REWRITE_TAGS.get('')
|
||||||
|
|
||||||
|
if not handler:
|
||||||
|
return False
|
||||||
|
|
||||||
self.out.write('<' + tag)
|
self.out.write('<' + tag)
|
||||||
|
|
||||||
for attr in tagAttrs:
|
for attr in tagAttrs:
|
||||||
|
@ -31,7 +31,13 @@ class ArchivalUrlRewriter:
|
|||||||
|
|
||||||
>>> test_rewrite('../../other.html', '/2020/http://example.com/index.html', '')
|
>>> test_rewrite('../../other.html', '/2020/http://example.com/index.html', '')
|
||||||
'/2020/http://example.com/other.html'
|
'/2020/http://example.com/other.html'
|
||||||
"""
|
|
||||||
|
>>> test_rewrite('', '/20131010010203/http://example.com/file.html', '/web/')
|
||||||
|
'/web/20131010010203/http://example.com/file.html'
|
||||||
|
|
||||||
|
>>> ArchivalUrlRewriter('/19960708im_/http://domain.example.com/path.txt', '/abc/').getAbsUrl()
|
||||||
|
'/abc/19960708im_/'
|
||||||
|
"""
|
||||||
|
|
||||||
NO_REWRITE_URI_PREFIX = ['javascript:', 'data:', 'mailto:', 'about:']
|
NO_REWRITE_URI_PREFIX = ['javascript:', 'data:', 'mailto:', 'about:']
|
||||||
|
|
||||||
@ -72,6 +78,9 @@ class ArchivalUrlRewriter:
|
|||||||
|
|
||||||
return finalUrl
|
return finalUrl
|
||||||
|
|
||||||
|
def getAbsUrl(self, url = ''):
|
||||||
|
return self.prefix + ArchivalUrl.to_str(self.wburl.type, self.wburl.mod, self.wburl.timestamp, url)
|
||||||
|
|
||||||
|
|
||||||
def setBaseUrl(self, newUrl):
|
def setBaseUrl(self, newUrl):
|
||||||
self.wburl.url = newUrl
|
self.wburl.url = newUrl
|
||||||
|
Loading…
x
Reference in New Issue
Block a user