mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-28 00:25:21 +01:00
244 lines
8.1 KiB
Python
244 lines
8.1 KiB
Python
import re
|
|
import sys
|
|
import itertools
|
|
|
|
from url_rewriter import UrlRewriter
|
|
|
|
#=================================================================
|
|
class RegexRewriter:
|
|
"""
|
|
# Test https->http converter (other tests below in subclasses)
|
|
>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
|
|
'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'
|
|
"""
|
|
|
|
@staticmethod
|
|
def comment_out(string):
|
|
return '/*' + string + '*/'
|
|
|
|
@staticmethod
|
|
def remove_https(string):
|
|
return string.replace("https", "http")
|
|
|
|
@staticmethod
|
|
def add_prefix(prefix):
|
|
return lambda string: prefix + string
|
|
|
|
@staticmethod
|
|
def archival_rewrite(rewriter):
|
|
return lambda x: rewriter.rewrite(x)
|
|
|
|
@staticmethod
|
|
def replacer(string):
|
|
return lambda x: string
|
|
|
|
HTTPX_MATCH_STR = 'https?:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+'
|
|
|
|
DEFAULT_OP = add_prefix
|
|
|
|
|
|
def __init__(self, rules):
|
|
#rules = self.create_rules(http_prefix)
|
|
|
|
# Build regexstr, concatenating regex list
|
|
regex_str = '|'.join(['(' + rx + ')' for rx, op, count in rules])
|
|
|
|
# ensure it's not middle of a word, wrap in non-capture group
|
|
regex_str = '(?<!\w)(?:' + regex_str + ')'
|
|
|
|
self.regex = re.compile(regex_str, re.M)
|
|
self.rules = rules
|
|
|
|
def filter(self, m):
|
|
return True
|
|
|
|
def rewrite(self, string):
|
|
return self.regex.sub(lambda x: self.replace(x), string)
|
|
|
|
def close(self):
|
|
return ''
|
|
|
|
def replace(self, m):
|
|
i = 0
|
|
for _, op, count in self.rules:
|
|
i += 1
|
|
|
|
full_m = i
|
|
while count > 0:
|
|
i += 1
|
|
count -= 1
|
|
|
|
if not m.group(i):
|
|
continue
|
|
|
|
# Optional filter to skip matches
|
|
if not self.filter(m):
|
|
return m.group(0)
|
|
|
|
# Custom func
|
|
if not hasattr(op, '__call__'):
|
|
op = RegexRewriter.DEFAULT_OP(op)
|
|
|
|
result = op(m.group(i))
|
|
|
|
# if extracting partial match
|
|
if i != full_m:
|
|
result = m.string[m.start(full_m):m.start(i)] + result + m.string[m.end(i):m.end(full_m)]
|
|
|
|
return result
|
|
|
|
|
|
|
|
#=================================================================
|
|
class JSRewriter(RegexRewriter):
|
|
"""
|
|
>>> test_js('location = "http://example.com/abc.html"')
|
|
'WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"'
|
|
|
|
>>> test_js('cool_Location = "http://example.com/abc.html"')
|
|
'cool_Location = "/web/20131010im_/http://example.com/abc.html"'
|
|
|
|
>>> test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
|
|
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
|
|
|
|
>>> test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
|
|
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
|
|
|
|
# custom rules added
|
|
>>> test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.comment_out, 0)])
|
|
'window.WB_wombat_location = "/web/20131010im_/http://example.com/abc.html"; /*some_func(); */'
|
|
|
|
"""
|
|
|
|
def __init__(self, rewriter, extra = []):
|
|
rules = self._create_rules(rewriter.get_abs_url())
|
|
rules.extend(extra)
|
|
|
|
RegexRewriter.__init__(self, rules)
|
|
|
|
|
|
def _create_rules(self, http_prefix):
|
|
return [
|
|
(RegexRewriter.HTTPX_MATCH_STR, http_prefix, 0),
|
|
('location', 'WB_wombat_', 0),
|
|
('(?<=document\.)domain', 'WB_wombat_', 0),
|
|
]
|
|
|
|
|
|
#=================================================================
|
|
class XMLRewriter(RegexRewriter):
|
|
"""
|
|
>>> test_xml('<tag xmlns="http://www.example.com/ns" attr="http://example.com"></tag>')
|
|
'<tag xmlns="http://www.example.com/ns" attr="/web/20131010im_/http://example.com"></tag>'
|
|
|
|
>>> test_xml('<tag xmlns:xsi="http://www.example.com/ns" attr=" http://example.com"></tag>')
|
|
'<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010im_/http://example.com"></tag>'
|
|
|
|
>>> test_xml('<tag> http://example.com<other>abchttp://example.com</other></tag>')
|
|
'<tag> /web/20131010im_/http://example.com<other>abchttp://example.com</other></tag>'
|
|
|
|
>>> test_xml('<main> http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> http://example.com </main>')
|
|
'<main> /web/20131010im_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010im_/http://example.com </main>'
|
|
|
|
"""
|
|
|
|
def __init__(self, rewriter, extra = []):
|
|
rules = self._create_rules(rewriter.get_abs_url())
|
|
|
|
RegexRewriter.__init__(self, rules)
|
|
|
|
# custom filter to reject 'xmlns' attr
|
|
def filter(self, m):
|
|
attr = m.group(1)
|
|
if attr and attr.startswith('xmlns'):
|
|
return False
|
|
|
|
return True
|
|
|
|
def _create_rules(self, http_prefix):
|
|
return [
|
|
('([A-Za-z:]+[\s=]+)?["\'\s]*(' + RegexRewriter.HTTPX_MATCH_STR + ')', http_prefix, 2),
|
|
]
|
|
|
|
#=================================================================
|
|
class CSSRewriter(RegexRewriter):
|
|
r"""
|
|
>>> test_css("background: url('/some/path.html')")
|
|
"background: url('/web/20131010im_/http://example.com/some/path.html')"
|
|
|
|
>>> test_css("background: url('../path.html')")
|
|
"background: url('/web/20131010im_/http://example.com/path.html')"
|
|
|
|
>>> test_css("background: url(\"http://domain.com/path.html\")")
|
|
'background: url("/web/20131010im_/http://domain.com/path.html")'
|
|
|
|
>>> test_css("background: url(file.jpeg)")
|
|
'background: url(/web/20131010im_/http://example.com/file.jpeg)'
|
|
|
|
>>> test_css("background: url('')")
|
|
"background: url('')"
|
|
|
|
>>> test_css("background: url (\"weirdpath\')")
|
|
'background: url ("/web/20131010im_/http://example.com/weirdpath\')'
|
|
|
|
>>> test_css("@import url ('path.css')")
|
|
"@import url ('/web/20131010im_/http://example.com/path.css')"
|
|
|
|
>>> test_css("@import url('path.css')")
|
|
"@import url('/web/20131010im_/http://example.com/path.css')"
|
|
|
|
>>> test_css("@import ( 'path.css')")
|
|
"@import ( '/web/20131010im_/http://example.com/path.css')"
|
|
|
|
>>> test_css("@import \"path.css\"")
|
|
'@import "/web/20131010im_/http://example.com/path.css"'
|
|
|
|
>>> test_css("@import ('../path.css\"")
|
|
'@import (\'/web/20131010im_/http://example.com/path.css"'
|
|
|
|
>>> test_css("@import ('../url.css\"")
|
|
'@import (\'/web/20131010im_/http://example.com/url.css"'
|
|
|
|
>>> test_css("@import (\"url.css\")")
|
|
'@import ("/web/20131010im_/http://example.com/url.css")'
|
|
|
|
>>> test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)")
|
|
'@import url(/web/20131010im_/http://example.com/url.css)\n@import url(/web/20131010im_/http://example.com/anotherurl.css)\n @import url(/web/20131010im_/http://example.com/and_a_third.css)'
|
|
|
|
"""
|
|
|
|
CSS_URL_REGEX = "url\\s*\\(\\s*[\\\\\"']*([^)'\"]+)[\\\\\"']*\\s*\\)"
|
|
CSS_IMPORT_NO_URL_REGEX = "@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)"
|
|
|
|
def __init__(self, rewriter):
|
|
rules = self._create_rules(rewriter)
|
|
|
|
RegexRewriter.__init__(self, rules)
|
|
|
|
|
|
def _create_rules(self, rewriter):
|
|
return [
|
|
(CSSRewriter.CSS_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
|
|
(CSSRewriter.CSS_IMPORT_NO_URL_REGEX, RegexRewriter.archival_rewrite(rewriter), 1),
|
|
]
|
|
|
|
import utils
|
|
if __name__ == "__main__" or utils.enable_doctests():
|
|
arcrw = UrlRewriter('/20131010im_/http://example.com/', '/web/')
|
|
|
|
def test_js(string, extra = []):
|
|
return JSRewriter(arcrw, extra).rewrite(string)
|
|
|
|
def test_xml(string):
|
|
return XMLRewriter(arcrw).rewrite(string)
|
|
|
|
def test_css(string):
|
|
return CSSRewriter(arcrw).rewrite(string)
|
|
|
|
|
|
import doctest
|
|
doctest.testmod()
|
|
|
|
|
|
|