1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

rewrite: refactor JS rewriters into seperate mixins, allowing for

link only, location only, and link + location JS rewriters.
location-only rewriter is new
js_rewrite_location options: all, location, urls (for now)
This commit is contained in:
Ilya Kreymer 2014-12-07 21:09:37 -08:00
parent b951b304b6
commit 0a6838ac2b
7 changed files with 65 additions and 27 deletions

View File

@ -35,7 +35,7 @@ class RegexRewriter(object):
#DEFAULT_OP = add_prefix
def __init__(self, rules):
def __init__(self, rewriter, rules):
#rules = self.create_rules(http_prefix)
# Build regexstr, concatenating regex list
@ -106,7 +106,7 @@ class RegexRewriter(object):
#=================================================================
class JSLinkOnlyRewriter(RegexRewriter):
class JSLinkRewriterMixin(object):
"""
JS Rewriter which rewrites absolute http://, https:// and // urls
at the beginning of a string
@ -118,13 +118,14 @@ class JSLinkOnlyRewriter(RegexRewriter):
rules = rules + [
(self.JS_HTTPX, RegexRewriter.archival_rewrite(rewriter), 0)
]
super(JSLinkOnlyRewriter, self).__init__(rules)
super(JSLinkRewriterMixin, self).__init__(rewriter, rules)
#=================================================================
class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
class JSLocationRewriterMixin(object):
#class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
"""
JS Rewriter which also rewrites location and domain to the
JS Rewriter mixin which rewrites location and domain to the
specified prefix (default: 'WB_wombat_')
"""
@ -148,7 +149,23 @@ class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
#(r'\b(?:self|window)\b[!=\W]+\b(top)\b',
#RegexRewriter.add_prefix(prefix), 1),
]
super(JSLinkAndLocationRewriter, self).__init__(rewriter, rules)
super(JSLocationRewriterMixin, self).__init__(rewriter, rules)
#=================================================================
class JSLocationOnlyRewriter(JSLocationRewriterMixin, RegexRewriter):
pass
#=================================================================
class JSLinkOnlyRewriter(JSLinkRewriterMixin, RegexRewriter):
pass
#=================================================================
class JSLinkAndLocationRewriter(JSLocationRewriterMixin,
JSLinkRewriterMixin,
RegexRewriter):
pass
#=================================================================
@ -161,7 +178,7 @@ class XMLRewriter(RegexRewriter):
def __init__(self, rewriter, extra=[]):
rules = self._create_rules(rewriter)
super(XMLRewriter, self).__init__(rules)
super(XMLRewriter, self).__init__(rewriter, rules)
# custom filter to reject 'xmlns' attr
def filter(self, m):
@ -189,7 +206,7 @@ class CSSRewriter(RegexRewriter):
def __init__(self, rewriter):
rules = self._create_rules(rewriter)
super(CSSRewriter, self).__init__(rules)
super(CSSRewriter, self).__init__(rewriter, rules)
def _create_rules(self, rewriter):
return [

View File

@ -1,7 +1,7 @@
from pywb.utils.dsrules import BaseRule
from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter, JSLocationOnlyRewriter
from header_rewriter import HeaderRewriter
from html_rewriter import HTMLRewriter
@ -27,12 +27,13 @@ class RewriteRules(BaseRule):
self.parse_comments = config.get('parse_comments', False)
# Custom handling for js rewriting, often the most complex
self.js_rewrite_location = config.get('js_rewrite_location', True)
self.js_rewrite_location = bool(self.js_rewrite_location)
self.js_rewrite_location = config.get('js_rewrite_location', 'all')
# ability to toggle rewriting
if self.js_rewrite_location:
if self.js_rewrite_location == 'all':
js_default_class = JSLinkAndLocationRewriter
elif self.js_rewrite_location == 'location':
js_default_class = JSLocationOnlyRewriter
else:
js_default_class = JSLinkOnlyRewriter

View File

@ -3,7 +3,7 @@ r"""
# Custom Regex
#=================================================================
# Test https->http converter (other tests below in subclasses)
>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
>>> RegexRewriter(urlrewriter, [(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'

View File

@ -13,7 +13,7 @@ urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.htm
bn_urlrewriter = UrlRewriter('20131226101010bn_/http://example.com/some/path/index.html', '/pywb/')
def head_insert_func(rule, cdx):
if rule.js_rewrite_location == True:
if rule.js_rewrite_location != 'urls':
return '<script src="/static/default/wombat.js"> </script>'
else:
return ''
@ -26,10 +26,10 @@ def test_local_1():
'com,example,test)/')
# wombat insert added
assert '<head><script src="/static/default/wombat.js"> </script>' in buff
assert '<head><script src="/static/default/wombat.js"> </script>' in buff, buff
# location rewritten
assert 'window.WB_wombat_location = "/other.html"' in buff
# JS location and JS link rewritten
assert 'window.WB_wombat_location = "/pywb/20131226101010/http:\/\/example.com/dynamic_page.html"' in buff
# link rewritten
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
@ -65,7 +65,7 @@ def test_local_no_head_banner_only():
# link NOT rewritten
assert '"another.html"' in buff
def test_local_banner_only():
def test_local_banner_only_no_rewrite():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
bn_urlrewriter,
head_insert_func,
@ -74,13 +74,13 @@ def test_local_banner_only():
# wombat insert added
assert '<head><script src="/static/default/wombat.js"> </script>' in buff
# location NOT rewritten
assert 'window.location = "/other.html"' in buff
# JS location NOT rewritten, JS link NOT rewritten
assert 'window.location = "http:\/\/example.com/dynamic_page.html"' in buff, buff
# link NOT rewritten
assert '"another.html"' in buff
def test_local_2_no_js_location_rewrite():
def test_local_2_link_only_rewrite():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
urlrewriter,
head_insert_func,
@ -89,13 +89,28 @@ def test_local_2_no_js_location_rewrite():
# no wombat insert
assert '<head><script src="/static/default/wombat.js"> </script>' not in buff
# no location rewrite
assert 'window.location = "/other.html"' in buff
# JS location NOT rewritten, JS link rewritten
assert 'window.location = "/pywb/20131226101010/http:\/\/example.com/dynamic_page.html"' in buff
# still link rewrite
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
def test_local_2_js_loc_only_rewrite():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
urlrewriter,
head_insert_func,
'example,example,test,loconly)/')
# wombat insert added
assert '<script src="/static/default/wombat.js"> </script>' in buff
# JS location rewritten, JS link NOT rewritten
assert 'window.WB_wombat_location = "http:\/\/example.com/dynamic_page.html"' in buff
# still link rewrite in HTML
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
def test_example_1():
status_headers, buff = get_rewritten('http://example.com/', urlrewriter, req_headers={'Connection': 'close'})

View File

@ -120,6 +120,11 @@ rules:
# testing rules -- not for valid domain
#=================================================================
# this rule block is a non-existent prefix merely for testing
- url_prefix: 'example,example,test,loconly)/'
rewrite:
js_rewrite_location: location
- url_prefix: 'example,example,test)/'
canonicalize:
@ -131,7 +136,7 @@ rules:
- id
rewrite:
js_rewrite_location: False
js_rewrite_location: urls
# all domain rules -- fallback to this dataset

View File

@ -1,5 +1,5 @@
<!-- WB Insert -->
{% if rule.js_rewrite_location and include_wombat %}
{% if rule.js_rewrite_location != 'urls' and include_wombat %}
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wombat.js'> </script>
<script>
{% set urlsplit = cdx.original | urlsplit %}

View File

@ -6,7 +6,7 @@
<script>
var some_val = false;
if (some_val) {
window.location = "/other.html";
window.location = "http:\/\/example.com/dynamic_page.html";
}
</script>
Test Content