1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

rewrite: refactor JS rewriters into seperate mixins, allowing for

link only, location only, and link + location JS rewriters.
location-only rewriter is new
js_rewrite_location options: all, location, urls (for now)
This commit is contained in:
Ilya Kreymer 2014-12-07 21:09:37 -08:00
parent b951b304b6
commit 0a6838ac2b
7 changed files with 65 additions and 27 deletions

View File

@ -35,7 +35,7 @@ class RegexRewriter(object):
#DEFAULT_OP = add_prefix #DEFAULT_OP = add_prefix
def __init__(self, rules): def __init__(self, rewriter, rules):
#rules = self.create_rules(http_prefix) #rules = self.create_rules(http_prefix)
# Build regexstr, concatenating regex list # Build regexstr, concatenating regex list
@ -106,7 +106,7 @@ class RegexRewriter(object):
#================================================================= #=================================================================
class JSLinkOnlyRewriter(RegexRewriter): class JSLinkRewriterMixin(object):
""" """
JS Rewriter which rewrites absolute http://, https:// and // urls JS Rewriter which rewrites absolute http://, https:// and // urls
at the beginning of a string at the beginning of a string
@ -118,13 +118,14 @@ class JSLinkOnlyRewriter(RegexRewriter):
rules = rules + [ rules = rules + [
(self.JS_HTTPX, RegexRewriter.archival_rewrite(rewriter), 0) (self.JS_HTTPX, RegexRewriter.archival_rewrite(rewriter), 0)
] ]
super(JSLinkOnlyRewriter, self).__init__(rules) super(JSLinkRewriterMixin, self).__init__(rewriter, rules)
#================================================================= #=================================================================
class JSLinkAndLocationRewriter(JSLinkOnlyRewriter): class JSLocationRewriterMixin(object):
#class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
""" """
JS Rewriter which also rewrites location and domain to the JS Rewriter mixin which rewrites location and domain to the
specified prefix (default: 'WB_wombat_') specified prefix (default: 'WB_wombat_')
""" """
@ -148,7 +149,23 @@ class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
#(r'\b(?:self|window)\b[!=\W]+\b(top)\b', #(r'\b(?:self|window)\b[!=\W]+\b(top)\b',
#RegexRewriter.add_prefix(prefix), 1), #RegexRewriter.add_prefix(prefix), 1),
] ]
super(JSLinkAndLocationRewriter, self).__init__(rewriter, rules) super(JSLocationRewriterMixin, self).__init__(rewriter, rules)
#=================================================================
class JSLocationOnlyRewriter(JSLocationRewriterMixin, RegexRewriter):
pass
#=================================================================
class JSLinkOnlyRewriter(JSLinkRewriterMixin, RegexRewriter):
pass
#=================================================================
class JSLinkAndLocationRewriter(JSLocationRewriterMixin,
JSLinkRewriterMixin,
RegexRewriter):
pass
#================================================================= #=================================================================
@ -161,7 +178,7 @@ class XMLRewriter(RegexRewriter):
def __init__(self, rewriter, extra=[]): def __init__(self, rewriter, extra=[]):
rules = self._create_rules(rewriter) rules = self._create_rules(rewriter)
super(XMLRewriter, self).__init__(rules) super(XMLRewriter, self).__init__(rewriter, rules)
# custom filter to reject 'xmlns' attr # custom filter to reject 'xmlns' attr
def filter(self, m): def filter(self, m):
@ -189,7 +206,7 @@ class CSSRewriter(RegexRewriter):
def __init__(self, rewriter): def __init__(self, rewriter):
rules = self._create_rules(rewriter) rules = self._create_rules(rewriter)
super(CSSRewriter, self).__init__(rules) super(CSSRewriter, self).__init__(rewriter, rules)
def _create_rules(self, rewriter): def _create_rules(self, rewriter):
return [ return [

View File

@ -1,7 +1,7 @@
from pywb.utils.dsrules import BaseRule from pywb.utils.dsrules import BaseRule
from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter, JSLocationOnlyRewriter
from header_rewriter import HeaderRewriter from header_rewriter import HeaderRewriter
from html_rewriter import HTMLRewriter from html_rewriter import HTMLRewriter
@ -27,12 +27,13 @@ class RewriteRules(BaseRule):
self.parse_comments = config.get('parse_comments', False) self.parse_comments = config.get('parse_comments', False)
# Custom handling for js rewriting, often the most complex # Custom handling for js rewriting, often the most complex
self.js_rewrite_location = config.get('js_rewrite_location', True) self.js_rewrite_location = config.get('js_rewrite_location', 'all')
self.js_rewrite_location = bool(self.js_rewrite_location)
# ability to toggle rewriting # ability to toggle rewriting
if self.js_rewrite_location: if self.js_rewrite_location == 'all':
js_default_class = JSLinkAndLocationRewriter js_default_class = JSLinkAndLocationRewriter
elif self.js_rewrite_location == 'location':
js_default_class = JSLocationOnlyRewriter
else: else:
js_default_class = JSLinkOnlyRewriter js_default_class = JSLinkOnlyRewriter

View File

@ -3,7 +3,7 @@ r"""
# Custom Regex # Custom Regex
#================================================================= #=================================================================
# Test https->http converter (other tests below in subclasses) # Test https->http converter (other tests below in subclasses)
>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com') >>> RegexRewriter(urlrewriter, [(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com' 'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'

View File

@ -13,7 +13,7 @@ urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.htm
bn_urlrewriter = UrlRewriter('20131226101010bn_/http://example.com/some/path/index.html', '/pywb/') bn_urlrewriter = UrlRewriter('20131226101010bn_/http://example.com/some/path/index.html', '/pywb/')
def head_insert_func(rule, cdx): def head_insert_func(rule, cdx):
if rule.js_rewrite_location == True: if rule.js_rewrite_location != 'urls':
return '<script src="/static/default/wombat.js"> </script>' return '<script src="/static/default/wombat.js"> </script>'
else: else:
return '' return ''
@ -26,10 +26,10 @@ def test_local_1():
'com,example,test)/') 'com,example,test)/')
# wombat insert added # wombat insert added
assert '<head><script src="/static/default/wombat.js"> </script>' in buff assert '<head><script src="/static/default/wombat.js"> </script>' in buff, buff
# location rewritten # JS location and JS link rewritten
assert 'window.WB_wombat_location = "/other.html"' in buff assert 'window.WB_wombat_location = "/pywb/20131226101010/http:\/\/example.com/dynamic_page.html"' in buff
# link rewritten # link rewritten
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
@ -65,7 +65,7 @@ def test_local_no_head_banner_only():
# link NOT rewritten # link NOT rewritten
assert '"another.html"' in buff assert '"another.html"' in buff
def test_local_banner_only(): def test_local_banner_only_no_rewrite():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
bn_urlrewriter, bn_urlrewriter,
head_insert_func, head_insert_func,
@ -74,13 +74,13 @@ def test_local_banner_only():
# wombat insert added # wombat insert added
assert '<head><script src="/static/default/wombat.js"> </script>' in buff assert '<head><script src="/static/default/wombat.js"> </script>' in buff
# location NOT rewritten # JS location NOT rewritten, JS link NOT rewritten
assert 'window.location = "/other.html"' in buff assert 'window.location = "http:\/\/example.com/dynamic_page.html"' in buff, buff
# link NOT rewritten # link NOT rewritten
assert '"another.html"' in buff assert '"another.html"' in buff
def test_local_2_no_js_location_rewrite(): def test_local_2_link_only_rewrite():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
urlrewriter, urlrewriter,
head_insert_func, head_insert_func,
@ -89,13 +89,28 @@ def test_local_2_no_js_location_rewrite():
# no wombat insert # no wombat insert
assert '<head><script src="/static/default/wombat.js"> </script>' not in buff assert '<head><script src="/static/default/wombat.js"> </script>' not in buff
# no location rewrite # JS location NOT rewritten, JS link rewritten
assert 'window.location = "/other.html"' in buff assert 'window.location = "/pywb/20131226101010/http:\/\/example.com/dynamic_page.html"' in buff
# still link rewrite # still link rewrite
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
def test_local_2_js_loc_only_rewrite():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
urlrewriter,
head_insert_func,
'example,example,test,loconly)/')
# wombat insert added
assert '<script src="/static/default/wombat.js"> </script>' in buff
# JS location rewritten, JS link NOT rewritten
assert 'window.WB_wombat_location = "http:\/\/example.com/dynamic_page.html"' in buff
# still link rewrite in HTML
assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff
def test_example_1(): def test_example_1():
status_headers, buff = get_rewritten('http://example.com/', urlrewriter, req_headers={'Connection': 'close'}) status_headers, buff = get_rewritten('http://example.com/', urlrewriter, req_headers={'Connection': 'close'})

View File

@ -120,6 +120,11 @@ rules:
# testing rules -- not for valid domain # testing rules -- not for valid domain
#================================================================= #=================================================================
# this rule block is a non-existent prefix merely for testing # this rule block is a non-existent prefix merely for testing
- url_prefix: 'example,example,test,loconly)/'
rewrite:
js_rewrite_location: location
- url_prefix: 'example,example,test)/' - url_prefix: 'example,example,test)/'
canonicalize: canonicalize:
@ -131,10 +136,10 @@ rules:
- id - id
rewrite: rewrite:
js_rewrite_location: False js_rewrite_location: urls
# all domain rules -- fallback to this dataset # all domain rules -- fallback to this dataset
#================================================================= #=================================================================
# Applies to all urls -- should be last # Applies to all urls -- should be last
- url_prefix: '' - url_prefix: ''

View File

@ -1,5 +1,5 @@
<!-- WB Insert --> <!-- WB Insert -->
{% if rule.js_rewrite_location and include_wombat %} {% if rule.js_rewrite_location != 'urls' and include_wombat %}
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wombat.js'> </script> <script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wombat.js'> </script>
<script> <script>
{% set urlsplit = cdx.original | urlsplit %} {% set urlsplit = cdx.original | urlsplit %}

View File

@ -6,7 +6,7 @@
<script> <script>
var some_val = false; var some_val = false;
if (some_val) { if (some_val) {
window.location = "/other.html"; window.location = "http:\/\/example.com/dynamic_page.html";
} }
</script> </script>
Test Content Test Content