From 0a6838ac2b98618d0a0cc7419dd5698b7e7e45ac Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 7 Dec 2014 21:09:37 -0800 Subject: [PATCH] rewrite: refactor JS rewriters into seperate mixins, allowing for link only, location only, and link + location JS rewriters. location-only rewriter is new js_rewrite_location options: all, location, urls (for now) --- pywb/rewrite/regex_rewriters.py | 33 +++++++++++++++------ pywb/rewrite/rewriterules.py | 9 +++--- pywb/rewrite/test/test_regex_rewriters.py | 2 +- pywb/rewrite/test/test_rewrite_live.py | 35 ++++++++++++++++------- pywb/rules.yaml | 9 ++++-- pywb/ui/head_insert.html | 2 +- sample_archive/text_content/sample.html | 2 +- 7 files changed, 65 insertions(+), 27 deletions(-) diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py index 375bca08..5d680068 100644 --- a/pywb/rewrite/regex_rewriters.py +++ b/pywb/rewrite/regex_rewriters.py @@ -35,7 +35,7 @@ class RegexRewriter(object): #DEFAULT_OP = add_prefix - def __init__(self, rules): + def __init__(self, rewriter, rules): #rules = self.create_rules(http_prefix) # Build regexstr, concatenating regex list @@ -106,7 +106,7 @@ class RegexRewriter(object): #================================================================= -class JSLinkOnlyRewriter(RegexRewriter): +class JSLinkRewriterMixin(object): """ JS Rewriter which rewrites absolute http://, https:// and // urls at the beginning of a string @@ -118,13 +118,14 @@ class JSLinkOnlyRewriter(RegexRewriter): rules = rules + [ (self.JS_HTTPX, RegexRewriter.archival_rewrite(rewriter), 0) ] - super(JSLinkOnlyRewriter, self).__init__(rules) + super(JSLinkRewriterMixin, self).__init__(rewriter, rules) #================================================================= -class JSLinkAndLocationRewriter(JSLinkOnlyRewriter): +class JSLocationRewriterMixin(object): +#class JSLinkAndLocationRewriter(JSLinkOnlyRewriter): """ - JS Rewriter which also rewrites location and domain to the + JS Rewriter mixin which rewrites location and domain to the specified prefix (default: 'WB_wombat_') """ @@ -148,7 +149,23 @@ class JSLinkAndLocationRewriter(JSLinkOnlyRewriter): #(r'\b(?:self|window)\b[!=\W]+\b(top)\b', #RegexRewriter.add_prefix(prefix), 1), ] - super(JSLinkAndLocationRewriter, self).__init__(rewriter, rules) + super(JSLocationRewriterMixin, self).__init__(rewriter, rules) + + +#================================================================= +class JSLocationOnlyRewriter(JSLocationRewriterMixin, RegexRewriter): + pass + + +#================================================================= +class JSLinkOnlyRewriter(JSLinkRewriterMixin, RegexRewriter): + pass + +#================================================================= +class JSLinkAndLocationRewriter(JSLocationRewriterMixin, + JSLinkRewriterMixin, + RegexRewriter): + pass #================================================================= @@ -161,7 +178,7 @@ class XMLRewriter(RegexRewriter): def __init__(self, rewriter, extra=[]): rules = self._create_rules(rewriter) - super(XMLRewriter, self).__init__(rules) + super(XMLRewriter, self).__init__(rewriter, rules) # custom filter to reject 'xmlns' attr def filter(self, m): @@ -189,7 +206,7 @@ class CSSRewriter(RegexRewriter): def __init__(self, rewriter): rules = self._create_rules(rewriter) - super(CSSRewriter, self).__init__(rules) + super(CSSRewriter, self).__init__(rewriter, rules) def _create_rules(self, rewriter): return [ diff --git a/pywb/rewrite/rewriterules.py b/pywb/rewrite/rewriterules.py index 5bc99e3a..583115f7 100644 --- a/pywb/rewrite/rewriterules.py +++ b/pywb/rewrite/rewriterules.py @@ -1,7 +1,7 @@ from pywb.utils.dsrules import BaseRule from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter -from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter +from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter, JSLocationOnlyRewriter from header_rewriter import HeaderRewriter from html_rewriter import HTMLRewriter @@ -27,12 +27,13 @@ class RewriteRules(BaseRule): self.parse_comments = config.get('parse_comments', False) # Custom handling for js rewriting, often the most complex - self.js_rewrite_location = config.get('js_rewrite_location', True) - self.js_rewrite_location = bool(self.js_rewrite_location) + self.js_rewrite_location = config.get('js_rewrite_location', 'all') # ability to toggle rewriting - if self.js_rewrite_location: + if self.js_rewrite_location == 'all': js_default_class = JSLinkAndLocationRewriter + elif self.js_rewrite_location == 'location': + js_default_class = JSLocationOnlyRewriter else: js_default_class = JSLinkOnlyRewriter diff --git a/pywb/rewrite/test/test_regex_rewriters.py b/pywb/rewrite/test/test_regex_rewriters.py index 92975a7f..19ea5eb6 100644 --- a/pywb/rewrite/test/test_regex_rewriters.py +++ b/pywb/rewrite/test/test_regex_rewriters.py @@ -3,7 +3,7 @@ r""" # Custom Regex #================================================================= # Test https->http converter (other tests below in subclasses) ->>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com') +>>> RegexRewriter(urlrewriter, [(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com') 'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com' diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py index b54138fa..3ea189a4 100644 --- a/pywb/rewrite/test/test_rewrite_live.py +++ b/pywb/rewrite/test/test_rewrite_live.py @@ -13,7 +13,7 @@ urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.htm bn_urlrewriter = UrlRewriter('20131226101010bn_/http://example.com/some/path/index.html', '/pywb/') def head_insert_func(rule, cdx): - if rule.js_rewrite_location == True: + if rule.js_rewrite_location != 'urls': return '' else: return '' @@ -26,10 +26,10 @@ def test_local_1(): 'com,example,test)/') # wombat insert added - assert '' in buff + assert '' in buff, buff - # location rewritten - assert 'window.WB_wombat_location = "/other.html"' in buff + # JS location and JS link rewritten + assert 'window.WB_wombat_location = "/pywb/20131226101010/http:\/\/example.com/dynamic_page.html"' in buff # link rewritten assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff @@ -65,7 +65,7 @@ def test_local_no_head_banner_only(): # link NOT rewritten assert '"another.html"' in buff -def test_local_banner_only(): +def test_local_banner_only_no_rewrite(): status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', bn_urlrewriter, head_insert_func, @@ -74,13 +74,13 @@ def test_local_banner_only(): # wombat insert added assert '' in buff - # location NOT rewritten - assert 'window.location = "/other.html"' in buff + # JS location NOT rewritten, JS link NOT rewritten + assert 'window.location = "http:\/\/example.com/dynamic_page.html"' in buff, buff # link NOT rewritten assert '"another.html"' in buff -def test_local_2_no_js_location_rewrite(): +def test_local_2_link_only_rewrite(): status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, head_insert_func, @@ -89,13 +89,28 @@ def test_local_2_no_js_location_rewrite(): # no wombat insert assert '' not in buff - # no location rewrite - assert 'window.location = "/other.html"' in buff + # JS location NOT rewritten, JS link rewritten + assert 'window.location = "/pywb/20131226101010/http:\/\/example.com/dynamic_page.html"' in buff # still link rewrite assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff +def test_local_2_js_loc_only_rewrite(): + status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', + urlrewriter, + head_insert_func, + 'example,example,test,loconly)/') + + # wombat insert added + assert '' in buff + + # JS location rewritten, JS link NOT rewritten + assert 'window.WB_wombat_location = "http:\/\/example.com/dynamic_page.html"' in buff + + # still link rewrite in HTML + assert '"/pywb/20131226101010/http://example.com/some/path/another.html"' in buff + def test_example_1(): status_headers, buff = get_rewritten('http://example.com/', urlrewriter, req_headers={'Connection': 'close'}) diff --git a/pywb/rules.yaml b/pywb/rules.yaml index aa80717e..b55d8278 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -120,6 +120,11 @@ rules: # testing rules -- not for valid domain #================================================================= # this rule block is a non-existent prefix merely for testing + - url_prefix: 'example,example,test,loconly)/' + + rewrite: + js_rewrite_location: location + - url_prefix: 'example,example,test)/' canonicalize: @@ -131,10 +136,10 @@ rules: - id rewrite: - js_rewrite_location: False + js_rewrite_location: urls - # all domain rules -- fallback to this dataset + # all domain rules -- fallback to this dataset #================================================================= # Applies to all urls -- should be last - url_prefix: '' diff --git a/pywb/ui/head_insert.html b/pywb/ui/head_insert.html index 4e53a5d0..812bcfe0 100644 --- a/pywb/ui/head_insert.html +++ b/pywb/ui/head_insert.html @@ -1,5 +1,5 @@ -{% if rule.js_rewrite_location and include_wombat %} +{% if rule.js_rewrite_location != 'urls' and include_wombat %} Test Content