From 388f31e08fa35930be6ccc02d40b09985a5e8db6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 11 Nov 2014 15:34:14 -0800 Subject: [PATCH 1/4] rewrite: don't rewrite rel=canonical links, need to make rewriting more configurable (#50) --- pywb/rewrite/html_rewriter.py | 6 ++++++ pywb/rewrite/test/test_html_rewriter.py | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index f0c904c2..618c5191 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -174,6 +174,12 @@ class HTMLRewriterMixin(object): elif attr_name == 'crossorigin': attr_name = '_crossorigin' + # special case: link don't rewrite canonical + elif tag == 'link' and attr_name == 'href': + if not self.has_attr(tag_attrs, ('rel', 'canonical')): + rw_mod = handler.get(attr_name) + attr_value = self._rewrite_url(attr_value, rw_mod) + # special case: meta tag elif (tag == 'meta') and (attr_name == 'content'): if self.has_attr(tag_attrs, ('http-equiv', 'refresh')): diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index 45df4dfb..710fa338 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -102,6 +102,10 @@ ur""" >>> parse('
SomeTest
', head_insert = '')
SomeTest
+# don't rewrite rel=canonical +>>> parse('') + + # doctype >>> parse('') From 20070e95b67e1c21d226762398c4a46f5a87664e Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 13 Nov 2014 09:24:34 -0800 Subject: [PATCH 2/4] cookie_rewriter: add 'exact' cookie rewriter which never changes the path/domain --- pywb/rewrite/cookie_rewriter.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pywb/rewrite/cookie_rewriter.py b/pywb/rewrite/cookie_rewriter.py index 4724df4c..e9dd80ac 100644 --- a/pywb/rewrite/cookie_rewriter.py +++ b/pywb/rewrite/cookie_rewriter.py @@ -55,6 +55,24 @@ class MinimalScopeCookieRewriter(WbUrlBaseCookieRewriter): return morsel +#================================================================= +class ExactPathCookieRewriter(WbUrlBaseCookieRewriter): + """ + Rewrite cookies only using exact path, useful for live rewrite + without a timestamp and to minimize cookie pollution + + If path or domain present, simply remove + """ + + def rewrite_cookie(self, name, morsel): + if morsel.get('domain'): + del morsel['domain'] + # else set cookie to rewritten path + if morsel.get('path'): + del morsel['path'] + + self._remove_age_opts(morsel) + return morsel #================================================================= class RootScopeCookieRewriter(WbUrlBaseCookieRewriter): """ @@ -79,5 +97,7 @@ class RootScopeCookieRewriter(WbUrlBaseCookieRewriter): def get_cookie_rewriter(cookie_scope): if cookie_scope == 'root': return RootScopeCookieRewriter + elif cookie_scope == 'exact': + return ExactPathCookieRewriter else: return MinimalScopeCookieRewriter From b8b8c30573f2175d2280f80893b1f4043f6bd728 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 13 Nov 2014 09:43:50 -0800 Subject: [PATCH 3/4] cookie_rewriter: add tests for exact cookie rewriter --- pywb/rewrite/test/test_cookie_rewriter.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pywb/rewrite/test/test_cookie_rewriter.py b/pywb/rewrite/test/test_cookie_rewriter.py index c20f56f9..4f57464f 100644 --- a/pywb/rewrite/test/test_cookie_rewriter.py +++ b/pywb/rewrite/test/test_cookie_rewriter.py @@ -1,4 +1,5 @@ r""" +# Default -- MinimalScopeRewriter # No rewriting >>> rewrite_cookie('a=b; c=d;') [('Set-Cookie', 'a=b'), ('Set-Cookie', 'c=d')] @@ -23,10 +24,17 @@ r""" >>> rewrite_cookie('abc@def=123') [] +# ExactCookieRewriter +>>> rewrite_cookie('some=value; Path=/diff/path/;', urlrewriter, ExactPathCookieRewriter) +[('Set-Cookie', 'some=value')] + +>>> rewrite_cookie('some=value; Domain=.example.com; Path=/diff/path/; Max-Age=1500', urlrewriter, ExactPathCookieRewriter) +[('Set-Cookie', 'some=value')] + """ -from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter +from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter, ExactPathCookieRewriter from pywb.rewrite.url_rewriter import UrlRewriter urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') @@ -34,6 +42,6 @@ urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.htm urlrewriter2 = UrlRewriter('em_/http://example.com/', '/preview/') -def rewrite_cookie(cookie_str, rewriter=urlrewriter): - return MinimalScopeCookieRewriter(rewriter).rewrite(cookie_str) +def rewrite_cookie(cookie_str, rewriter=urlrewriter, cookie_rewriter=MinimalScopeCookieRewriter): + return cookie_rewriter(rewriter).rewrite(cookie_str) From d7eb40af20aa26e42f3b43e60f2fedd234200099 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 23 Nov 2014 18:56:49 -0800 Subject: [PATCH 4/4] rewrite: properly rewrite scheme relative JS-escaped urls: '\/\/example.com', '\\/\\/example.com/', treat same as '//example.com' adding http: prefix --- pywb/rewrite/test/test_regex_rewriters.py | 10 ++++++++++ pywb/rewrite/test/test_url_rewriter.py | 15 +++++++++++++++ pywb/rewrite/url_rewriter.py | 6 ++++-- 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/pywb/rewrite/test/test_regex_rewriters.py b/pywb/rewrite/test/test_regex_rewriters.py index 253328e5..92975a7f 100644 --- a/pywb/rewrite/test/test_regex_rewriters.py +++ b/pywb/rewrite/test/test_regex_rewriters.py @@ -45,6 +45,16 @@ r""" >>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"') 'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"' +# protocol-rel escapes +>>> _test_js('"//example.com/"') +'"/web/20131010/http://example.com/"' + +>>> _test_js(r'"\/\/example.com/"') +'"/web/20131010/http:\\/\\/example.com/"' + +>>> _test_js(r'"\\/\\/example.com/"') +'"/web/20131010/http:\\\\/\\\\/example.com/"' + # custom rules added >>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)]) 'window.WB_wombat_location = "/web/20131010/http://example.com/abc.html"; /*some_func(); */' diff --git a/pywb/rewrite/test/test_url_rewriter.py b/pywb/rewrite/test/test_url_rewriter.py index be0ca7da..3d324069 100644 --- a/pywb/rewrite/test/test_url_rewriter.py +++ b/pywb/rewrite/test/test_url_rewriter.py @@ -50,6 +50,21 @@ >>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') 'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com' +>>> do_rewrite(r'//some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') +'localhost:8080/20101226101112/http://some-other-site.com' + +>>> do_rewrite(r'\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') +'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com' + +>>> do_rewrite(r'\\/\\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') +'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com' + +>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') +'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com' + +>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') +'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com' + >>> do_rewrite('../../other.html', '2020/http://example.com/index.html', '/') '/2020/http://example.com/other.html' diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index 61a48e50..aa87260c 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -17,7 +17,9 @@ class UrlRewriter(object): PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:'] - def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None, + REL_SCHEME = ('//', r'\/\/', r'\\/\\/') + + def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None, root_path=None, cookie_scope=None): self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl) self.prefix = prefix @@ -45,7 +47,7 @@ class UrlRewriter(object): is_abs = any(url.startswith(x) for x in self.PROTOCOLS) - if url.startswith('//'): + if url.startswith(self.REL_SCHEME): is_abs = True url = 'http:' + url