diff --git a/pywb/rewrite/cookie_rewriter.py b/pywb/rewrite/cookie_rewriter.py index 4724df4c..e9dd80ac 100644 --- a/pywb/rewrite/cookie_rewriter.py +++ b/pywb/rewrite/cookie_rewriter.py @@ -55,6 +55,24 @@ class MinimalScopeCookieRewriter(WbUrlBaseCookieRewriter): return morsel +#================================================================= +class ExactPathCookieRewriter(WbUrlBaseCookieRewriter): + """ + Rewrite cookies only using exact path, useful for live rewrite + without a timestamp and to minimize cookie pollution + + If path or domain present, simply remove + """ + + def rewrite_cookie(self, name, morsel): + if morsel.get('domain'): + del morsel['domain'] + # else set cookie to rewritten path + if morsel.get('path'): + del morsel['path'] + + self._remove_age_opts(morsel) + return morsel #================================================================= class RootScopeCookieRewriter(WbUrlBaseCookieRewriter): """ @@ -79,5 +97,7 @@ class RootScopeCookieRewriter(WbUrlBaseCookieRewriter): def get_cookie_rewriter(cookie_scope): if cookie_scope == 'root': return RootScopeCookieRewriter + elif cookie_scope == 'exact': + return ExactPathCookieRewriter else: return MinimalScopeCookieRewriter diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index b3668521..7acc0f3e 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -185,6 +185,12 @@ class HTMLRewriterMixin(object): elif attr_name == 'crossorigin': attr_name = '_crossorigin' + # special case: link don't rewrite canonical + elif tag == 'link' and attr_name == 'href': + if not self.has_attr(tag_attrs, ('rel', 'canonical')): + rw_mod = handler.get(attr_name) + attr_value = self._rewrite_url(attr_value, rw_mod) + # special case: meta tag elif (tag == 'meta') and (attr_name == 'content'): if self.has_attr(tag_attrs, ('http-equiv', 'refresh')): diff --git a/pywb/rewrite/test/test_cookie_rewriter.py b/pywb/rewrite/test/test_cookie_rewriter.py index c20f56f9..4f57464f 100644 --- a/pywb/rewrite/test/test_cookie_rewriter.py +++ b/pywb/rewrite/test/test_cookie_rewriter.py @@ -1,4 +1,5 @@ r""" +# Default -- MinimalScopeRewriter # No rewriting >>> rewrite_cookie('a=b; c=d;') [('Set-Cookie', 'a=b'), ('Set-Cookie', 'c=d')] @@ -23,10 +24,17 @@ r""" >>> rewrite_cookie('abc@def=123') [] +# ExactCookieRewriter +>>> rewrite_cookie('some=value; Path=/diff/path/;', urlrewriter, ExactPathCookieRewriter) +[('Set-Cookie', 'some=value')] + +>>> rewrite_cookie('some=value; Domain=.example.com; Path=/diff/path/; Max-Age=1500', urlrewriter, ExactPathCookieRewriter) +[('Set-Cookie', 'some=value')] + """ -from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter +from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter, ExactPathCookieRewriter from pywb.rewrite.url_rewriter import UrlRewriter urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') @@ -34,6 +42,6 @@ urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.htm urlrewriter2 = UrlRewriter('em_/http://example.com/', '/preview/') -def rewrite_cookie(cookie_str, rewriter=urlrewriter): - return MinimalScopeCookieRewriter(rewriter).rewrite(cookie_str) +def rewrite_cookie(cookie_str, rewriter=urlrewriter, cookie_rewriter=MinimalScopeCookieRewriter): + return cookie_rewriter(rewriter).rewrite(cookie_str) diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index a7d9337b..46c1773c 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -106,6 +106,10 @@ ur""" >>> parse('
SomeTest
', head_insert = '')
SomeTest
+# don't rewrite rel=canonical +>>> parse('') + + # doctype >>> parse('') diff --git a/pywb/rewrite/test/test_regex_rewriters.py b/pywb/rewrite/test/test_regex_rewriters.py index 253328e5..92975a7f 100644 --- a/pywb/rewrite/test/test_regex_rewriters.py +++ b/pywb/rewrite/test/test_regex_rewriters.py @@ -45,6 +45,16 @@ r""" >>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"') 'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"' +# protocol-rel escapes +>>> _test_js('"//example.com/"') +'"/web/20131010/http://example.com/"' + +>>> _test_js(r'"\/\/example.com/"') +'"/web/20131010/http:\\/\\/example.com/"' + +>>> _test_js(r'"\\/\\/example.com/"') +'"/web/20131010/http:\\\\/\\\\/example.com/"' + # custom rules added >>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)]) 'window.WB_wombat_location = "/web/20131010/http://example.com/abc.html"; /*some_func(); */' diff --git a/pywb/rewrite/test/test_url_rewriter.py b/pywb/rewrite/test/test_url_rewriter.py index be0ca7da..3d324069 100644 --- a/pywb/rewrite/test/test_url_rewriter.py +++ b/pywb/rewrite/test/test_url_rewriter.py @@ -50,6 +50,21 @@ >>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') 'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com' +>>> do_rewrite(r'//some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') +'localhost:8080/20101226101112/http://some-other-site.com' + +>>> do_rewrite(r'\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') +'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com' + +>>> do_rewrite(r'\\/\\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') +'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com' + +>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') +'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com' + +>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') +'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com' + >>> do_rewrite('../../other.html', '2020/http://example.com/index.html', '/') '/2020/http://example.com/other.html' diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index a162f67e..4ef332d4 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -17,7 +17,9 @@ class UrlRewriter(object): PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:'] - def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None, + REL_SCHEME = ('//', r'\/\/', r'\\/\\/') + + def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None, root_path=None, cookie_scope=None): self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl) self.prefix = prefix @@ -45,7 +47,7 @@ class UrlRewriter(object): is_abs = any(url.startswith(x) for x in self.PROTOCOLS) - if url.startswith('//'): + if url.startswith(self.REL_SCHEME): is_abs = True url = 'http:' + url