diff --git a/pywb/rewrite/cookie_rewriter.py b/pywb/rewrite/cookie_rewriter.py index 34b8f2c6..9ea80ac7 100644 --- a/pywb/rewrite/cookie_rewriter.py +++ b/pywb/rewrite/cookie_rewriter.py @@ -1,16 +1,20 @@ from six.moves.http_cookies import SimpleCookie, CookieError import six +import re -#================================================================= +#================================================================ class WbUrlBaseCookieRewriter(object): """ Base Cookie rewriter for wburl-based requests. """ + UTC_RX = re.compile('((?:.*)Expires=(?:.*))UTC', re.I) + def __init__(self, url_rewriter): self.url_rewriter = url_rewriter def rewrite(self, cookie_str, header='Set-Cookie'): results = [] + cookie_str = self.UTC_RX.sub('\\1GMT', cookie_str) try: cookie = SimpleCookie(cookie_str) except CookieError: @@ -21,17 +25,27 @@ class WbUrlBaseCookieRewriter(object): for name, morsel in six.iteritems(cookie): morsel = self.rewrite_cookie(name, morsel) - if morsel: - path = morsel.get('path') - if path: - inx = path.find(self.url_rewriter.rel_prefix) - if inx > 0: - morsel['path'] = path[inx:] - - results.append((header, morsel.OutputString())) + self._filter_morsel(morsel) + results.append((header, morsel.OutputString())) return results + def _filter_morsel(self, morsel): + path = morsel.get('path') + if path: + inx = path.find(self.url_rewriter.rel_prefix) + if inx > 0: + morsel['path'] = path[inx:] + + if not self.url_rewriter.full_prefix.startswith('https://'): + # also remove secure to avoid issues when + # proxying over plain http + if morsel.get('secure'): + del morsel['secure'] + + if not self.url_rewriter.rewrite_opts.get('is_live'): + self._remove_age_opts(morsel) + def _remove_age_opts(self, morsel): # remove expires as it refers to archived time if morsel.get('expires'): @@ -41,11 +55,6 @@ class WbUrlBaseCookieRewriter(object): if morsel.get('max-age'): del morsel['max-age'] - # for now, also remove secure to avoid issues when - # proxying over plain http (TODO: detect https?) - if morsel.get('secure'): - del morsel['secure'] - #================================================================= class RemoveAllCookiesRewriter(WbUrlBaseCookieRewriter): @@ -71,7 +80,6 @@ class MinimalScopeCookieRewriter(WbUrlBaseCookieRewriter): elif morsel.get('path'): morsel['path'] = self.url_rewriter.rewrite(morsel['path']) - self._remove_age_opts(morsel) return morsel @@ -96,7 +104,6 @@ class HostScopeCookieRewriter(WbUrlBaseCookieRewriter): elif morsel.get('path'): morsel['path'] = self.url_rewriter.rewrite(morsel['path']) - self._remove_age_opts(morsel) return morsel @@ -116,7 +123,6 @@ class ExactPathCookieRewriter(WbUrlBaseCookieRewriter): if morsel.get('path'): del morsel['path'] - self._remove_age_opts(morsel) return morsel @@ -136,7 +142,6 @@ class RootScopeCookieRewriter(WbUrlBaseCookieRewriter): if morsel.get('domain'): del morsel['domain'] - self._remove_age_opts(morsel) return morsel @@ -154,3 +159,4 @@ def get_cookie_rewriter(cookie_scope): return MinimalScopeCookieRewriter else: return HostScopeCookieRewriter + diff --git a/pywb/rewrite/header_rewriter.py b/pywb/rewrite/header_rewriter.py index 610df546..4c161186 100644 --- a/pywb/rewrite/header_rewriter.py +++ b/pywb/rewrite/header_rewriter.py @@ -166,6 +166,9 @@ class HeaderRewriter(object): else: add_prefixed_header(name, value) + elif lowername.startswith('x-'): + add_header(name, value) + elif urlrewriter: add_prefixed_header(name, value) else: diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 9ad1dc0e..5d708103 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -106,8 +106,8 @@ class RewriteContent: stream) return (status_headers, self.stream_to_gen(stream), False) - if wb_url.is_banner_only: - urlrewriter = None + if urlrewriter and cdx and cdx.get('is_live'): + urlrewriter.rewrite_opts['is_live'] = True rule = self.ruleset.get_first_match(urlkey) diff --git a/pywb/rewrite/test/test_cookie_rewriter.py b/pywb/rewrite/test/test_cookie_rewriter.py index e738804e..8dc30544 100644 --- a/pywb/rewrite/test/test_cookie_rewriter.py +++ b/pywb/rewrite/test/test_cookie_rewriter.py @@ -24,6 +24,26 @@ True >>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter, 'coll') [('Set-Cookie', 'abc=def; Path=/pywb/20131226101010/http://example.com/some/path/file.html')] +# keep expires +>>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter2, 'coll') +[('Set-Cookie', 'abc=def; expires=Wed, 13 Jan 2021 22:23:01 GMT; Path=/preview/em_/http://example.com/file.html')] + +# keep expires, UTC->GMT +>>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 UTC', urlrewriter2, 'coll') +[('Set-Cookie', 'abc=def; expires=Wed, 13 Jan 2021 22:23:01 GMT; Path=/preview/em_/http://example.com/file.html')] + +# keep Max-Age +>>> rewrite_cookie('abc=def; Path=file.html; Max-Age=1500', urlrewriter2, 'coll') +[('Set-Cookie', 'abc=def; Max-Age=1500; Path=/preview/em_/http://example.com/file.html')] + +# Secure Remove +>>> rewrite_cookie('abc=def; Path=file.html; HttpOnly; Secure', urlrewriter2, 'coll') +[('Set-Cookie', 'abc=def; HttpOnly; Path=/preview/em_/http://example.com/file.html')] + +# Secure Keep +>>> rewrite_cookie('abc=def; Path=file.html; HttpOnly; Secure', urlrewriter3, 'coll') +[('Set-Cookie', 'abc=def; HttpOnly; Path=/preview/em_/http://example.com/file.html; Secure')] + # Cookie with invalid chars, not parsed >>> rewrite_cookie('abc@def=123', urlrewriter, 'coll') [] @@ -67,15 +87,22 @@ True """ -from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter, get_cookie_rewriter +from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter +from pywb.rewrite.cookie_rewriter import get_cookie_rewriter from pywb.rewrite.url_rewriter import UrlRewriter -urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', 'http://localhost:8080/pywb/', rel_prefix='/pywb/') +urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', + 'http://localhost:8080/pywb/', + rel_prefix='/pywb/') urlrewriter2 = UrlRewriter('em_/http://example.com/', '/preview/') +urlrewriter2.rewrite_opts['is_live'] = True + +urlrewriter3 = UrlRewriter('em_/http://example.com/', 'https://localhost:8080/preview/') def rewrite_cookie(cookie_str, rewriter=urlrewriter, scope='default'): cookie_rewriter = get_cookie_rewriter(scope) return cookie_rewriter(rewriter).rewrite(cookie_str) + diff --git a/pywb/rewrite/test/test_header_rewriter.py b/pywb/rewrite/test/test_header_rewriter.py index 6bb40acb..e3a052e6 100644 --- a/pywb/rewrite/test/test_header_rewriter.py +++ b/pywb/rewrite/test/test_header_rewriter.py @@ -31,14 +31,15 @@ HTTP Headers Rewriting 'text_type': 'js'} # Binary -- transfer-encoding rewritten ->>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Set-Cookie', 'foo=bar; Path=/;'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')]) +>>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Set-Cookie', 'foo=bar; Path=/;'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked'), ('X-Proxy', 'test')]) {'charset': None, 'removed_header_dict': {'transfer-encoding': 'chunked'}, 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/'), ('Content-Encoding', 'gzip'), - ('X-Archive-Orig-Transfer-Encoding', 'chunked')]), + ('X-Archive-Orig-Transfer-Encoding', 'chunked'), + ('X-Proxy', 'test')]), 'text_type': None} """ diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index 25c04d74..4774bc03 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -20,7 +20,7 @@ class UrlRewriter(object): REL_SCHEME = ('//', r'\/\/', r'\\/\\/') def __init__(self, wburl, prefix='', full_prefix=None, rel_prefix=None, - root_path=None, cookie_scope=None, rewrite_opts={}): + root_path=None, cookie_scope=None, rewrite_opts=None): self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl) self.prefix = prefix self.full_prefix = full_prefix or prefix @@ -32,9 +32,9 @@ class UrlRewriter(object): self.prefix_scheme = None self.prefix_abs = self.prefix and self.prefix.startswith(self.PROTOCOLS) self.cookie_scope = cookie_scope - self.rewrite_opts = rewrite_opts + self.rewrite_opts = rewrite_opts or {} - if rewrite_opts.get('punycode_links'): + if self.rewrite_opts.get('punycode_links'): self.wburl._do_percent_encode = False def rewrite(self, url, mod=None): diff --git a/tests/test_live_proxy.py b/tests/test_live_proxy.py index 14644322..8b4d001b 100644 --- a/tests/test_live_proxy.py +++ b/tests/test_live_proxy.py @@ -116,7 +116,7 @@ class TestProxyLiveRewriter: # equal to returned response (echo) assert self.requestlog[0] == resp.text - assert resp.headers['x-archive-orig-x-proxy'] == 'test' + assert resp.headers['x-proxy'] == 'test' assert resp.text.startswith('GET http://example.com/ HTTP/1.1') assert 'referer: http://other.example.com' in resp.text.lower() @@ -136,7 +136,7 @@ class TestProxyLiveRewriter: # proxied, but without range assert self.requestlog[0] == resp.text - assert resp.headers['x-archive-orig-x-proxy'] == 'test' + assert resp.headers['x-proxy'] == 'test' assert self.requestlog[0].startswith('GET http://example.com/ HTTP/1.1') assert 'range: ' not in self.requestlog[0] @@ -153,7 +153,7 @@ class TestProxyLiveRewriter: assert resp.headers['Accept-Ranges'] == 'bytes' # not from proxy - assert 'x-archive-orig-x-proxy' not in resp.headers + assert 'x-proxy' not in resp.headers # proxy receives a request also, but w/o range assert len(self.requestlog) == 1 @@ -182,7 +182,7 @@ class TestProxyLiveRewriter: assert resp.headers['Accept-Ranges'] == 'bytes' # not from proxy - assert 'x-archive-orig-x-proxy' not in resp.headers + assert 'x-proxy' not in resp.headers # already pinged proxy, no additional requests set to proxy assert len(self.requestlog) == 0