mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
cookie rewrite improvements: #177
- don't remove max-age and expires if in 'live' rewrite mode (flag set on urlrewriter) - remove secure only if replay prefix is not https - fix expires UTC->GMT as cookie parsing chokes on UTC - other rewriting: don't append rewrite prefix to x- headers tests: add more cookie rewriting tests
This commit is contained in:
parent
61381fcac6
commit
4a60e15577
@ -1,16 +1,20 @@
|
|||||||
from six.moves.http_cookies import SimpleCookie, CookieError
|
from six.moves.http_cookies import SimpleCookie, CookieError
|
||||||
import six
|
import six
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#================================================================
|
||||||
class WbUrlBaseCookieRewriter(object):
|
class WbUrlBaseCookieRewriter(object):
|
||||||
""" Base Cookie rewriter for wburl-based requests.
|
""" Base Cookie rewriter for wburl-based requests.
|
||||||
"""
|
"""
|
||||||
|
UTC_RX = re.compile('((?:.*)Expires=(?:.*))UTC', re.I)
|
||||||
|
|
||||||
def __init__(self, url_rewriter):
|
def __init__(self, url_rewriter):
|
||||||
self.url_rewriter = url_rewriter
|
self.url_rewriter = url_rewriter
|
||||||
|
|
||||||
def rewrite(self, cookie_str, header='Set-Cookie'):
|
def rewrite(self, cookie_str, header='Set-Cookie'):
|
||||||
results = []
|
results = []
|
||||||
|
cookie_str = self.UTC_RX.sub('\\1GMT', cookie_str)
|
||||||
try:
|
try:
|
||||||
cookie = SimpleCookie(cookie_str)
|
cookie = SimpleCookie(cookie_str)
|
||||||
except CookieError:
|
except CookieError:
|
||||||
@ -21,17 +25,27 @@ class WbUrlBaseCookieRewriter(object):
|
|||||||
for name, morsel in six.iteritems(cookie):
|
for name, morsel in six.iteritems(cookie):
|
||||||
morsel = self.rewrite_cookie(name, morsel)
|
morsel = self.rewrite_cookie(name, morsel)
|
||||||
|
|
||||||
if morsel:
|
self._filter_morsel(morsel)
|
||||||
path = morsel.get('path')
|
results.append((header, morsel.OutputString()))
|
||||||
if path:
|
|
||||||
inx = path.find(self.url_rewriter.rel_prefix)
|
|
||||||
if inx > 0:
|
|
||||||
morsel['path'] = path[inx:]
|
|
||||||
|
|
||||||
results.append((header, morsel.OutputString()))
|
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
def _filter_morsel(self, morsel):
|
||||||
|
path = morsel.get('path')
|
||||||
|
if path:
|
||||||
|
inx = path.find(self.url_rewriter.rel_prefix)
|
||||||
|
if inx > 0:
|
||||||
|
morsel['path'] = path[inx:]
|
||||||
|
|
||||||
|
if not self.url_rewriter.full_prefix.startswith('https://'):
|
||||||
|
# also remove secure to avoid issues when
|
||||||
|
# proxying over plain http
|
||||||
|
if morsel.get('secure'):
|
||||||
|
del morsel['secure']
|
||||||
|
|
||||||
|
if not self.url_rewriter.rewrite_opts.get('is_live'):
|
||||||
|
self._remove_age_opts(morsel)
|
||||||
|
|
||||||
def _remove_age_opts(self, morsel):
|
def _remove_age_opts(self, morsel):
|
||||||
# remove expires as it refers to archived time
|
# remove expires as it refers to archived time
|
||||||
if morsel.get('expires'):
|
if morsel.get('expires'):
|
||||||
@ -41,11 +55,6 @@ class WbUrlBaseCookieRewriter(object):
|
|||||||
if morsel.get('max-age'):
|
if morsel.get('max-age'):
|
||||||
del morsel['max-age']
|
del morsel['max-age']
|
||||||
|
|
||||||
# for now, also remove secure to avoid issues when
|
|
||||||
# proxying over plain http (TODO: detect https?)
|
|
||||||
if morsel.get('secure'):
|
|
||||||
del morsel['secure']
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class RemoveAllCookiesRewriter(WbUrlBaseCookieRewriter):
|
class RemoveAllCookiesRewriter(WbUrlBaseCookieRewriter):
|
||||||
@ -71,7 +80,6 @@ class MinimalScopeCookieRewriter(WbUrlBaseCookieRewriter):
|
|||||||
elif morsel.get('path'):
|
elif morsel.get('path'):
|
||||||
morsel['path'] = self.url_rewriter.rewrite(morsel['path'])
|
morsel['path'] = self.url_rewriter.rewrite(morsel['path'])
|
||||||
|
|
||||||
self._remove_age_opts(morsel)
|
|
||||||
return morsel
|
return morsel
|
||||||
|
|
||||||
|
|
||||||
@ -96,7 +104,6 @@ class HostScopeCookieRewriter(WbUrlBaseCookieRewriter):
|
|||||||
elif morsel.get('path'):
|
elif morsel.get('path'):
|
||||||
morsel['path'] = self.url_rewriter.rewrite(morsel['path'])
|
morsel['path'] = self.url_rewriter.rewrite(morsel['path'])
|
||||||
|
|
||||||
self._remove_age_opts(morsel)
|
|
||||||
return morsel
|
return morsel
|
||||||
|
|
||||||
|
|
||||||
@ -116,7 +123,6 @@ class ExactPathCookieRewriter(WbUrlBaseCookieRewriter):
|
|||||||
if morsel.get('path'):
|
if morsel.get('path'):
|
||||||
del morsel['path']
|
del morsel['path']
|
||||||
|
|
||||||
self._remove_age_opts(morsel)
|
|
||||||
return morsel
|
return morsel
|
||||||
|
|
||||||
|
|
||||||
@ -136,7 +142,6 @@ class RootScopeCookieRewriter(WbUrlBaseCookieRewriter):
|
|||||||
if morsel.get('domain'):
|
if morsel.get('domain'):
|
||||||
del morsel['domain']
|
del morsel['domain']
|
||||||
|
|
||||||
self._remove_age_opts(morsel)
|
|
||||||
return morsel
|
return morsel
|
||||||
|
|
||||||
|
|
||||||
@ -154,3 +159,4 @@ def get_cookie_rewriter(cookie_scope):
|
|||||||
return MinimalScopeCookieRewriter
|
return MinimalScopeCookieRewriter
|
||||||
else:
|
else:
|
||||||
return HostScopeCookieRewriter
|
return HostScopeCookieRewriter
|
||||||
|
|
||||||
|
@ -166,6 +166,9 @@ class HeaderRewriter(object):
|
|||||||
else:
|
else:
|
||||||
add_prefixed_header(name, value)
|
add_prefixed_header(name, value)
|
||||||
|
|
||||||
|
elif lowername.startswith('x-'):
|
||||||
|
add_header(name, value)
|
||||||
|
|
||||||
elif urlrewriter:
|
elif urlrewriter:
|
||||||
add_prefixed_header(name, value)
|
add_prefixed_header(name, value)
|
||||||
else:
|
else:
|
||||||
|
@ -106,8 +106,8 @@ class RewriteContent:
|
|||||||
stream)
|
stream)
|
||||||
return (status_headers, self.stream_to_gen(stream), False)
|
return (status_headers, self.stream_to_gen(stream), False)
|
||||||
|
|
||||||
if wb_url.is_banner_only:
|
if urlrewriter and cdx and cdx.get('is_live'):
|
||||||
urlrewriter = None
|
urlrewriter.rewrite_opts['is_live'] = True
|
||||||
|
|
||||||
rule = self.ruleset.get_first_match(urlkey)
|
rule = self.ruleset.get_first_match(urlkey)
|
||||||
|
|
||||||
|
@ -24,6 +24,26 @@ True
|
|||||||
>>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter, 'coll')
|
>>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter, 'coll')
|
||||||
[('Set-Cookie', 'abc=def; Path=/pywb/20131226101010/http://example.com/some/path/file.html')]
|
[('Set-Cookie', 'abc=def; Path=/pywb/20131226101010/http://example.com/some/path/file.html')]
|
||||||
|
|
||||||
|
# keep expires
|
||||||
|
>>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter2, 'coll')
|
||||||
|
[('Set-Cookie', 'abc=def; expires=Wed, 13 Jan 2021 22:23:01 GMT; Path=/preview/em_/http://example.com/file.html')]
|
||||||
|
|
||||||
|
# keep expires, UTC->GMT
|
||||||
|
>>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 UTC', urlrewriter2, 'coll')
|
||||||
|
[('Set-Cookie', 'abc=def; expires=Wed, 13 Jan 2021 22:23:01 GMT; Path=/preview/em_/http://example.com/file.html')]
|
||||||
|
|
||||||
|
# keep Max-Age
|
||||||
|
>>> rewrite_cookie('abc=def; Path=file.html; Max-Age=1500', urlrewriter2, 'coll')
|
||||||
|
[('Set-Cookie', 'abc=def; Max-Age=1500; Path=/preview/em_/http://example.com/file.html')]
|
||||||
|
|
||||||
|
# Secure Remove
|
||||||
|
>>> rewrite_cookie('abc=def; Path=file.html; HttpOnly; Secure', urlrewriter2, 'coll')
|
||||||
|
[('Set-Cookie', 'abc=def; HttpOnly; Path=/preview/em_/http://example.com/file.html')]
|
||||||
|
|
||||||
|
# Secure Keep
|
||||||
|
>>> rewrite_cookie('abc=def; Path=file.html; HttpOnly; Secure', urlrewriter3, 'coll')
|
||||||
|
[('Set-Cookie', 'abc=def; HttpOnly; Path=/preview/em_/http://example.com/file.html; Secure')]
|
||||||
|
|
||||||
# Cookie with invalid chars, not parsed
|
# Cookie with invalid chars, not parsed
|
||||||
>>> rewrite_cookie('abc@def=123', urlrewriter, 'coll')
|
>>> rewrite_cookie('abc@def=123', urlrewriter, 'coll')
|
||||||
[]
|
[]
|
||||||
@ -67,15 +87,22 @@ True
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter, get_cookie_rewriter
|
from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter
|
||||||
|
from pywb.rewrite.cookie_rewriter import get_cookie_rewriter
|
||||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
|
|
||||||
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', 'http://localhost:8080/pywb/', rel_prefix='/pywb/')
|
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
|
||||||
|
'http://localhost:8080/pywb/',
|
||||||
|
rel_prefix='/pywb/')
|
||||||
|
|
||||||
urlrewriter2 = UrlRewriter('em_/http://example.com/', '/preview/')
|
urlrewriter2 = UrlRewriter('em_/http://example.com/', '/preview/')
|
||||||
|
urlrewriter2.rewrite_opts['is_live'] = True
|
||||||
|
|
||||||
|
urlrewriter3 = UrlRewriter('em_/http://example.com/', 'https://localhost:8080/preview/')
|
||||||
|
|
||||||
|
|
||||||
def rewrite_cookie(cookie_str, rewriter=urlrewriter, scope='default'):
|
def rewrite_cookie(cookie_str, rewriter=urlrewriter, scope='default'):
|
||||||
cookie_rewriter = get_cookie_rewriter(scope)
|
cookie_rewriter = get_cookie_rewriter(scope)
|
||||||
return cookie_rewriter(rewriter).rewrite(cookie_str)
|
return cookie_rewriter(rewriter).rewrite(cookie_str)
|
||||||
|
|
||||||
|
|
||||||
|
@ -31,14 +31,15 @@ HTTP Headers Rewriting
|
|||||||
'text_type': 'js'}
|
'text_type': 'js'}
|
||||||
|
|
||||||
# Binary -- transfer-encoding rewritten
|
# Binary -- transfer-encoding rewritten
|
||||||
>>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Set-Cookie', 'foo=bar; Path=/;'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
|
>>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Set-Cookie', 'foo=bar; Path=/;'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked'), ('X-Proxy', 'test')])
|
||||||
{'charset': None,
|
{'charset': None,
|
||||||
'removed_header_dict': {'transfer-encoding': 'chunked'},
|
'removed_header_dict': {'transfer-encoding': 'chunked'},
|
||||||
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
|
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
|
||||||
('Content-Type', 'image/png'),
|
('Content-Type', 'image/png'),
|
||||||
('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/'),
|
('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/'),
|
||||||
('Content-Encoding', 'gzip'),
|
('Content-Encoding', 'gzip'),
|
||||||
('X-Archive-Orig-Transfer-Encoding', 'chunked')]),
|
('X-Archive-Orig-Transfer-Encoding', 'chunked'),
|
||||||
|
('X-Proxy', 'test')]),
|
||||||
'text_type': None}
|
'text_type': None}
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
@ -20,7 +20,7 @@ class UrlRewriter(object):
|
|||||||
REL_SCHEME = ('//', r'\/\/', r'\\/\\/')
|
REL_SCHEME = ('//', r'\/\/', r'\\/\\/')
|
||||||
|
|
||||||
def __init__(self, wburl, prefix='', full_prefix=None, rel_prefix=None,
|
def __init__(self, wburl, prefix='', full_prefix=None, rel_prefix=None,
|
||||||
root_path=None, cookie_scope=None, rewrite_opts={}):
|
root_path=None, cookie_scope=None, rewrite_opts=None):
|
||||||
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
|
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
|
||||||
self.prefix = prefix
|
self.prefix = prefix
|
||||||
self.full_prefix = full_prefix or prefix
|
self.full_prefix = full_prefix or prefix
|
||||||
@ -32,9 +32,9 @@ class UrlRewriter(object):
|
|||||||
self.prefix_scheme = None
|
self.prefix_scheme = None
|
||||||
self.prefix_abs = self.prefix and self.prefix.startswith(self.PROTOCOLS)
|
self.prefix_abs = self.prefix and self.prefix.startswith(self.PROTOCOLS)
|
||||||
self.cookie_scope = cookie_scope
|
self.cookie_scope = cookie_scope
|
||||||
self.rewrite_opts = rewrite_opts
|
self.rewrite_opts = rewrite_opts or {}
|
||||||
|
|
||||||
if rewrite_opts.get('punycode_links'):
|
if self.rewrite_opts.get('punycode_links'):
|
||||||
self.wburl._do_percent_encode = False
|
self.wburl._do_percent_encode = False
|
||||||
|
|
||||||
def rewrite(self, url, mod=None):
|
def rewrite(self, url, mod=None):
|
||||||
|
@ -116,7 +116,7 @@ class TestProxyLiveRewriter:
|
|||||||
|
|
||||||
# equal to returned response (echo)
|
# equal to returned response (echo)
|
||||||
assert self.requestlog[0] == resp.text
|
assert self.requestlog[0] == resp.text
|
||||||
assert resp.headers['x-archive-orig-x-proxy'] == 'test'
|
assert resp.headers['x-proxy'] == 'test'
|
||||||
|
|
||||||
assert resp.text.startswith('GET http://example.com/ HTTP/1.1')
|
assert resp.text.startswith('GET http://example.com/ HTTP/1.1')
|
||||||
assert 'referer: http://other.example.com' in resp.text.lower()
|
assert 'referer: http://other.example.com' in resp.text.lower()
|
||||||
@ -136,7 +136,7 @@ class TestProxyLiveRewriter:
|
|||||||
|
|
||||||
# proxied, but without range
|
# proxied, but without range
|
||||||
assert self.requestlog[0] == resp.text
|
assert self.requestlog[0] == resp.text
|
||||||
assert resp.headers['x-archive-orig-x-proxy'] == 'test'
|
assert resp.headers['x-proxy'] == 'test'
|
||||||
|
|
||||||
assert self.requestlog[0].startswith('GET http://example.com/ HTTP/1.1')
|
assert self.requestlog[0].startswith('GET http://example.com/ HTTP/1.1')
|
||||||
assert 'range: ' not in self.requestlog[0]
|
assert 'range: ' not in self.requestlog[0]
|
||||||
@ -153,7 +153,7 @@ class TestProxyLiveRewriter:
|
|||||||
assert resp.headers['Accept-Ranges'] == 'bytes'
|
assert resp.headers['Accept-Ranges'] == 'bytes'
|
||||||
|
|
||||||
# not from proxy
|
# not from proxy
|
||||||
assert 'x-archive-orig-x-proxy' not in resp.headers
|
assert 'x-proxy' not in resp.headers
|
||||||
|
|
||||||
# proxy receives a request also, but w/o range
|
# proxy receives a request also, but w/o range
|
||||||
assert len(self.requestlog) == 1
|
assert len(self.requestlog) == 1
|
||||||
@ -182,7 +182,7 @@ class TestProxyLiveRewriter:
|
|||||||
assert resp.headers['Accept-Ranges'] == 'bytes'
|
assert resp.headers['Accept-Ranges'] == 'bytes'
|
||||||
|
|
||||||
# not from proxy
|
# not from proxy
|
||||||
assert 'x-archive-orig-x-proxy' not in resp.headers
|
assert 'x-proxy' not in resp.headers
|
||||||
|
|
||||||
# already pinged proxy, no additional requests set to proxy
|
# already pinged proxy, no additional requests set to proxy
|
||||||
assert len(self.requestlog) == 0
|
assert len(self.requestlog) == 0
|
||||||
|
Loading…
x
Reference in New Issue
Block a user