mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
cookie rewrite improvements: #177
- don't remove max-age and expires if in 'live' rewrite mode (flag set on urlrewriter) - remove secure only if replay prefix is not https - fix expires UTC->GMT as cookie parsing chokes on UTC - other rewriting: don't append rewrite prefix to x- headers tests: add more cookie rewriting tests
This commit is contained in:
parent
61381fcac6
commit
4a60e15577
@ -1,16 +1,20 @@
|
||||
from six.moves.http_cookies import SimpleCookie, CookieError
|
||||
import six
|
||||
import re
|
||||
|
||||
|
||||
#=================================================================
|
||||
#================================================================
|
||||
class WbUrlBaseCookieRewriter(object):
|
||||
""" Base Cookie rewriter for wburl-based requests.
|
||||
"""
|
||||
UTC_RX = re.compile('((?:.*)Expires=(?:.*))UTC', re.I)
|
||||
|
||||
def __init__(self, url_rewriter):
|
||||
self.url_rewriter = url_rewriter
|
||||
|
||||
def rewrite(self, cookie_str, header='Set-Cookie'):
|
||||
results = []
|
||||
cookie_str = self.UTC_RX.sub('\\1GMT', cookie_str)
|
||||
try:
|
||||
cookie = SimpleCookie(cookie_str)
|
||||
except CookieError:
|
||||
@ -21,17 +25,27 @@ class WbUrlBaseCookieRewriter(object):
|
||||
for name, morsel in six.iteritems(cookie):
|
||||
morsel = self.rewrite_cookie(name, morsel)
|
||||
|
||||
if morsel:
|
||||
path = morsel.get('path')
|
||||
if path:
|
||||
inx = path.find(self.url_rewriter.rel_prefix)
|
||||
if inx > 0:
|
||||
morsel['path'] = path[inx:]
|
||||
|
||||
results.append((header, morsel.OutputString()))
|
||||
self._filter_morsel(morsel)
|
||||
results.append((header, morsel.OutputString()))
|
||||
|
||||
return results
|
||||
|
||||
def _filter_morsel(self, morsel):
|
||||
path = morsel.get('path')
|
||||
if path:
|
||||
inx = path.find(self.url_rewriter.rel_prefix)
|
||||
if inx > 0:
|
||||
morsel['path'] = path[inx:]
|
||||
|
||||
if not self.url_rewriter.full_prefix.startswith('https://'):
|
||||
# also remove secure to avoid issues when
|
||||
# proxying over plain http
|
||||
if morsel.get('secure'):
|
||||
del morsel['secure']
|
||||
|
||||
if not self.url_rewriter.rewrite_opts.get('is_live'):
|
||||
self._remove_age_opts(morsel)
|
||||
|
||||
def _remove_age_opts(self, morsel):
|
||||
# remove expires as it refers to archived time
|
||||
if morsel.get('expires'):
|
||||
@ -41,11 +55,6 @@ class WbUrlBaseCookieRewriter(object):
|
||||
if morsel.get('max-age'):
|
||||
del morsel['max-age']
|
||||
|
||||
# for now, also remove secure to avoid issues when
|
||||
# proxying over plain http (TODO: detect https?)
|
||||
if morsel.get('secure'):
|
||||
del morsel['secure']
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RemoveAllCookiesRewriter(WbUrlBaseCookieRewriter):
|
||||
@ -71,7 +80,6 @@ class MinimalScopeCookieRewriter(WbUrlBaseCookieRewriter):
|
||||
elif morsel.get('path'):
|
||||
morsel['path'] = self.url_rewriter.rewrite(morsel['path'])
|
||||
|
||||
self._remove_age_opts(morsel)
|
||||
return morsel
|
||||
|
||||
|
||||
@ -96,7 +104,6 @@ class HostScopeCookieRewriter(WbUrlBaseCookieRewriter):
|
||||
elif morsel.get('path'):
|
||||
morsel['path'] = self.url_rewriter.rewrite(morsel['path'])
|
||||
|
||||
self._remove_age_opts(morsel)
|
||||
return morsel
|
||||
|
||||
|
||||
@ -116,7 +123,6 @@ class ExactPathCookieRewriter(WbUrlBaseCookieRewriter):
|
||||
if morsel.get('path'):
|
||||
del morsel['path']
|
||||
|
||||
self._remove_age_opts(morsel)
|
||||
return morsel
|
||||
|
||||
|
||||
@ -136,7 +142,6 @@ class RootScopeCookieRewriter(WbUrlBaseCookieRewriter):
|
||||
if morsel.get('domain'):
|
||||
del morsel['domain']
|
||||
|
||||
self._remove_age_opts(morsel)
|
||||
return morsel
|
||||
|
||||
|
||||
@ -154,3 +159,4 @@ def get_cookie_rewriter(cookie_scope):
|
||||
return MinimalScopeCookieRewriter
|
||||
else:
|
||||
return HostScopeCookieRewriter
|
||||
|
||||
|
@ -166,6 +166,9 @@ class HeaderRewriter(object):
|
||||
else:
|
||||
add_prefixed_header(name, value)
|
||||
|
||||
elif lowername.startswith('x-'):
|
||||
add_header(name, value)
|
||||
|
||||
elif urlrewriter:
|
||||
add_prefixed_header(name, value)
|
||||
else:
|
||||
|
@ -106,8 +106,8 @@ class RewriteContent:
|
||||
stream)
|
||||
return (status_headers, self.stream_to_gen(stream), False)
|
||||
|
||||
if wb_url.is_banner_only:
|
||||
urlrewriter = None
|
||||
if urlrewriter and cdx and cdx.get('is_live'):
|
||||
urlrewriter.rewrite_opts['is_live'] = True
|
||||
|
||||
rule = self.ruleset.get_first_match(urlkey)
|
||||
|
||||
|
@ -24,6 +24,26 @@ True
|
||||
>>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter, 'coll')
|
||||
[('Set-Cookie', 'abc=def; Path=/pywb/20131226101010/http://example.com/some/path/file.html')]
|
||||
|
||||
# keep expires
|
||||
>>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter2, 'coll')
|
||||
[('Set-Cookie', 'abc=def; expires=Wed, 13 Jan 2021 22:23:01 GMT; Path=/preview/em_/http://example.com/file.html')]
|
||||
|
||||
# keep expires, UTC->GMT
|
||||
>>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 UTC', urlrewriter2, 'coll')
|
||||
[('Set-Cookie', 'abc=def; expires=Wed, 13 Jan 2021 22:23:01 GMT; Path=/preview/em_/http://example.com/file.html')]
|
||||
|
||||
# keep Max-Age
|
||||
>>> rewrite_cookie('abc=def; Path=file.html; Max-Age=1500', urlrewriter2, 'coll')
|
||||
[('Set-Cookie', 'abc=def; Max-Age=1500; Path=/preview/em_/http://example.com/file.html')]
|
||||
|
||||
# Secure Remove
|
||||
>>> rewrite_cookie('abc=def; Path=file.html; HttpOnly; Secure', urlrewriter2, 'coll')
|
||||
[('Set-Cookie', 'abc=def; HttpOnly; Path=/preview/em_/http://example.com/file.html')]
|
||||
|
||||
# Secure Keep
|
||||
>>> rewrite_cookie('abc=def; Path=file.html; HttpOnly; Secure', urlrewriter3, 'coll')
|
||||
[('Set-Cookie', 'abc=def; HttpOnly; Path=/preview/em_/http://example.com/file.html; Secure')]
|
||||
|
||||
# Cookie with invalid chars, not parsed
|
||||
>>> rewrite_cookie('abc@def=123', urlrewriter, 'coll')
|
||||
[]
|
||||
@ -67,15 +87,22 @@ True
|
||||
"""
|
||||
|
||||
|
||||
from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter, get_cookie_rewriter
|
||||
from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter
|
||||
from pywb.rewrite.cookie_rewriter import get_cookie_rewriter
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
|
||||
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', 'http://localhost:8080/pywb/', rel_prefix='/pywb/')
|
||||
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
|
||||
'http://localhost:8080/pywb/',
|
||||
rel_prefix='/pywb/')
|
||||
|
||||
urlrewriter2 = UrlRewriter('em_/http://example.com/', '/preview/')
|
||||
urlrewriter2.rewrite_opts['is_live'] = True
|
||||
|
||||
urlrewriter3 = UrlRewriter('em_/http://example.com/', 'https://localhost:8080/preview/')
|
||||
|
||||
|
||||
def rewrite_cookie(cookie_str, rewriter=urlrewriter, scope='default'):
|
||||
cookie_rewriter = get_cookie_rewriter(scope)
|
||||
return cookie_rewriter(rewriter).rewrite(cookie_str)
|
||||
|
||||
|
||||
|
@ -31,14 +31,15 @@ HTTP Headers Rewriting
|
||||
'text_type': 'js'}
|
||||
|
||||
# Binary -- transfer-encoding rewritten
|
||||
>>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Set-Cookie', 'foo=bar; Path=/;'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
|
||||
>>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Set-Cookie', 'foo=bar; Path=/;'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked'), ('X-Proxy', 'test')])
|
||||
{'charset': None,
|
||||
'removed_header_dict': {'transfer-encoding': 'chunked'},
|
||||
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
|
||||
('Content-Type', 'image/png'),
|
||||
('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/'),
|
||||
('Content-Encoding', 'gzip'),
|
||||
('X-Archive-Orig-Transfer-Encoding', 'chunked')]),
|
||||
('X-Archive-Orig-Transfer-Encoding', 'chunked'),
|
||||
('X-Proxy', 'test')]),
|
||||
'text_type': None}
|
||||
|
||||
"""
|
||||
|
@ -20,7 +20,7 @@ class UrlRewriter(object):
|
||||
REL_SCHEME = ('//', r'\/\/', r'\\/\\/')
|
||||
|
||||
def __init__(self, wburl, prefix='', full_prefix=None, rel_prefix=None,
|
||||
root_path=None, cookie_scope=None, rewrite_opts={}):
|
||||
root_path=None, cookie_scope=None, rewrite_opts=None):
|
||||
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
|
||||
self.prefix = prefix
|
||||
self.full_prefix = full_prefix or prefix
|
||||
@ -32,9 +32,9 @@ class UrlRewriter(object):
|
||||
self.prefix_scheme = None
|
||||
self.prefix_abs = self.prefix and self.prefix.startswith(self.PROTOCOLS)
|
||||
self.cookie_scope = cookie_scope
|
||||
self.rewrite_opts = rewrite_opts
|
||||
self.rewrite_opts = rewrite_opts or {}
|
||||
|
||||
if rewrite_opts.get('punycode_links'):
|
||||
if self.rewrite_opts.get('punycode_links'):
|
||||
self.wburl._do_percent_encode = False
|
||||
|
||||
def rewrite(self, url, mod=None):
|
||||
|
@ -116,7 +116,7 @@ class TestProxyLiveRewriter:
|
||||
|
||||
# equal to returned response (echo)
|
||||
assert self.requestlog[0] == resp.text
|
||||
assert resp.headers['x-archive-orig-x-proxy'] == 'test'
|
||||
assert resp.headers['x-proxy'] == 'test'
|
||||
|
||||
assert resp.text.startswith('GET http://example.com/ HTTP/1.1')
|
||||
assert 'referer: http://other.example.com' in resp.text.lower()
|
||||
@ -136,7 +136,7 @@ class TestProxyLiveRewriter:
|
||||
|
||||
# proxied, but without range
|
||||
assert self.requestlog[0] == resp.text
|
||||
assert resp.headers['x-archive-orig-x-proxy'] == 'test'
|
||||
assert resp.headers['x-proxy'] == 'test'
|
||||
|
||||
assert self.requestlog[0].startswith('GET http://example.com/ HTTP/1.1')
|
||||
assert 'range: ' not in self.requestlog[0]
|
||||
@ -153,7 +153,7 @@ class TestProxyLiveRewriter:
|
||||
assert resp.headers['Accept-Ranges'] == 'bytes'
|
||||
|
||||
# not from proxy
|
||||
assert 'x-archive-orig-x-proxy' not in resp.headers
|
||||
assert 'x-proxy' not in resp.headers
|
||||
|
||||
# proxy receives a request also, but w/o range
|
||||
assert len(self.requestlog) == 1
|
||||
@ -182,7 +182,7 @@ class TestProxyLiveRewriter:
|
||||
assert resp.headers['Accept-Ranges'] == 'bytes'
|
||||
|
||||
# not from proxy
|
||||
assert 'x-archive-orig-x-proxy' not in resp.headers
|
||||
assert 'x-proxy' not in resp.headers
|
||||
|
||||
# already pinged proxy, no additional requests set to proxy
|
||||
assert len(self.requestlog) == 0
|
||||
|
Loading…
x
Reference in New Issue
Block a user