1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

cookie rewrite improvements: #177

- don't remove max-age and expires if in 'live' rewrite mode (flag set on urlrewriter)
- remove secure only if replay prefix is not https
- fix expires UTC->GMT as cookie parsing chokes on UTC
- other rewriting: don't append rewrite prefix to x- headers
tests: add more cookie rewriting tests
This commit is contained in:
Ilya Kreymer 2016-04-26 09:45:23 -07:00
parent 61381fcac6
commit 4a60e15577
7 changed files with 68 additions and 31 deletions

View File

@ -1,16 +1,20 @@
from six.moves.http_cookies import SimpleCookie, CookieError
import six
import re
#=================================================================
#================================================================
class WbUrlBaseCookieRewriter(object):
""" Base Cookie rewriter for wburl-based requests.
"""
UTC_RX = re.compile('((?:.*)Expires=(?:.*))UTC', re.I)
def __init__(self, url_rewriter):
self.url_rewriter = url_rewriter
def rewrite(self, cookie_str, header='Set-Cookie'):
results = []
cookie_str = self.UTC_RX.sub('\\1GMT', cookie_str)
try:
cookie = SimpleCookie(cookie_str)
except CookieError:
@ -21,17 +25,27 @@ class WbUrlBaseCookieRewriter(object):
for name, morsel in six.iteritems(cookie):
morsel = self.rewrite_cookie(name, morsel)
if morsel:
path = morsel.get('path')
if path:
inx = path.find(self.url_rewriter.rel_prefix)
if inx > 0:
morsel['path'] = path[inx:]
results.append((header, morsel.OutputString()))
self._filter_morsel(morsel)
results.append((header, morsel.OutputString()))
return results
def _filter_morsel(self, morsel):
path = morsel.get('path')
if path:
inx = path.find(self.url_rewriter.rel_prefix)
if inx > 0:
morsel['path'] = path[inx:]
if not self.url_rewriter.full_prefix.startswith('https://'):
# also remove secure to avoid issues when
# proxying over plain http
if morsel.get('secure'):
del morsel['secure']
if not self.url_rewriter.rewrite_opts.get('is_live'):
self._remove_age_opts(morsel)
def _remove_age_opts(self, morsel):
# remove expires as it refers to archived time
if morsel.get('expires'):
@ -41,11 +55,6 @@ class WbUrlBaseCookieRewriter(object):
if morsel.get('max-age'):
del morsel['max-age']
# for now, also remove secure to avoid issues when
# proxying over plain http (TODO: detect https?)
if morsel.get('secure'):
del morsel['secure']
#=================================================================
class RemoveAllCookiesRewriter(WbUrlBaseCookieRewriter):
@ -71,7 +80,6 @@ class MinimalScopeCookieRewriter(WbUrlBaseCookieRewriter):
elif morsel.get('path'):
morsel['path'] = self.url_rewriter.rewrite(morsel['path'])
self._remove_age_opts(morsel)
return morsel
@ -96,7 +104,6 @@ class HostScopeCookieRewriter(WbUrlBaseCookieRewriter):
elif morsel.get('path'):
morsel['path'] = self.url_rewriter.rewrite(morsel['path'])
self._remove_age_opts(morsel)
return morsel
@ -116,7 +123,6 @@ class ExactPathCookieRewriter(WbUrlBaseCookieRewriter):
if morsel.get('path'):
del morsel['path']
self._remove_age_opts(morsel)
return morsel
@ -136,7 +142,6 @@ class RootScopeCookieRewriter(WbUrlBaseCookieRewriter):
if morsel.get('domain'):
del morsel['domain']
self._remove_age_opts(morsel)
return morsel
@ -154,3 +159,4 @@ def get_cookie_rewriter(cookie_scope):
return MinimalScopeCookieRewriter
else:
return HostScopeCookieRewriter

View File

@ -166,6 +166,9 @@ class HeaderRewriter(object):
else:
add_prefixed_header(name, value)
elif lowername.startswith('x-'):
add_header(name, value)
elif urlrewriter:
add_prefixed_header(name, value)
else:

View File

@ -106,8 +106,8 @@ class RewriteContent:
stream)
return (status_headers, self.stream_to_gen(stream), False)
if wb_url.is_banner_only:
urlrewriter = None
if urlrewriter and cdx and cdx.get('is_live'):
urlrewriter.rewrite_opts['is_live'] = True
rule = self.ruleset.get_first_match(urlkey)

View File

@ -24,6 +24,26 @@ True
>>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter, 'coll')
[('Set-Cookie', 'abc=def; Path=/pywb/20131226101010/http://example.com/some/path/file.html')]
# keep expires
>>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 GMT', urlrewriter2, 'coll')
[('Set-Cookie', 'abc=def; expires=Wed, 13 Jan 2021 22:23:01 GMT; Path=/preview/em_/http://example.com/file.html')]
# keep expires, UTC->GMT
>>> rewrite_cookie('abc=def; Path=file.html; Expires=Wed, 13 Jan 2021 22:23:01 UTC', urlrewriter2, 'coll')
[('Set-Cookie', 'abc=def; expires=Wed, 13 Jan 2021 22:23:01 GMT; Path=/preview/em_/http://example.com/file.html')]
# keep Max-Age
>>> rewrite_cookie('abc=def; Path=file.html; Max-Age=1500', urlrewriter2, 'coll')
[('Set-Cookie', 'abc=def; Max-Age=1500; Path=/preview/em_/http://example.com/file.html')]
# Secure Remove
>>> rewrite_cookie('abc=def; Path=file.html; HttpOnly; Secure', urlrewriter2, 'coll')
[('Set-Cookie', 'abc=def; HttpOnly; Path=/preview/em_/http://example.com/file.html')]
# Secure Keep
>>> rewrite_cookie('abc=def; Path=file.html; HttpOnly; Secure', urlrewriter3, 'coll')
[('Set-Cookie', 'abc=def; HttpOnly; Path=/preview/em_/http://example.com/file.html; Secure')]
# Cookie with invalid chars, not parsed
>>> rewrite_cookie('abc@def=123', urlrewriter, 'coll')
[]
@ -67,15 +87,22 @@ True
"""
from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter, get_cookie_rewriter
from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter
from pywb.rewrite.cookie_rewriter import get_cookie_rewriter
from pywb.rewrite.url_rewriter import UrlRewriter
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', 'http://localhost:8080/pywb/', rel_prefix='/pywb/')
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
'http://localhost:8080/pywb/',
rel_prefix='/pywb/')
urlrewriter2 = UrlRewriter('em_/http://example.com/', '/preview/')
urlrewriter2.rewrite_opts['is_live'] = True
urlrewriter3 = UrlRewriter('em_/http://example.com/', 'https://localhost:8080/preview/')
def rewrite_cookie(cookie_str, rewriter=urlrewriter, scope='default'):
cookie_rewriter = get_cookie_rewriter(scope)
return cookie_rewriter(rewriter).rewrite(cookie_str)

View File

@ -31,14 +31,15 @@ HTTP Headers Rewriting
'text_type': 'js'}
# Binary -- transfer-encoding rewritten
>>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Set-Cookie', 'foo=bar; Path=/;'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
>>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Set-Cookie', 'foo=bar; Path=/;'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked'), ('X-Proxy', 'test')])
{'charset': None,
'removed_header_dict': {'transfer-encoding': 'chunked'},
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
('Content-Type', 'image/png'),
('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/'),
('Content-Encoding', 'gzip'),
('X-Archive-Orig-Transfer-Encoding', 'chunked')]),
('X-Archive-Orig-Transfer-Encoding', 'chunked'),
('X-Proxy', 'test')]),
'text_type': None}
"""

View File

@ -20,7 +20,7 @@ class UrlRewriter(object):
REL_SCHEME = ('//', r'\/\/', r'\\/\\/')
def __init__(self, wburl, prefix='', full_prefix=None, rel_prefix=None,
root_path=None, cookie_scope=None, rewrite_opts={}):
root_path=None, cookie_scope=None, rewrite_opts=None):
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
self.prefix = prefix
self.full_prefix = full_prefix or prefix
@ -32,9 +32,9 @@ class UrlRewriter(object):
self.prefix_scheme = None
self.prefix_abs = self.prefix and self.prefix.startswith(self.PROTOCOLS)
self.cookie_scope = cookie_scope
self.rewrite_opts = rewrite_opts
self.rewrite_opts = rewrite_opts or {}
if rewrite_opts.get('punycode_links'):
if self.rewrite_opts.get('punycode_links'):
self.wburl._do_percent_encode = False
def rewrite(self, url, mod=None):

View File

@ -116,7 +116,7 @@ class TestProxyLiveRewriter:
# equal to returned response (echo)
assert self.requestlog[0] == resp.text
assert resp.headers['x-archive-orig-x-proxy'] == 'test'
assert resp.headers['x-proxy'] == 'test'
assert resp.text.startswith('GET http://example.com/ HTTP/1.1')
assert 'referer: http://other.example.com' in resp.text.lower()
@ -136,7 +136,7 @@ class TestProxyLiveRewriter:
# proxied, but without range
assert self.requestlog[0] == resp.text
assert resp.headers['x-archive-orig-x-proxy'] == 'test'
assert resp.headers['x-proxy'] == 'test'
assert self.requestlog[0].startswith('GET http://example.com/ HTTP/1.1')
assert 'range: ' not in self.requestlog[0]
@ -153,7 +153,7 @@ class TestProxyLiveRewriter:
assert resp.headers['Accept-Ranges'] == 'bytes'
# not from proxy
assert 'x-archive-orig-x-proxy' not in resp.headers
assert 'x-proxy' not in resp.headers
# proxy receives a request also, but w/o range
assert len(self.requestlog) == 1
@ -182,7 +182,7 @@ class TestProxyLiveRewriter:
assert resp.headers['Accept-Ranges'] == 'bytes'
# not from proxy
assert 'x-archive-orig-x-proxy' not in resp.headers
assert 'x-proxy' not in resp.headers
# already pinged proxy, no additional requests set to proxy
assert len(self.requestlog) == 0