1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Support/Improvements to Domain Cookie Cache (#491)

* domain cookie fix:
- don't set cookies for service worker modifiers if response is not 200
- don't add existing cookies to Cookie or Set-Cookie headers
- add sw_/, wkrf_/ modifiers to generate paths
- enable domain cookie cacheing by default with fakeredis for live index and record mode, keyed by collection
- reqs: add fakeredis, tldextract, update warcio
- tests: add initial tests for domain cookie rewriting
This commit is contained in:
Ilya Kreymer 2019-07-31 14:58:15 -07:00 committed by GitHub
parent 837894a07f
commit ffca45c855
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 84 additions and 19 deletions

View File

@ -1,6 +1,8 @@
from io import BytesIO
import requests
from fakeredis import FakeStrictRedis
from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit
from warcio.bufferedreaders import BufferedReader
from warcio.recordloader import ArcWarcRecordLoader
@ -13,6 +15,7 @@ from pywb.rewrite.rewriteinputreq import RewriteInputRequest
from pywb.rewrite.templateview import BaseInsertView, HeadInsertView, JinjaEnv, TopFrameView
from pywb.rewrite.url_rewriter import IdentityUrlRewriter, UrlRewriter
from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.cookies import CookieTracker
from pywb.utils.canonicalize import canonicalize
from pywb.utils.io import BUFF_SIZE, OffsetLimitReader, no_except_close
from pywb.utils.memento import MementoUtils
@ -81,7 +84,7 @@ class RewriterApp(object):
self.use_js_obj_proxy = config.get('use_js_obj_proxy', True)
self.cookie_tracker = None
self.cookie_tracker = self._init_cookie_tracker()
self.enable_memento = self.config.get('enable_memento')
@ -94,6 +97,9 @@ class RewriterApp(object):
# deprecated: Use X-Forwarded-Proto header instead!
self.force_scheme = config.get('force_scheme')
def _init_cookie_tracker(self):
return CookieTracker(FakeStrictRedis())
def add_csp_header(self, wb_url, status_headers):
if self.csp_header and wb_url.mod == self.replay_mod:
status_headers.headers.append(self.csp_header)
@ -267,10 +273,15 @@ class RewriterApp(object):
range_start, range_end, skip_record = self._check_range(inputreq, wb_url)
setcookie_headers = None
cookie_key = None
if self.cookie_tracker:
cookie_key = self.get_cookie_key(kwargs)
res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key)
inputreq.extra_cookie, setcookie_headers = res
if cookie_key:
res = self.cookie_tracker.get_cookie_headers(wb_url.url,
urlrewriter,
cookie_key,
environ.get('HTTP_COOKIE', ''))
inputreq.extra_cookie, setcookie_headers = res
r = self._do_req(inputreq, wb_url, kwargs, skip_record)
@ -366,7 +377,12 @@ class RewriterApp(object):
config=self.config))
cookie_rewriter = None
if self.cookie_tracker:
if self.cookie_tracker and cookie_key:
# skip add cookie if service worker is not 200 -- sw will not be loaded by browser
# so don't update any cookies for it
if wb_url.mod == 'sw_' and record.http_headers.get_statuscode() != '200':
cookie_key = None
cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter,
cookie_key)
@ -637,7 +653,12 @@ class RewriterApp(object):
return base_url
def get_cookie_key(self, kwargs):
raise NotImplemented()
# note: currently this is per-collection, so enabled only for live or recording
# to support multiple users recording/live, would need per user cookie
if kwargs.get('index') == '$live' or kwargs.get('type') == 'record':
return 'cookie:' + kwargs['coll']
else:
return None
def _add_custom_params(self, cdx, headers, kwargs, record):
pass

View File

@ -41,10 +41,10 @@ class WbUrlBaseCookieRewriter(object):
then assume its meant to be a prefix, and likely needed for
other content.
Set cookie with same prefix but for all common modifiers:
(mp_, js_, cs_, oe_, if_)
(mp_, js_, cs_, oe_, if_, sw_, wkrf_)
"""
curr_mod = self.url_rewriter.wburl.mod
if curr_mod not in ('mp_', 'if_'):
if curr_mod not in ('mp_', 'if_', 'sw_'):
return False
if not morsel.get('httponly'):
@ -54,7 +54,7 @@ class WbUrlBaseCookieRewriter(object):
if not path or not path.endswith('/'):
return False
for mod in ('mp_', 'cs_', 'js_', 'im_', 'oe_', 'if_'):
for mod in ('mp_', 'cs_', 'js_', 'im_', 'oe_', 'if_', 'sw_', 'wkrf_'):
new_path = path.replace(curr_mod + '/', mod + '/')
morsel['path'] = new_path
results.append((header, morsel.OutputString()))

View File

@ -19,7 +19,8 @@ class CookieTracker(object):
def get_rewriter(self, url_rewriter, cookie_key):
return DomainCacheCookieRewriter(url_rewriter, self, cookie_key)
def get_cookie_headers(self, url, url_rewriter, cookie_key):
def get_cookie_headers(self, url, url_rewriter, cookie_key, existing_cookie):
existing_cookie = existing_cookie or ''
subds = self.get_subdomains(url)
host_cookie_rewriter = HostScopeNoFilterCookieRewriter(url_rewriter)
@ -46,7 +47,14 @@ class CookieTracker(object):
n = n.decode('utf-8')
v = v.decode('utf-8')
full = n + '=' + v
n += '='
# if cookie already in existing cookie, don't add duplicate
# also, don't add to set-cookie again (to avoid exceeding cookie size)
if n in existing_cookie:
continue
full = n + v
cookies.append(full.split(';')[0])
full += '; Max-Age=' + str(self.expire_time)
@ -108,7 +116,7 @@ class DomainCacheCookieRewriter(WbUrlBaseCookieRewriter):
# if domain set, no choice but to expand cookie path to root
domain = morsel.pop('domain', '')
if domain:
if domain and self.cookie_key:
#if morsel.get('max-age'):
# morsel['max-age'] = int(morsel['max-age'])

View File

@ -282,12 +282,12 @@ class TestContentRewriter(object):
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
mods = set()
assert len(headers.headers) == 6
assert len(headers.headers) == 8
for name, value in headers.headers:
assert name == 'Set-Cookie'
mods.add(re.search('Path=/prefix/201701([^/]+)', value).group(1))
assert mods == {'mp_', 'cs_', 'js_', 'im_', 'oe_', 'if_'}
assert mods == {'mp_', 'cs_', 'js_', 'im_', 'oe_', 'if_', 'sw_', 'wkrf_'}
assert is_rw == False
def test_rewrite_http_cookie_no_all_mods_no_slash(self):

File diff suppressed because one or more lines are too long

View File

@ -1,5 +1,5 @@
six
warcio>=1.5.2
warcio>=1.7.1
chardet
requests
redis<3.0
@ -13,3 +13,5 @@ gevent==1.4.0
webassets==0.12.1
portalocker
wsgiprox>=1.5.1
fakeredis<1.0
tldextract

View File

@ -27,17 +27,33 @@ def header_test_server(environ, start_response):
return [body]
# ============================================================================
def cookie_test_server(environ, start_response):
body = 'cookie value: ' + environ.get('HTTP_COOKIE', '')
body = body.encode('utf-8')
headers = [('Content-Length', str(len(body))),
('Content-Type', 'text/plain')]
if b'testcookie' not in body:
headers.append(('Set-Cookie', 'testcookie=cookie-val; Path=/; Domain=.example.com'))
start_response('200 OK', headers=headers)
return [body]
# ============================================================================
class TestLiveRewriter(HttpBinLiveTests, BaseConfigTest):
@classmethod
def setup_class(cls):
cls.lint_app = False
super(TestLiveRewriter, cls).setup_class('config_test.yaml')
cls.test_serv = GeventServer(header_test_server)
cls.header_test_serv = GeventServer(header_test_server)
cls.cookie_test_serv = GeventServer(cookie_test_server)
@classmethod
def teardown_class(cls):
cls.test_serv.stop()
cls.header_test_serv.stop()
cls.cookie_test_serv.stop()
super(TestLiveRewriter, cls).teardown_class()
def test_live_live_1(self, fmod_sl):
@ -94,9 +110,27 @@ class TestLiveRewriter(HttpBinLiveTests, BaseConfigTest):
if six.PY3:
value = value.decode('latin-1')
resp = self.get('/live/{0}http://localhost:%s/unicode' % self.test_serv.port, fmod_sl)
resp = self.get('/live/{0}http://localhost:%s/unicode' % self.header_test_serv.port, fmod_sl)
assert resp.headers['x-utf-8'] == value
def test_domain_cookie(self, fmod_sl):
resp = self.get('/live/{0}http://localhost:%s/' % self.cookie_test_serv.port, fmod_sl,
headers={'Host': 'example.com'})
assert resp.headers['Set-Cookie'] == 'testcookie=cookie-val; Path=/live/{0}http://localhost:{1}/'.format(fmod_sl, self.cookie_test_serv.port)
assert resp.text == 'cookie value: '
resp = self.get('/live/{0}http://localhost:%s/' % self.cookie_test_serv.port, fmod_sl,
headers={'Host': 'example.com'})
assert resp.text == 'cookie value: testcookie=cookie-val'
resp = self.get('/live/{0}http://localhost:%s/' % self.cookie_test_serv.port, fmod_sl,
headers={'Host': 'sub.example.com'})
assert 'Set-Cookie' not in resp.headers
assert resp.text == 'cookie value: testcookie=cookie-val'
def test_live_live_frame(self):
resp = self.testapp.get('/live/http://example.com/')
assert resp.status_int == 200

2
wombat

@ -1 +1 @@
Subproject commit 20061139c3e08c14ea3361a79a3edd495d0e8f19
Subproject commit acfb37a74bee00c4c483befd7f756551b45b9333