1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Support/Improvements to Domain Cookie Cache (#491)

* domain cookie fix:
- don't set cookies for service worker modifiers if response is not 200
- don't add existing cookies to Cookie or Set-Cookie headers
- add sw_/, wkrf_/ modifiers to generate paths
- enable domain cookie cacheing by default with fakeredis for live index and record mode, keyed by collection
- reqs: add fakeredis, tldextract, update warcio
- tests: add initial tests for domain cookie rewriting
This commit is contained in:
Ilya Kreymer 2019-07-31 14:58:15 -07:00 committed by GitHub
parent 837894a07f
commit ffca45c855
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 84 additions and 19 deletions

View File

@ -1,6 +1,8 @@
from io import BytesIO from io import BytesIO
import requests import requests
from fakeredis import FakeStrictRedis
from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit
from warcio.bufferedreaders import BufferedReader from warcio.bufferedreaders import BufferedReader
from warcio.recordloader import ArcWarcRecordLoader from warcio.recordloader import ArcWarcRecordLoader
@ -13,6 +15,7 @@ from pywb.rewrite.rewriteinputreq import RewriteInputRequest
from pywb.rewrite.templateview import BaseInsertView, HeadInsertView, JinjaEnv, TopFrameView from pywb.rewrite.templateview import BaseInsertView, HeadInsertView, JinjaEnv, TopFrameView
from pywb.rewrite.url_rewriter import IdentityUrlRewriter, UrlRewriter from pywb.rewrite.url_rewriter import IdentityUrlRewriter, UrlRewriter
from pywb.rewrite.wburl import WbUrl from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.cookies import CookieTracker
from pywb.utils.canonicalize import canonicalize from pywb.utils.canonicalize import canonicalize
from pywb.utils.io import BUFF_SIZE, OffsetLimitReader, no_except_close from pywb.utils.io import BUFF_SIZE, OffsetLimitReader, no_except_close
from pywb.utils.memento import MementoUtils from pywb.utils.memento import MementoUtils
@ -81,7 +84,7 @@ class RewriterApp(object):
self.use_js_obj_proxy = config.get('use_js_obj_proxy', True) self.use_js_obj_proxy = config.get('use_js_obj_proxy', True)
self.cookie_tracker = None self.cookie_tracker = self._init_cookie_tracker()
self.enable_memento = self.config.get('enable_memento') self.enable_memento = self.config.get('enable_memento')
@ -94,6 +97,9 @@ class RewriterApp(object):
# deprecated: Use X-Forwarded-Proto header instead! # deprecated: Use X-Forwarded-Proto header instead!
self.force_scheme = config.get('force_scheme') self.force_scheme = config.get('force_scheme')
def _init_cookie_tracker(self):
return CookieTracker(FakeStrictRedis())
def add_csp_header(self, wb_url, status_headers): def add_csp_header(self, wb_url, status_headers):
if self.csp_header and wb_url.mod == self.replay_mod: if self.csp_header and wb_url.mod == self.replay_mod:
status_headers.headers.append(self.csp_header) status_headers.headers.append(self.csp_header)
@ -267,10 +273,15 @@ class RewriterApp(object):
range_start, range_end, skip_record = self._check_range(inputreq, wb_url) range_start, range_end, skip_record = self._check_range(inputreq, wb_url)
setcookie_headers = None setcookie_headers = None
cookie_key = None
if self.cookie_tracker: if self.cookie_tracker:
cookie_key = self.get_cookie_key(kwargs) cookie_key = self.get_cookie_key(kwargs)
res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key) if cookie_key:
inputreq.extra_cookie, setcookie_headers = res res = self.cookie_tracker.get_cookie_headers(wb_url.url,
urlrewriter,
cookie_key,
environ.get('HTTP_COOKIE', ''))
inputreq.extra_cookie, setcookie_headers = res
r = self._do_req(inputreq, wb_url, kwargs, skip_record) r = self._do_req(inputreq, wb_url, kwargs, skip_record)
@ -366,7 +377,12 @@ class RewriterApp(object):
config=self.config)) config=self.config))
cookie_rewriter = None cookie_rewriter = None
if self.cookie_tracker: if self.cookie_tracker and cookie_key:
# skip add cookie if service worker is not 200 -- sw will not be loaded by browser
# so don't update any cookies for it
if wb_url.mod == 'sw_' and record.http_headers.get_statuscode() != '200':
cookie_key = None
cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter, cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter,
cookie_key) cookie_key)
@ -637,7 +653,12 @@ class RewriterApp(object):
return base_url return base_url
def get_cookie_key(self, kwargs): def get_cookie_key(self, kwargs):
raise NotImplemented() # note: currently this is per-collection, so enabled only for live or recording
# to support multiple users recording/live, would need per user cookie
if kwargs.get('index') == '$live' or kwargs.get('type') == 'record':
return 'cookie:' + kwargs['coll']
else:
return None
def _add_custom_params(self, cdx, headers, kwargs, record): def _add_custom_params(self, cdx, headers, kwargs, record):
pass pass

View File

@ -41,10 +41,10 @@ class WbUrlBaseCookieRewriter(object):
then assume its meant to be a prefix, and likely needed for then assume its meant to be a prefix, and likely needed for
other content. other content.
Set cookie with same prefix but for all common modifiers: Set cookie with same prefix but for all common modifiers:
(mp_, js_, cs_, oe_, if_) (mp_, js_, cs_, oe_, if_, sw_, wkrf_)
""" """
curr_mod = self.url_rewriter.wburl.mod curr_mod = self.url_rewriter.wburl.mod
if curr_mod not in ('mp_', 'if_'): if curr_mod not in ('mp_', 'if_', 'sw_'):
return False return False
if not morsel.get('httponly'): if not morsel.get('httponly'):
@ -54,7 +54,7 @@ class WbUrlBaseCookieRewriter(object):
if not path or not path.endswith('/'): if not path or not path.endswith('/'):
return False return False
for mod in ('mp_', 'cs_', 'js_', 'im_', 'oe_', 'if_'): for mod in ('mp_', 'cs_', 'js_', 'im_', 'oe_', 'if_', 'sw_', 'wkrf_'):
new_path = path.replace(curr_mod + '/', mod + '/') new_path = path.replace(curr_mod + '/', mod + '/')
morsel['path'] = new_path morsel['path'] = new_path
results.append((header, morsel.OutputString())) results.append((header, morsel.OutputString()))

View File

@ -19,7 +19,8 @@ class CookieTracker(object):
def get_rewriter(self, url_rewriter, cookie_key): def get_rewriter(self, url_rewriter, cookie_key):
return DomainCacheCookieRewriter(url_rewriter, self, cookie_key) return DomainCacheCookieRewriter(url_rewriter, self, cookie_key)
def get_cookie_headers(self, url, url_rewriter, cookie_key): def get_cookie_headers(self, url, url_rewriter, cookie_key, existing_cookie):
existing_cookie = existing_cookie or ''
subds = self.get_subdomains(url) subds = self.get_subdomains(url)
host_cookie_rewriter = HostScopeNoFilterCookieRewriter(url_rewriter) host_cookie_rewriter = HostScopeNoFilterCookieRewriter(url_rewriter)
@ -46,7 +47,14 @@ class CookieTracker(object):
n = n.decode('utf-8') n = n.decode('utf-8')
v = v.decode('utf-8') v = v.decode('utf-8')
full = n + '=' + v n += '='
# if cookie already in existing cookie, don't add duplicate
# also, don't add to set-cookie again (to avoid exceeding cookie size)
if n in existing_cookie:
continue
full = n + v
cookies.append(full.split(';')[0]) cookies.append(full.split(';')[0])
full += '; Max-Age=' + str(self.expire_time) full += '; Max-Age=' + str(self.expire_time)
@ -108,7 +116,7 @@ class DomainCacheCookieRewriter(WbUrlBaseCookieRewriter):
# if domain set, no choice but to expand cookie path to root # if domain set, no choice but to expand cookie path to root
domain = morsel.pop('domain', '') domain = morsel.pop('domain', '')
if domain: if domain and self.cookie_key:
#if morsel.get('max-age'): #if morsel.get('max-age'):
# morsel['max-age'] = int(morsel['max-age']) # morsel['max-age'] = int(morsel['max-age'])

View File

@ -282,12 +282,12 @@ class TestContentRewriter(object):
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_') headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
mods = set() mods = set()
assert len(headers.headers) == 6 assert len(headers.headers) == 8
for name, value in headers.headers: for name, value in headers.headers:
assert name == 'Set-Cookie' assert name == 'Set-Cookie'
mods.add(re.search('Path=/prefix/201701([^/]+)', value).group(1)) mods.add(re.search('Path=/prefix/201701([^/]+)', value).group(1))
assert mods == {'mp_', 'cs_', 'js_', 'im_', 'oe_', 'if_'} assert mods == {'mp_', 'cs_', 'js_', 'im_', 'oe_', 'if_', 'sw_', 'wkrf_'}
assert is_rw == False assert is_rw == False
def test_rewrite_http_cookie_no_all_mods_no_slash(self): def test_rewrite_http_cookie_no_all_mods_no_slash(self):

File diff suppressed because one or more lines are too long

View File

@ -1,5 +1,5 @@
six six
warcio>=1.5.2 warcio>=1.7.1
chardet chardet
requests requests
redis<3.0 redis<3.0
@ -13,3 +13,5 @@ gevent==1.4.0
webassets==0.12.1 webassets==0.12.1
portalocker portalocker
wsgiprox>=1.5.1 wsgiprox>=1.5.1
fakeredis<1.0
tldextract

View File

@ -27,17 +27,33 @@ def header_test_server(environ, start_response):
return [body] return [body]
# ============================================================================
def cookie_test_server(environ, start_response):
body = 'cookie value: ' + environ.get('HTTP_COOKIE', '')
body = body.encode('utf-8')
headers = [('Content-Length', str(len(body))),
('Content-Type', 'text/plain')]
if b'testcookie' not in body:
headers.append(('Set-Cookie', 'testcookie=cookie-val; Path=/; Domain=.example.com'))
start_response('200 OK', headers=headers)
return [body]
# ============================================================================ # ============================================================================
class TestLiveRewriter(HttpBinLiveTests, BaseConfigTest): class TestLiveRewriter(HttpBinLiveTests, BaseConfigTest):
@classmethod @classmethod
def setup_class(cls): def setup_class(cls):
cls.lint_app = False cls.lint_app = False
super(TestLiveRewriter, cls).setup_class('config_test.yaml') super(TestLiveRewriter, cls).setup_class('config_test.yaml')
cls.test_serv = GeventServer(header_test_server) cls.header_test_serv = GeventServer(header_test_server)
cls.cookie_test_serv = GeventServer(cookie_test_server)
@classmethod @classmethod
def teardown_class(cls): def teardown_class(cls):
cls.test_serv.stop() cls.header_test_serv.stop()
cls.cookie_test_serv.stop()
super(TestLiveRewriter, cls).teardown_class() super(TestLiveRewriter, cls).teardown_class()
def test_live_live_1(self, fmod_sl): def test_live_live_1(self, fmod_sl):
@ -94,9 +110,27 @@ class TestLiveRewriter(HttpBinLiveTests, BaseConfigTest):
if six.PY3: if six.PY3:
value = value.decode('latin-1') value = value.decode('latin-1')
resp = self.get('/live/{0}http://localhost:%s/unicode' % self.test_serv.port, fmod_sl) resp = self.get('/live/{0}http://localhost:%s/unicode' % self.header_test_serv.port, fmod_sl)
assert resp.headers['x-utf-8'] == value assert resp.headers['x-utf-8'] == value
def test_domain_cookie(self, fmod_sl):
resp = self.get('/live/{0}http://localhost:%s/' % self.cookie_test_serv.port, fmod_sl,
headers={'Host': 'example.com'})
assert resp.headers['Set-Cookie'] == 'testcookie=cookie-val; Path=/live/{0}http://localhost:{1}/'.format(fmod_sl, self.cookie_test_serv.port)
assert resp.text == 'cookie value: '
resp = self.get('/live/{0}http://localhost:%s/' % self.cookie_test_serv.port, fmod_sl,
headers={'Host': 'example.com'})
assert resp.text == 'cookie value: testcookie=cookie-val'
resp = self.get('/live/{0}http://localhost:%s/' % self.cookie_test_serv.port, fmod_sl,
headers={'Host': 'sub.example.com'})
assert 'Set-Cookie' not in resp.headers
assert resp.text == 'cookie value: testcookie=cookie-val'
def test_live_live_frame(self): def test_live_live_frame(self):
resp = self.testapp.get('/live/http://example.com/') resp = self.testapp.get('/live/http://example.com/')
assert resp.status_int == 200 assert resp.status_int == 200

2
wombat

@ -1 +1 @@
Subproject commit 20061139c3e08c14ea3361a79a3edd495d0e8f19 Subproject commit acfb37a74bee00c4c483befd7f756551b45b9333