mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Support/Improvements to Domain Cookie Cache (#491)
* domain cookie fix: - don't set cookies for service worker modifiers if response is not 200 - don't add existing cookies to Cookie or Set-Cookie headers - add sw_/, wkrf_/ modifiers to generate paths - enable domain cookie cacheing by default with fakeredis for live index and record mode, keyed by collection - reqs: add fakeredis, tldextract, update warcio - tests: add initial tests for domain cookie rewriting
This commit is contained in:
parent
837894a07f
commit
ffca45c855
@ -1,6 +1,8 @@
|
|||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
from fakeredis import FakeStrictRedis
|
||||||
|
|
||||||
from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit
|
from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit
|
||||||
from warcio.bufferedreaders import BufferedReader
|
from warcio.bufferedreaders import BufferedReader
|
||||||
from warcio.recordloader import ArcWarcRecordLoader
|
from warcio.recordloader import ArcWarcRecordLoader
|
||||||
@ -13,6 +15,7 @@ from pywb.rewrite.rewriteinputreq import RewriteInputRequest
|
|||||||
from pywb.rewrite.templateview import BaseInsertView, HeadInsertView, JinjaEnv, TopFrameView
|
from pywb.rewrite.templateview import BaseInsertView, HeadInsertView, JinjaEnv, TopFrameView
|
||||||
from pywb.rewrite.url_rewriter import IdentityUrlRewriter, UrlRewriter
|
from pywb.rewrite.url_rewriter import IdentityUrlRewriter, UrlRewriter
|
||||||
from pywb.rewrite.wburl import WbUrl
|
from pywb.rewrite.wburl import WbUrl
|
||||||
|
from pywb.rewrite.cookies import CookieTracker
|
||||||
from pywb.utils.canonicalize import canonicalize
|
from pywb.utils.canonicalize import canonicalize
|
||||||
from pywb.utils.io import BUFF_SIZE, OffsetLimitReader, no_except_close
|
from pywb.utils.io import BUFF_SIZE, OffsetLimitReader, no_except_close
|
||||||
from pywb.utils.memento import MementoUtils
|
from pywb.utils.memento import MementoUtils
|
||||||
@ -81,7 +84,7 @@ class RewriterApp(object):
|
|||||||
|
|
||||||
self.use_js_obj_proxy = config.get('use_js_obj_proxy', True)
|
self.use_js_obj_proxy = config.get('use_js_obj_proxy', True)
|
||||||
|
|
||||||
self.cookie_tracker = None
|
self.cookie_tracker = self._init_cookie_tracker()
|
||||||
|
|
||||||
self.enable_memento = self.config.get('enable_memento')
|
self.enable_memento = self.config.get('enable_memento')
|
||||||
|
|
||||||
@ -94,6 +97,9 @@ class RewriterApp(object):
|
|||||||
# deprecated: Use X-Forwarded-Proto header instead!
|
# deprecated: Use X-Forwarded-Proto header instead!
|
||||||
self.force_scheme = config.get('force_scheme')
|
self.force_scheme = config.get('force_scheme')
|
||||||
|
|
||||||
|
def _init_cookie_tracker(self):
|
||||||
|
return CookieTracker(FakeStrictRedis())
|
||||||
|
|
||||||
def add_csp_header(self, wb_url, status_headers):
|
def add_csp_header(self, wb_url, status_headers):
|
||||||
if self.csp_header and wb_url.mod == self.replay_mod:
|
if self.csp_header and wb_url.mod == self.replay_mod:
|
||||||
status_headers.headers.append(self.csp_header)
|
status_headers.headers.append(self.csp_header)
|
||||||
@ -267,10 +273,15 @@ class RewriterApp(object):
|
|||||||
range_start, range_end, skip_record = self._check_range(inputreq, wb_url)
|
range_start, range_end, skip_record = self._check_range(inputreq, wb_url)
|
||||||
|
|
||||||
setcookie_headers = None
|
setcookie_headers = None
|
||||||
|
cookie_key = None
|
||||||
if self.cookie_tracker:
|
if self.cookie_tracker:
|
||||||
cookie_key = self.get_cookie_key(kwargs)
|
cookie_key = self.get_cookie_key(kwargs)
|
||||||
res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key)
|
if cookie_key:
|
||||||
inputreq.extra_cookie, setcookie_headers = res
|
res = self.cookie_tracker.get_cookie_headers(wb_url.url,
|
||||||
|
urlrewriter,
|
||||||
|
cookie_key,
|
||||||
|
environ.get('HTTP_COOKIE', ''))
|
||||||
|
inputreq.extra_cookie, setcookie_headers = res
|
||||||
|
|
||||||
r = self._do_req(inputreq, wb_url, kwargs, skip_record)
|
r = self._do_req(inputreq, wb_url, kwargs, skip_record)
|
||||||
|
|
||||||
@ -366,7 +377,12 @@ class RewriterApp(object):
|
|||||||
config=self.config))
|
config=self.config))
|
||||||
|
|
||||||
cookie_rewriter = None
|
cookie_rewriter = None
|
||||||
if self.cookie_tracker:
|
if self.cookie_tracker and cookie_key:
|
||||||
|
# skip add cookie if service worker is not 200 -- sw will not be loaded by browser
|
||||||
|
# so don't update any cookies for it
|
||||||
|
if wb_url.mod == 'sw_' and record.http_headers.get_statuscode() != '200':
|
||||||
|
cookie_key = None
|
||||||
|
|
||||||
cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter,
|
cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter,
|
||||||
cookie_key)
|
cookie_key)
|
||||||
|
|
||||||
@ -637,7 +653,12 @@ class RewriterApp(object):
|
|||||||
return base_url
|
return base_url
|
||||||
|
|
||||||
def get_cookie_key(self, kwargs):
|
def get_cookie_key(self, kwargs):
|
||||||
raise NotImplemented()
|
# note: currently this is per-collection, so enabled only for live or recording
|
||||||
|
# to support multiple users recording/live, would need per user cookie
|
||||||
|
if kwargs.get('index') == '$live' or kwargs.get('type') == 'record':
|
||||||
|
return 'cookie:' + kwargs['coll']
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
def _add_custom_params(self, cdx, headers, kwargs, record):
|
def _add_custom_params(self, cdx, headers, kwargs, record):
|
||||||
pass
|
pass
|
||||||
|
@ -41,10 +41,10 @@ class WbUrlBaseCookieRewriter(object):
|
|||||||
then assume its meant to be a prefix, and likely needed for
|
then assume its meant to be a prefix, and likely needed for
|
||||||
other content.
|
other content.
|
||||||
Set cookie with same prefix but for all common modifiers:
|
Set cookie with same prefix but for all common modifiers:
|
||||||
(mp_, js_, cs_, oe_, if_)
|
(mp_, js_, cs_, oe_, if_, sw_, wkrf_)
|
||||||
"""
|
"""
|
||||||
curr_mod = self.url_rewriter.wburl.mod
|
curr_mod = self.url_rewriter.wburl.mod
|
||||||
if curr_mod not in ('mp_', 'if_'):
|
if curr_mod not in ('mp_', 'if_', 'sw_'):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if not morsel.get('httponly'):
|
if not morsel.get('httponly'):
|
||||||
@ -54,7 +54,7 @@ class WbUrlBaseCookieRewriter(object):
|
|||||||
if not path or not path.endswith('/'):
|
if not path or not path.endswith('/'):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
for mod in ('mp_', 'cs_', 'js_', 'im_', 'oe_', 'if_'):
|
for mod in ('mp_', 'cs_', 'js_', 'im_', 'oe_', 'if_', 'sw_', 'wkrf_'):
|
||||||
new_path = path.replace(curr_mod + '/', mod + '/')
|
new_path = path.replace(curr_mod + '/', mod + '/')
|
||||||
morsel['path'] = new_path
|
morsel['path'] = new_path
|
||||||
results.append((header, morsel.OutputString()))
|
results.append((header, morsel.OutputString()))
|
||||||
|
@ -19,7 +19,8 @@ class CookieTracker(object):
|
|||||||
def get_rewriter(self, url_rewriter, cookie_key):
|
def get_rewriter(self, url_rewriter, cookie_key):
|
||||||
return DomainCacheCookieRewriter(url_rewriter, self, cookie_key)
|
return DomainCacheCookieRewriter(url_rewriter, self, cookie_key)
|
||||||
|
|
||||||
def get_cookie_headers(self, url, url_rewriter, cookie_key):
|
def get_cookie_headers(self, url, url_rewriter, cookie_key, existing_cookie):
|
||||||
|
existing_cookie = existing_cookie or ''
|
||||||
subds = self.get_subdomains(url)
|
subds = self.get_subdomains(url)
|
||||||
host_cookie_rewriter = HostScopeNoFilterCookieRewriter(url_rewriter)
|
host_cookie_rewriter = HostScopeNoFilterCookieRewriter(url_rewriter)
|
||||||
|
|
||||||
@ -46,7 +47,14 @@ class CookieTracker(object):
|
|||||||
n = n.decode('utf-8')
|
n = n.decode('utf-8')
|
||||||
v = v.decode('utf-8')
|
v = v.decode('utf-8')
|
||||||
|
|
||||||
full = n + '=' + v
|
n += '='
|
||||||
|
|
||||||
|
# if cookie already in existing cookie, don't add duplicate
|
||||||
|
# also, don't add to set-cookie again (to avoid exceeding cookie size)
|
||||||
|
if n in existing_cookie:
|
||||||
|
continue
|
||||||
|
|
||||||
|
full = n + v
|
||||||
cookies.append(full.split(';')[0])
|
cookies.append(full.split(';')[0])
|
||||||
|
|
||||||
full += '; Max-Age=' + str(self.expire_time)
|
full += '; Max-Age=' + str(self.expire_time)
|
||||||
@ -108,7 +116,7 @@ class DomainCacheCookieRewriter(WbUrlBaseCookieRewriter):
|
|||||||
# if domain set, no choice but to expand cookie path to root
|
# if domain set, no choice but to expand cookie path to root
|
||||||
domain = morsel.pop('domain', '')
|
domain = morsel.pop('domain', '')
|
||||||
|
|
||||||
if domain:
|
if domain and self.cookie_key:
|
||||||
#if morsel.get('max-age'):
|
#if morsel.get('max-age'):
|
||||||
# morsel['max-age'] = int(morsel['max-age'])
|
# morsel['max-age'] = int(morsel['max-age'])
|
||||||
|
|
||||||
|
@ -282,12 +282,12 @@ class TestContentRewriter(object):
|
|||||||
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
|
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
|
||||||
|
|
||||||
mods = set()
|
mods = set()
|
||||||
assert len(headers.headers) == 6
|
assert len(headers.headers) == 8
|
||||||
for name, value in headers.headers:
|
for name, value in headers.headers:
|
||||||
assert name == 'Set-Cookie'
|
assert name == 'Set-Cookie'
|
||||||
mods.add(re.search('Path=/prefix/201701([^/]+)', value).group(1))
|
mods.add(re.search('Path=/prefix/201701([^/]+)', value).group(1))
|
||||||
|
|
||||||
assert mods == {'mp_', 'cs_', 'js_', 'im_', 'oe_', 'if_'}
|
assert mods == {'mp_', 'cs_', 'js_', 'im_', 'oe_', 'if_', 'sw_', 'wkrf_'}
|
||||||
assert is_rw == False
|
assert is_rw == False
|
||||||
|
|
||||||
def test_rewrite_http_cookie_no_all_mods_no_slash(self):
|
def test_rewrite_http_cookie_no_all_mods_no_slash(self):
|
||||||
|
File diff suppressed because one or more lines are too long
@ -1,5 +1,5 @@
|
|||||||
six
|
six
|
||||||
warcio>=1.5.2
|
warcio>=1.7.1
|
||||||
chardet
|
chardet
|
||||||
requests
|
requests
|
||||||
redis<3.0
|
redis<3.0
|
||||||
@ -13,3 +13,5 @@ gevent==1.4.0
|
|||||||
webassets==0.12.1
|
webassets==0.12.1
|
||||||
portalocker
|
portalocker
|
||||||
wsgiprox>=1.5.1
|
wsgiprox>=1.5.1
|
||||||
|
fakeredis<1.0
|
||||||
|
tldextract
|
||||||
|
@ -27,17 +27,33 @@ def header_test_server(environ, start_response):
|
|||||||
return [body]
|
return [body]
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
def cookie_test_server(environ, start_response):
|
||||||
|
body = 'cookie value: ' + environ.get('HTTP_COOKIE', '')
|
||||||
|
body = body.encode('utf-8')
|
||||||
|
headers = [('Content-Length', str(len(body))),
|
||||||
|
('Content-Type', 'text/plain')]
|
||||||
|
|
||||||
|
if b'testcookie' not in body:
|
||||||
|
headers.append(('Set-Cookie', 'testcookie=cookie-val; Path=/; Domain=.example.com'))
|
||||||
|
|
||||||
|
start_response('200 OK', headers=headers)
|
||||||
|
return [body]
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class TestLiveRewriter(HttpBinLiveTests, BaseConfigTest):
|
class TestLiveRewriter(HttpBinLiveTests, BaseConfigTest):
|
||||||
@classmethod
|
@classmethod
|
||||||
def setup_class(cls):
|
def setup_class(cls):
|
||||||
cls.lint_app = False
|
cls.lint_app = False
|
||||||
super(TestLiveRewriter, cls).setup_class('config_test.yaml')
|
super(TestLiveRewriter, cls).setup_class('config_test.yaml')
|
||||||
cls.test_serv = GeventServer(header_test_server)
|
cls.header_test_serv = GeventServer(header_test_server)
|
||||||
|
cls.cookie_test_serv = GeventServer(cookie_test_server)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def teardown_class(cls):
|
def teardown_class(cls):
|
||||||
cls.test_serv.stop()
|
cls.header_test_serv.stop()
|
||||||
|
cls.cookie_test_serv.stop()
|
||||||
super(TestLiveRewriter, cls).teardown_class()
|
super(TestLiveRewriter, cls).teardown_class()
|
||||||
|
|
||||||
def test_live_live_1(self, fmod_sl):
|
def test_live_live_1(self, fmod_sl):
|
||||||
@ -94,9 +110,27 @@ class TestLiveRewriter(HttpBinLiveTests, BaseConfigTest):
|
|||||||
if six.PY3:
|
if six.PY3:
|
||||||
value = value.decode('latin-1')
|
value = value.decode('latin-1')
|
||||||
|
|
||||||
resp = self.get('/live/{0}http://localhost:%s/unicode' % self.test_serv.port, fmod_sl)
|
resp = self.get('/live/{0}http://localhost:%s/unicode' % self.header_test_serv.port, fmod_sl)
|
||||||
assert resp.headers['x-utf-8'] == value
|
assert resp.headers['x-utf-8'] == value
|
||||||
|
|
||||||
|
def test_domain_cookie(self, fmod_sl):
|
||||||
|
resp = self.get('/live/{0}http://localhost:%s/' % self.cookie_test_serv.port, fmod_sl,
|
||||||
|
headers={'Host': 'example.com'})
|
||||||
|
|
||||||
|
assert resp.headers['Set-Cookie'] == 'testcookie=cookie-val; Path=/live/{0}http://localhost:{1}/'.format(fmod_sl, self.cookie_test_serv.port)
|
||||||
|
assert resp.text == 'cookie value: '
|
||||||
|
|
||||||
|
resp = self.get('/live/{0}http://localhost:%s/' % self.cookie_test_serv.port, fmod_sl,
|
||||||
|
headers={'Host': 'example.com'})
|
||||||
|
|
||||||
|
assert resp.text == 'cookie value: testcookie=cookie-val'
|
||||||
|
|
||||||
|
resp = self.get('/live/{0}http://localhost:%s/' % self.cookie_test_serv.port, fmod_sl,
|
||||||
|
headers={'Host': 'sub.example.com'})
|
||||||
|
|
||||||
|
assert 'Set-Cookie' not in resp.headers
|
||||||
|
assert resp.text == 'cookie value: testcookie=cookie-val'
|
||||||
|
|
||||||
def test_live_live_frame(self):
|
def test_live_live_frame(self):
|
||||||
resp = self.testapp.get('/live/http://example.com/')
|
resp = self.testapp.get('/live/http://example.com/')
|
||||||
assert resp.status_int == 200
|
assert resp.status_int == 200
|
||||||
|
2
wombat
2
wombat
@ -1 +1 @@
|
|||||||
Subproject commit 20061139c3e08c14ea3361a79a3edd495d0e8f19
|
Subproject commit acfb37a74bee00c4c483befd7f756551b45b9333
|
Loading…
x
Reference in New Issue
Block a user