mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
cookie_tracker: add support for redis-based subdomain cookie tracker, which temp caches cookies with Domain= set in redis and passes them upstream
when rewriting. addresses webrecorder/webrecorder#79
This commit is contained in:
parent
228ca58c5b
commit
ab3af90df2
139
urlrewrite/cookies.py
Normal file
139
urlrewrite/cookies.py
Normal file
@ -0,0 +1,139 @@
|
|||||||
|
from pywb.rewrite.cookie_rewriter import WbUrlBaseCookieRewriter
|
||||||
|
from pywb.utils.timeutils import datetime_to_http_date
|
||||||
|
from six.moves.http_cookiejar import CookieJar, DefaultCookiePolicy
|
||||||
|
|
||||||
|
import redis
|
||||||
|
|
||||||
|
import tldextract
|
||||||
|
import time
|
||||||
|
import datetime
|
||||||
|
import six
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
class CookieTracker(object):
|
||||||
|
def __init__(self, redis):
|
||||||
|
self.redis = redis
|
||||||
|
|
||||||
|
def get_rewriter(self, url_rewriter, cookie_key):
|
||||||
|
return DomainCacheCookieRewriter(url_rewriter,
|
||||||
|
self.redis,
|
||||||
|
cookie_key)
|
||||||
|
|
||||||
|
def get_cookie_headers(self, url, cookie_key):
|
||||||
|
subds = self.get_subdomains(url)
|
||||||
|
if not subds:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
with redis.utils.pipeline(self.redis) as pi:
|
||||||
|
for x in subds:
|
||||||
|
pi.hgetall(cookie_key + '.' + x)
|
||||||
|
|
||||||
|
all_res = pi.execute()
|
||||||
|
|
||||||
|
cookies = []
|
||||||
|
set_cookies = []
|
||||||
|
|
||||||
|
for res in all_res:
|
||||||
|
if not res:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for n, v in six.iteritems(res):
|
||||||
|
n = n.decode('utf-8')
|
||||||
|
v = v.decode('utf-8')
|
||||||
|
full = n + '=' + v
|
||||||
|
cookies.append(full.split(';')[0])
|
||||||
|
set_cookies.append(('Set-Cookie', full + '; Max-Age=120'))
|
||||||
|
|
||||||
|
cookies = ';'.join(cookies)
|
||||||
|
return cookies, set_cookies
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_subdomains(url):
|
||||||
|
tld = tldextract.extract(url)
|
||||||
|
|
||||||
|
if not tld.subdomain:
|
||||||
|
return None
|
||||||
|
|
||||||
|
main = tld.domain + '.' + tld.suffix
|
||||||
|
full = tld.subdomain + '.' + main
|
||||||
|
|
||||||
|
def get_all_subdomains(main, full):
|
||||||
|
doms = []
|
||||||
|
while main != full:
|
||||||
|
full = full.split('.', 1)[1]
|
||||||
|
doms.append(full)
|
||||||
|
|
||||||
|
return doms
|
||||||
|
|
||||||
|
all_subs = get_all_subdomains(main, full)
|
||||||
|
return all_subs
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
class DomainCacheCookieRewriter(WbUrlBaseCookieRewriter):
|
||||||
|
def __init__(self, url_rewriter, redis, cookie_key):
|
||||||
|
super(DomainCacheCookieRewriter, self).__init__(url_rewriter)
|
||||||
|
self.redis = redis
|
||||||
|
self.cookie_key = cookie_key
|
||||||
|
|
||||||
|
def rewrite_cookie(self, name, morsel):
|
||||||
|
# if domain set, no choice but to expand cookie path to root
|
||||||
|
domain = morsel.pop('domain', '')
|
||||||
|
|
||||||
|
if domain:
|
||||||
|
#if morsel.get('max-age'):
|
||||||
|
# morsel['max-age'] = int(morsel['max-age'])
|
||||||
|
|
||||||
|
#self.cookiejar.set_cookie(self.morsel_to_cookie(morsel))
|
||||||
|
#print(morsel, self.cookie_key + domain)
|
||||||
|
|
||||||
|
string = morsel.value
|
||||||
|
if morsel.get('path'):
|
||||||
|
string += '; Path=' + morsel.get('path')
|
||||||
|
|
||||||
|
if morsel.get('httponly'):
|
||||||
|
string += '; HttpOnly'
|
||||||
|
|
||||||
|
if morsel.get('secure'):
|
||||||
|
string += '; Secure'
|
||||||
|
|
||||||
|
with redis.utils.pipeline(self.redis) as pi:
|
||||||
|
pi.hset(self.cookie_key + domain, morsel.key, string)
|
||||||
|
pi.expire(self.cookie_key + domain, 120)
|
||||||
|
|
||||||
|
# else set cookie to rewritten path
|
||||||
|
if morsel.get('path'):
|
||||||
|
morsel['path'] = self.url_rewriter.rewrite(morsel['path'])
|
||||||
|
|
||||||
|
return morsel
|
||||||
|
|
||||||
|
def get_expire_sec(self, morsel):
|
||||||
|
expires = None
|
||||||
|
|
||||||
|
if morsel.get('max-age'):
|
||||||
|
return int(morsel['max-age'])
|
||||||
|
|
||||||
|
expires = morsel.get('expires')
|
||||||
|
if not expires:
|
||||||
|
return None
|
||||||
|
|
||||||
|
expires = expires.replace(' UTC', ' GMT')
|
||||||
|
|
||||||
|
try:
|
||||||
|
expires = time.strptime(expires, '%a, %d-%b-%Y %H:%M:%S GMT')
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
expires = time.strptime(expires, '%a, %d %b %Y %H:%M:%S GMT')
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
expires = time.mktime(expires)
|
||||||
|
expires = expires - time.timezone - time.time()
|
||||||
|
return expires
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
|
@ -17,6 +17,7 @@ class RewriteInputRequest(DirectWSGIInputRequest):
|
|||||||
self.urlkey = urlkey
|
self.urlkey = urlkey
|
||||||
self.url = url
|
self.url = url
|
||||||
self.rewriter = rewriter
|
self.rewriter = rewriter
|
||||||
|
self.extra_cookie = None
|
||||||
|
|
||||||
self.splits = urlsplit(self.url)
|
self.splits = urlsplit(self.url)
|
||||||
|
|
||||||
@ -76,6 +77,10 @@ class RewriteInputRequest(DirectWSGIInputRequest):
|
|||||||
if value:
|
if value:
|
||||||
headers['Cookie'] = value
|
headers['Cookie'] = value
|
||||||
|
|
||||||
|
if self.extra_cookie:
|
||||||
|
headers['Cookie'] = self.extra_cookie + ';' + headers.get('Cookie', '')
|
||||||
|
print('Cookie', headers['Cookie'])
|
||||||
|
|
||||||
return headers
|
return headers
|
||||||
|
|
||||||
def _req_cookie_rewrite(self, value):
|
def _req_cookie_rewrite(self, value):
|
||||||
|
@ -17,6 +17,7 @@ from urlrewrite.rewriteinputreq import RewriteInputRequest
|
|||||||
from urlrewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
|
from urlrewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
import gevent
|
import gevent
|
||||||
import json
|
import json
|
||||||
|
|
||||||
@ -53,6 +54,8 @@ class RewriterApp(object):
|
|||||||
self.error_view = BaseInsertView(self.jinja_env, 'error.html')
|
self.error_view = BaseInsertView(self.jinja_env, 'error.html')
|
||||||
self.query_view = BaseInsertView(self.jinja_env, config.get('query_html', 'query.html'))
|
self.query_view = BaseInsertView(self.jinja_env, config.get('query_html', 'query.html'))
|
||||||
|
|
||||||
|
self.cookie_tracker = None
|
||||||
|
|
||||||
def call_with_params(self, **kwargs):
|
def call_with_params(self, **kwargs):
|
||||||
def run_app(environ, start_response):
|
def run_app(environ, start_response):
|
||||||
environ['pywb.kwargs'] = kwargs
|
environ['pywb.kwargs'] = kwargs
|
||||||
@ -123,8 +126,15 @@ class RewriterApp(object):
|
|||||||
else:
|
else:
|
||||||
async_record_url = mod_url
|
async_record_url = mod_url
|
||||||
|
|
||||||
r = self._do_req(inputreq, url, wb_url, kwargs,
|
skip = async_record_url is not None
|
||||||
async_record_url is not None)
|
|
||||||
|
setcookie_headers = None
|
||||||
|
if self.cookie_tracker:
|
||||||
|
cookie_key = self.get_cookie_key(kwargs)
|
||||||
|
res = self.cookie_tracker.get_cookie_headers(url, cookie_key)
|
||||||
|
inputreq.extra_cookie, setcookie_headers = res
|
||||||
|
|
||||||
|
r = self._do_req(inputreq, url, wb_url, kwargs, skip)
|
||||||
|
|
||||||
if r.status_code >= 400:
|
if r.status_code >= 400:
|
||||||
error = None
|
error = None
|
||||||
@ -143,7 +153,6 @@ class RewriterApp(object):
|
|||||||
raise UpstreamException(r.status_code, url=url, details=details)
|
raise UpstreamException(r.status_code, url=url, details=details)
|
||||||
|
|
||||||
if async_record_url:
|
if async_record_url:
|
||||||
#print('ASYNC REC', async_record_url)
|
|
||||||
environ.pop('HTTP_RANGE', '')
|
environ.pop('HTTP_RANGE', '')
|
||||||
gevent.spawn(self._do_async_req,
|
gevent.spawn(self._do_async_req,
|
||||||
inputreq,
|
inputreq,
|
||||||
@ -183,14 +192,24 @@ class RewriterApp(object):
|
|||||||
environ,
|
environ,
|
||||||
self.framed_replay))
|
self.framed_replay))
|
||||||
|
|
||||||
|
cookie_rewriter = None
|
||||||
|
if self.cookie_tracker:
|
||||||
|
cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter,
|
||||||
|
cookie_key)
|
||||||
|
|
||||||
result = self.content_rewriter.rewrite_content(urlrewriter,
|
result = self.content_rewriter.rewrite_content(urlrewriter,
|
||||||
record.status_headers,
|
record.status_headers,
|
||||||
record.stream,
|
record.stream,
|
||||||
head_insert_func,
|
head_insert_func,
|
||||||
urlkey,
|
urlkey,
|
||||||
cdx)
|
cdx,
|
||||||
|
cookie_rewriter)
|
||||||
|
|
||||||
status_headers, gen, is_rw = result
|
status_headers, gen, is_rw = result
|
||||||
|
|
||||||
|
if setcookie_headers:
|
||||||
|
status_headers.headers.extend(setcookie_headers)
|
||||||
|
|
||||||
return WbResponse(status_headers, gen)
|
return WbResponse(status_headers, gen)
|
||||||
|
|
||||||
def get_top_url(self, full_prefix, wb_url, cdx, kwargs):
|
def get_top_url(self, full_prefix, wb_url, cdx, kwargs):
|
||||||
@ -343,6 +362,9 @@ class RewriterApp(object):
|
|||||||
def get_upstream_url(self, url, wb_url, closest, kwargs):
|
def get_upstream_url(self, url, wb_url, closest, kwargs):
|
||||||
raise NotImplemented()
|
raise NotImplemented()
|
||||||
|
|
||||||
|
def get_cookie_key(self, kwargs):
|
||||||
|
raise NotImplemented()
|
||||||
|
|
||||||
def _add_custom_params(self, cdx, headers, kwargs):
|
def _add_custom_params(self, cdx, headers, kwargs):
|
||||||
cdx['is_live'] = 'true'
|
cdx['is_live'] = 'true'
|
||||||
pass
|
pass
|
||||||
|
@ -1,29 +1,39 @@
|
|||||||
from gevent.monkey import patch_all; patch_all()
|
from gevent.monkey import patch_all; patch_all()
|
||||||
|
|
||||||
from bottle import run, Bottle, request, response
|
from bottle import run, Bottle, request, response, debug
|
||||||
|
|
||||||
from six.moves.urllib.parse import quote
|
from six.moves.urllib.parse import quote
|
||||||
|
|
||||||
from pywb.utils.loaders import LocalFileLoader
|
from pywb.utils.loaders import LocalFileLoader
|
||||||
|
|
||||||
import mimetypes
|
import mimetypes
|
||||||
|
import redis
|
||||||
|
|
||||||
from urlrewrite.rewriterapp import RewriterApp
|
from urlrewrite.rewriterapp import RewriterApp
|
||||||
|
from urlrewrite.cookies import CookieTracker
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class RWApp(RewriterApp):
|
class RWApp(RewriterApp):
|
||||||
def __init__(self, upstream_urls):
|
def __init__(self, upstream_urls, cookie_key_templ, redis):
|
||||||
self.upstream_urls = upstream_urls
|
self.upstream_urls = upstream_urls
|
||||||
|
self.cookie_key_templ = cookie_key_templ
|
||||||
self.app = Bottle()
|
self.app = Bottle()
|
||||||
self.block_loader = LocalFileLoader()
|
self.block_loader = LocalFileLoader()
|
||||||
self.init_routes()
|
self.init_routes()
|
||||||
|
|
||||||
super(RWApp, self).__init__(True)
|
super(RWApp, self).__init__(True)
|
||||||
|
|
||||||
|
self.cookie_tracker = CookieTracker(redis)
|
||||||
|
|
||||||
def get_upstream_url(self, url, wb_url, closest, kwargs):
|
def get_upstream_url(self, url, wb_url, closest, kwargs):
|
||||||
type = kwargs.get('type')
|
type = kwargs.get('type')
|
||||||
return self.upstream_urls[type].format(url=quote(url),
|
return self.upstream_urls[type].format(url=quote(url),
|
||||||
closest=closest)
|
closest=closest)
|
||||||
|
|
||||||
|
def get_cookie_key(self, kwargs):
|
||||||
|
return self.cookie_key_templ.format(**kwargs)
|
||||||
|
|
||||||
def init_routes(self):
|
def init_routes(self):
|
||||||
@self.app.get('/static/__pywb/<filepath:path>')
|
@self.app.get('/static/__pywb/<filepath:path>')
|
||||||
def server_static(filepath):
|
def server_static(filepath):
|
||||||
@ -45,7 +55,8 @@ class RWApp(RewriterApp):
|
|||||||
'replay': 'http://localhost:%s/replay/resource/postreq?url={url}&closest={closest}' % replay_port,
|
'replay': 'http://localhost:%s/replay/resource/postreq?url={url}&closest={closest}' % replay_port,
|
||||||
}
|
}
|
||||||
|
|
||||||
rwapp = RWApp(upstream_urls)
|
r = redis.StrictRedis.from_url('redis://localhost/2')
|
||||||
|
rwapp = RWApp(upstream_urls, 'cookies:', r)
|
||||||
return rwapp
|
return rwapp
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,13 +1,14 @@
|
|||||||
|
|
||||||
from webagg.test.testutils import LiveServerTests, BaseTestClass
|
from webagg.test.testutils import LiveServerTests, BaseTestClass
|
||||||
|
from webagg.test.testutils import FakeRedisTests
|
||||||
|
|
||||||
from .simpleapp import RWApp
|
from .simpleapp import RWApp, debug
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import webtest
|
import webtest
|
||||||
|
|
||||||
|
|
||||||
class TestRewriter(LiveServerTests, BaseTestClass):
|
class TestRewriter(LiveServerTests, FakeRedisTests, BaseTestClass):
|
||||||
@classmethod
|
@classmethod
|
||||||
def setup_class(cls):
|
def setup_class(cls):
|
||||||
super(TestRewriter, cls).setup_class()
|
super(TestRewriter, cls).setup_class()
|
||||||
@ -17,6 +18,7 @@ class TestRewriter(LiveServerTests, BaseTestClass):
|
|||||||
|
|
||||||
cls.app = RWApp.create_app(replay_port=cls.server.port)
|
cls.app = RWApp.create_app(replay_port=cls.server.port)
|
||||||
cls.testapp = webtest.TestApp(cls.app.app)
|
cls.testapp = webtest.TestApp(cls.app.app)
|
||||||
|
debug(True)
|
||||||
|
|
||||||
def test_replay(self):
|
def test_replay(self):
|
||||||
resp = self.testapp.get('/live/mp_/http://example.com/')
|
resp = self.testapp.get('/live/mp_/http://example.com/')
|
||||||
@ -34,7 +36,8 @@ class TestRewriter(LiveServerTests, BaseTestClass):
|
|||||||
|
|
||||||
assert 'wbinfo.capture_url = "http://example.com/"' in resp.text
|
assert 'wbinfo.capture_url = "http://example.com/"' in resp.text
|
||||||
|
|
||||||
|
def test_cookie_track_1(self):
|
||||||
|
resp = self.testapp.get('/live/mp_/https://twitter.com/')
|
||||||
|
|
||||||
|
assert resp.headers['set-cookie'] != None
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user