mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 08:04:49 +01:00
cookie_tracker: add support for redis-based subdomain cookie tracker, which temp caches cookies with Domain= set in redis and passes them upstream
when rewriting. addresses webrecorder/webrecorder#79
This commit is contained in:
parent
228ca58c5b
commit
ab3af90df2
139
urlrewrite/cookies.py
Normal file
139
urlrewrite/cookies.py
Normal file
@ -0,0 +1,139 @@
|
||||
from pywb.rewrite.cookie_rewriter import WbUrlBaseCookieRewriter
|
||||
from pywb.utils.timeutils import datetime_to_http_date
|
||||
from six.moves.http_cookiejar import CookieJar, DefaultCookiePolicy
|
||||
|
||||
import redis
|
||||
|
||||
import tldextract
|
||||
import time
|
||||
import datetime
|
||||
import six
|
||||
|
||||
|
||||
# =============================================================================
|
||||
class CookieTracker(object):
|
||||
def __init__(self, redis):
|
||||
self.redis = redis
|
||||
|
||||
def get_rewriter(self, url_rewriter, cookie_key):
|
||||
return DomainCacheCookieRewriter(url_rewriter,
|
||||
self.redis,
|
||||
cookie_key)
|
||||
|
||||
def get_cookie_headers(self, url, cookie_key):
|
||||
subds = self.get_subdomains(url)
|
||||
if not subds:
|
||||
return None, None
|
||||
|
||||
with redis.utils.pipeline(self.redis) as pi:
|
||||
for x in subds:
|
||||
pi.hgetall(cookie_key + '.' + x)
|
||||
|
||||
all_res = pi.execute()
|
||||
|
||||
cookies = []
|
||||
set_cookies = []
|
||||
|
||||
for res in all_res:
|
||||
if not res:
|
||||
continue
|
||||
|
||||
for n, v in six.iteritems(res):
|
||||
n = n.decode('utf-8')
|
||||
v = v.decode('utf-8')
|
||||
full = n + '=' + v
|
||||
cookies.append(full.split(';')[0])
|
||||
set_cookies.append(('Set-Cookie', full + '; Max-Age=120'))
|
||||
|
||||
cookies = ';'.join(cookies)
|
||||
return cookies, set_cookies
|
||||
|
||||
@staticmethod
|
||||
def get_subdomains(url):
|
||||
tld = tldextract.extract(url)
|
||||
|
||||
if not tld.subdomain:
|
||||
return None
|
||||
|
||||
main = tld.domain + '.' + tld.suffix
|
||||
full = tld.subdomain + '.' + main
|
||||
|
||||
def get_all_subdomains(main, full):
|
||||
doms = []
|
||||
while main != full:
|
||||
full = full.split('.', 1)[1]
|
||||
doms.append(full)
|
||||
|
||||
return doms
|
||||
|
||||
all_subs = get_all_subdomains(main, full)
|
||||
return all_subs
|
||||
|
||||
|
||||
# =============================================================================
|
||||
class DomainCacheCookieRewriter(WbUrlBaseCookieRewriter):
|
||||
def __init__(self, url_rewriter, redis, cookie_key):
|
||||
super(DomainCacheCookieRewriter, self).__init__(url_rewriter)
|
||||
self.redis = redis
|
||||
self.cookie_key = cookie_key
|
||||
|
||||
def rewrite_cookie(self, name, morsel):
|
||||
# if domain set, no choice but to expand cookie path to root
|
||||
domain = morsel.pop('domain', '')
|
||||
|
||||
if domain:
|
||||
#if morsel.get('max-age'):
|
||||
# morsel['max-age'] = int(morsel['max-age'])
|
||||
|
||||
#self.cookiejar.set_cookie(self.morsel_to_cookie(morsel))
|
||||
#print(morsel, self.cookie_key + domain)
|
||||
|
||||
string = morsel.value
|
||||
if morsel.get('path'):
|
||||
string += '; Path=' + morsel.get('path')
|
||||
|
||||
if morsel.get('httponly'):
|
||||
string += '; HttpOnly'
|
||||
|
||||
if morsel.get('secure'):
|
||||
string += '; Secure'
|
||||
|
||||
with redis.utils.pipeline(self.redis) as pi:
|
||||
pi.hset(self.cookie_key + domain, morsel.key, string)
|
||||
pi.expire(self.cookie_key + domain, 120)
|
||||
|
||||
# else set cookie to rewritten path
|
||||
if morsel.get('path'):
|
||||
morsel['path'] = self.url_rewriter.rewrite(morsel['path'])
|
||||
|
||||
return morsel
|
||||
|
||||
def get_expire_sec(self, morsel):
|
||||
expires = None
|
||||
|
||||
if morsel.get('max-age'):
|
||||
return int(morsel['max-age'])
|
||||
|
||||
expires = morsel.get('expires')
|
||||
if not expires:
|
||||
return None
|
||||
|
||||
expires = expires.replace(' UTC', ' GMT')
|
||||
|
||||
try:
|
||||
expires = time.strptime(expires, '%a, %d-%b-%Y %H:%M:%S GMT')
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
expires = time.strptime(expires, '%a, %d %b %Y %H:%M:%S GMT')
|
||||
except:
|
||||
pass
|
||||
|
||||
expires = time.mktime(expires)
|
||||
expires = expires - time.timezone - time.time()
|
||||
return expires
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
@ -17,6 +17,7 @@ class RewriteInputRequest(DirectWSGIInputRequest):
|
||||
self.urlkey = urlkey
|
||||
self.url = url
|
||||
self.rewriter = rewriter
|
||||
self.extra_cookie = None
|
||||
|
||||
self.splits = urlsplit(self.url)
|
||||
|
||||
@ -76,6 +77,10 @@ class RewriteInputRequest(DirectWSGIInputRequest):
|
||||
if value:
|
||||
headers['Cookie'] = value
|
||||
|
||||
if self.extra_cookie:
|
||||
headers['Cookie'] = self.extra_cookie + ';' + headers.get('Cookie', '')
|
||||
print('Cookie', headers['Cookie'])
|
||||
|
||||
return headers
|
||||
|
||||
def _req_cookie_rewrite(self, value):
|
||||
|
@ -17,6 +17,7 @@ from urlrewrite.rewriteinputreq import RewriteInputRequest
|
||||
from urlrewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
|
||||
|
||||
from io import BytesIO
|
||||
|
||||
import gevent
|
||||
import json
|
||||
|
||||
@ -53,6 +54,8 @@ class RewriterApp(object):
|
||||
self.error_view = BaseInsertView(self.jinja_env, 'error.html')
|
||||
self.query_view = BaseInsertView(self.jinja_env, config.get('query_html', 'query.html'))
|
||||
|
||||
self.cookie_tracker = None
|
||||
|
||||
def call_with_params(self, **kwargs):
|
||||
def run_app(environ, start_response):
|
||||
environ['pywb.kwargs'] = kwargs
|
||||
@ -123,8 +126,15 @@ class RewriterApp(object):
|
||||
else:
|
||||
async_record_url = mod_url
|
||||
|
||||
r = self._do_req(inputreq, url, wb_url, kwargs,
|
||||
async_record_url is not None)
|
||||
skip = async_record_url is not None
|
||||
|
||||
setcookie_headers = None
|
||||
if self.cookie_tracker:
|
||||
cookie_key = self.get_cookie_key(kwargs)
|
||||
res = self.cookie_tracker.get_cookie_headers(url, cookie_key)
|
||||
inputreq.extra_cookie, setcookie_headers = res
|
||||
|
||||
r = self._do_req(inputreq, url, wb_url, kwargs, skip)
|
||||
|
||||
if r.status_code >= 400:
|
||||
error = None
|
||||
@ -143,7 +153,6 @@ class RewriterApp(object):
|
||||
raise UpstreamException(r.status_code, url=url, details=details)
|
||||
|
||||
if async_record_url:
|
||||
#print('ASYNC REC', async_record_url)
|
||||
environ.pop('HTTP_RANGE', '')
|
||||
gevent.spawn(self._do_async_req,
|
||||
inputreq,
|
||||
@ -183,14 +192,24 @@ class RewriterApp(object):
|
||||
environ,
|
||||
self.framed_replay))
|
||||
|
||||
cookie_rewriter = None
|
||||
if self.cookie_tracker:
|
||||
cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter,
|
||||
cookie_key)
|
||||
|
||||
result = self.content_rewriter.rewrite_content(urlrewriter,
|
||||
record.status_headers,
|
||||
record.stream,
|
||||
head_insert_func,
|
||||
urlkey,
|
||||
cdx)
|
||||
cdx,
|
||||
cookie_rewriter)
|
||||
|
||||
status_headers, gen, is_rw = result
|
||||
|
||||
if setcookie_headers:
|
||||
status_headers.headers.extend(setcookie_headers)
|
||||
|
||||
return WbResponse(status_headers, gen)
|
||||
|
||||
def get_top_url(self, full_prefix, wb_url, cdx, kwargs):
|
||||
@ -343,6 +362,9 @@ class RewriterApp(object):
|
||||
def get_upstream_url(self, url, wb_url, closest, kwargs):
|
||||
raise NotImplemented()
|
||||
|
||||
def get_cookie_key(self, kwargs):
|
||||
raise NotImplemented()
|
||||
|
||||
def _add_custom_params(self, cdx, headers, kwargs):
|
||||
cdx['is_live'] = 'true'
|
||||
pass
|
||||
|
@ -1,29 +1,39 @@
|
||||
from gevent.monkey import patch_all; patch_all()
|
||||
|
||||
from bottle import run, Bottle, request, response
|
||||
from bottle import run, Bottle, request, response, debug
|
||||
|
||||
from six.moves.urllib.parse import quote
|
||||
|
||||
from pywb.utils.loaders import LocalFileLoader
|
||||
|
||||
import mimetypes
|
||||
import redis
|
||||
|
||||
from urlrewrite.rewriterapp import RewriterApp
|
||||
from urlrewrite.cookies import CookieTracker
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class RWApp(RewriterApp):
|
||||
def __init__(self, upstream_urls):
|
||||
def __init__(self, upstream_urls, cookie_key_templ, redis):
|
||||
self.upstream_urls = upstream_urls
|
||||
self.cookie_key_templ = cookie_key_templ
|
||||
self.app = Bottle()
|
||||
self.block_loader = LocalFileLoader()
|
||||
self.init_routes()
|
||||
|
||||
super(RWApp, self).__init__(True)
|
||||
|
||||
self.cookie_tracker = CookieTracker(redis)
|
||||
|
||||
def get_upstream_url(self, url, wb_url, closest, kwargs):
|
||||
type = kwargs.get('type')
|
||||
return self.upstream_urls[type].format(url=quote(url),
|
||||
closest=closest)
|
||||
|
||||
def get_cookie_key(self, kwargs):
|
||||
return self.cookie_key_templ.format(**kwargs)
|
||||
|
||||
def init_routes(self):
|
||||
@self.app.get('/static/__pywb/<filepath:path>')
|
||||
def server_static(filepath):
|
||||
@ -45,7 +55,8 @@ class RWApp(RewriterApp):
|
||||
'replay': 'http://localhost:%s/replay/resource/postreq?url={url}&closest={closest}' % replay_port,
|
||||
}
|
||||
|
||||
rwapp = RWApp(upstream_urls)
|
||||
r = redis.StrictRedis.from_url('redis://localhost/2')
|
||||
rwapp = RWApp(upstream_urls, 'cookies:', r)
|
||||
return rwapp
|
||||
|
||||
|
||||
|
@ -1,13 +1,14 @@
|
||||
|
||||
from webagg.test.testutils import LiveServerTests, BaseTestClass
|
||||
from webagg.test.testutils import FakeRedisTests
|
||||
|
||||
from .simpleapp import RWApp
|
||||
from .simpleapp import RWApp, debug
|
||||
|
||||
import os
|
||||
import webtest
|
||||
|
||||
|
||||
class TestRewriter(LiveServerTests, BaseTestClass):
|
||||
class TestRewriter(LiveServerTests, FakeRedisTests, BaseTestClass):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
super(TestRewriter, cls).setup_class()
|
||||
@ -17,6 +18,7 @@ class TestRewriter(LiveServerTests, BaseTestClass):
|
||||
|
||||
cls.app = RWApp.create_app(replay_port=cls.server.port)
|
||||
cls.testapp = webtest.TestApp(cls.app.app)
|
||||
debug(True)
|
||||
|
||||
def test_replay(self):
|
||||
resp = self.testapp.get('/live/mp_/http://example.com/')
|
||||
@ -34,7 +36,8 @@ class TestRewriter(LiveServerTests, BaseTestClass):
|
||||
|
||||
assert 'wbinfo.capture_url = "http://example.com/"' in resp.text
|
||||
|
||||
def test_cookie_track_1(self):
|
||||
resp = self.testapp.get('/live/mp_/https://twitter.com/')
|
||||
|
||||
|
||||
|
||||
assert resp.headers['set-cookie'] != None
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user