1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

cookie_tracker: add support for redis-based subdomain cookie tracker, which temp caches cookies with Domain= set in redis and passes them upstream

when rewriting. addresses webrecorder/webrecorder#79
This commit is contained in:
Ilya Kreymer 2016-05-04 16:39:47 -07:00
parent 228ca58c5b
commit ab3af90df2
5 changed files with 191 additions and 11 deletions

139
urlrewrite/cookies.py Normal file
View File

@ -0,0 +1,139 @@
from pywb.rewrite.cookie_rewriter import WbUrlBaseCookieRewriter
from pywb.utils.timeutils import datetime_to_http_date
from six.moves.http_cookiejar import CookieJar, DefaultCookiePolicy
import redis
import tldextract
import time
import datetime
import six
# =============================================================================
class CookieTracker(object):
def __init__(self, redis):
self.redis = redis
def get_rewriter(self, url_rewriter, cookie_key):
return DomainCacheCookieRewriter(url_rewriter,
self.redis,
cookie_key)
def get_cookie_headers(self, url, cookie_key):
subds = self.get_subdomains(url)
if not subds:
return None, None
with redis.utils.pipeline(self.redis) as pi:
for x in subds:
pi.hgetall(cookie_key + '.' + x)
all_res = pi.execute()
cookies = []
set_cookies = []
for res in all_res:
if not res:
continue
for n, v in six.iteritems(res):
n = n.decode('utf-8')
v = v.decode('utf-8')
full = n + '=' + v
cookies.append(full.split(';')[0])
set_cookies.append(('Set-Cookie', full + '; Max-Age=120'))
cookies = ';'.join(cookies)
return cookies, set_cookies
@staticmethod
def get_subdomains(url):
tld = tldextract.extract(url)
if not tld.subdomain:
return None
main = tld.domain + '.' + tld.suffix
full = tld.subdomain + '.' + main
def get_all_subdomains(main, full):
doms = []
while main != full:
full = full.split('.', 1)[1]
doms.append(full)
return doms
all_subs = get_all_subdomains(main, full)
return all_subs
# =============================================================================
class DomainCacheCookieRewriter(WbUrlBaseCookieRewriter):
def __init__(self, url_rewriter, redis, cookie_key):
super(DomainCacheCookieRewriter, self).__init__(url_rewriter)
self.redis = redis
self.cookie_key = cookie_key
def rewrite_cookie(self, name, morsel):
# if domain set, no choice but to expand cookie path to root
domain = morsel.pop('domain', '')
if domain:
#if morsel.get('max-age'):
# morsel['max-age'] = int(morsel['max-age'])
#self.cookiejar.set_cookie(self.morsel_to_cookie(morsel))
#print(morsel, self.cookie_key + domain)
string = morsel.value
if morsel.get('path'):
string += '; Path=' + morsel.get('path')
if morsel.get('httponly'):
string += '; HttpOnly'
if morsel.get('secure'):
string += '; Secure'
with redis.utils.pipeline(self.redis) as pi:
pi.hset(self.cookie_key + domain, morsel.key, string)
pi.expire(self.cookie_key + domain, 120)
# else set cookie to rewritten path
if morsel.get('path'):
morsel['path'] = self.url_rewriter.rewrite(morsel['path'])
return morsel
def get_expire_sec(self, morsel):
expires = None
if morsel.get('max-age'):
return int(morsel['max-age'])
expires = morsel.get('expires')
if not expires:
return None
expires = expires.replace(' UTC', ' GMT')
try:
expires = time.strptime(expires, '%a, %d-%b-%Y %H:%M:%S GMT')
except:
pass
try:
expires = time.strptime(expires, '%a, %d %b %Y %H:%M:%S GMT')
except:
pass
expires = time.mktime(expires)
expires = expires - time.timezone - time.time()
return expires
# ============================================================================

View File

@ -17,6 +17,7 @@ class RewriteInputRequest(DirectWSGIInputRequest):
self.urlkey = urlkey
self.url = url
self.rewriter = rewriter
self.extra_cookie = None
self.splits = urlsplit(self.url)
@ -76,6 +77,10 @@ class RewriteInputRequest(DirectWSGIInputRequest):
if value:
headers['Cookie'] = value
if self.extra_cookie:
headers['Cookie'] = self.extra_cookie + ';' + headers.get('Cookie', '')
print('Cookie', headers['Cookie'])
return headers
def _req_cookie_rewrite(self, value):

View File

@ -17,6 +17,7 @@ from urlrewrite.rewriteinputreq import RewriteInputRequest
from urlrewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView
from io import BytesIO
import gevent
import json
@ -53,6 +54,8 @@ class RewriterApp(object):
self.error_view = BaseInsertView(self.jinja_env, 'error.html')
self.query_view = BaseInsertView(self.jinja_env, config.get('query_html', 'query.html'))
self.cookie_tracker = None
def call_with_params(self, **kwargs):
def run_app(environ, start_response):
environ['pywb.kwargs'] = kwargs
@ -123,8 +126,15 @@ class RewriterApp(object):
else:
async_record_url = mod_url
r = self._do_req(inputreq, url, wb_url, kwargs,
async_record_url is not None)
skip = async_record_url is not None
setcookie_headers = None
if self.cookie_tracker:
cookie_key = self.get_cookie_key(kwargs)
res = self.cookie_tracker.get_cookie_headers(url, cookie_key)
inputreq.extra_cookie, setcookie_headers = res
r = self._do_req(inputreq, url, wb_url, kwargs, skip)
if r.status_code >= 400:
error = None
@ -143,7 +153,6 @@ class RewriterApp(object):
raise UpstreamException(r.status_code, url=url, details=details)
if async_record_url:
#print('ASYNC REC', async_record_url)
environ.pop('HTTP_RANGE', '')
gevent.spawn(self._do_async_req,
inputreq,
@ -183,14 +192,24 @@ class RewriterApp(object):
environ,
self.framed_replay))
cookie_rewriter = None
if self.cookie_tracker:
cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter,
cookie_key)
result = self.content_rewriter.rewrite_content(urlrewriter,
record.status_headers,
record.stream,
head_insert_func,
urlkey,
cdx)
cdx,
cookie_rewriter)
status_headers, gen, is_rw = result
if setcookie_headers:
status_headers.headers.extend(setcookie_headers)
return WbResponse(status_headers, gen)
def get_top_url(self, full_prefix, wb_url, cdx, kwargs):
@ -343,6 +362,9 @@ class RewriterApp(object):
def get_upstream_url(self, url, wb_url, closest, kwargs):
raise NotImplemented()
def get_cookie_key(self, kwargs):
raise NotImplemented()
def _add_custom_params(self, cdx, headers, kwargs):
cdx['is_live'] = 'true'
pass

View File

@ -1,29 +1,39 @@
from gevent.monkey import patch_all; patch_all()
from bottle import run, Bottle, request, response
from bottle import run, Bottle, request, response, debug
from six.moves.urllib.parse import quote
from pywb.utils.loaders import LocalFileLoader
import mimetypes
import redis
from urlrewrite.rewriterapp import RewriterApp
from urlrewrite.cookies import CookieTracker
# ============================================================================
class RWApp(RewriterApp):
def __init__(self, upstream_urls):
def __init__(self, upstream_urls, cookie_key_templ, redis):
self.upstream_urls = upstream_urls
self.cookie_key_templ = cookie_key_templ
self.app = Bottle()
self.block_loader = LocalFileLoader()
self.init_routes()
super(RWApp, self).__init__(True)
self.cookie_tracker = CookieTracker(redis)
def get_upstream_url(self, url, wb_url, closest, kwargs):
type = kwargs.get('type')
return self.upstream_urls[type].format(url=quote(url),
closest=closest)
def get_cookie_key(self, kwargs):
return self.cookie_key_templ.format(**kwargs)
def init_routes(self):
@self.app.get('/static/__pywb/<filepath:path>')
def server_static(filepath):
@ -45,7 +55,8 @@ class RWApp(RewriterApp):
'replay': 'http://localhost:%s/replay/resource/postreq?url={url}&closest={closest}' % replay_port,
}
rwapp = RWApp(upstream_urls)
r = redis.StrictRedis.from_url('redis://localhost/2')
rwapp = RWApp(upstream_urls, 'cookies:', r)
return rwapp

View File

@ -1,13 +1,14 @@
from webagg.test.testutils import LiveServerTests, BaseTestClass
from webagg.test.testutils import FakeRedisTests
from .simpleapp import RWApp
from .simpleapp import RWApp, debug
import os
import webtest
class TestRewriter(LiveServerTests, BaseTestClass):
class TestRewriter(LiveServerTests, FakeRedisTests, BaseTestClass):
@classmethod
def setup_class(cls):
super(TestRewriter, cls).setup_class()
@ -17,6 +18,7 @@ class TestRewriter(LiveServerTests, BaseTestClass):
cls.app = RWApp.create_app(replay_port=cls.server.port)
cls.testapp = webtest.TestApp(cls.app.app)
debug(True)
def test_replay(self):
resp = self.testapp.get('/live/mp_/http://example.com/')
@ -34,7 +36,8 @@ class TestRewriter(LiveServerTests, BaseTestClass):
assert 'wbinfo.capture_url = "http://example.com/"' in resp.text
def test_cookie_track_1(self):
resp = self.testapp.get('/live/mp_/https://twitter.com/')
assert resp.headers['set-cookie'] != None