From ab3af90df22e85f9494c610becdd6b03a2446b51 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 4 May 2016 16:39:47 -0700 Subject: [PATCH] cookie_tracker: add support for redis-based subdomain cookie tracker, which temp caches cookies with Domain= set in redis and passes them upstream when rewriting. addresses webrecorder/webrecorder#79 --- urlrewrite/cookies.py | 139 +++++++++++++++++++++++++++++++ urlrewrite/rewriteinputreq.py | 5 ++ urlrewrite/rewriterapp.py | 30 ++++++- urlrewrite/test/simpleapp.py | 17 +++- urlrewrite/test/test_rewriter.py | 11 ++- 5 files changed, 191 insertions(+), 11 deletions(-) create mode 100644 urlrewrite/cookies.py diff --git a/urlrewrite/cookies.py b/urlrewrite/cookies.py new file mode 100644 index 00000000..5ebe1144 --- /dev/null +++ b/urlrewrite/cookies.py @@ -0,0 +1,139 @@ +from pywb.rewrite.cookie_rewriter import WbUrlBaseCookieRewriter +from pywb.utils.timeutils import datetime_to_http_date +from six.moves.http_cookiejar import CookieJar, DefaultCookiePolicy + +import redis + +import tldextract +import time +import datetime +import six + + +# ============================================================================= +class CookieTracker(object): + def __init__(self, redis): + self.redis = redis + + def get_rewriter(self, url_rewriter, cookie_key): + return DomainCacheCookieRewriter(url_rewriter, + self.redis, + cookie_key) + + def get_cookie_headers(self, url, cookie_key): + subds = self.get_subdomains(url) + if not subds: + return None, None + + with redis.utils.pipeline(self.redis) as pi: + for x in subds: + pi.hgetall(cookie_key + '.' + x) + + all_res = pi.execute() + + cookies = [] + set_cookies = [] + + for res in all_res: + if not res: + continue + + for n, v in six.iteritems(res): + n = n.decode('utf-8') + v = v.decode('utf-8') + full = n + '=' + v + cookies.append(full.split(';')[0]) + set_cookies.append(('Set-Cookie', full + '; Max-Age=120')) + + cookies = ';'.join(cookies) + return cookies, set_cookies + + @staticmethod + def get_subdomains(url): + tld = tldextract.extract(url) + + if not tld.subdomain: + return None + + main = tld.domain + '.' + tld.suffix + full = tld.subdomain + '.' + main + + def get_all_subdomains(main, full): + doms = [] + while main != full: + full = full.split('.', 1)[1] + doms.append(full) + + return doms + + all_subs = get_all_subdomains(main, full) + return all_subs + + +# ============================================================================= +class DomainCacheCookieRewriter(WbUrlBaseCookieRewriter): + def __init__(self, url_rewriter, redis, cookie_key): + super(DomainCacheCookieRewriter, self).__init__(url_rewriter) + self.redis = redis + self.cookie_key = cookie_key + + def rewrite_cookie(self, name, morsel): + # if domain set, no choice but to expand cookie path to root + domain = morsel.pop('domain', '') + + if domain: + #if morsel.get('max-age'): + # morsel['max-age'] = int(morsel['max-age']) + + #self.cookiejar.set_cookie(self.morsel_to_cookie(morsel)) + #print(morsel, self.cookie_key + domain) + + string = morsel.value + if morsel.get('path'): + string += '; Path=' + morsel.get('path') + + if morsel.get('httponly'): + string += '; HttpOnly' + + if morsel.get('secure'): + string += '; Secure' + + with redis.utils.pipeline(self.redis) as pi: + pi.hset(self.cookie_key + domain, morsel.key, string) + pi.expire(self.cookie_key + domain, 120) + + # else set cookie to rewritten path + if morsel.get('path'): + morsel['path'] = self.url_rewriter.rewrite(morsel['path']) + + return morsel + + def get_expire_sec(self, morsel): + expires = None + + if morsel.get('max-age'): + return int(morsel['max-age']) + + expires = morsel.get('expires') + if not expires: + return None + + expires = expires.replace(' UTC', ' GMT') + + try: + expires = time.strptime(expires, '%a, %d-%b-%Y %H:%M:%S GMT') + except: + pass + + try: + expires = time.strptime(expires, '%a, %d %b %Y %H:%M:%S GMT') + except: + pass + + expires = time.mktime(expires) + expires = expires - time.timezone - time.time() + return expires + + +# ============================================================================ + diff --git a/urlrewrite/rewriteinputreq.py b/urlrewrite/rewriteinputreq.py index fec5797b..18d84905 100644 --- a/urlrewrite/rewriteinputreq.py +++ b/urlrewrite/rewriteinputreq.py @@ -17,6 +17,7 @@ class RewriteInputRequest(DirectWSGIInputRequest): self.urlkey = urlkey self.url = url self.rewriter = rewriter + self.extra_cookie = None self.splits = urlsplit(self.url) @@ -76,6 +77,10 @@ class RewriteInputRequest(DirectWSGIInputRequest): if value: headers['Cookie'] = value + if self.extra_cookie: + headers['Cookie'] = self.extra_cookie + ';' + headers.get('Cookie', '') + print('Cookie', headers['Cookie']) + return headers def _req_cookie_rewrite(self, value): diff --git a/urlrewrite/rewriterapp.py b/urlrewrite/rewriterapp.py index 9ccc13ad..a8f5122a 100644 --- a/urlrewrite/rewriterapp.py +++ b/urlrewrite/rewriterapp.py @@ -17,6 +17,7 @@ from urlrewrite.rewriteinputreq import RewriteInputRequest from urlrewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView, BaseInsertView from io import BytesIO + import gevent import json @@ -53,6 +54,8 @@ class RewriterApp(object): self.error_view = BaseInsertView(self.jinja_env, 'error.html') self.query_view = BaseInsertView(self.jinja_env, config.get('query_html', 'query.html')) + self.cookie_tracker = None + def call_with_params(self, **kwargs): def run_app(environ, start_response): environ['pywb.kwargs'] = kwargs @@ -123,8 +126,15 @@ class RewriterApp(object): else: async_record_url = mod_url - r = self._do_req(inputreq, url, wb_url, kwargs, - async_record_url is not None) + skip = async_record_url is not None + + setcookie_headers = None + if self.cookie_tracker: + cookie_key = self.get_cookie_key(kwargs) + res = self.cookie_tracker.get_cookie_headers(url, cookie_key) + inputreq.extra_cookie, setcookie_headers = res + + r = self._do_req(inputreq, url, wb_url, kwargs, skip) if r.status_code >= 400: error = None @@ -143,7 +153,6 @@ class RewriterApp(object): raise UpstreamException(r.status_code, url=url, details=details) if async_record_url: - #print('ASYNC REC', async_record_url) environ.pop('HTTP_RANGE', '') gevent.spawn(self._do_async_req, inputreq, @@ -183,14 +192,24 @@ class RewriterApp(object): environ, self.framed_replay)) + cookie_rewriter = None + if self.cookie_tracker: + cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter, + cookie_key) + result = self.content_rewriter.rewrite_content(urlrewriter, record.status_headers, record.stream, head_insert_func, urlkey, - cdx) + cdx, + cookie_rewriter) status_headers, gen, is_rw = result + + if setcookie_headers: + status_headers.headers.extend(setcookie_headers) + return WbResponse(status_headers, gen) def get_top_url(self, full_prefix, wb_url, cdx, kwargs): @@ -343,6 +362,9 @@ class RewriterApp(object): def get_upstream_url(self, url, wb_url, closest, kwargs): raise NotImplemented() + def get_cookie_key(self, kwargs): + raise NotImplemented() + def _add_custom_params(self, cdx, headers, kwargs): cdx['is_live'] = 'true' pass diff --git a/urlrewrite/test/simpleapp.py b/urlrewrite/test/simpleapp.py index ee620de3..d7f05181 100644 --- a/urlrewrite/test/simpleapp.py +++ b/urlrewrite/test/simpleapp.py @@ -1,29 +1,39 @@ from gevent.monkey import patch_all; patch_all() -from bottle import run, Bottle, request, response +from bottle import run, Bottle, request, response, debug from six.moves.urllib.parse import quote from pywb.utils.loaders import LocalFileLoader + import mimetypes +import redis from urlrewrite.rewriterapp import RewriterApp +from urlrewrite.cookies import CookieTracker # ============================================================================ class RWApp(RewriterApp): - def __init__(self, upstream_urls): + def __init__(self, upstream_urls, cookie_key_templ, redis): self.upstream_urls = upstream_urls + self.cookie_key_templ = cookie_key_templ self.app = Bottle() self.block_loader = LocalFileLoader() self.init_routes() + super(RWApp, self).__init__(True) + self.cookie_tracker = CookieTracker(redis) + def get_upstream_url(self, url, wb_url, closest, kwargs): type = kwargs.get('type') return self.upstream_urls[type].format(url=quote(url), closest=closest) + def get_cookie_key(self, kwargs): + return self.cookie_key_templ.format(**kwargs) + def init_routes(self): @self.app.get('/static/__pywb/') def server_static(filepath): @@ -45,7 +55,8 @@ class RWApp(RewriterApp): 'replay': 'http://localhost:%s/replay/resource/postreq?url={url}&closest={closest}' % replay_port, } - rwapp = RWApp(upstream_urls) + r = redis.StrictRedis.from_url('redis://localhost/2') + rwapp = RWApp(upstream_urls, 'cookies:', r) return rwapp diff --git a/urlrewrite/test/test_rewriter.py b/urlrewrite/test/test_rewriter.py index 7f10a280..4fdaff48 100644 --- a/urlrewrite/test/test_rewriter.py +++ b/urlrewrite/test/test_rewriter.py @@ -1,13 +1,14 @@ from webagg.test.testutils import LiveServerTests, BaseTestClass +from webagg.test.testutils import FakeRedisTests -from .simpleapp import RWApp +from .simpleapp import RWApp, debug import os import webtest -class TestRewriter(LiveServerTests, BaseTestClass): +class TestRewriter(LiveServerTests, FakeRedisTests, BaseTestClass): @classmethod def setup_class(cls): super(TestRewriter, cls).setup_class() @@ -17,6 +18,7 @@ class TestRewriter(LiveServerTests, BaseTestClass): cls.app = RWApp.create_app(replay_port=cls.server.port) cls.testapp = webtest.TestApp(cls.app.app) + debug(True) def test_replay(self): resp = self.testapp.get('/live/mp_/http://example.com/') @@ -34,7 +36,8 @@ class TestRewriter(LiveServerTests, BaseTestClass): assert 'wbinfo.capture_url = "http://example.com/"' in resp.text + def test_cookie_track_1(self): + resp = self.testapp.get('/live/mp_/https://twitter.com/') - - + assert resp.headers['set-cookie'] != None