From 607ea1ccf0990266fb96cf136924675a5955e6e9 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 29 Jul 2014 12:23:41 -0700 Subject: [PATCH] proxy resolver: cookie resolver uses session cookies proxy static handler: handled via proxy to support http/https use 'pywb.proxy' prefix for custom env settings --- pywb/framework/proxy.py | 42 +++++++++-- pywb/framework/proxy_resolvers.py | 104 ++++++++++++++++++++++------ pywb/framework/wbrequestresponse.py | 10 ++- pywb/framework/wsgi_wrappers.py | 12 ++-- 4 files changed, 132 insertions(+), 36 deletions(-) diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index a9cf6a66..82218e20 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -14,7 +14,7 @@ from pywb.utils.bufferedreaders import BufferedReader from certauth import CertificateAuthority -from proxy_resolvers import ProxyAuthResolver +from proxy_resolvers import ProxyAuthResolver, CookieResolver #================================================================= @@ -68,6 +68,8 @@ class ProxyRouter(object): self.resolver = ProxyAuthResolver(routes, proxy_options) #self.resolver = CookieResolver(routes, proxy_options) + self.magic_name = proxy_options.get('magic_name', 'pywb-proxy.com') + self.unaltered = proxy_options.get('unaltered_replay', False) self.proxy_pac_path = proxy_options.get('pac_path', self.PAC_PATH) @@ -100,7 +102,12 @@ class ProxyRouter(object): if not url.startswith(('http://', 'https://')): return None - env['pywb.proxy_scheme'] = 'https' if is_https else 'http' + env['pywb.proxy_scheme'] = 'http' + + route = None + coll = None + matcher = None + response = None # check resolver, for pre connect resolve if self.resolver.pre_connect: @@ -115,6 +122,21 @@ class ProxyRouter(object): return response url = env['REL_REQUEST_URI'] + else: + parts = urlparse.urlsplit(env['REL_REQUEST_URI']) + hostport = parts.netloc.split(':', 1) + env['pywb.proxy_host'] = hostport[0] + env['pywb.proxy_port'] = hostport[1] if len(hostport) == 2 else '' + env['pywb.proxy_req_uri'] = parts.path + if parts.query: + env['pywb.proxy_req_uri'] += '?' + parts.query + + # static + static_prefix = 'static.' + self.magic_name + + if env['pywb.proxy_host'] == static_prefix: + env['REL_REQUEST_URI'] = env['pywb.proxy_req_uri'] + return None # check resolver, post connect if not self.resolver.pre_connect: @@ -122,11 +144,14 @@ class ProxyRouter(object): if response: return response + host_prefix = env['pywb.proxy_scheme'] + '://' + static_prefix + wbrequest = route.request_class(env, request_uri=url, wb_url_str=url, coll=coll, - host_prefix=self.hostpaths[0], + # host_prefix=self.hostpaths[0], + host_prefix=host_prefix, wburl_class=route.handler.get_wburl_type(), urlrewriter_class=HttpsUrlRewriter, use_abs_prefix=False, @@ -136,7 +161,8 @@ class ProxyRouter(object): route.apply_filters(wbrequest, matcher) if self.unaltered: - wbrequest.wb_url.mod = 'id_' + #wbrequest.wb_url.mod = 'id_' + wbrequest.wb_url.mod = 'bn_' return route.handler(wbrequest) @@ -201,14 +227,16 @@ class ProxyRouter(object): env['SERVER_PROTOCOL'] = statusparts[2].strip() - env['SERVER_NAME'] = hostname - env['SERVER_PORT'] = port + env['pywb.proxy_scheme'] = 'https' + + env['pywb.proxy_host'] = hostname + env['pywb.proxy_port'] = port + env['pywb.proxy_req_uri'] = statusparts[1] queryparts = env['REL_REQUEST_URI'].split('?', 1) env['PATH_INFO'] = queryparts[0] env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else '' - env['wsgi.url_scheme'] = 'https' while True: line = buffreader.readline() diff --git a/pywb/framework/proxy_resolvers.py b/pywb/framework/proxy_resolvers.py index b4bfe840..35c84c8a 100644 --- a/pywb/framework/proxy_resolvers.py +++ b/pywb/framework/proxy_resolvers.py @@ -2,6 +2,25 @@ from wbrequestresponse import WbResponse, WbRequest from pywb.utils.statusandheaders import StatusAndHeaders import urlparse import base64 +import os + +try: + import uwsgi + uwsgi_cache = True +except ImportError: + uwsgi_cache = False + + +#================================================================= +class UwsgiCache(object): + def __setitem__(self, item, value): + uwsgi.cache_update(item, value) + + def __getitem__(self, item): + return uwsgi.cache_get(item) + + def __contains__(self, item): + return uwsgi.cache_exists(item) #================================================================= @@ -104,9 +123,15 @@ class CookieResolver(BaseCollResolver): # pragma: no cover self.cookie_name = config.get('cookie_name', '__pywb_coll') self.proxy_select_view = config.get('proxy_select_view') + if uwsgi_cache: + print 'UWSGI CACHE' + self.cache = UwsgiCache() + else: + self.cache = {} + def get_proxy_coll(self, env): - cookie = self.extract_client_cookie(env, self.cookie_name) - return cookie + coll, sesh_id = self.get_coll(env) + return coll def select_coll_response(self, env): return self.make_magic_response('auto', @@ -114,14 +139,15 @@ class CookieResolver(BaseCollResolver): # pragma: no cover env) def resolve(self, env): - url = env['REL_REQUEST_URI'] + server_name = env['pywb.proxy_host'] - if ('.' + self.magic_name) in url: - return None, None, None, self.handle_magic_page(url, env) + if ('.' + self.magic_name) in server_name: + return None, None, None, self.handle_magic_page(env) return super(CookieResolver, self).resolve(env) - def handle_magic_page(self, url, env): + def handle_magic_page(self, env): + url = env['REL_REQUEST_URI'] parts = urlparse.urlsplit(url) path_url = parts.path[1:] @@ -129,58 +155,77 @@ class CookieResolver(BaseCollResolver): # pragma: no cover path_url += '?' + parts.query if parts.netloc.startswith('auto'): - coll = self.extract_client_cookie(env, self.cookie_name) + coll, sesh_id = self.get_coll(env) if coll: - return self.make_sethost_cookie_response(coll, path_url, env) + return self.make_sethost_cookie_response(sesh_id, path_url, env) else: return self.make_magic_response('select', path_url, env) elif '.set.' in parts.netloc: - coll = parts.netloc.split('.', 1)[0] - headers = self.make_cookie_headers(coll, self.magic_name) + old_sesh_id = self.extract_client_cookie(env, self.cookie_name) + sesh_id = self.create_renew_sesh_id(old_sesh_id) - return self.make_sethost_cookie_response(coll, path_url, env, + if sesh_id != old_sesh_id: + headers = self.make_cookie_headers(sesh_id, self.magic_name) + else: + headers = None + + value, name, _ = parts.netloc.split('.', 2) + + # set sesh value + self.cache[sesh_id] = value + + return self.make_sethost_cookie_response(sesh_id, path_url, env, headers=headers) elif '.sethost.' in parts.netloc: host_parts = parts.netloc.split('.', 1) - coll = host_parts[0] + sesh_id = host_parts[0] inx = parts.netloc.find('.' + self.magic_name + '.') domain = parts.netloc[inx + len(self.magic_name) + 2:] - headers = self.make_cookie_headers(coll, domain) + headers = self.make_cookie_headers(sesh_id, domain) full_url = env['pywb.proxy_scheme'] + '://' + domain full_url += '/' + path_url return WbResponse.redir_response(full_url, headers=headers) - elif self.proxy_select_view: - route_temp = env['pywb.proxy_scheme'] + '://%s.set.' + elif 'select.' in parts.netloc: + if not self.proxy_select_view: + return WbResponse.text_response('select text for ' + path_url) + + coll, sesh_id = self.get_coll(env) + + route_temp = env['pywb.proxy_scheme'] + '://%s.coll.set.' route_temp += self.magic_name + '/' + path_url return (self.proxy_select_view. render_response(routes=self.routes, route_temp=route_temp, + coll=coll, url=path_url)) - else: - return WbResponse.text_response('select text for ' + path_url) - def make_cookie_headers(self, coll, domain): + #else: + # msg = 'Invalid Magic Path: ' + url + # print msg + # return WbResponse.text_response(msg, status='404 Not Found') + + def make_cookie_headers(self, sesh_id, domain): cookie_val = '{0}={1}; Path=/; Domain=.{2}; HttpOnly' - cookie_val = cookie_val.format(self.cookie_name, coll, domain) + cookie_val = cookie_val.format(self.cookie_name, sesh_id, domain) headers = [('Set-Cookie', cookie_val)] return headers - def make_sethost_cookie_response(self, coll, path_url, env, headers=None): + def make_sethost_cookie_response(self, sesh_id, path_url, env, headers=None): path_parts = urlparse.urlsplit(path_url) new_url = path_parts.path[1:] if path_parts.query: new_url += '?' + path_parts.query - return self.make_magic_response(coll + '.sethost', new_url, env, + return self.make_magic_response(sesh_id + '.sethost', new_url, env, suffix=path_parts.netloc, headers=headers) @@ -194,6 +239,23 @@ class CookieResolver(BaseCollResolver): # pragma: no cover full_url += '/' + url return WbResponse.redir_response(full_url, headers=headers) + def get_coll(self, env): + sesh_id = self.extract_client_cookie(env, self.cookie_name) + + coll = None + if sesh_id: + coll = self.cache[sesh_id] + + return coll, sesh_id + + def create_renew_sesh_id(self, sesh_id, force=False): + #if sesh_id in self.cache and not force: + if sesh_id and (sesh_id in self.cache) and not force: + return sesh_id + + sesh_id = base64.b32encode(os.urandom(5)).lower() + return sesh_id + @staticmethod def extract_client_cookie(env, cookie_name): cookie_header = env.get('HTTP_COOKIE') diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index b17b3575..da456474 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -152,9 +152,13 @@ class WbResponse(object): pass @staticmethod - def text_stream(stream, status='200 OK', content_type='text/plain'): - status_headers = StatusAndHeaders(status, - [('Content-Type', content_type)]) + def text_stream(stream, status='200 OK', content_type='text/plain', + headers=None): + def_headers = [('Content-Type', content_type)] + if headers: + def_headers += headers + + status_headers = StatusAndHeaders(status, def_headers) return WbResponse(status_headers, value=stream) diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py index c8e7c86a..2babc83f 100644 --- a/pywb/framework/wsgi_wrappers.py +++ b/pywb/framework/wsgi_wrappers.py @@ -77,8 +77,8 @@ class WSGIApp(object): ssl_sock.write('\r\n') for obj in resp_iter: - ssl_sock.write(obj) - + if obj: + ssl_sock.write(obj) ssl_sock.close() start_response(env['pywb.proxy_statusline'], []) @@ -125,22 +125,24 @@ class WSGIApp(object): else: err_url = None + err_msg = exc.message.encode('utf-8') + if print_trace: import traceback err_details = traceback.format_exc(exc) print err_details else: - logging.info(str(exc)) + logging.info(err_msg) err_details = None if error_view: return error_view.render_response(exc_type=type(exc).__name__, - err_msg=str(exc), + err_msg=err_msg, err_details=err_details, status=status, err_url=err_url) else: - return WbResponse.text_response(status + ' Error: ' + str(exc), + return WbResponse.text_response(status + ' Error: ' + err_msg, status=status) #=================================================================