From 6234d795dcd242654362a46a18a410007942b0ee Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 28 Jul 2014 11:52:54 -0700 Subject: [PATCH] proxy improvements: refactor coll selector into BaseCollSelector, supporting either proxy auth or cookie-based selection (in progress) https proxy: support POST requests, properly read http header and wrap remainder in wsgi.input https proxy: properly update wsgi for wrapped request wbrequestresponse: add content-length 0 to redir_response --- pywb/framework/proxy.py | 296 +++++++++++++++--- pywb/framework/test/test_wbrequestresponse.py | 2 +- pywb/framework/wbrequestresponse.py | 11 +- pywb/framework/wsgi_wrappers.py | 2 +- pywb/webapp/pywb_init.py | 6 +- 5 files changed, 260 insertions(+), 57 deletions(-) diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index fdfb8ac1..386927ca 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -12,6 +12,8 @@ from pywb.rewrite.url_rewriter import HttpsUrlRewriter from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.wbexception import BadRequestException +from pywb.utils.bufferedreaders import BufferedReader + from certauth import CertificateAuthority @@ -51,8 +53,10 @@ class ProxyRouter(object): for more details. """ + PAC_PATH = '/proxy.pac' + BLOCK_SIZE = 4096 + def __init__(self, routes, **kwargs): - self.routes = routes self.hostpaths = kwargs.get('hostpaths') self.error_view = kwargs.get('error_view') @@ -61,13 +65,14 @@ class ProxyRouter(object): if proxy_options: proxy_options = proxy_options.get('proxy_options', {}) - self.auth_msg = proxy_options.get('auth_msg', - 'Please enter name of a collection to use for proxy mode') - - self.use_default_coll = proxy_options.get('use_default_coll', True) + self.resolver = ProxyAuthResolver(routes, proxy_options) + #self.resolver = CookieResolver(routes, proxy_options) self.unaltered = proxy_options.get('unaltered_replay', False) + self.proxy_pac_path = proxy_options.get('pac_path', self.PAC_PATH) + + if proxy_options.get('enable_https_proxy'): ca_file = proxy_options.get('root_ca_file') @@ -85,48 +90,23 @@ class ProxyRouter(object): def __call__(self, env): is_https = (env['REQUEST_METHOD'] == 'CONNECT') + # for non-https requests, check pac path and non-proxy urls if not is_https: url = env['REL_REQUEST_URI'] - if url.endswith('/proxy.pac'): + if url == self.proxy_pac_path: return self.make_pac_response(env) if not url.startswith(('http://', 'https://')): return None - proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION') + env['pywb.proxy_scheme'] = 'https' if is_https else 'http' - route = None - coll = None - matcher = None - - if proxy_auth: - proxy_coll = self.read_basic_auth_coll(proxy_auth) - - if not proxy_coll: - return self.proxy_auth_coll_response() - - proxy_coll = '/' + proxy_coll + '/' - - for r in self.routes: - matcher, c = r.is_handling(proxy_coll) - if matcher: - route = r - coll = c - break - - if not route: - return self.proxy_auth_coll_response() - - # if 'use_default_coll' or only one collection, use that - # for proxy mode - elif self.use_default_coll or len(self.routes) == 1: - route = self.routes[0] - coll = self.routes[0].regex.pattern - - # otherwise, require proxy auth 407 to select collection - else: - return self.proxy_auth_coll_response() + # check resolver, for pre connect resolve + if self.resolver.pre_connect: + route, coll, matcher, response = self.resolver.resolve(env) + if response: + return response # do connect, then get updated url if is_https: @@ -136,6 +116,12 @@ class ProxyRouter(object): url = env['REL_REQUEST_URI'] + # check resolver, post connect + if not self.resolver.pre_connect: + route, coll, matcher, response = self.resolver.resolve(env) + if response: + return response + wbrequest = route.request_class(env, request_uri=url, wb_url_str=url, @@ -189,20 +175,18 @@ class ProxyRouter(object): sock.send('Server: pywb proxy\r\n') sock.send('\r\n') - hostname = env['REL_REQUEST_URI'].split(':')[0] + hostname, port = env['REL_REQUEST_URI'].split(':') created, certfile = self.ca.get_cert_for_host(hostname) ssl_sock = ssl.wrap_socket(sock, server_side=True, - certfile=certfile) - #ssl_version=ssl.PROTOCOL_SSLv23) + certfile=certfile, + ciphers="ALL", + ssl_version=ssl.PROTOCOL_SSLv23) env['pywb.proxy_ssl_sock'] = ssl_sock - #todo: better reading of all headers - buff = ssl_sock.recv(4096) - - buffreader = BytesIO(buff) + buffreader = BufferedReader(ssl_sock, block_size=self.BLOCK_SIZE) statusline = buffreader.readline() statusparts = statusline.split(' ') @@ -217,23 +201,44 @@ class ProxyRouter(object): env['SERVER_PROTOCOL'] = statusparts[2].strip() + env['SERVER_NAME'] = hostname + env['SERVER_PORT'] = port + queryparts = env['REL_REQUEST_URI'].split('?', 1) env['PATH_INFO'] = queryparts[0] env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else '' - env['wsgi.input'] = socket._fileobject(ssl_sock, mode='r') + env['wsgi.url_scheme'] = 'https' while True: line = buffreader.readline() + if line: + line = line.rstrip() + if not line: break - parts = line.split(':') + parts = line.split(':', 1) if len(parts) < 2: continue - name = 'HTTP_' + parts[0].replace('-', '_').upper() - env[name] = parts[1] + name = parts[0].strip() + value = parts[1].strip() + + name = name.replace('-', '_').upper() + + if not name in ('CONTENT_LENGTH', 'CONTENT_TYPE'): + name = 'HTTP_' + name + + env[name] = value + + remain = buffreader.rem_length() + if remain > 0: + remainder = buffreader.read(self.BLOCK_SIZE) + input_ = socket._fileobject(ssl_sock, mode='r') + env['wsgi.input'] = BufferedReader(input_, + block_size=self.BLOCK_SIZE, + starting_data=remainder) # Proxy Auto-Config (PAC) script for the proxy def make_pac_response(self, env): @@ -263,7 +268,73 @@ class ProxyRouter(object): return WbResponse.text_response(buff, content_type=content_type) - def proxy_auth_coll_response(self): + +#================================================================= +class BaseCollResolver(object): + def __init__(self, routes, config): + self.routes = routes + self.pre_connect = config.get('pre_connect', False) + self.use_default_coll = config.get('use_default_coll', True) + + def resolve(self, env): + route = None + coll = None + matcher = None + + proxy_coll = self.get_proxy_coll(env) + + # invalid parsing + if proxy_coll == '': + return None, None, None, self.select_coll_response(env) + + if proxy_coll is None and isinstance(self.use_default_coll, str): + proxy_coll = self.use_default_coll + + if proxy_coll: + proxy_coll = '/' + proxy_coll + '/' + + for r in self.routes: + matcher, c = r.is_handling(proxy_coll) + if matcher: + route = r + coll = c + break + + # if no match, return coll selection response + if not route: + return None, None, None, self.select_coll_response(env) + + # if 'use_default_coll' + elif self.use_default_coll == True or len(self.routes) == 1: + route = self.routes[0] + coll = self.routes[0].path + + # otherwise, return the appropriate coll selection response + else: + return None, None, None, self.select_coll_response(env) + + return route, coll, matcher, None + + +#================================================================= +class ProxyAuthResolver(BaseCollResolver): + DEFAULT_MSG = 'Please enter name of a collection to use with proxy mode' + + def __init__(self, routes, config): + config['pre_connect'] = True + super(ProxyAuthResolver, self).__init__(routes, config) + self.auth_msg = config.get('auth_msg', self.DEFAULT_MSG) + + def get_proxy_coll(self, env): + proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION') + + if not proxy_auth: + return None + + proxy_coll = self.read_basic_auth_coll(proxy_auth) + return proxy_coll + + def select_coll_response(self, env): proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg) headers = [('Content-Type', 'text/plain'), @@ -286,3 +357,128 @@ class ProxyRouter(object): user_pass = base64.b64decode(parts[1]) return user_pass.split(':')[0] + + +#================================================================= +class CookieResolver(BaseCollResolver): + def __init__(self, routes, config): + config['pre_connect'] = False + super(CookieResolver, self).__init__(routes, config) + self.magic_name = config.get('magic_name', 'pywb-proxy.com') + self.cookie_name = config.get('cookie_name', '__pywb_coll') + self.proxy_select_view = config.get('proxy_select_view') + + def get_proxy_coll(self, env): + cookie = self.extract_client_cookie(env, self.cookie_name) + return cookie + + def select_coll_response(self, env): + return self.make_magic_response('auto', + env['REL_REQUEST_URI'], + env) + + def resolve(self, env): + url = env['REL_REQUEST_URI'] + + if ('.' + self.magic_name) in url: + return None, None, None, self.handle_magic_page(url, env) + + return super(CookieResolver, self).resolve(env) + + def handle_magic_page(self, url, env): + parts = urlparse.urlsplit(url) + + path_url = parts.path[1:] + if parts.query: + path_url += '?' + parts.query + + if parts.netloc.startswith('auto'): + coll = self.extract_client_cookie(env, self.cookie_name) + + if coll: + return self.make_sethost_cookie_response(coll, path_url, env) + else: + return self.make_magic_response('select', path_url, env) + + elif '.set.' in parts.netloc: + coll = parts.netloc.split('.', 1)[0] + headers = self.make_cookie_headers(coll, self.magic_name) + + return self.make_sethost_cookie_response(coll, path_url, env, + headers=headers) + + elif '.sethost.' in parts.netloc: + host_parts = parts.netloc.split('.', 1) + coll = host_parts[0] + + inx = parts.netloc.find('.' + self.magic_name + '.') + domain = parts.netloc[inx + len(self.magic_name) + 2:] + + headers = self.make_cookie_headers(coll, domain) + + full_url = env['pywb.proxy_scheme'] + '://' + domain + full_url += '/' + path_url + return WbResponse.redir_response(full_url, headers=headers) + + elif self.proxy_select_view: + route_temp = env['pywb.proxy_scheme'] + '://%s.set.' + route_temp += self.magic_name + '/' + path_url + + return (self.proxy_select_view. + render_response(routes=self.routes, + route_temp=route_temp, + url=path_url)) + else: + return WbResponse.text_response('select text for ' + path_url) + + def make_cookie_headers(self, coll, domain): + cookie_val = '{0}={1}; Path=/; Domain=.{2}; HttpOnly' + cookie_val = cookie_val.format(self.cookie_name, coll, domain) + headers = [('Set-Cookie', cookie_val)] + return headers + + def make_sethost_cookie_response(self, coll, path_url, env, headers=None): + path_parts = urlparse.urlsplit(path_url) + + new_url = path_parts.path[1:] + if path_parts.query: + new_url += '?' + path_parts.query + + return self.make_magic_response(coll + '.sethost', new_url, env, + suffix=path_parts.netloc, + headers=headers) + + + def make_magic_response(self, prefix, url, env, + suffix=None, headers=None): + full_url = env['pywb.proxy_scheme'] + '://' + prefix + '.' + full_url += self.magic_name + if suffix: + full_url += '.' + suffix + full_url += '/' + url + return WbResponse.redir_response(full_url, headers=headers) + + @staticmethod + def extract_client_cookie(env, cookie_name): + cookie_header = env.get('HTTP_COOKIE') + if not cookie_header: + return None + + # attempt to extract cookie_name only + inx = cookie_header.find(cookie_name) + if inx < 0: + return None + + end_inx = cookie_header.find(';', inx) + if end_inx > 0: + value = cookie_header[inx:end_inx] + else: + value = cookie_header[inx:] + + value = value.split('=') + if len(value) < 2: + return None + + value = value[1].strip() + return value + diff --git a/pywb/framework/test/test_wbrequestresponse.py b/pywb/framework/test/test_wbrequestresponse.py index 65940e4a..5bbb65b8 100644 --- a/pywb/framework/test/test_wbrequestresponse.py +++ b/pywb/framework/test/test_wbrequestresponse.py @@ -46,7 +46,7 @@ {'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])} >>> WbResponse.redir_response('http://example.com/otherfile') -{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])} +{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile'), ('Content-Length', '0')])} """ diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index f2c63f9c..b17b3575 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -125,7 +125,7 @@ class WbRequest(object): if not self.wb_url: return - mime = self.env.get('CONTENT_TYPE') + mime = self.env.get('CONTENT_TYPE').split(';')[0] length = self.env.get('CONTENT_LENGTH') stream = self.env['wsgi.input'] @@ -167,9 +167,12 @@ class WbResponse(object): return WbResponse(status_headers, value=[text]) @staticmethod - def redir_response(location, status='302 Redirect'): - return WbResponse(StatusAndHeaders(status, - [('Location', location)])) + def redir_response(location, status='302 Redirect', headers=None): + redir_headers = [('Location', location), ('Content-Length', '0')] + if headers: + redir_headers += headers + + return WbResponse(StatusAndHeaders(status, redir_headers)) def __call__(self, env, start_response): diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py index 1e1100e4..c8e7c86a 100644 --- a/pywb/framework/wsgi_wrappers.py +++ b/pywb/framework/wsgi_wrappers.py @@ -64,7 +64,7 @@ class WSGIApp(object): env['pywb.proxy_statusline'] = statusline - ssl_sock.write('HTTP/1.0 ' + statusline + '\r\n') + ssl_sock.write('HTTP/1.1 ' + statusline + '\r\n') for name, value in headers: ssl_sock.write(name + ': ' + value + '\r\n') diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py index 2fd02377..6de8fafa 100644 --- a/pywb/webapp/pywb_init.py +++ b/pywb/webapp/pywb_init.py @@ -215,13 +215,17 @@ def create_wb_router(passed_config={}): if hasattr(route.handler, 'resolve_refs'): route.handler.resolve_refs(handler_dict) - # Check for new proxy mode! if config.get('enable_http_proxy', False): router = ProxyArchivalRouter else: router = ArchivalRouter + if config.get('proxy_select_html'): + temp = J2TemplateView.create_template(config.get('proxy_select_html'), + 'Proxy Coll Selector') + config.get('proxy_options')['proxy_select_view'] = temp + # Finally, create wb router return router( routes,