1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

proxy improvements: refactor coll selector into BaseCollSelector,

supporting either proxy auth or cookie-based selection (in progress)
https proxy: support POST requests, properly read http header and wrap remainder
in wsgi.input
https proxy: properly update wsgi for wrapped request
wbrequestresponse: add content-length 0 to redir_response
This commit is contained in:
Ilya Kreymer 2014-07-28 11:52:54 -07:00
parent b6fb0e510e
commit 6234d795dc
5 changed files with 260 additions and 57 deletions

View File

@ -12,6 +12,8 @@ from pywb.rewrite.url_rewriter import HttpsUrlRewriter
from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.wbexception import BadRequestException from pywb.utils.wbexception import BadRequestException
from pywb.utils.bufferedreaders import BufferedReader
from certauth import CertificateAuthority from certauth import CertificateAuthority
@ -51,8 +53,10 @@ class ProxyRouter(object):
for more details. for more details.
""" """
PAC_PATH = '/proxy.pac'
BLOCK_SIZE = 4096
def __init__(self, routes, **kwargs): def __init__(self, routes, **kwargs):
self.routes = routes
self.hostpaths = kwargs.get('hostpaths') self.hostpaths = kwargs.get('hostpaths')
self.error_view = kwargs.get('error_view') self.error_view = kwargs.get('error_view')
@ -61,13 +65,14 @@ class ProxyRouter(object):
if proxy_options: if proxy_options:
proxy_options = proxy_options.get('proxy_options', {}) proxy_options = proxy_options.get('proxy_options', {})
self.auth_msg = proxy_options.get('auth_msg', self.resolver = ProxyAuthResolver(routes, proxy_options)
'Please enter name of a collection to use for proxy mode') #self.resolver = CookieResolver(routes, proxy_options)
self.use_default_coll = proxy_options.get('use_default_coll', True)
self.unaltered = proxy_options.get('unaltered_replay', False) self.unaltered = proxy_options.get('unaltered_replay', False)
self.proxy_pac_path = proxy_options.get('pac_path', self.PAC_PATH)
if proxy_options.get('enable_https_proxy'): if proxy_options.get('enable_https_proxy'):
ca_file = proxy_options.get('root_ca_file') ca_file = proxy_options.get('root_ca_file')
@ -85,48 +90,23 @@ class ProxyRouter(object):
def __call__(self, env): def __call__(self, env):
is_https = (env['REQUEST_METHOD'] == 'CONNECT') is_https = (env['REQUEST_METHOD'] == 'CONNECT')
# for non-https requests, check pac path and non-proxy urls
if not is_https: if not is_https:
url = env['REL_REQUEST_URI'] url = env['REL_REQUEST_URI']
if url.endswith('/proxy.pac'): if url == self.proxy_pac_path:
return self.make_pac_response(env) return self.make_pac_response(env)
if not url.startswith(('http://', 'https://')): if not url.startswith(('http://', 'https://')):
return None return None
proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION') env['pywb.proxy_scheme'] = 'https' if is_https else 'http'
route = None # check resolver, for pre connect resolve
coll = None if self.resolver.pre_connect:
matcher = None route, coll, matcher, response = self.resolver.resolve(env)
if response:
if proxy_auth: return response
proxy_coll = self.read_basic_auth_coll(proxy_auth)
if not proxy_coll:
return self.proxy_auth_coll_response()
proxy_coll = '/' + proxy_coll + '/'
for r in self.routes:
matcher, c = r.is_handling(proxy_coll)
if matcher:
route = r
coll = c
break
if not route:
return self.proxy_auth_coll_response()
# if 'use_default_coll' or only one collection, use that
# for proxy mode
elif self.use_default_coll or len(self.routes) == 1:
route = self.routes[0]
coll = self.routes[0].regex.pattern
# otherwise, require proxy auth 407 to select collection
else:
return self.proxy_auth_coll_response()
# do connect, then get updated url # do connect, then get updated url
if is_https: if is_https:
@ -136,6 +116,12 @@ class ProxyRouter(object):
url = env['REL_REQUEST_URI'] url = env['REL_REQUEST_URI']
# check resolver, post connect
if not self.resolver.pre_connect:
route, coll, matcher, response = self.resolver.resolve(env)
if response:
return response
wbrequest = route.request_class(env, wbrequest = route.request_class(env,
request_uri=url, request_uri=url,
wb_url_str=url, wb_url_str=url,
@ -189,20 +175,18 @@ class ProxyRouter(object):
sock.send('Server: pywb proxy\r\n') sock.send('Server: pywb proxy\r\n')
sock.send('\r\n') sock.send('\r\n')
hostname = env['REL_REQUEST_URI'].split(':')[0] hostname, port = env['REL_REQUEST_URI'].split(':')
created, certfile = self.ca.get_cert_for_host(hostname) created, certfile = self.ca.get_cert_for_host(hostname)
ssl_sock = ssl.wrap_socket(sock, ssl_sock = ssl.wrap_socket(sock,
server_side=True, server_side=True,
certfile=certfile) certfile=certfile,
#ssl_version=ssl.PROTOCOL_SSLv23) ciphers="ALL",
ssl_version=ssl.PROTOCOL_SSLv23)
env['pywb.proxy_ssl_sock'] = ssl_sock env['pywb.proxy_ssl_sock'] = ssl_sock
#todo: better reading of all headers buffreader = BufferedReader(ssl_sock, block_size=self.BLOCK_SIZE)
buff = ssl_sock.recv(4096)
buffreader = BytesIO(buff)
statusline = buffreader.readline() statusline = buffreader.readline()
statusparts = statusline.split(' ') statusparts = statusline.split(' ')
@ -217,23 +201,44 @@ class ProxyRouter(object):
env['SERVER_PROTOCOL'] = statusparts[2].strip() env['SERVER_PROTOCOL'] = statusparts[2].strip()
env['SERVER_NAME'] = hostname
env['SERVER_PORT'] = port
queryparts = env['REL_REQUEST_URI'].split('?', 1) queryparts = env['REL_REQUEST_URI'].split('?', 1)
env['PATH_INFO'] = queryparts[0] env['PATH_INFO'] = queryparts[0]
env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else '' env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else ''
env['wsgi.input'] = socket._fileobject(ssl_sock, mode='r') env['wsgi.url_scheme'] = 'https'
while True: while True:
line = buffreader.readline() line = buffreader.readline()
if line:
line = line.rstrip()
if not line: if not line:
break break
parts = line.split(':') parts = line.split(':', 1)
if len(parts) < 2: if len(parts) < 2:
continue continue
name = 'HTTP_' + parts[0].replace('-', '_').upper() name = parts[0].strip()
env[name] = parts[1] value = parts[1].strip()
name = name.replace('-', '_').upper()
if not name in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
name = 'HTTP_' + name
env[name] = value
remain = buffreader.rem_length()
if remain > 0:
remainder = buffreader.read(self.BLOCK_SIZE)
input_ = socket._fileobject(ssl_sock, mode='r')
env['wsgi.input'] = BufferedReader(input_,
block_size=self.BLOCK_SIZE,
starting_data=remainder)
# Proxy Auto-Config (PAC) script for the proxy # Proxy Auto-Config (PAC) script for the proxy
def make_pac_response(self, env): def make_pac_response(self, env):
@ -263,7 +268,73 @@ class ProxyRouter(object):
return WbResponse.text_response(buff, content_type=content_type) return WbResponse.text_response(buff, content_type=content_type)
def proxy_auth_coll_response(self):
#=================================================================
class BaseCollResolver(object):
def __init__(self, routes, config):
self.routes = routes
self.pre_connect = config.get('pre_connect', False)
self.use_default_coll = config.get('use_default_coll', True)
def resolve(self, env):
route = None
coll = None
matcher = None
proxy_coll = self.get_proxy_coll(env)
# invalid parsing
if proxy_coll == '':
return None, None, None, self.select_coll_response(env)
if proxy_coll is None and isinstance(self.use_default_coll, str):
proxy_coll = self.use_default_coll
if proxy_coll:
proxy_coll = '/' + proxy_coll + '/'
for r in self.routes:
matcher, c = r.is_handling(proxy_coll)
if matcher:
route = r
coll = c
break
# if no match, return coll selection response
if not route:
return None, None, None, self.select_coll_response(env)
# if 'use_default_coll'
elif self.use_default_coll == True or len(self.routes) == 1:
route = self.routes[0]
coll = self.routes[0].path
# otherwise, return the appropriate coll selection response
else:
return None, None, None, self.select_coll_response(env)
return route, coll, matcher, None
#=================================================================
class ProxyAuthResolver(BaseCollResolver):
DEFAULT_MSG = 'Please enter name of a collection to use with proxy mode'
def __init__(self, routes, config):
config['pre_connect'] = True
super(ProxyAuthResolver, self).__init__(routes, config)
self.auth_msg = config.get('auth_msg', self.DEFAULT_MSG)
def get_proxy_coll(self, env):
proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION')
if not proxy_auth:
return None
proxy_coll = self.read_basic_auth_coll(proxy_auth)
return proxy_coll
def select_coll_response(self, env):
proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg) proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg)
headers = [('Content-Type', 'text/plain'), headers = [('Content-Type', 'text/plain'),
@ -286,3 +357,128 @@ class ProxyRouter(object):
user_pass = base64.b64decode(parts[1]) user_pass = base64.b64decode(parts[1])
return user_pass.split(':')[0] return user_pass.split(':')[0]
#=================================================================
class CookieResolver(BaseCollResolver):
def __init__(self, routes, config):
config['pre_connect'] = False
super(CookieResolver, self).__init__(routes, config)
self.magic_name = config.get('magic_name', 'pywb-proxy.com')
self.cookie_name = config.get('cookie_name', '__pywb_coll')
self.proxy_select_view = config.get('proxy_select_view')
def get_proxy_coll(self, env):
cookie = self.extract_client_cookie(env, self.cookie_name)
return cookie
def select_coll_response(self, env):
return self.make_magic_response('auto',
env['REL_REQUEST_URI'],
env)
def resolve(self, env):
url = env['REL_REQUEST_URI']
if ('.' + self.magic_name) in url:
return None, None, None, self.handle_magic_page(url, env)
return super(CookieResolver, self).resolve(env)
def handle_magic_page(self, url, env):
parts = urlparse.urlsplit(url)
path_url = parts.path[1:]
if parts.query:
path_url += '?' + parts.query
if parts.netloc.startswith('auto'):
coll = self.extract_client_cookie(env, self.cookie_name)
if coll:
return self.make_sethost_cookie_response(coll, path_url, env)
else:
return self.make_magic_response('select', path_url, env)
elif '.set.' in parts.netloc:
coll = parts.netloc.split('.', 1)[0]
headers = self.make_cookie_headers(coll, self.magic_name)
return self.make_sethost_cookie_response(coll, path_url, env,
headers=headers)
elif '.sethost.' in parts.netloc:
host_parts = parts.netloc.split('.', 1)
coll = host_parts[0]
inx = parts.netloc.find('.' + self.magic_name + '.')
domain = parts.netloc[inx + len(self.magic_name) + 2:]
headers = self.make_cookie_headers(coll, domain)
full_url = env['pywb.proxy_scheme'] + '://' + domain
full_url += '/' + path_url
return WbResponse.redir_response(full_url, headers=headers)
elif self.proxy_select_view:
route_temp = env['pywb.proxy_scheme'] + '://%s.set.'
route_temp += self.magic_name + '/' + path_url
return (self.proxy_select_view.
render_response(routes=self.routes,
route_temp=route_temp,
url=path_url))
else:
return WbResponse.text_response('select text for ' + path_url)
def make_cookie_headers(self, coll, domain):
cookie_val = '{0}={1}; Path=/; Domain=.{2}; HttpOnly'
cookie_val = cookie_val.format(self.cookie_name, coll, domain)
headers = [('Set-Cookie', cookie_val)]
return headers
def make_sethost_cookie_response(self, coll, path_url, env, headers=None):
path_parts = urlparse.urlsplit(path_url)
new_url = path_parts.path[1:]
if path_parts.query:
new_url += '?' + path_parts.query
return self.make_magic_response(coll + '.sethost', new_url, env,
suffix=path_parts.netloc,
headers=headers)
def make_magic_response(self, prefix, url, env,
suffix=None, headers=None):
full_url = env['pywb.proxy_scheme'] + '://' + prefix + '.'
full_url += self.magic_name
if suffix:
full_url += '.' + suffix
full_url += '/' + url
return WbResponse.redir_response(full_url, headers=headers)
@staticmethod
def extract_client_cookie(env, cookie_name):
cookie_header = env.get('HTTP_COOKIE')
if not cookie_header:
return None
# attempt to extract cookie_name only
inx = cookie_header.find(cookie_name)
if inx < 0:
return None
end_inx = cookie_header.find(';', inx)
if end_inx > 0:
value = cookie_header[inx:end_inx]
else:
value = cookie_header[inx:]
value = value.split('=')
if len(value) < 2:
return None
value = value[1].strip()
return value

View File

@ -46,7 +46,7 @@
{'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])} {'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])}
>>> WbResponse.redir_response('http://example.com/otherfile') >>> WbResponse.redir_response('http://example.com/otherfile')
{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])} {'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile'), ('Content-Length', '0')])}
""" """

View File

@ -125,7 +125,7 @@ class WbRequest(object):
if not self.wb_url: if not self.wb_url:
return return
mime = self.env.get('CONTENT_TYPE') mime = self.env.get('CONTENT_TYPE').split(';')[0]
length = self.env.get('CONTENT_LENGTH') length = self.env.get('CONTENT_LENGTH')
stream = self.env['wsgi.input'] stream = self.env['wsgi.input']
@ -167,9 +167,12 @@ class WbResponse(object):
return WbResponse(status_headers, value=[text]) return WbResponse(status_headers, value=[text])
@staticmethod @staticmethod
def redir_response(location, status='302 Redirect'): def redir_response(location, status='302 Redirect', headers=None):
return WbResponse(StatusAndHeaders(status, redir_headers = [('Location', location), ('Content-Length', '0')]
[('Location', location)])) if headers:
redir_headers += headers
return WbResponse(StatusAndHeaders(status, redir_headers))
def __call__(self, env, start_response): def __call__(self, env, start_response):

View File

@ -64,7 +64,7 @@ class WSGIApp(object):
env['pywb.proxy_statusline'] = statusline env['pywb.proxy_statusline'] = statusline
ssl_sock.write('HTTP/1.0 ' + statusline + '\r\n') ssl_sock.write('HTTP/1.1 ' + statusline + '\r\n')
for name, value in headers: for name, value in headers:
ssl_sock.write(name + ': ' + value + '\r\n') ssl_sock.write(name + ': ' + value + '\r\n')

View File

@ -215,13 +215,17 @@ def create_wb_router(passed_config={}):
if hasattr(route.handler, 'resolve_refs'): if hasattr(route.handler, 'resolve_refs'):
route.handler.resolve_refs(handler_dict) route.handler.resolve_refs(handler_dict)
# Check for new proxy mode! # Check for new proxy mode!
if config.get('enable_http_proxy', False): if config.get('enable_http_proxy', False):
router = ProxyArchivalRouter router = ProxyArchivalRouter
else: else:
router = ArchivalRouter router = ArchivalRouter
if config.get('proxy_select_html'):
temp = J2TemplateView.create_template(config.get('proxy_select_html'),
'Proxy Coll Selector')
config.get('proxy_options')['proxy_select_view'] = temp
# Finally, create wb router # Finally, create wb router
return router( return router(
routes, routes,