1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

proxy improvements: refactor coll selector into BaseCollSelector,

supporting either proxy auth or cookie-based selection (in progress)
https proxy: support POST requests, properly read http header and wrap remainder
in wsgi.input
https proxy: properly update wsgi for wrapped request
wbrequestresponse: add content-length 0 to redir_response
This commit is contained in:
Ilya Kreymer 2014-07-28 11:52:54 -07:00
parent b6fb0e510e
commit 6234d795dc
5 changed files with 260 additions and 57 deletions

View File

@ -12,6 +12,8 @@ from pywb.rewrite.url_rewriter import HttpsUrlRewriter
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.wbexception import BadRequestException
from pywb.utils.bufferedreaders import BufferedReader
from certauth import CertificateAuthority
@ -51,8 +53,10 @@ class ProxyRouter(object):
for more details.
"""
PAC_PATH = '/proxy.pac'
BLOCK_SIZE = 4096
def __init__(self, routes, **kwargs):
self.routes = routes
self.hostpaths = kwargs.get('hostpaths')
self.error_view = kwargs.get('error_view')
@ -61,13 +65,14 @@ class ProxyRouter(object):
if proxy_options:
proxy_options = proxy_options.get('proxy_options', {})
self.auth_msg = proxy_options.get('auth_msg',
'Please enter name of a collection to use for proxy mode')
self.use_default_coll = proxy_options.get('use_default_coll', True)
self.resolver = ProxyAuthResolver(routes, proxy_options)
#self.resolver = CookieResolver(routes, proxy_options)
self.unaltered = proxy_options.get('unaltered_replay', False)
self.proxy_pac_path = proxy_options.get('pac_path', self.PAC_PATH)
if proxy_options.get('enable_https_proxy'):
ca_file = proxy_options.get('root_ca_file')
@ -85,48 +90,23 @@ class ProxyRouter(object):
def __call__(self, env):
is_https = (env['REQUEST_METHOD'] == 'CONNECT')
# for non-https requests, check pac path and non-proxy urls
if not is_https:
url = env['REL_REQUEST_URI']
if url.endswith('/proxy.pac'):
if url == self.proxy_pac_path:
return self.make_pac_response(env)
if not url.startswith(('http://', 'https://')):
return None
proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION')
env['pywb.proxy_scheme'] = 'https' if is_https else 'http'
route = None
coll = None
matcher = None
if proxy_auth:
proxy_coll = self.read_basic_auth_coll(proxy_auth)
if not proxy_coll:
return self.proxy_auth_coll_response()
proxy_coll = '/' + proxy_coll + '/'
for r in self.routes:
matcher, c = r.is_handling(proxy_coll)
if matcher:
route = r
coll = c
break
if not route:
return self.proxy_auth_coll_response()
# if 'use_default_coll' or only one collection, use that
# for proxy mode
elif self.use_default_coll or len(self.routes) == 1:
route = self.routes[0]
coll = self.routes[0].regex.pattern
# otherwise, require proxy auth 407 to select collection
else:
return self.proxy_auth_coll_response()
# check resolver, for pre connect resolve
if self.resolver.pre_connect:
route, coll, matcher, response = self.resolver.resolve(env)
if response:
return response
# do connect, then get updated url
if is_https:
@ -136,6 +116,12 @@ class ProxyRouter(object):
url = env['REL_REQUEST_URI']
# check resolver, post connect
if not self.resolver.pre_connect:
route, coll, matcher, response = self.resolver.resolve(env)
if response:
return response
wbrequest = route.request_class(env,
request_uri=url,
wb_url_str=url,
@ -189,20 +175,18 @@ class ProxyRouter(object):
sock.send('Server: pywb proxy\r\n')
sock.send('\r\n')
hostname = env['REL_REQUEST_URI'].split(':')[0]
hostname, port = env['REL_REQUEST_URI'].split(':')
created, certfile = self.ca.get_cert_for_host(hostname)
ssl_sock = ssl.wrap_socket(sock,
server_side=True,
certfile=certfile)
#ssl_version=ssl.PROTOCOL_SSLv23)
certfile=certfile,
ciphers="ALL",
ssl_version=ssl.PROTOCOL_SSLv23)
env['pywb.proxy_ssl_sock'] = ssl_sock
#todo: better reading of all headers
buff = ssl_sock.recv(4096)
buffreader = BytesIO(buff)
buffreader = BufferedReader(ssl_sock, block_size=self.BLOCK_SIZE)
statusline = buffreader.readline()
statusparts = statusline.split(' ')
@ -217,23 +201,44 @@ class ProxyRouter(object):
env['SERVER_PROTOCOL'] = statusparts[2].strip()
env['SERVER_NAME'] = hostname
env['SERVER_PORT'] = port
queryparts = env['REL_REQUEST_URI'].split('?', 1)
env['PATH_INFO'] = queryparts[0]
env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else ''
env['wsgi.input'] = socket._fileobject(ssl_sock, mode='r')
env['wsgi.url_scheme'] = 'https'
while True:
line = buffreader.readline()
if line:
line = line.rstrip()
if not line:
break
parts = line.split(':')
parts = line.split(':', 1)
if len(parts) < 2:
continue
name = 'HTTP_' + parts[0].replace('-', '_').upper()
env[name] = parts[1]
name = parts[0].strip()
value = parts[1].strip()
name = name.replace('-', '_').upper()
if not name in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
name = 'HTTP_' + name
env[name] = value
remain = buffreader.rem_length()
if remain > 0:
remainder = buffreader.read(self.BLOCK_SIZE)
input_ = socket._fileobject(ssl_sock, mode='r')
env['wsgi.input'] = BufferedReader(input_,
block_size=self.BLOCK_SIZE,
starting_data=remainder)
# Proxy Auto-Config (PAC) script for the proxy
def make_pac_response(self, env):
@ -263,7 +268,73 @@ class ProxyRouter(object):
return WbResponse.text_response(buff, content_type=content_type)
def proxy_auth_coll_response(self):
#=================================================================
class BaseCollResolver(object):
def __init__(self, routes, config):
self.routes = routes
self.pre_connect = config.get('pre_connect', False)
self.use_default_coll = config.get('use_default_coll', True)
def resolve(self, env):
route = None
coll = None
matcher = None
proxy_coll = self.get_proxy_coll(env)
# invalid parsing
if proxy_coll == '':
return None, None, None, self.select_coll_response(env)
if proxy_coll is None and isinstance(self.use_default_coll, str):
proxy_coll = self.use_default_coll
if proxy_coll:
proxy_coll = '/' + proxy_coll + '/'
for r in self.routes:
matcher, c = r.is_handling(proxy_coll)
if matcher:
route = r
coll = c
break
# if no match, return coll selection response
if not route:
return None, None, None, self.select_coll_response(env)
# if 'use_default_coll'
elif self.use_default_coll == True or len(self.routes) == 1:
route = self.routes[0]
coll = self.routes[0].path
# otherwise, return the appropriate coll selection response
else:
return None, None, None, self.select_coll_response(env)
return route, coll, matcher, None
#=================================================================
class ProxyAuthResolver(BaseCollResolver):
DEFAULT_MSG = 'Please enter name of a collection to use with proxy mode'
def __init__(self, routes, config):
config['pre_connect'] = True
super(ProxyAuthResolver, self).__init__(routes, config)
self.auth_msg = config.get('auth_msg', self.DEFAULT_MSG)
def get_proxy_coll(self, env):
proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION')
if not proxy_auth:
return None
proxy_coll = self.read_basic_auth_coll(proxy_auth)
return proxy_coll
def select_coll_response(self, env):
proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg)
headers = [('Content-Type', 'text/plain'),
@ -286,3 +357,128 @@ class ProxyRouter(object):
user_pass = base64.b64decode(parts[1])
return user_pass.split(':')[0]
#=================================================================
class CookieResolver(BaseCollResolver):
def __init__(self, routes, config):
config['pre_connect'] = False
super(CookieResolver, self).__init__(routes, config)
self.magic_name = config.get('magic_name', 'pywb-proxy.com')
self.cookie_name = config.get('cookie_name', '__pywb_coll')
self.proxy_select_view = config.get('proxy_select_view')
def get_proxy_coll(self, env):
cookie = self.extract_client_cookie(env, self.cookie_name)
return cookie
def select_coll_response(self, env):
return self.make_magic_response('auto',
env['REL_REQUEST_URI'],
env)
def resolve(self, env):
url = env['REL_REQUEST_URI']
if ('.' + self.magic_name) in url:
return None, None, None, self.handle_magic_page(url, env)
return super(CookieResolver, self).resolve(env)
def handle_magic_page(self, url, env):
parts = urlparse.urlsplit(url)
path_url = parts.path[1:]
if parts.query:
path_url += '?' + parts.query
if parts.netloc.startswith('auto'):
coll = self.extract_client_cookie(env, self.cookie_name)
if coll:
return self.make_sethost_cookie_response(coll, path_url, env)
else:
return self.make_magic_response('select', path_url, env)
elif '.set.' in parts.netloc:
coll = parts.netloc.split('.', 1)[0]
headers = self.make_cookie_headers(coll, self.magic_name)
return self.make_sethost_cookie_response(coll, path_url, env,
headers=headers)
elif '.sethost.' in parts.netloc:
host_parts = parts.netloc.split('.', 1)
coll = host_parts[0]
inx = parts.netloc.find('.' + self.magic_name + '.')
domain = parts.netloc[inx + len(self.magic_name) + 2:]
headers = self.make_cookie_headers(coll, domain)
full_url = env['pywb.proxy_scheme'] + '://' + domain
full_url += '/' + path_url
return WbResponse.redir_response(full_url, headers=headers)
elif self.proxy_select_view:
route_temp = env['pywb.proxy_scheme'] + '://%s.set.'
route_temp += self.magic_name + '/' + path_url
return (self.proxy_select_view.
render_response(routes=self.routes,
route_temp=route_temp,
url=path_url))
else:
return WbResponse.text_response('select text for ' + path_url)
def make_cookie_headers(self, coll, domain):
cookie_val = '{0}={1}; Path=/; Domain=.{2}; HttpOnly'
cookie_val = cookie_val.format(self.cookie_name, coll, domain)
headers = [('Set-Cookie', cookie_val)]
return headers
def make_sethost_cookie_response(self, coll, path_url, env, headers=None):
path_parts = urlparse.urlsplit(path_url)
new_url = path_parts.path[1:]
if path_parts.query:
new_url += '?' + path_parts.query
return self.make_magic_response(coll + '.sethost', new_url, env,
suffix=path_parts.netloc,
headers=headers)
def make_magic_response(self, prefix, url, env,
suffix=None, headers=None):
full_url = env['pywb.proxy_scheme'] + '://' + prefix + '.'
full_url += self.magic_name
if suffix:
full_url += '.' + suffix
full_url += '/' + url
return WbResponse.redir_response(full_url, headers=headers)
@staticmethod
def extract_client_cookie(env, cookie_name):
cookie_header = env.get('HTTP_COOKIE')
if not cookie_header:
return None
# attempt to extract cookie_name only
inx = cookie_header.find(cookie_name)
if inx < 0:
return None
end_inx = cookie_header.find(';', inx)
if end_inx > 0:
value = cookie_header[inx:end_inx]
else:
value = cookie_header[inx:]
value = value.split('=')
if len(value) < 2:
return None
value = value[1].strip()
return value

View File

@ -46,7 +46,7 @@
{'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])}
>>> WbResponse.redir_response('http://example.com/otherfile')
{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])}
{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile'), ('Content-Length', '0')])}
"""

View File

@ -125,7 +125,7 @@ class WbRequest(object):
if not self.wb_url:
return
mime = self.env.get('CONTENT_TYPE')
mime = self.env.get('CONTENT_TYPE').split(';')[0]
length = self.env.get('CONTENT_LENGTH')
stream = self.env['wsgi.input']
@ -167,9 +167,12 @@ class WbResponse(object):
return WbResponse(status_headers, value=[text])
@staticmethod
def redir_response(location, status='302 Redirect'):
return WbResponse(StatusAndHeaders(status,
[('Location', location)]))
def redir_response(location, status='302 Redirect', headers=None):
redir_headers = [('Location', location), ('Content-Length', '0')]
if headers:
redir_headers += headers
return WbResponse(StatusAndHeaders(status, redir_headers))
def __call__(self, env, start_response):

View File

@ -64,7 +64,7 @@ class WSGIApp(object):
env['pywb.proxy_statusline'] = statusline
ssl_sock.write('HTTP/1.0 ' + statusline + '\r\n')
ssl_sock.write('HTTP/1.1 ' + statusline + '\r\n')
for name, value in headers:
ssl_sock.write(name + ': ' + value + '\r\n')

View File

@ -215,13 +215,17 @@ def create_wb_router(passed_config={}):
if hasattr(route.handler, 'resolve_refs'):
route.handler.resolve_refs(handler_dict)
# Check for new proxy mode!
if config.get('enable_http_proxy', False):
router = ProxyArchivalRouter
else:
router = ArchivalRouter
if config.get('proxy_select_html'):
temp = J2TemplateView.create_template(config.get('proxy_select_html'),
'Proxy Coll Selector')
config.get('proxy_options')['proxy_select_view'] = temp
# Finally, create wb router
return router(
routes,