From 739f23da9e2ab0beb865c0284eec8577f00c93b2 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 26 Jul 2014 09:48:44 -0700 Subject: [PATCH 01/26] https proxy support, CONNECT verb handling (uwsgi only) --- README.rst | 2 +- pywb/framework/proxy.py | 72 ++++++++++++++++++++++++++++++++- pywb/framework/wsgi_wrappers.py | 33 +++++++++++++++ setup.py | 2 +- 4 files changed, 106 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 9c4b380d..6aa256ac 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -PyWb 0.5.1 +PyWb 0.5.2 ========== .. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index 62bc06b0..6754ecd7 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -62,12 +62,16 @@ class ProxyRouter(object): self.unaltered = proxy_options.get('unaltered_replay', False) def __call__(self, env): + if env['REQUEST_METHOD'] == 'CONNECT': + if not self.handle_connect(env): + return None + url = env['REL_REQUEST_URI'] if url.endswith('/proxy.pac'): return self.make_pac_response(env) - if not url.startswith('http://'): + if not url.startswith(('http://', 'https://')): return None proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION') @@ -122,6 +126,72 @@ class ProxyRouter(object): return route.handler(wbrequest) + def handle_connect(self, env): + import uwsgi + import socket + import ssl + from io import BytesIO + + fd = uwsgi.connection_fd() + conn = socket.fromfd(fd, socket.AF_INET, socket.SOCK_STREAM) + sock = socket.socket(_sock=conn) + + if (self.use_default_coll or + len(self.routes) == 1 or + env.get('HTTP_PROXY_AUTHORIZATION') is not None): + + sock.send('HTTP/1.0 200 Connection Established\r\n') + sock.send('Server: pywb proxy\r\n') + sock.send('\r\n') + else: + env['pywb.proxy_statusline'] = '407 Proxy Auth Required' + sock.send('HTTP/1.0 407 Proxy Auth Required\r\n') + sock.send('Server: pywb proxy\r\n') + sock.send('\r\n') + return False + + ssl_sock = ssl.wrap_socket(sock, server_side=True, + certfile='/tmp/testcert.pem', + ssl_version=ssl.PROTOCOL_SSLv23) + + env['pywb.proxy_ssl_sock'] = ssl_sock + + buff = ssl_sock.recv(4096) + + buffreader = BytesIO(buff) + + statusline = buffreader.readline() + statusparts = statusline.split(' ') + + if len(statusparts) < 3: + return + + env['REQUEST_METHOD'] = statusparts[0] + env['REL_REQUEST_URI'] = ('https://' + + env['REL_REQUEST_URI'].replace(':443', '') + + statusparts[1]) + + env['SERVER_PROTOCOL'] = statusparts[2].strip() + + queryparts = env['REL_REQUEST_URI'].split('?', 1) + env['PATH_INFO'] = queryparts[0] + env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else '' + + while True: + line = buffreader.readline() + if not line: + break + + parts = line.split(':') + if len(parts) < 2: + continue + + name = 'HTTP_' + parts[0].replace('-', '_').upper() + env[name] = parts[1] + + return True + + # Proxy Auto-Config (PAC) script for the proxy def make_pac_response(self, env): import os diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py index 3729a660..b40b5678 100644 --- a/pywb/framework/wsgi_wrappers.py +++ b/pywb/framework/wsgi_wrappers.py @@ -50,6 +50,39 @@ class WSGIApp(object): # Top-level wsgi application def __call__(self, env, start_response): + if env['REQUEST_METHOD'] == 'CONNECT': + return self.handle_connect(env, start_response) + else: + return self.handle_methods(env, start_response) + + def handle_connect(self, env, start_response): + def ssl_start_response(statusline, headers): + ssl_sock = env.get('pywb.proxy_ssl_sock') + if not ssl_sock: + return + + env['pywb.proxy_statusline'] = statusline + + ssl_sock.write('HTTP/1.1 ' + statusline + '\r\n') + for name, value in headers: + ssl_sock.write(name + ': ' + value + '\r\n') + + resp_iter = self.handle_methods(env, ssl_start_response) + + ssl_sock = env.get('pywb.proxy_ssl_sock') + if ssl_sock: + ssl_sock.write('\r\n') + + for obj in resp_iter: + ssl_sock.write(obj) + + ssl_sock.close() + + start_response(env['pywb.proxy_statusline'], []) + + return [] + + def handle_methods(self, env, start_response): if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'): env['REL_REQUEST_URI'] = rel_request_uri(env) else: diff --git a/setup.py b/setup.py index 3e89abed..a6e9c885 100755 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ class PyTest(TestCommand): setup( name='pywb', - version='0.5.1', + version='0.5.2', url='https://github.com/ikreymer/pywb', author='Ilya Kreymer', author_email='ikreymer@gmail.com', From eca3cf5fbf096b123c760d196f9c3067bce49eb0 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 26 Jul 2014 13:24:53 -0700 Subject: [PATCH 02/26] https proxy: add ca generator! support uwsgi, gunicorn and ref better handling of 407, other error responses in response to CONNECT --- config.yaml | 3 ++ pywb/framework/certa.py | 87 ++++++++++++++++++++++++++++++ pywb/framework/proxy.py | 95 ++++++++++++++++++++------------- pywb/framework/wsgi_wrappers.py | 19 ++++--- 4 files changed, 160 insertions(+), 44 deletions(-) create mode 100644 pywb/framework/certa.py diff --git a/config.yaml b/config.yaml index 937b4545..fc2290ba 100644 --- a/config.yaml +++ b/config.yaml @@ -109,3 +109,6 @@ enable_memento: true # Replay content in an iframe framed_replay: true + +debug_echo_env: True + diff --git a/pywb/framework/certa.py b/pywb/framework/certa.py new file mode 100644 index 00000000..844ad497 --- /dev/null +++ b/pywb/framework/certa.py @@ -0,0 +1,87 @@ +import logging +import os +import OpenSSL +import random + + +class CertificateAuthority(object): + logger = logging.getLogger('pywb.CertificateAuthority') + + def __init__(self, ca_file='pywb-ca.pem', certs_dir='./pywb-ca'): + self.ca_file = ca_file + self.certs_dir = certs_dir + + if not os.path.exists(ca_file): + self._generate_ca() + else: + self._read_ca(ca_file) + + if not os.path.exists(certs_dir): + self.logger.info("directory for generated certs {} doesn't exist, creating it".format(certs_dir)) + os.mkdir(certs_dir) + + + def _generate_ca(self): + # Generate key + self.key = OpenSSL.crypto.PKey() + self.key.generate_key(OpenSSL.crypto.TYPE_RSA, 2048) + + # Generate certificate + self.cert = OpenSSL.crypto.X509() + self.cert.set_version(3) + # avoid sec_error_reused_issuer_and_serial + self.cert.set_serial_number(random.randint(0,2**64-1)) + self.cert.get_subject().CN = 'pywb CA on {}'.format('') + self.cert.gmtime_adj_notBefore(0) # now + self.cert.gmtime_adj_notAfter(100*365*24*60*60) # 100 yrs in future + self.cert.set_issuer(self.cert.get_subject()) + self.cert.set_pubkey(self.key) + self.cert.add_extensions([ + OpenSSL.crypto.X509Extension(b"basicConstraints", True, b"CA:TRUE, pathlen:0"), + OpenSSL.crypto.X509Extension(b"keyUsage", True, b"keyCertSign, cRLSign"), + OpenSSL.crypto.X509Extension(b"subjectKeyIdentifier", False, b"hash", subject=self.cert), + ]) + self.cert.sign(self.key, "sha1") + + with open(self.ca_file, 'wb+') as f: + f.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM, self.key)) + f.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM, self.cert)) + + self.logger.info('generated CA key+cert and wrote to {}'.format(self.ca_file)) + + + def _read_ca(self, filename): + self.cert = OpenSSL.crypto.load_certificate(OpenSSL.SSL.FILETYPE_PEM, open(filename).read()) + self.key = OpenSSL.crypto.load_privatekey(OpenSSL.SSL.FILETYPE_PEM, open(filename).read()) + self.logger.info('read CA key+cert from {}'.format(self.ca_file)) + + def __getitem__(self, cn): + cnp = os.path.sep.join([self.certs_dir, '%s.pem' % cn]) + if not os.path.exists(cnp): + # create certificate + key = OpenSSL.crypto.PKey() + key.generate_key(OpenSSL.crypto.TYPE_RSA, 2048) + + # Generate CSR + req = OpenSSL.crypto.X509Req() + req.get_subject().CN = cn + req.set_pubkey(key) + req.sign(key, 'sha1') + + # Sign CSR + cert = OpenSSL.crypto.X509() + cert.set_subject(req.get_subject()) + cert.set_serial_number(random.randint(0,2**64-1)) + cert.gmtime_adj_notBefore(0) + cert.gmtime_adj_notAfter(10*365*24*60*60) + cert.set_issuer(self.cert.get_subject()) + cert.set_pubkey(req.get_pubkey()) + cert.sign(self.key, 'sha1') + + with open(cnp, 'wb+') as f: + f.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM, key)) + f.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM, cert)) + + self.logger.info('wrote generated key+cert to {}'.format(cnp)) + + return cnp diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index 6754ecd7..202e4f3b 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -4,8 +4,15 @@ from archivalrouter import ArchivalRouter import urlparse import base64 +import socket +import ssl +from io import BytesIO + from pywb.rewrite.url_rewriter import HttpsUrlRewriter from pywb.utils.statusandheaders import StatusAndHeaders +from pywb.utils.wbexception import BadRequestException + +from certa import CertificateAuthority #================================================================= @@ -61,19 +68,21 @@ class ProxyRouter(object): self.unaltered = proxy_options.get('unaltered_replay', False) + self.ca = CertificateAuthority() + + def __call__(self, env): - if env['REQUEST_METHOD'] == 'CONNECT': - if not self.handle_connect(env): + is_https = (env['REQUEST_METHOD'] == 'CONNECT') + + if not is_https: + url = env['REL_REQUEST_URI'] + + if url.endswith('/proxy.pac'): + return self.make_pac_response(env) + + if not url.startswith(('http://', 'https://')): return None - url = env['REL_REQUEST_URI'] - - if url.endswith('/proxy.pac'): - return self.make_pac_response(env) - - if not url.startswith(('http://', 'https://')): - return None - proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION') route = None @@ -108,6 +117,12 @@ class ProxyRouter(object): else: return self.proxy_auth_coll_response() + # do connect, then get updated url + if is_https: + self.handle_connect(env) + + url = env['REL_REQUEST_URI'] + wbrequest = route.request_class(env, request_uri=url, wb_url_str=url, @@ -126,36 +141,41 @@ class ProxyRouter(object): return route.handler(wbrequest) - def handle_connect(self, env): - import uwsgi - import socket - import ssl - from io import BytesIO - - fd = uwsgi.connection_fd() - conn = socket.fromfd(fd, socket.AF_INET, socket.SOCK_STREAM) - sock = socket.socket(_sock=conn) - - if (self.use_default_coll or - len(self.routes) == 1 or - env.get('HTTP_PROXY_AUTHORIZATION') is not None): - - sock.send('HTTP/1.0 200 Connection Established\r\n') - sock.send('Server: pywb proxy\r\n') - sock.send('\r\n') + def get_request_socket(self, env): + if env.get('uwsgi.version'): + import uwsgi + fd = uwsgi.connection_fd() + conn = socket.fromfd(fd, socket.AF_INET, socket.SOCK_STREAM) + sock = socket.socket(_sock=conn) + elif env.get('gunicorn.socket'): + sock = env['gunicorn.socket'] else: - env['pywb.proxy_statusline'] = '407 Proxy Auth Required' - sock.send('HTTP/1.0 407 Proxy Auth Required\r\n') - sock.send('Server: pywb proxy\r\n') - sock.send('\r\n') - return False + # attempt to find socket from wsgi.input + input_ = env.get('wsgi.input') + if input_ and hasattr(input_, '_sock'): + sock = socket.socket(_sock=input_._sock) + + return sock + + def handle_connect(self, env): + sock = self.get_request_socket(env) + if not sock: + return WbResponse.text_response('HTTPS Proxy Not Supported', + '405 HTTPS Proxy Not Supported') + + sock.send('HTTP/1.0 200 Connection Established\r\n') + sock.send('Server: pywb proxy\r\n') + sock.send('\r\n') + + hostname = env['REL_REQUEST_URI'].split(':')[0] ssl_sock = ssl.wrap_socket(sock, server_side=True, - certfile='/tmp/testcert.pem', - ssl_version=ssl.PROTOCOL_SSLv23) + certfile=self.ca[hostname]) + #ssl_version=ssl.PROTOCOL_SSLv23) env['pywb.proxy_ssl_sock'] = ssl_sock + #todo: better reading of all headers buff = ssl_sock.recv(4096) buffreader = BytesIO(buff) @@ -164,7 +184,7 @@ class ProxyRouter(object): statusparts = statusline.split(' ') if len(statusparts) < 3: - return + raise BadRequestException('Invalid Proxy Request') env['REQUEST_METHOD'] = statusparts[0] env['REL_REQUEST_URI'] = ('https://' + @@ -177,6 +197,8 @@ class ProxyRouter(object): env['PATH_INFO'] = queryparts[0] env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else '' + env['wsgi.input'] = socket._fileobject(ssl_sock, mode='r') + while True: line = buffreader.readline() if not line: @@ -189,9 +211,6 @@ class ProxyRouter(object): name = 'HTTP_' + parts[0].replace('-', '_').upper() env[name] = parts[1] - return True - - # Proxy Auto-Config (PAC) script for the proxy def make_pac_response(self, env): import os diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py index b40b5678..1e1100e4 100644 --- a/pywb/framework/wsgi_wrappers.py +++ b/pywb/framework/wsgi_wrappers.py @@ -59,24 +59,27 @@ class WSGIApp(object): def ssl_start_response(statusline, headers): ssl_sock = env.get('pywb.proxy_ssl_sock') if not ssl_sock: + start_response(statusline, headers) return env['pywb.proxy_statusline'] = statusline - ssl_sock.write('HTTP/1.1 ' + statusline + '\r\n') + ssl_sock.write('HTTP/1.0 ' + statusline + '\r\n') for name, value in headers: ssl_sock.write(name + ': ' + value + '\r\n') resp_iter = self.handle_methods(env, ssl_start_response) ssl_sock = env.get('pywb.proxy_ssl_sock') - if ssl_sock: - ssl_sock.write('\r\n') + if not ssl_sock: + return resp_iter - for obj in resp_iter: - ssl_sock.write(obj) + ssl_sock.write('\r\n') - ssl_sock.close() + for obj in resp_iter: + ssl_sock.write(obj) + + ssl_sock.close() start_response(env['pywb.proxy_statusline'], []) @@ -178,6 +181,10 @@ def init_app(init_func, load_yaml=True, config_file=None, config={}): def start_wsgi_server(the_app, name, default_port=None): # pragma: no cover from wsgiref.simple_server import make_server + # disable is_hop_by_hop restrictions + import wsgiref.handlers + wsgiref.handlers.is_hop_by_hop = lambda x: False + port = the_app.port if not port: From 1464e89c419c88a8986b8d351e359273912b5c8f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 26 Jul 2014 14:24:28 -0700 Subject: [PATCH 03/26] wbresponse: always include Content-Length for text_response --- pywb/framework/certa.py | 7 +++++-- pywb/framework/wbrequestresponse.py | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pywb/framework/certa.py b/pywb/framework/certa.py index 844ad497..d957dbad 100644 --- a/pywb/framework/certa.py +++ b/pywb/framework/certa.py @@ -7,7 +7,10 @@ import random class CertificateAuthority(object): logger = logging.getLogger('pywb.CertificateAuthority') - def __init__(self, ca_file='pywb-ca.pem', certs_dir='./pywb-ca'): + def __init__(self, ca_file='pywb-ca.pem', + certs_dir='./pywb-ca', + certname='pywb CA'): + self.ca_file = ca_file self.certs_dir = certs_dir @@ -31,7 +34,7 @@ class CertificateAuthority(object): self.cert.set_version(3) # avoid sec_error_reused_issuer_and_serial self.cert.set_serial_number(random.randint(0,2**64-1)) - self.cert.get_subject().CN = 'pywb CA on {}'.format('') + self.cert.get_subject().CN = certname self.cert.gmtime_adj_notBefore(0) # now self.cert.gmtime_adj_notAfter(100*365*24*60*60) # 100 yrs in future self.cert.set_issuer(self.cert.get_subject()) diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index 0f1a9f32..f2c63f9c 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -161,7 +161,8 @@ class WbResponse(object): @staticmethod def text_response(text, status='200 OK', content_type='text/plain'): status_headers = StatusAndHeaders(status, - [('Content-Type', content_type)]) + [('Content-Type', content_type), + ('Content-Length', str(len(text)))]) return WbResponse(status_headers, value=[text]) From 5beb831ae9c76727f6a67ae0078f250288e35448 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 26 Jul 2014 14:27:31 -0700 Subject: [PATCH 04/26] wbrequestresponse: update doctest --- pywb/framework/test/test_wbrequestresponse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pywb/framework/test/test_wbrequestresponse.py b/pywb/framework/test/test_wbrequestresponse.py index e066d4d1..65940e4a 100644 --- a/pywb/framework/test/test_wbrequestresponse.py +++ b/pywb/framework/test/test_wbrequestresponse.py @@ -40,7 +40,7 @@ # WbResponse Tests # ================= >>> WbResponse.text_response('Test') -{'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain')])} +{'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain'), ('Content-Length', '4')])} >>> WbResponse.text_stream(['Test', 'Another'], '404') {'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])} From e58a63a9feaba6f54fb98d5c5960c8a50b7546e2 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 26 Jul 2014 14:35:52 -0700 Subject: [PATCH 05/26] setup: add openssl as a req --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index a6e9c885..45349981 100755 --- a/setup.py +++ b/setup.py @@ -70,6 +70,7 @@ setup( 'jinja2', 'surt', 'pyyaml', + 'pyopenssl', ], tests_require=[ 'pytest', From ae35d92dded125bd95813fd48b87366e4ff66a2d Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 26 Jul 2014 15:27:02 -0700 Subject: [PATCH 06/26] fix typo in certauth --- pywb/framework/certa.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pywb/framework/certa.py b/pywb/framework/certa.py index d957dbad..21cf8770 100644 --- a/pywb/framework/certa.py +++ b/pywb/framework/certa.py @@ -13,6 +13,7 @@ class CertificateAuthority(object): self.ca_file = ca_file self.certs_dir = certs_dir + self.certname = certname if not os.path.exists(ca_file): self._generate_ca() @@ -34,7 +35,7 @@ class CertificateAuthority(object): self.cert.set_version(3) # avoid sec_error_reused_issuer_and_serial self.cert.set_serial_number(random.randint(0,2**64-1)) - self.cert.get_subject().CN = certname + self.cert.get_subject().CN = self.certname self.cert.gmtime_adj_notBefore(0) # now self.cert.gmtime_adj_notAfter(100*365*24*60*60) # 100 yrs in future self.cert.set_issuer(self.cert.get_subject()) From 2a9197137e94b532a8a00aa8de269178bbb22cb9 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 26 Jul 2014 21:06:28 -0700 Subject: [PATCH 07/26] certauth: some cleanup for pep8, 2.6 compat --- pywb/framework/certa.py | 44 +++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/pywb/framework/certa.py b/pywb/framework/certa.py index 21cf8770..b7b1e5bf 100644 --- a/pywb/framework/certa.py +++ b/pywb/framework/certa.py @@ -4,6 +4,7 @@ import OpenSSL import random +#================================================================= class CertificateAuthority(object): logger = logging.getLogger('pywb.CertificateAuthority') @@ -21,7 +22,6 @@ class CertificateAuthority(object): self._read_ca(ca_file) if not os.path.exists(certs_dir): - self.logger.info("directory for generated certs {} doesn't exist, creating it".format(certs_dir)) os.mkdir(certs_dir) @@ -41,23 +41,37 @@ class CertificateAuthority(object): self.cert.set_issuer(self.cert.get_subject()) self.cert.set_pubkey(self.key) self.cert.add_extensions([ - OpenSSL.crypto.X509Extension(b"basicConstraints", True, b"CA:TRUE, pathlen:0"), - OpenSSL.crypto.X509Extension(b"keyUsage", True, b"keyCertSign, cRLSign"), - OpenSSL.crypto.X509Extension(b"subjectKeyIdentifier", False, b"hash", subject=self.cert), + OpenSSL.crypto.X509Extension(b"basicConstraints", + True, + b"CA:TRUE, pathlen:0"), + + OpenSSL.crypto.X509Extension(b"keyUsage", + True, + b"keyCertSign, cRLSign"), + + OpenSSL.crypto.X509Extension(b"subjectKeyIdentifier", + False, + b"hash", + subject=self.cert), ]) self.cert.sign(self.key, "sha1") with open(self.ca_file, 'wb+') as f: - f.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM, self.key)) - f.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM, self.cert)) - - self.logger.info('generated CA key+cert and wrote to {}'.format(self.ca_file)) + f.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM, + self.key)) + f.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM, + self.cert)) def _read_ca(self, filename): - self.cert = OpenSSL.crypto.load_certificate(OpenSSL.SSL.FILETYPE_PEM, open(filename).read()) - self.key = OpenSSL.crypto.load_privatekey(OpenSSL.SSL.FILETYPE_PEM, open(filename).read()) - self.logger.info('read CA key+cert from {}'.format(self.ca_file)) + with open(filename) as cert_fh: + self.cert = OpenSSL.crypto.load_certificate( + OpenSSL.SSL.FILETYPE_PEM, cert_fh.read()) + + cert_fh.seek(0) + + self.key = OpenSSL.crypto.load_privatekey( + OpenSSL.SSL.FILETYPE_PEM, cert_fh.read()) def __getitem__(self, cn): cnp = os.path.sep.join([self.certs_dir, '%s.pem' % cn]) @@ -83,9 +97,9 @@ class CertificateAuthority(object): cert.sign(self.key, 'sha1') with open(cnp, 'wb+') as f: - f.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM, key)) - f.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM, cert)) - - self.logger.info('wrote generated key+cert to {}'.format(cnp)) + f.write(OpenSSL.crypto.dump_privatekey( + OpenSSL.SSL.FILETYPE_PEM, key)) + f.write(OpenSSL.crypto.dump_certificate( + OpenSSL.SSL.FILETYPE_PEM, cert)) return cnp From b6fb0e510e2582626e0fa5a92167850f987cfd64 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 27 Jul 2014 19:35:16 -0700 Subject: [PATCH 08/26] certauth: clean up CertificatAuthority, add cli interface for creating root cert and host certs CertificateAuthority instance creates per-host certs, assume root cert exists static method generate_ca_root() used to create root cert once add proxy_options to enable https support --- pywb/framework/certa.py | 105 ------------------- pywb/framework/certauth.py | 201 +++++++++++++++++++++++++++++++++++++ pywb/framework/proxy.py | 44 ++++++-- 3 files changed, 235 insertions(+), 115 deletions(-) delete mode 100644 pywb/framework/certa.py create mode 100644 pywb/framework/certauth.py diff --git a/pywb/framework/certa.py b/pywb/framework/certa.py deleted file mode 100644 index b7b1e5bf..00000000 --- a/pywb/framework/certa.py +++ /dev/null @@ -1,105 +0,0 @@ -import logging -import os -import OpenSSL -import random - - -#================================================================= -class CertificateAuthority(object): - logger = logging.getLogger('pywb.CertificateAuthority') - - def __init__(self, ca_file='pywb-ca.pem', - certs_dir='./pywb-ca', - certname='pywb CA'): - - self.ca_file = ca_file - self.certs_dir = certs_dir - self.certname = certname - - if not os.path.exists(ca_file): - self._generate_ca() - else: - self._read_ca(ca_file) - - if not os.path.exists(certs_dir): - os.mkdir(certs_dir) - - - def _generate_ca(self): - # Generate key - self.key = OpenSSL.crypto.PKey() - self.key.generate_key(OpenSSL.crypto.TYPE_RSA, 2048) - - # Generate certificate - self.cert = OpenSSL.crypto.X509() - self.cert.set_version(3) - # avoid sec_error_reused_issuer_and_serial - self.cert.set_serial_number(random.randint(0,2**64-1)) - self.cert.get_subject().CN = self.certname - self.cert.gmtime_adj_notBefore(0) # now - self.cert.gmtime_adj_notAfter(100*365*24*60*60) # 100 yrs in future - self.cert.set_issuer(self.cert.get_subject()) - self.cert.set_pubkey(self.key) - self.cert.add_extensions([ - OpenSSL.crypto.X509Extension(b"basicConstraints", - True, - b"CA:TRUE, pathlen:0"), - - OpenSSL.crypto.X509Extension(b"keyUsage", - True, - b"keyCertSign, cRLSign"), - - OpenSSL.crypto.X509Extension(b"subjectKeyIdentifier", - False, - b"hash", - subject=self.cert), - ]) - self.cert.sign(self.key, "sha1") - - with open(self.ca_file, 'wb+') as f: - f.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM, - self.key)) - - f.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM, - self.cert)) - - def _read_ca(self, filename): - with open(filename) as cert_fh: - self.cert = OpenSSL.crypto.load_certificate( - OpenSSL.SSL.FILETYPE_PEM, cert_fh.read()) - - cert_fh.seek(0) - - self.key = OpenSSL.crypto.load_privatekey( - OpenSSL.SSL.FILETYPE_PEM, cert_fh.read()) - - def __getitem__(self, cn): - cnp = os.path.sep.join([self.certs_dir, '%s.pem' % cn]) - if not os.path.exists(cnp): - # create certificate - key = OpenSSL.crypto.PKey() - key.generate_key(OpenSSL.crypto.TYPE_RSA, 2048) - - # Generate CSR - req = OpenSSL.crypto.X509Req() - req.get_subject().CN = cn - req.set_pubkey(key) - req.sign(key, 'sha1') - - # Sign CSR - cert = OpenSSL.crypto.X509() - cert.set_subject(req.get_subject()) - cert.set_serial_number(random.randint(0,2**64-1)) - cert.gmtime_adj_notBefore(0) - cert.gmtime_adj_notAfter(10*365*24*60*60) - cert.set_issuer(self.cert.get_subject()) - cert.set_pubkey(req.get_pubkey()) - cert.sign(self.key, 'sha1') - - with open(cnp, 'wb+') as f: - f.write(OpenSSL.crypto.dump_privatekey( - OpenSSL.SSL.FILETYPE_PEM, key)) - f.write(OpenSSL.crypto.dump_certificate( - OpenSSL.SSL.FILETYPE_PEM, cert)) - - return cnp diff --git a/pywb/framework/certauth.py b/pywb/framework/certauth.py new file mode 100644 index 00000000..0ce7cec3 --- /dev/null +++ b/pywb/framework/certauth.py @@ -0,0 +1,201 @@ +import logging +import os +from OpenSSL import crypto +from OpenSSL.SSL import FILETYPE_PEM +import random +from argparse import ArgumentParser + + +#================================================================= +# Duration of 100 years +CERT_DURATION = 100 * 365 * 24 * 60 * 60 + +CERTS_DIR = './pywb-certs/' + +CERT_NAME = 'pywb https proxy replay CA' + +CERT_CA_FILE = './pywb-ca.pem' + + +#================================================================= +class CertificateAuthority(object): + """ + Utility class for signing individual certificate + with a root cert. + + Static generate_ca_root() method for creating the root cert + + All certs saved on filesystem. Individual certs are stored + in specified certs_dir and reused if previously created. + """ + + def __init__(self, ca_file, certs_dir): + if not ca_file: + ca_file = CERT_CA_FILE + + if not certs_dir: + certs_dir = CERTS_DIR + + self.ca_file = ca_file + self.certs_dir = certs_dir + + # read previously created root cert + self.cert, self.key = self.read_pem(ca_file) + + if not os.path.exists(certs_dir): + os.mkdir(certs_dir) + + def get_cert_for_host(self, host, overwrite=False): + host_filename = os.path.sep.join([self.certs_dir, '%s.pem' % host]) + + if not overwrite and os.path.exists(host_filename): + return False, host_filename + + self.generate_host_cert(host, self.cert, self.key, host_filename) + return True, host_filename + + @staticmethod + def _make_cert(certname): + cert = crypto.X509() + cert.set_version(3) + cert.set_serial_number(random.randint(0, 2 ** 64 - 1)) + cert.get_subject().CN = certname + + cert.gmtime_adj_notBefore(0) + cert.gmtime_adj_notAfter(CERT_DURATION) + return cert + + @staticmethod + def generate_ca_root(ca_file, certname=None, overwrite=False): + if not certname: + certname = CERT_NAME + + if not ca_file: + ca_file = CERT_CA_FILE + + if not overwrite and os.path.exists(ca_file): + cert, key = CertificateAuthority.read_pem(ca_file) + return False, cert, key + + # Generate key + key = crypto.PKey() + key.generate_key(crypto.TYPE_RSA, 2048) + + # Generate cert + cert = CertificateAuthority._make_cert(certname) + + cert.set_issuer(cert.get_subject()) + cert.set_pubkey(key) + cert.add_extensions([ + crypto.X509Extension(b"basicConstraints", + True, + b"CA:TRUE, pathlen:0"), + + crypto.X509Extension(b"keyUsage", + True, + b"keyCertSign, cRLSign"), + + crypto.X509Extension(b"subjectKeyIdentifier", + False, + b"hash", + subject=cert), + ]) + cert.sign(key, "sha1") + + # Write cert + key + CertificateAuthority.write_pem(ca_file, cert, key) + return True, cert, key + + @staticmethod + def generate_host_cert(host, root_cert, root_key, host_filename): + # Generate key + key = crypto.PKey() + key.generate_key(crypto.TYPE_RSA, 2048) + + # Generate CSR + req = crypto.X509Req() + req.get_subject().CN = host + req.set_pubkey(key) + req.sign(key, 'sha1') + + # Generate Cert + cert = CertificateAuthority._make_cert(host) + + cert.set_issuer(root_cert.get_subject()) + cert.set_pubkey(req.get_pubkey()) + cert.sign(root_key, 'sha1') + + # Write cert + key + CertificateAuthority.write_pem(host_filename, cert, key) + return cert, key + + @staticmethod + def write_pem(filename, cert, key): + with open(filename, 'wb+') as f: + f.write(crypto.dump_privatekey(FILETYPE_PEM, key)) + + f.write(crypto.dump_certificate(FILETYPE_PEM, cert)) + + @staticmethod + def read_pem(filename): + with open(filename, 'r') as f: + cert = crypto.load_certificate(FILETYPE_PEM, f.read()) + f.seek(0) + key = crypto.load_privatekey(FILETYPE_PEM, f.read()) + + return cert, key + + +#================================================================= +def main(): + parser = ArgumentParser(description='Cert Auth Cert Maker') + + parser.add_argument('output_file', help='path to certificate file') + + parser.add_argument('-r', '--use-root', + help='use specified root cert to create signed cert') + + parser.add_argument('-n', '--name', action='store', default=CERT_NAME, + help='name for root certificate') + + parser.add_argument('-d', '--certs-dir', default=CERTS_DIR) + + parser.add_argument('-f', '--force', action='store_true') + + result = parser.parse_args() + + overwrite = result.force + + # Create a new signed certificate using specified root + if result.use_root: + certs_dir = result.certs_dir + ca = CertificateAuthority(ca_file=result.use_root, + certs_dir=result.certs_dir, + certname=result.name) + + created, host_filename = ca.get_cert_for_host(result.output_file, + overwrite) + + if created: + print ('Created new cert "' + host_filename + + '" signed by root cert ' + + result.use_root) + else: + print ('Cert "' + host_filename + '" already exists,' + + ' use -f to overwrite') + + # Create new root certificate + else: + created, c, k = (CertificateAuthority. + generate_ca_root(result.output_file, + result.name, + overwrite)) + + if created: + print 'Created new root cert: "' + result.output_file + '"' + else: + print ('Root cert "' + result.output_file + '" already exists,' + + ' use -f to overwrite') + +if __name__ == "__main__": + main() diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index 202e4f3b..fdfb8ac1 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -12,7 +12,7 @@ from pywb.rewrite.url_rewriter import HttpsUrlRewriter from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.wbexception import BadRequestException -from certa import CertificateAuthority +from certauth import CertificateAuthority #================================================================= @@ -68,8 +68,19 @@ class ProxyRouter(object): self.unaltered = proxy_options.get('unaltered_replay', False) - self.ca = CertificateAuthority() + if proxy_options.get('enable_https_proxy'): + ca_file = proxy_options.get('root_ca_file') + # attempt to create the root_ca_file if doesn't exist + # (generally recommended to create this seperately) + certname = proxy_options.get('root_ca_name') + CertificateAuthority.generate_ca_root(certname, ca_file) + + certs_dir = proxy_options.get('certs_dir') + self.ca = CertificateAuthority(ca_file=ca_file, + certs_dir=certs_dir) + else: + self.ca = None def __call__(self, env): is_https = (env['REQUEST_METHOD'] == 'CONNECT') @@ -119,7 +130,9 @@ class ProxyRouter(object): # do connect, then get updated url if is_https: - self.handle_connect(env) + response = self.handle_connect(env) + if response: + return response url = env['REL_REQUEST_URI'] @@ -142,14 +155,23 @@ class ProxyRouter(object): return route.handler(wbrequest) def get_request_socket(self, env): + if not self.ca: + return None + + sock = None + if env.get('uwsgi.version'): - import uwsgi - fd = uwsgi.connection_fd() - conn = socket.fromfd(fd, socket.AF_INET, socket.SOCK_STREAM) - sock = socket.socket(_sock=conn) + try: + import uwsgi + fd = uwsgi.connection_fd() + conn = socket.fromfd(fd, socket.AF_INET, socket.SOCK_STREAM) + sock = socket.socket(_sock=conn) + except Exception: + pass elif env.get('gunicorn.socket'): sock = env['gunicorn.socket'] - else: + + if not sock: # attempt to find socket from wsgi.input input_ = env.get('wsgi.input') if input_ and hasattr(input_, '_sock'): @@ -168,9 +190,11 @@ class ProxyRouter(object): sock.send('\r\n') hostname = env['REL_REQUEST_URI'].split(':')[0] + created, certfile = self.ca.get_cert_for_host(hostname) - ssl_sock = ssl.wrap_socket(sock, server_side=True, - certfile=self.ca[hostname]) + ssl_sock = ssl.wrap_socket(sock, + server_side=True, + certfile=certfile) #ssl_version=ssl.PROTOCOL_SSLv23) env['pywb.proxy_ssl_sock'] = ssl_sock From 6234d795dcd242654362a46a18a410007942b0ee Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 28 Jul 2014 11:52:54 -0700 Subject: [PATCH 09/26] proxy improvements: refactor coll selector into BaseCollSelector, supporting either proxy auth or cookie-based selection (in progress) https proxy: support POST requests, properly read http header and wrap remainder in wsgi.input https proxy: properly update wsgi for wrapped request wbrequestresponse: add content-length 0 to redir_response --- pywb/framework/proxy.py | 296 +++++++++++++++--- pywb/framework/test/test_wbrequestresponse.py | 2 +- pywb/framework/wbrequestresponse.py | 11 +- pywb/framework/wsgi_wrappers.py | 2 +- pywb/webapp/pywb_init.py | 6 +- 5 files changed, 260 insertions(+), 57 deletions(-) diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index fdfb8ac1..386927ca 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -12,6 +12,8 @@ from pywb.rewrite.url_rewriter import HttpsUrlRewriter from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.wbexception import BadRequestException +from pywb.utils.bufferedreaders import BufferedReader + from certauth import CertificateAuthority @@ -51,8 +53,10 @@ class ProxyRouter(object): for more details. """ + PAC_PATH = '/proxy.pac' + BLOCK_SIZE = 4096 + def __init__(self, routes, **kwargs): - self.routes = routes self.hostpaths = kwargs.get('hostpaths') self.error_view = kwargs.get('error_view') @@ -61,13 +65,14 @@ class ProxyRouter(object): if proxy_options: proxy_options = proxy_options.get('proxy_options', {}) - self.auth_msg = proxy_options.get('auth_msg', - 'Please enter name of a collection to use for proxy mode') - - self.use_default_coll = proxy_options.get('use_default_coll', True) + self.resolver = ProxyAuthResolver(routes, proxy_options) + #self.resolver = CookieResolver(routes, proxy_options) self.unaltered = proxy_options.get('unaltered_replay', False) + self.proxy_pac_path = proxy_options.get('pac_path', self.PAC_PATH) + + if proxy_options.get('enable_https_proxy'): ca_file = proxy_options.get('root_ca_file') @@ -85,48 +90,23 @@ class ProxyRouter(object): def __call__(self, env): is_https = (env['REQUEST_METHOD'] == 'CONNECT') + # for non-https requests, check pac path and non-proxy urls if not is_https: url = env['REL_REQUEST_URI'] - if url.endswith('/proxy.pac'): + if url == self.proxy_pac_path: return self.make_pac_response(env) if not url.startswith(('http://', 'https://')): return None - proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION') + env['pywb.proxy_scheme'] = 'https' if is_https else 'http' - route = None - coll = None - matcher = None - - if proxy_auth: - proxy_coll = self.read_basic_auth_coll(proxy_auth) - - if not proxy_coll: - return self.proxy_auth_coll_response() - - proxy_coll = '/' + proxy_coll + '/' - - for r in self.routes: - matcher, c = r.is_handling(proxy_coll) - if matcher: - route = r - coll = c - break - - if not route: - return self.proxy_auth_coll_response() - - # if 'use_default_coll' or only one collection, use that - # for proxy mode - elif self.use_default_coll or len(self.routes) == 1: - route = self.routes[0] - coll = self.routes[0].regex.pattern - - # otherwise, require proxy auth 407 to select collection - else: - return self.proxy_auth_coll_response() + # check resolver, for pre connect resolve + if self.resolver.pre_connect: + route, coll, matcher, response = self.resolver.resolve(env) + if response: + return response # do connect, then get updated url if is_https: @@ -136,6 +116,12 @@ class ProxyRouter(object): url = env['REL_REQUEST_URI'] + # check resolver, post connect + if not self.resolver.pre_connect: + route, coll, matcher, response = self.resolver.resolve(env) + if response: + return response + wbrequest = route.request_class(env, request_uri=url, wb_url_str=url, @@ -189,20 +175,18 @@ class ProxyRouter(object): sock.send('Server: pywb proxy\r\n') sock.send('\r\n') - hostname = env['REL_REQUEST_URI'].split(':')[0] + hostname, port = env['REL_REQUEST_URI'].split(':') created, certfile = self.ca.get_cert_for_host(hostname) ssl_sock = ssl.wrap_socket(sock, server_side=True, - certfile=certfile) - #ssl_version=ssl.PROTOCOL_SSLv23) + certfile=certfile, + ciphers="ALL", + ssl_version=ssl.PROTOCOL_SSLv23) env['pywb.proxy_ssl_sock'] = ssl_sock - #todo: better reading of all headers - buff = ssl_sock.recv(4096) - - buffreader = BytesIO(buff) + buffreader = BufferedReader(ssl_sock, block_size=self.BLOCK_SIZE) statusline = buffreader.readline() statusparts = statusline.split(' ') @@ -217,23 +201,44 @@ class ProxyRouter(object): env['SERVER_PROTOCOL'] = statusparts[2].strip() + env['SERVER_NAME'] = hostname + env['SERVER_PORT'] = port + queryparts = env['REL_REQUEST_URI'].split('?', 1) env['PATH_INFO'] = queryparts[0] env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else '' - env['wsgi.input'] = socket._fileobject(ssl_sock, mode='r') + env['wsgi.url_scheme'] = 'https' while True: line = buffreader.readline() + if line: + line = line.rstrip() + if not line: break - parts = line.split(':') + parts = line.split(':', 1) if len(parts) < 2: continue - name = 'HTTP_' + parts[0].replace('-', '_').upper() - env[name] = parts[1] + name = parts[0].strip() + value = parts[1].strip() + + name = name.replace('-', '_').upper() + + if not name in ('CONTENT_LENGTH', 'CONTENT_TYPE'): + name = 'HTTP_' + name + + env[name] = value + + remain = buffreader.rem_length() + if remain > 0: + remainder = buffreader.read(self.BLOCK_SIZE) + input_ = socket._fileobject(ssl_sock, mode='r') + env['wsgi.input'] = BufferedReader(input_, + block_size=self.BLOCK_SIZE, + starting_data=remainder) # Proxy Auto-Config (PAC) script for the proxy def make_pac_response(self, env): @@ -263,7 +268,73 @@ class ProxyRouter(object): return WbResponse.text_response(buff, content_type=content_type) - def proxy_auth_coll_response(self): + +#================================================================= +class BaseCollResolver(object): + def __init__(self, routes, config): + self.routes = routes + self.pre_connect = config.get('pre_connect', False) + self.use_default_coll = config.get('use_default_coll', True) + + def resolve(self, env): + route = None + coll = None + matcher = None + + proxy_coll = self.get_proxy_coll(env) + + # invalid parsing + if proxy_coll == '': + return None, None, None, self.select_coll_response(env) + + if proxy_coll is None and isinstance(self.use_default_coll, str): + proxy_coll = self.use_default_coll + + if proxy_coll: + proxy_coll = '/' + proxy_coll + '/' + + for r in self.routes: + matcher, c = r.is_handling(proxy_coll) + if matcher: + route = r + coll = c + break + + # if no match, return coll selection response + if not route: + return None, None, None, self.select_coll_response(env) + + # if 'use_default_coll' + elif self.use_default_coll == True or len(self.routes) == 1: + route = self.routes[0] + coll = self.routes[0].path + + # otherwise, return the appropriate coll selection response + else: + return None, None, None, self.select_coll_response(env) + + return route, coll, matcher, None + + +#================================================================= +class ProxyAuthResolver(BaseCollResolver): + DEFAULT_MSG = 'Please enter name of a collection to use with proxy mode' + + def __init__(self, routes, config): + config['pre_connect'] = True + super(ProxyAuthResolver, self).__init__(routes, config) + self.auth_msg = config.get('auth_msg', self.DEFAULT_MSG) + + def get_proxy_coll(self, env): + proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION') + + if not proxy_auth: + return None + + proxy_coll = self.read_basic_auth_coll(proxy_auth) + return proxy_coll + + def select_coll_response(self, env): proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg) headers = [('Content-Type', 'text/plain'), @@ -286,3 +357,128 @@ class ProxyRouter(object): user_pass = base64.b64decode(parts[1]) return user_pass.split(':')[0] + + +#================================================================= +class CookieResolver(BaseCollResolver): + def __init__(self, routes, config): + config['pre_connect'] = False + super(CookieResolver, self).__init__(routes, config) + self.magic_name = config.get('magic_name', 'pywb-proxy.com') + self.cookie_name = config.get('cookie_name', '__pywb_coll') + self.proxy_select_view = config.get('proxy_select_view') + + def get_proxy_coll(self, env): + cookie = self.extract_client_cookie(env, self.cookie_name) + return cookie + + def select_coll_response(self, env): + return self.make_magic_response('auto', + env['REL_REQUEST_URI'], + env) + + def resolve(self, env): + url = env['REL_REQUEST_URI'] + + if ('.' + self.magic_name) in url: + return None, None, None, self.handle_magic_page(url, env) + + return super(CookieResolver, self).resolve(env) + + def handle_magic_page(self, url, env): + parts = urlparse.urlsplit(url) + + path_url = parts.path[1:] + if parts.query: + path_url += '?' + parts.query + + if parts.netloc.startswith('auto'): + coll = self.extract_client_cookie(env, self.cookie_name) + + if coll: + return self.make_sethost_cookie_response(coll, path_url, env) + else: + return self.make_magic_response('select', path_url, env) + + elif '.set.' in parts.netloc: + coll = parts.netloc.split('.', 1)[0] + headers = self.make_cookie_headers(coll, self.magic_name) + + return self.make_sethost_cookie_response(coll, path_url, env, + headers=headers) + + elif '.sethost.' in parts.netloc: + host_parts = parts.netloc.split('.', 1) + coll = host_parts[0] + + inx = parts.netloc.find('.' + self.magic_name + '.') + domain = parts.netloc[inx + len(self.magic_name) + 2:] + + headers = self.make_cookie_headers(coll, domain) + + full_url = env['pywb.proxy_scheme'] + '://' + domain + full_url += '/' + path_url + return WbResponse.redir_response(full_url, headers=headers) + + elif self.proxy_select_view: + route_temp = env['pywb.proxy_scheme'] + '://%s.set.' + route_temp += self.magic_name + '/' + path_url + + return (self.proxy_select_view. + render_response(routes=self.routes, + route_temp=route_temp, + url=path_url)) + else: + return WbResponse.text_response('select text for ' + path_url) + + def make_cookie_headers(self, coll, domain): + cookie_val = '{0}={1}; Path=/; Domain=.{2}; HttpOnly' + cookie_val = cookie_val.format(self.cookie_name, coll, domain) + headers = [('Set-Cookie', cookie_val)] + return headers + + def make_sethost_cookie_response(self, coll, path_url, env, headers=None): + path_parts = urlparse.urlsplit(path_url) + + new_url = path_parts.path[1:] + if path_parts.query: + new_url += '?' + path_parts.query + + return self.make_magic_response(coll + '.sethost', new_url, env, + suffix=path_parts.netloc, + headers=headers) + + + def make_magic_response(self, prefix, url, env, + suffix=None, headers=None): + full_url = env['pywb.proxy_scheme'] + '://' + prefix + '.' + full_url += self.magic_name + if suffix: + full_url += '.' + suffix + full_url += '/' + url + return WbResponse.redir_response(full_url, headers=headers) + + @staticmethod + def extract_client_cookie(env, cookie_name): + cookie_header = env.get('HTTP_COOKIE') + if not cookie_header: + return None + + # attempt to extract cookie_name only + inx = cookie_header.find(cookie_name) + if inx < 0: + return None + + end_inx = cookie_header.find(';', inx) + if end_inx > 0: + value = cookie_header[inx:end_inx] + else: + value = cookie_header[inx:] + + value = value.split('=') + if len(value) < 2: + return None + + value = value[1].strip() + return value + diff --git a/pywb/framework/test/test_wbrequestresponse.py b/pywb/framework/test/test_wbrequestresponse.py index 65940e4a..5bbb65b8 100644 --- a/pywb/framework/test/test_wbrequestresponse.py +++ b/pywb/framework/test/test_wbrequestresponse.py @@ -46,7 +46,7 @@ {'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])} >>> WbResponse.redir_response('http://example.com/otherfile') -{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])} +{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile'), ('Content-Length', '0')])} """ diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index f2c63f9c..b17b3575 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -125,7 +125,7 @@ class WbRequest(object): if not self.wb_url: return - mime = self.env.get('CONTENT_TYPE') + mime = self.env.get('CONTENT_TYPE').split(';')[0] length = self.env.get('CONTENT_LENGTH') stream = self.env['wsgi.input'] @@ -167,9 +167,12 @@ class WbResponse(object): return WbResponse(status_headers, value=[text]) @staticmethod - def redir_response(location, status='302 Redirect'): - return WbResponse(StatusAndHeaders(status, - [('Location', location)])) + def redir_response(location, status='302 Redirect', headers=None): + redir_headers = [('Location', location), ('Content-Length', '0')] + if headers: + redir_headers += headers + + return WbResponse(StatusAndHeaders(status, redir_headers)) def __call__(self, env, start_response): diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py index 1e1100e4..c8e7c86a 100644 --- a/pywb/framework/wsgi_wrappers.py +++ b/pywb/framework/wsgi_wrappers.py @@ -64,7 +64,7 @@ class WSGIApp(object): env['pywb.proxy_statusline'] = statusline - ssl_sock.write('HTTP/1.0 ' + statusline + '\r\n') + ssl_sock.write('HTTP/1.1 ' + statusline + '\r\n') for name, value in headers: ssl_sock.write(name + ': ' + value + '\r\n') diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py index 2fd02377..6de8fafa 100644 --- a/pywb/webapp/pywb_init.py +++ b/pywb/webapp/pywb_init.py @@ -215,13 +215,17 @@ def create_wb_router(passed_config={}): if hasattr(route.handler, 'resolve_refs'): route.handler.resolve_refs(handler_dict) - # Check for new proxy mode! if config.get('enable_http_proxy', False): router = ProxyArchivalRouter else: router = ArchivalRouter + if config.get('proxy_select_html'): + temp = J2TemplateView.create_template(config.get('proxy_select_html'), + 'Proxy Coll Selector') + config.get('proxy_options')['proxy_select_view'] = temp + # Finally, create wb router return router( routes, From ba61f23e40a7ecdf690a40a2dc3a4345533bc6a1 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 28 Jul 2014 15:22:22 -0700 Subject: [PATCH 10/26] proxy_resolvers: move resolvers to seperate file, default to ProxyAuthResolver (CookieResolver still work-in-progress) --- pywb/framework/proxy.py | 219 +----------------------------- pywb/framework/proxy_resolvers.py | 219 ++++++++++++++++++++++++++++++ 2 files changed, 221 insertions(+), 217 deletions(-) create mode 100644 pywb/framework/proxy_resolvers.py diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index 386927ca..a9cf6a66 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -6,16 +6,16 @@ import base64 import socket import ssl -from io import BytesIO from pywb.rewrite.url_rewriter import HttpsUrlRewriter -from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.wbexception import BadRequestException from pywb.utils.bufferedreaders import BufferedReader from certauth import CertificateAuthority +from proxy_resolvers import ProxyAuthResolver + #================================================================= class ProxyArchivalRouter(ArchivalRouter): @@ -267,218 +267,3 @@ class ProxyRouter(object): content_type = 'application/x-ns-proxy-autoconfig' return WbResponse.text_response(buff, content_type=content_type) - - -#================================================================= -class BaseCollResolver(object): - def __init__(self, routes, config): - self.routes = routes - self.pre_connect = config.get('pre_connect', False) - self.use_default_coll = config.get('use_default_coll', True) - - def resolve(self, env): - route = None - coll = None - matcher = None - - proxy_coll = self.get_proxy_coll(env) - - # invalid parsing - if proxy_coll == '': - return None, None, None, self.select_coll_response(env) - - if proxy_coll is None and isinstance(self.use_default_coll, str): - proxy_coll = self.use_default_coll - - if proxy_coll: - proxy_coll = '/' + proxy_coll + '/' - - for r in self.routes: - matcher, c = r.is_handling(proxy_coll) - if matcher: - route = r - coll = c - break - - # if no match, return coll selection response - if not route: - return None, None, None, self.select_coll_response(env) - - # if 'use_default_coll' - elif self.use_default_coll == True or len(self.routes) == 1: - route = self.routes[0] - coll = self.routes[0].path - - # otherwise, return the appropriate coll selection response - else: - return None, None, None, self.select_coll_response(env) - - return route, coll, matcher, None - - -#================================================================= -class ProxyAuthResolver(BaseCollResolver): - DEFAULT_MSG = 'Please enter name of a collection to use with proxy mode' - - def __init__(self, routes, config): - config['pre_connect'] = True - super(ProxyAuthResolver, self).__init__(routes, config) - self.auth_msg = config.get('auth_msg', self.DEFAULT_MSG) - - def get_proxy_coll(self, env): - proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION') - - if not proxy_auth: - return None - - proxy_coll = self.read_basic_auth_coll(proxy_auth) - return proxy_coll - - def select_coll_response(self, env): - proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg) - - headers = [('Content-Type', 'text/plain'), - ('Proxy-Authenticate', proxy_msg)] - - status_headers = StatusAndHeaders('407 Proxy Authentication', headers) - - value = self.auth_msg - - return WbResponse(status_headers, value=[value]) - - @staticmethod - def read_basic_auth_coll(value): - parts = value.split(' ') - if parts[0].lower() != 'basic': - return '' - - if len(parts) != 2: - return '' - - user_pass = base64.b64decode(parts[1]) - return user_pass.split(':')[0] - - -#================================================================= -class CookieResolver(BaseCollResolver): - def __init__(self, routes, config): - config['pre_connect'] = False - super(CookieResolver, self).__init__(routes, config) - self.magic_name = config.get('magic_name', 'pywb-proxy.com') - self.cookie_name = config.get('cookie_name', '__pywb_coll') - self.proxy_select_view = config.get('proxy_select_view') - - def get_proxy_coll(self, env): - cookie = self.extract_client_cookie(env, self.cookie_name) - return cookie - - def select_coll_response(self, env): - return self.make_magic_response('auto', - env['REL_REQUEST_URI'], - env) - - def resolve(self, env): - url = env['REL_REQUEST_URI'] - - if ('.' + self.magic_name) in url: - return None, None, None, self.handle_magic_page(url, env) - - return super(CookieResolver, self).resolve(env) - - def handle_magic_page(self, url, env): - parts = urlparse.urlsplit(url) - - path_url = parts.path[1:] - if parts.query: - path_url += '?' + parts.query - - if parts.netloc.startswith('auto'): - coll = self.extract_client_cookie(env, self.cookie_name) - - if coll: - return self.make_sethost_cookie_response(coll, path_url, env) - else: - return self.make_magic_response('select', path_url, env) - - elif '.set.' in parts.netloc: - coll = parts.netloc.split('.', 1)[0] - headers = self.make_cookie_headers(coll, self.magic_name) - - return self.make_sethost_cookie_response(coll, path_url, env, - headers=headers) - - elif '.sethost.' in parts.netloc: - host_parts = parts.netloc.split('.', 1) - coll = host_parts[0] - - inx = parts.netloc.find('.' + self.magic_name + '.') - domain = parts.netloc[inx + len(self.magic_name) + 2:] - - headers = self.make_cookie_headers(coll, domain) - - full_url = env['pywb.proxy_scheme'] + '://' + domain - full_url += '/' + path_url - return WbResponse.redir_response(full_url, headers=headers) - - elif self.proxy_select_view: - route_temp = env['pywb.proxy_scheme'] + '://%s.set.' - route_temp += self.magic_name + '/' + path_url - - return (self.proxy_select_view. - render_response(routes=self.routes, - route_temp=route_temp, - url=path_url)) - else: - return WbResponse.text_response('select text for ' + path_url) - - def make_cookie_headers(self, coll, domain): - cookie_val = '{0}={1}; Path=/; Domain=.{2}; HttpOnly' - cookie_val = cookie_val.format(self.cookie_name, coll, domain) - headers = [('Set-Cookie', cookie_val)] - return headers - - def make_sethost_cookie_response(self, coll, path_url, env, headers=None): - path_parts = urlparse.urlsplit(path_url) - - new_url = path_parts.path[1:] - if path_parts.query: - new_url += '?' + path_parts.query - - return self.make_magic_response(coll + '.sethost', new_url, env, - suffix=path_parts.netloc, - headers=headers) - - - def make_magic_response(self, prefix, url, env, - suffix=None, headers=None): - full_url = env['pywb.proxy_scheme'] + '://' + prefix + '.' - full_url += self.magic_name - if suffix: - full_url += '.' + suffix - full_url += '/' + url - return WbResponse.redir_response(full_url, headers=headers) - - @staticmethod - def extract_client_cookie(env, cookie_name): - cookie_header = env.get('HTTP_COOKIE') - if not cookie_header: - return None - - # attempt to extract cookie_name only - inx = cookie_header.find(cookie_name) - if inx < 0: - return None - - end_inx = cookie_header.find(';', inx) - if end_inx > 0: - value = cookie_header[inx:end_inx] - else: - value = cookie_header[inx:] - - value = value.split('=') - if len(value) < 2: - return None - - value = value[1].strip() - return value - diff --git a/pywb/framework/proxy_resolvers.py b/pywb/framework/proxy_resolvers.py new file mode 100644 index 00000000..b4bfe840 --- /dev/null +++ b/pywb/framework/proxy_resolvers.py @@ -0,0 +1,219 @@ +from wbrequestresponse import WbResponse, WbRequest +from pywb.utils.statusandheaders import StatusAndHeaders +import urlparse +import base64 + + +#================================================================= +class BaseCollResolver(object): + def __init__(self, routes, config): + self.routes = routes + self.pre_connect = config.get('pre_connect', False) + self.use_default_coll = config.get('use_default_coll', True) + + def resolve(self, env): + route = None + coll = None + matcher = None + + proxy_coll = self.get_proxy_coll(env) + + # invalid parsing + if proxy_coll == '': + return None, None, None, self.select_coll_response(env) + + if proxy_coll is None and isinstance(self.use_default_coll, str): + proxy_coll = self.use_default_coll + + if proxy_coll: + proxy_coll = '/' + proxy_coll + '/' + + for r in self.routes: + matcher, c = r.is_handling(proxy_coll) + if matcher: + route = r + coll = c + break + + # if no match, return coll selection response + if not route: + return None, None, None, self.select_coll_response(env) + + # if 'use_default_coll' + elif self.use_default_coll == True or len(self.routes) == 1: + route = self.routes[0] + coll = self.routes[0].path + + # otherwise, return the appropriate coll selection response + else: + return None, None, None, self.select_coll_response(env) + + return route, coll, matcher, None + + +#================================================================= +class ProxyAuthResolver(BaseCollResolver): + DEFAULT_MSG = 'Please enter name of a collection to use with proxy mode' + + def __init__(self, routes, config): + config['pre_connect'] = True + super(ProxyAuthResolver, self).__init__(routes, config) + self.auth_msg = config.get('auth_msg', self.DEFAULT_MSG) + + def get_proxy_coll(self, env): + proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION') + + if not proxy_auth: + return None + + proxy_coll = self.read_basic_auth_coll(proxy_auth) + return proxy_coll + + def select_coll_response(self, env): + proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg) + + headers = [('Content-Type', 'text/plain'), + ('Proxy-Authenticate', proxy_msg)] + + status_headers = StatusAndHeaders('407 Proxy Authentication', headers) + + value = self.auth_msg + + return WbResponse(status_headers, value=[value]) + + @staticmethod + def read_basic_auth_coll(value): + parts = value.split(' ') + if parts[0].lower() != 'basic': + return '' + + if len(parts) != 2: + return '' + + user_pass = base64.b64decode(parts[1]) + return user_pass.split(':')[0] + + +#================================================================= +# Experimental CookieResolver +class CookieResolver(BaseCollResolver): # pragma: no cover + def __init__(self, routes, config): + config['pre_connect'] = False + super(CookieResolver, self).__init__(routes, config) + self.magic_name = config.get('magic_name', 'pywb-proxy.com') + self.cookie_name = config.get('cookie_name', '__pywb_coll') + self.proxy_select_view = config.get('proxy_select_view') + + def get_proxy_coll(self, env): + cookie = self.extract_client_cookie(env, self.cookie_name) + return cookie + + def select_coll_response(self, env): + return self.make_magic_response('auto', + env['REL_REQUEST_URI'], + env) + + def resolve(self, env): + url = env['REL_REQUEST_URI'] + + if ('.' + self.magic_name) in url: + return None, None, None, self.handle_magic_page(url, env) + + return super(CookieResolver, self).resolve(env) + + def handle_magic_page(self, url, env): + parts = urlparse.urlsplit(url) + + path_url = parts.path[1:] + if parts.query: + path_url += '?' + parts.query + + if parts.netloc.startswith('auto'): + coll = self.extract_client_cookie(env, self.cookie_name) + + if coll: + return self.make_sethost_cookie_response(coll, path_url, env) + else: + return self.make_magic_response('select', path_url, env) + + elif '.set.' in parts.netloc: + coll = parts.netloc.split('.', 1)[0] + headers = self.make_cookie_headers(coll, self.magic_name) + + return self.make_sethost_cookie_response(coll, path_url, env, + headers=headers) + + elif '.sethost.' in parts.netloc: + host_parts = parts.netloc.split('.', 1) + coll = host_parts[0] + + inx = parts.netloc.find('.' + self.magic_name + '.') + domain = parts.netloc[inx + len(self.magic_name) + 2:] + + headers = self.make_cookie_headers(coll, domain) + + full_url = env['pywb.proxy_scheme'] + '://' + domain + full_url += '/' + path_url + return WbResponse.redir_response(full_url, headers=headers) + + elif self.proxy_select_view: + route_temp = env['pywb.proxy_scheme'] + '://%s.set.' + route_temp += self.magic_name + '/' + path_url + + return (self.proxy_select_view. + render_response(routes=self.routes, + route_temp=route_temp, + url=path_url)) + else: + return WbResponse.text_response('select text for ' + path_url) + + def make_cookie_headers(self, coll, domain): + cookie_val = '{0}={1}; Path=/; Domain=.{2}; HttpOnly' + cookie_val = cookie_val.format(self.cookie_name, coll, domain) + headers = [('Set-Cookie', cookie_val)] + return headers + + def make_sethost_cookie_response(self, coll, path_url, env, headers=None): + path_parts = urlparse.urlsplit(path_url) + + new_url = path_parts.path[1:] + if path_parts.query: + new_url += '?' + path_parts.query + + return self.make_magic_response(coll + '.sethost', new_url, env, + suffix=path_parts.netloc, + headers=headers) + + + def make_magic_response(self, prefix, url, env, + suffix=None, headers=None): + full_url = env['pywb.proxy_scheme'] + '://' + prefix + '.' + full_url += self.magic_name + if suffix: + full_url += '.' + suffix + full_url += '/' + url + return WbResponse.redir_response(full_url, headers=headers) + + @staticmethod + def extract_client_cookie(env, cookie_name): + cookie_header = env.get('HTTP_COOKIE') + if not cookie_header: + return None + + # attempt to extract cookie_name only + inx = cookie_header.find(cookie_name) + if inx < 0: + return None + + end_inx = cookie_header.find(';', inx) + if end_inx > 0: + value = cookie_header[inx:end_inx] + else: + value = cookie_header[inx:] + + value = value.split('=') + if len(value) < 2: + return None + + value = value[1].strip() + return value From 9c960269041e5edebba19cfc9735308e1ed91b9b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 28 Jul 2014 16:06:01 -0700 Subject: [PATCH 11/26] proxy-cert-auth: add cli hook for 'proxy-cert-auth' for creating root certs, tweak help --- pywb/framework/certauth.py | 13 +++++++------ setup.py | 1 + 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pywb/framework/certauth.py b/pywb/framework/certauth.py index 0ce7cec3..023754af 100644 --- a/pywb/framework/certauth.py +++ b/pywb/framework/certauth.py @@ -150,10 +150,11 @@ class CertificateAuthority(object): def main(): parser = ArgumentParser(description='Cert Auth Cert Maker') - parser.add_argument('output_file', help='path to certificate file') + parser.add_argument('output_pem_file', help='path to cert .pem file') parser.add_argument('-r', '--use-root', - help='use specified root cert to create signed cert') + help=('use specified root cert (.pem file) ' + + 'to create signed cert')) parser.add_argument('-n', '--name', action='store', default=CERT_NAME, help='name for root certificate') @@ -173,7 +174,7 @@ def main(): certs_dir=result.certs_dir, certname=result.name) - created, host_filename = ca.get_cert_for_host(result.output_file, + created, host_filename = ca.get_cert_for_host(result.output_pem_file, overwrite) if created: @@ -187,14 +188,14 @@ def main(): # Create new root certificate else: created, c, k = (CertificateAuthority. - generate_ca_root(result.output_file, + generate_ca_root(result.output_pem_file, result.name, overwrite)) if created: - print 'Created new root cert: "' + result.output_file + '"' + print 'Created new root cert: "' + result.output_pem_file + '"' else: - print ('Root cert "' + result.output_file + '" already exists,' + + print ('Root cert "' + result.output_pem_file + '" already exists,' + ' use -f to overwrite') if __name__ == "__main__": diff --git a/setup.py b/setup.py index 45349981..2881d1e5 100755 --- a/setup.py +++ b/setup.py @@ -87,6 +87,7 @@ setup( cdx-server = pywb.apps.cdx_server:main cdx-indexer = pywb.warc.cdxindexer:main live-rewrite-server = pywb.apps.live_rewrite_server:main + proxy-cert-auth = pywb.framework.certauth:main """, zip_safe=False, classifiers=[ From 607ea1ccf0990266fb96cf136924675a5955e6e9 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 29 Jul 2014 12:23:41 -0700 Subject: [PATCH 12/26] proxy resolver: cookie resolver uses session cookies proxy static handler: handled via proxy to support http/https use 'pywb.proxy' prefix for custom env settings --- pywb/framework/proxy.py | 42 +++++++++-- pywb/framework/proxy_resolvers.py | 104 ++++++++++++++++++++++------ pywb/framework/wbrequestresponse.py | 10 ++- pywb/framework/wsgi_wrappers.py | 12 ++-- 4 files changed, 132 insertions(+), 36 deletions(-) diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index a9cf6a66..82218e20 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -14,7 +14,7 @@ from pywb.utils.bufferedreaders import BufferedReader from certauth import CertificateAuthority -from proxy_resolvers import ProxyAuthResolver +from proxy_resolvers import ProxyAuthResolver, CookieResolver #================================================================= @@ -68,6 +68,8 @@ class ProxyRouter(object): self.resolver = ProxyAuthResolver(routes, proxy_options) #self.resolver = CookieResolver(routes, proxy_options) + self.magic_name = proxy_options.get('magic_name', 'pywb-proxy.com') + self.unaltered = proxy_options.get('unaltered_replay', False) self.proxy_pac_path = proxy_options.get('pac_path', self.PAC_PATH) @@ -100,7 +102,12 @@ class ProxyRouter(object): if not url.startswith(('http://', 'https://')): return None - env['pywb.proxy_scheme'] = 'https' if is_https else 'http' + env['pywb.proxy_scheme'] = 'http' + + route = None + coll = None + matcher = None + response = None # check resolver, for pre connect resolve if self.resolver.pre_connect: @@ -115,6 +122,21 @@ class ProxyRouter(object): return response url = env['REL_REQUEST_URI'] + else: + parts = urlparse.urlsplit(env['REL_REQUEST_URI']) + hostport = parts.netloc.split(':', 1) + env['pywb.proxy_host'] = hostport[0] + env['pywb.proxy_port'] = hostport[1] if len(hostport) == 2 else '' + env['pywb.proxy_req_uri'] = parts.path + if parts.query: + env['pywb.proxy_req_uri'] += '?' + parts.query + + # static + static_prefix = 'static.' + self.magic_name + + if env['pywb.proxy_host'] == static_prefix: + env['REL_REQUEST_URI'] = env['pywb.proxy_req_uri'] + return None # check resolver, post connect if not self.resolver.pre_connect: @@ -122,11 +144,14 @@ class ProxyRouter(object): if response: return response + host_prefix = env['pywb.proxy_scheme'] + '://' + static_prefix + wbrequest = route.request_class(env, request_uri=url, wb_url_str=url, coll=coll, - host_prefix=self.hostpaths[0], + # host_prefix=self.hostpaths[0], + host_prefix=host_prefix, wburl_class=route.handler.get_wburl_type(), urlrewriter_class=HttpsUrlRewriter, use_abs_prefix=False, @@ -136,7 +161,8 @@ class ProxyRouter(object): route.apply_filters(wbrequest, matcher) if self.unaltered: - wbrequest.wb_url.mod = 'id_' + #wbrequest.wb_url.mod = 'id_' + wbrequest.wb_url.mod = 'bn_' return route.handler(wbrequest) @@ -201,14 +227,16 @@ class ProxyRouter(object): env['SERVER_PROTOCOL'] = statusparts[2].strip() - env['SERVER_NAME'] = hostname - env['SERVER_PORT'] = port + env['pywb.proxy_scheme'] = 'https' + + env['pywb.proxy_host'] = hostname + env['pywb.proxy_port'] = port + env['pywb.proxy_req_uri'] = statusparts[1] queryparts = env['REL_REQUEST_URI'].split('?', 1) env['PATH_INFO'] = queryparts[0] env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else '' - env['wsgi.url_scheme'] = 'https' while True: line = buffreader.readline() diff --git a/pywb/framework/proxy_resolvers.py b/pywb/framework/proxy_resolvers.py index b4bfe840..35c84c8a 100644 --- a/pywb/framework/proxy_resolvers.py +++ b/pywb/framework/proxy_resolvers.py @@ -2,6 +2,25 @@ from wbrequestresponse import WbResponse, WbRequest from pywb.utils.statusandheaders import StatusAndHeaders import urlparse import base64 +import os + +try: + import uwsgi + uwsgi_cache = True +except ImportError: + uwsgi_cache = False + + +#================================================================= +class UwsgiCache(object): + def __setitem__(self, item, value): + uwsgi.cache_update(item, value) + + def __getitem__(self, item): + return uwsgi.cache_get(item) + + def __contains__(self, item): + return uwsgi.cache_exists(item) #================================================================= @@ -104,9 +123,15 @@ class CookieResolver(BaseCollResolver): # pragma: no cover self.cookie_name = config.get('cookie_name', '__pywb_coll') self.proxy_select_view = config.get('proxy_select_view') + if uwsgi_cache: + print 'UWSGI CACHE' + self.cache = UwsgiCache() + else: + self.cache = {} + def get_proxy_coll(self, env): - cookie = self.extract_client_cookie(env, self.cookie_name) - return cookie + coll, sesh_id = self.get_coll(env) + return coll def select_coll_response(self, env): return self.make_magic_response('auto', @@ -114,14 +139,15 @@ class CookieResolver(BaseCollResolver): # pragma: no cover env) def resolve(self, env): - url = env['REL_REQUEST_URI'] + server_name = env['pywb.proxy_host'] - if ('.' + self.magic_name) in url: - return None, None, None, self.handle_magic_page(url, env) + if ('.' + self.magic_name) in server_name: + return None, None, None, self.handle_magic_page(env) return super(CookieResolver, self).resolve(env) - def handle_magic_page(self, url, env): + def handle_magic_page(self, env): + url = env['REL_REQUEST_URI'] parts = urlparse.urlsplit(url) path_url = parts.path[1:] @@ -129,58 +155,77 @@ class CookieResolver(BaseCollResolver): # pragma: no cover path_url += '?' + parts.query if parts.netloc.startswith('auto'): - coll = self.extract_client_cookie(env, self.cookie_name) + coll, sesh_id = self.get_coll(env) if coll: - return self.make_sethost_cookie_response(coll, path_url, env) + return self.make_sethost_cookie_response(sesh_id, path_url, env) else: return self.make_magic_response('select', path_url, env) elif '.set.' in parts.netloc: - coll = parts.netloc.split('.', 1)[0] - headers = self.make_cookie_headers(coll, self.magic_name) + old_sesh_id = self.extract_client_cookie(env, self.cookie_name) + sesh_id = self.create_renew_sesh_id(old_sesh_id) - return self.make_sethost_cookie_response(coll, path_url, env, + if sesh_id != old_sesh_id: + headers = self.make_cookie_headers(sesh_id, self.magic_name) + else: + headers = None + + value, name, _ = parts.netloc.split('.', 2) + + # set sesh value + self.cache[sesh_id] = value + + return self.make_sethost_cookie_response(sesh_id, path_url, env, headers=headers) elif '.sethost.' in parts.netloc: host_parts = parts.netloc.split('.', 1) - coll = host_parts[0] + sesh_id = host_parts[0] inx = parts.netloc.find('.' + self.magic_name + '.') domain = parts.netloc[inx + len(self.magic_name) + 2:] - headers = self.make_cookie_headers(coll, domain) + headers = self.make_cookie_headers(sesh_id, domain) full_url = env['pywb.proxy_scheme'] + '://' + domain full_url += '/' + path_url return WbResponse.redir_response(full_url, headers=headers) - elif self.proxy_select_view: - route_temp = env['pywb.proxy_scheme'] + '://%s.set.' + elif 'select.' in parts.netloc: + if not self.proxy_select_view: + return WbResponse.text_response('select text for ' + path_url) + + coll, sesh_id = self.get_coll(env) + + route_temp = env['pywb.proxy_scheme'] + '://%s.coll.set.' route_temp += self.magic_name + '/' + path_url return (self.proxy_select_view. render_response(routes=self.routes, route_temp=route_temp, + coll=coll, url=path_url)) - else: - return WbResponse.text_response('select text for ' + path_url) - def make_cookie_headers(self, coll, domain): + #else: + # msg = 'Invalid Magic Path: ' + url + # print msg + # return WbResponse.text_response(msg, status='404 Not Found') + + def make_cookie_headers(self, sesh_id, domain): cookie_val = '{0}={1}; Path=/; Domain=.{2}; HttpOnly' - cookie_val = cookie_val.format(self.cookie_name, coll, domain) + cookie_val = cookie_val.format(self.cookie_name, sesh_id, domain) headers = [('Set-Cookie', cookie_val)] return headers - def make_sethost_cookie_response(self, coll, path_url, env, headers=None): + def make_sethost_cookie_response(self, sesh_id, path_url, env, headers=None): path_parts = urlparse.urlsplit(path_url) new_url = path_parts.path[1:] if path_parts.query: new_url += '?' + path_parts.query - return self.make_magic_response(coll + '.sethost', new_url, env, + return self.make_magic_response(sesh_id + '.sethost', new_url, env, suffix=path_parts.netloc, headers=headers) @@ -194,6 +239,23 @@ class CookieResolver(BaseCollResolver): # pragma: no cover full_url += '/' + url return WbResponse.redir_response(full_url, headers=headers) + def get_coll(self, env): + sesh_id = self.extract_client_cookie(env, self.cookie_name) + + coll = None + if sesh_id: + coll = self.cache[sesh_id] + + return coll, sesh_id + + def create_renew_sesh_id(self, sesh_id, force=False): + #if sesh_id in self.cache and not force: + if sesh_id and (sesh_id in self.cache) and not force: + return sesh_id + + sesh_id = base64.b32encode(os.urandom(5)).lower() + return sesh_id + @staticmethod def extract_client_cookie(env, cookie_name): cookie_header = env.get('HTTP_COOKIE') diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index b17b3575..da456474 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -152,9 +152,13 @@ class WbResponse(object): pass @staticmethod - def text_stream(stream, status='200 OK', content_type='text/plain'): - status_headers = StatusAndHeaders(status, - [('Content-Type', content_type)]) + def text_stream(stream, status='200 OK', content_type='text/plain', + headers=None): + def_headers = [('Content-Type', content_type)] + if headers: + def_headers += headers + + status_headers = StatusAndHeaders(status, def_headers) return WbResponse(status_headers, value=stream) diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py index c8e7c86a..2babc83f 100644 --- a/pywb/framework/wsgi_wrappers.py +++ b/pywb/framework/wsgi_wrappers.py @@ -77,8 +77,8 @@ class WSGIApp(object): ssl_sock.write('\r\n') for obj in resp_iter: - ssl_sock.write(obj) - + if obj: + ssl_sock.write(obj) ssl_sock.close() start_response(env['pywb.proxy_statusline'], []) @@ -125,22 +125,24 @@ class WSGIApp(object): else: err_url = None + err_msg = exc.message.encode('utf-8') + if print_trace: import traceback err_details = traceback.format_exc(exc) print err_details else: - logging.info(str(exc)) + logging.info(err_msg) err_details = None if error_view: return error_view.render_response(exc_type=type(exc).__name__, - err_msg=str(exc), + err_msg=err_msg, err_details=err_details, status=status, err_url=err_url) else: - return WbResponse.text_response(status + ' Error: ' + str(exc), + return WbResponse.text_response(status + ' Error: ' + err_msg, status=status) #================================================================= From 96d9f4dcad9f93d032e8b007d6aa1497dce1e38b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 30 Jul 2014 10:38:13 -0700 Subject: [PATCH 13/26] proxy mode: cookie based selector using session to coll ui: add proxy_selector html, add switch link to error and banner --- pywb/framework/proxy.py | 4 ++-- pywb/framework/proxy_resolvers.py | 1 - pywb/framework/wsgi_wrappers.py | 3 +++ pywb/static/wb.js | 4 ++++ pywb/ui/error.html | 7 +++++++ pywb/ui/head_insert.html | 3 ++- pywb/ui/proxy_select.html | 25 +++++++++++++++++++++++++ pywb/webapp/pywb_init.py | 17 +++++++++++------ 8 files changed, 54 insertions(+), 10 deletions(-) create mode 100644 pywb/ui/proxy_select.html diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index 82218e20..e387bf4b 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -65,8 +65,8 @@ class ProxyRouter(object): if proxy_options: proxy_options = proxy_options.get('proxy_options', {}) - self.resolver = ProxyAuthResolver(routes, proxy_options) - #self.resolver = CookieResolver(routes, proxy_options) + #self.resolver = ProxyAuthResolver(routes, proxy_options) + self.resolver = CookieResolver(routes, proxy_options) self.magic_name = proxy_options.get('magic_name', 'pywb-proxy.com') diff --git a/pywb/framework/proxy_resolvers.py b/pywb/framework/proxy_resolvers.py index 35c84c8a..1b33be01 100644 --- a/pywb/framework/proxy_resolvers.py +++ b/pywb/framework/proxy_resolvers.py @@ -124,7 +124,6 @@ class CookieResolver(BaseCollResolver): # pragma: no cover self.proxy_select_view = config.get('proxy_select_view') if uwsgi_cache: - print 'UWSGI CACHE' self.cache = UwsgiCache() else: self.cache = {} diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py index 2babc83f..85e23aaa 100644 --- a/pywb/framework/wsgi_wrappers.py +++ b/pywb/framework/wsgi_wrappers.py @@ -135,11 +135,14 @@ class WSGIApp(object): logging.info(err_msg) err_details = None + is_proxy_mode = env.get('pywb.proxy_host') is not None + if error_view: return error_view.render_response(exc_type=type(exc).__name__, err_msg=err_msg, err_details=err_details, status=status, + is_proxy_mode=is_proxy_mode, err_url=err_url) else: return WbResponse.text_response(status + ' Error: ' + err_msg, diff --git a/pywb/static/wb.js b/pywb/static/wb.js index 0244cde8..4a23b03c 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -69,6 +69,10 @@ function init_banner() { var capture_str = (wbinfo ? wbinfo.capture_str : ""); text += "" + capture_str + ""; + + if (wbinfo.is_proxy_mode && wbinfo.url) { + text += '
Switch Collection'; + } banner.innerHTML = text; diff --git a/pywb/ui/error.html b/pywb/ui/error.html index b3a8c478..d7231893 100644 --- a/pywb/ui/error.html +++ b/pywb/ui/error.html @@ -9,3 +9,10 @@

{% endif %} + +{% if is_proxy_mode and err_url and status == '404 Not Found' %} +

+Try Different Collections +

+{% endif %} + diff --git a/pywb/ui/head_insert.html b/pywb/ui/head_insert.html index b1ff4a26..d9e1207b 100644 --- a/pywb/ui/head_insert.html +++ b/pywb/ui/head_insert.html @@ -2,7 +2,7 @@ {% if rule.js_rewrite_location and include_wombat %} From 522ea87637e99b4762ddf941b8d188d685dc9bd7 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 31 Jul 2014 11:12:50 -0700 Subject: [PATCH 16/26] proxy: timestamp selection support! certauth: wildcard support, use *.host wildcard for proxy certs whenever possible ui: add coll info/switch and calendar links to banner --- pywb/framework/certauth.py | 30 ++++++-- pywb/framework/proxy.py | 52 +++++++++----- pywb/framework/proxy_resolvers.py | 116 +++++++++++++++++++++--------- pywb/framework/wsgi_wrappers.py | 6 +- pywb/rewrite/url_rewriter.py | 2 +- pywb/static/wb.js | 10 ++- pywb/ui/error.html | 4 +- pywb/ui/head_insert.html | 3 +- pywb/webapp/pywb_init.py | 2 +- 9 files changed, 160 insertions(+), 65 deletions(-) diff --git a/pywb/framework/certauth.py b/pywb/framework/certauth.py index 023754af..73b0d0e4 100644 --- a/pywb/framework/certauth.py +++ b/pywb/framework/certauth.py @@ -45,13 +45,15 @@ class CertificateAuthority(object): if not os.path.exists(certs_dir): os.mkdir(certs_dir) - def get_cert_for_host(self, host, overwrite=False): - host_filename = os.path.sep.join([self.certs_dir, '%s.pem' % host]) + def get_cert_for_host(self, host, overwrite=False, wildcard=False): + host_filename = os.path.join(self.certs_dir, host) + '.pem' if not overwrite and os.path.exists(host_filename): return False, host_filename - self.generate_host_cert(host, self.cert, self.key, host_filename) + self.generate_host_cert(host, self.cert, self.key, host_filename, + wildcard) + return True, host_filename @staticmethod @@ -107,7 +109,8 @@ class CertificateAuthority(object): return True, cert, key @staticmethod - def generate_host_cert(host, root_cert, root_key, host_filename): + def generate_host_cert(host, root_cert, root_key, host_filename, + wildcard=False): # Generate key key = crypto.PKey() key.generate_key(crypto.TYPE_RSA, 2048) @@ -123,6 +126,19 @@ class CertificateAuthority(object): cert.set_issuer(root_cert.get_subject()) cert.set_pubkey(req.get_pubkey()) + + if wildcard: + DNS = 'DNS:' + alt_hosts = [DNS + host, + DNS + '*.' + host] + + alt_hosts = ', '.join(alt_hosts) + + cert.add_extensions([ + crypto.X509Extension('subjectAltName', + False, + alt_hosts)]) + cert.sign(root_key, 'sha1') # Write cert + key @@ -163,6 +179,9 @@ def main(): parser.add_argument('-f', '--force', action='store_true') + parser.add_argument('-w', '--wildcard_cert', action='store_true', + help='add wildcard SAN to host: *., ') + result = parser.parse_args() overwrite = result.force @@ -170,12 +189,13 @@ def main(): # Create a new signed certificate using specified root if result.use_root: certs_dir = result.certs_dir + wildcard = result.wildcard ca = CertificateAuthority(ca_file=result.use_root, certs_dir=result.certs_dir, certname=result.name) created, host_filename = ca.get_cert_for_host(result.output_pem_file, - overwrite) + overwrite, wildcard) if created: print ('Created new cert "' + host_filename + diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index ba6d3266..693e7bd0 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -76,7 +76,6 @@ class ProxyRouter(object): else: self.resolver = ProxyAuthResolver(routes, proxy_options) - self.insert_banner = proxy_options.get('banner_only_replay', False) self.unaltered = proxy_options.get('unaltered_replay', False) self.proxy_pac_path = proxy_options.get('pac_path', self.PAC_PATH) @@ -115,10 +114,11 @@ class ProxyRouter(object): coll = None matcher = None response = None + ts = None # check resolver, for pre connect resolve if self.resolver.pre_connect: - route, coll, matcher, response = self.resolver.resolve(env) + route, coll, matcher, response, ts = self.resolver.resolve(env) if response: return response @@ -138,26 +138,36 @@ class ProxyRouter(object): if parts.query: env['pywb.proxy_req_uri'] += '?' + parts.query - # select prefix - env['pywb_proxy_select'] = 'select.' + self.magic_name + env['pywb_proxy_magic'] = self.magic_name + # route (static) and other resources to archival replay if env['pywb.proxy_host'] == self.magic_name: env['REL_REQUEST_URI'] = env['pywb.proxy_req_uri'] return None # check resolver, post connect if not self.resolver.pre_connect: - route, coll, matcher, response = self.resolver.resolve(env) + route, coll, matcher, ts, response = self.resolver.resolve(env) if response: return response host_prefix = env['pywb.proxy_scheme'] + '://' + self.magic_name + rel_prefix = '' + + # special case for proxy calendar + if (env['pywb.proxy_host'] == 'query.' + self.magic_name): + url = env['pywb.proxy_req_uri'][1:] + rel_prefix = '/' + + if ts is not None: + url = ts + '/' + url wbrequest = route.request_class(env, request_uri=url, wb_url_str=url, coll=coll, host_prefix=host_prefix, + rel_prefix=rel_prefix, wburl_class=route.handler.get_wburl_type(), urlrewriter_class=HttpsUrlRewriter, use_abs_prefix=False, @@ -166,10 +176,10 @@ class ProxyRouter(object): if matcher: route.apply_filters(wbrequest, matcher) - if self.insert_banner: - wbrequest.wb_url.mod = 'bn_' - elif self.unaltered: + if self.unaltered: wbrequest.wb_url.mod = 'id_' + elif is_https: + wbrequest.wb_url.mod = 'bn_' return route.handler(wbrequest) @@ -209,13 +219,23 @@ class ProxyRouter(object): sock.send('\r\n') hostname, port = env['REL_REQUEST_URI'].split(':') - created, certfile = self.ca.get_cert_for_host(hostname) + cert_host = hostname - ssl_sock = ssl.wrap_socket(sock, - server_side=True, - certfile=certfile, - ciphers="ALL", - ssl_version=ssl.PROTOCOL_SSLv23) + host_parts = hostname.split('.', 1) + if len(host_parts) == 2 and '.' in host_parts[1]: + cert_host = host_parts[1] + + created, certfile = self.ca.get_cert_for_host(cert_host, + wildcard=True) + + try: + ssl_sock = ssl.wrap_socket(sock, + server_side=True, + certfile=certfile, + ciphers="ALL", + ssl_version=ssl.PROTOCOL_SSLv23) + except Exception as se: + raise BadRequestException(se.message) env['pywb.proxy_ssl_sock'] = ssl_sock @@ -244,7 +264,6 @@ class ProxyRouter(object): env['PATH_INFO'] = queryparts[0] env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else '' - while True: line = buffreader.readline() if line: @@ -270,8 +289,7 @@ class ProxyRouter(object): remain = buffreader.rem_length() if remain > 0: remainder = buffreader.read(self.BLOCK_SIZE) - input_ = socket._fileobject(ssl_sock, mode='r') - env['wsgi.input'] = BufferedReader(input_, + env['wsgi.input'] = BufferedReader(ssl_sock, block_size=self.BLOCK_SIZE, starting_data=remainder) diff --git a/pywb/framework/proxy_resolvers.py b/pywb/framework/proxy_resolvers.py index 9062bafd..8fb65b73 100644 --- a/pywb/framework/proxy_resolvers.py +++ b/pywb/framework/proxy_resolvers.py @@ -1,5 +1,7 @@ from wbrequestresponse import WbResponse, WbRequest from pywb.utils.statusandheaders import StatusAndHeaders +from pywb.rewrite.wburl import WbUrl + import urlparse import base64 import os @@ -22,6 +24,9 @@ class UwsgiCache(object): def __contains__(self, item): return uwsgi.cache_exists(item) + def __delitem__(self, item): + uwsgi.cache_del(item) + #================================================================= class BaseCollResolver(object): @@ -34,12 +39,13 @@ class BaseCollResolver(object): route = None coll = None matcher = None + ts = None - proxy_coll = self.get_proxy_coll(env) + proxy_coll, ts = self.get_proxy_coll_ts(env) # invalid parsing if proxy_coll == '': - return None, None, None, self.select_coll_response(env) + return None, None, None, None, self.select_coll_response(env) if proxy_coll is None and isinstance(self.use_default_coll, str): proxy_coll = self.use_default_coll @@ -56,7 +62,7 @@ class BaseCollResolver(object): # if no match, return coll selection response if not route: - return None, None, None, self.select_coll_response(env) + return None, None, None, None, self.select_coll_response(env) # if 'use_default_coll' elif self.use_default_coll == True or len(self.routes) == 1: @@ -65,9 +71,9 @@ class BaseCollResolver(object): # otherwise, return the appropriate coll selection response else: - return None, None, None, self.select_coll_response(env) + return None, None, None, None, self.select_coll_response(env) - return route, coll, matcher, None + return route, coll, matcher, ts, None #================================================================= @@ -79,14 +85,14 @@ class ProxyAuthResolver(BaseCollResolver): super(ProxyAuthResolver, self).__init__(routes, config) self.auth_msg = config.get('auth_msg', self.DEFAULT_MSG) - def get_proxy_coll(self, env): + def get_proxy_coll_ts(self, env): proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION') if not proxy_auth: - return None + return None, None proxy_coll = self.read_basic_auth_coll(proxy_auth) - return proxy_coll + return proxy_coll, None def select_coll_response(self, env): proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg) @@ -120,6 +126,9 @@ class CookieResolver(BaseCollResolver): # pragma: no cover config['pre_connect'] = False super(CookieResolver, self).__init__(routes, config) self.magic_name = config['magic_name'] + self.sethost_prefix = '-sethost.' + self.magic_name + '.' + self.set_prefix = '-set.' + self.magic_name + self.cookie_name = config.get('cookie_name', '__pywb_coll') self.proxy_select_view = config.get('proxy_select_view') @@ -128,9 +137,9 @@ class CookieResolver(BaseCollResolver): # pragma: no cover else: self.cache = {} - def get_proxy_coll(self, env): - coll, sesh_id = self.get_coll(env) - return coll + def get_proxy_coll_ts(self, env): + coll, ts, sesh_id = self.get_coll(env) + return coll, ts def select_coll_response(self, env): return self.make_magic_response('auto', @@ -141,27 +150,44 @@ class CookieResolver(BaseCollResolver): # pragma: no cover server_name = env['pywb.proxy_host'] if ('.' + self.magic_name) in server_name: - return None, None, None, self.handle_magic_page(env) + response = self.handle_magic_page(env) + if response: + return None, None, None, None, response return super(CookieResolver, self).resolve(env) def handle_magic_page(self, env): - url = env['REL_REQUEST_URI'] - parts = urlparse.urlsplit(url) + request_url = env['REL_REQUEST_URI'] + parts = urlparse.urlsplit(request_url) + server_name = env['pywb.proxy_host'] path_url = parts.path[1:] if parts.query: path_url += '?' + parts.query - if parts.netloc.startswith('auto'): - coll, sesh_id = self.get_coll(env) + if server_name.startswith('auto'): + coll, ts, sesh_id = self.get_coll(env) if coll: return self.make_sethost_cookie_response(sesh_id, path_url, env) else: return self.make_magic_response('select', path_url, env) - elif '.set.' in parts.netloc: + elif server_name.startswith('query.'): + wb_url = WbUrl(path_url) + + # only dealing with specific timestamp setting + if wb_url.is_query(): + return None + + coll, ts, sesh_id = self.get_coll(env) + if not coll: + return self.make_magic_response('select', path_url, env) + + self.set_ts(sesh_id, wb_url.timestamp) + return self.make_redir_response(wb_url.url) + + elif server_name.endswith(self.set_prefix): old_sesh_id = self.extract_client_cookie(env, self.cookie_name) sesh_id = self.create_renew_sesh_id(old_sesh_id) @@ -170,34 +196,33 @@ class CookieResolver(BaseCollResolver): # pragma: no cover else: headers = None - value, name, _ = parts.netloc.split('.', 2) + coll = server_name[:-len(self.set_prefix)] # set sesh value - self.cache[sesh_id] = value + self.set_coll(sesh_id, coll) return self.make_sethost_cookie_response(sesh_id, path_url, env, headers=headers) - elif '.sethost.' in parts.netloc: - host_parts = parts.netloc.split('.', 1) - sesh_id = host_parts[0] + elif self.sethost_prefix in server_name: + inx = server_name.find(self.sethost_prefix) + sesh_id = server_name[:inx] - inx = parts.netloc.find('.' + self.magic_name + '.') - domain = parts.netloc[inx + len(self.magic_name) + 2:] + domain = server_name[inx + len(self.sethost_prefix):] headers = self.make_cookie_headers(sesh_id, domain) full_url = env['pywb.proxy_scheme'] + '://' + domain full_url += '/' + path_url - return WbResponse.redir_response(full_url, headers=headers) + return self.make_redir_response(full_url, headers=headers) - elif 'select.' in parts.netloc: + elif 'select.' in server_name: if not self.proxy_select_view: return WbResponse.text_response('select text for ' + path_url) - coll, sesh_id = self.get_coll(env) + coll, ts, sesh_id = self.get_coll(env) - route_temp = env['pywb.proxy_scheme'] + '://%s.coll.set.' + route_temp = env['pywb.proxy_scheme'] + '://%s-set.' route_temp += self.magic_name + '/' + path_url return (self.proxy_select_view. @@ -217,14 +242,18 @@ class CookieResolver(BaseCollResolver): # pragma: no cover headers = [('Set-Cookie', cookie_val)] return headers - def make_sethost_cookie_response(self, sesh_id, path_url, env, headers=None): + def make_sethost_cookie_response(self, sesh_id, path_url, + env, headers=None): + if '://' not in path_url: + path_url = 'http://' + path_url + path_parts = urlparse.urlsplit(path_url) new_url = path_parts.path[1:] if path_parts.query: new_url += '?' + path_parts.query - return self.make_magic_response(sesh_id + '.sethost', new_url, env, + return self.make_magic_response(sesh_id + '-sethost', new_url, env, suffix=path_parts.netloc, headers=headers) @@ -236,25 +265,44 @@ class CookieResolver(BaseCollResolver): # pragma: no cover if suffix: full_url += '.' + suffix full_url += '/' + url - return WbResponse.redir_response(full_url, headers=headers) + return self.make_redir_response(full_url, headers=headers) + + def set_coll(self, sesh_id, coll): + self.cache[sesh_id + ':c'] = coll + + def set_ts(self, sesh_id, ts): + if ts: + self.cache[sesh_id + ':t'] = ts + # this ensures that omitting timestamp will reset to latest + # capture by deleting the cache entry + else: + del self.cache[sesh_id + ':t'] def get_coll(self, env): sesh_id = self.extract_client_cookie(env, self.cookie_name) coll = None + ts = None if sesh_id: - coll = self.cache[sesh_id] + coll = self.cache[sesh_id + ':c'] + try: + ts = self.cache[sesh_id + ':t'] + except KeyError: + pass - return coll, sesh_id + return coll, ts, sesh_id def create_renew_sesh_id(self, sesh_id, force=False): #if sesh_id in self.cache and not force: - if sesh_id and (sesh_id in self.cache) and not force: + if sesh_id and ((sesh_id + ':c') in self.cache) and not force: return sesh_id sesh_id = base64.b32encode(os.urandom(5)).lower() return sesh_id + def make_redir_response(self, url, headers=None): + return WbResponse.redir_response(url, headers=headers) + @staticmethod def extract_client_cookie(env, cookie_name): cookie_header = env.get('HTTP_COOKIE') diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py index d1a4f772..3498c819 100644 --- a/pywb/framework/wsgi_wrappers.py +++ b/pywb/framework/wsgi_wrappers.py @@ -125,7 +125,11 @@ class WSGIApp(object): else: err_url = None - err_msg = exc.message.encode('utf-8') + try: + err_msg = exc.message.encode('utf-8') + except Exception: + err_msg = exc.message + err_url = '' if print_trace: import traceback diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index d5593a22..2679b4dc 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -144,7 +144,7 @@ class HttpsUrlRewriter(object): else: return url - def get_timestamp_url(self, timestamp, url): + def get_timestamp_url(self, timestamp, url=''): return url def get_abs_url(self, url=''): diff --git a/pywb/static/wb.js b/pywb/static/wb.js index fb2c3ac3..f4267b8e 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -70,9 +70,13 @@ function init_banner() { text += "" + capture_str + ""; - if (wbinfo.proxy_select && wbinfo.url) { - full_url = wbinfo.proxy_select + "/" + wbinfo.url; - text += '
Switch Collection'; + if (wbinfo.proxy_magic && wbinfo.url) { + var select_url = wbinfo.proxy_magic + "/" + wbinfo.url; + var query_url = wbinfo.proxy_magic + "/*/" + wbinfo.url; + text += '
' + text += 'From ' + wbinfo.coll + ' [Switch]'; + text += '  '; + text += 'View All Captures'; } banner.innerHTML = text; diff --git a/pywb/ui/error.html b/pywb/ui/error.html index 6453e987..b122fc38 100644 --- a/pywb/ui/error.html +++ b/pywb/ui/error.html @@ -10,9 +10,9 @@

{% endif %} -{% if env.pywb_proxy_select and err_url and status == '404 Not Found' %} +{% if env.pywb_proxy_magic and err_url and status == '404 Not Found' %}

-Try Different Collections +Try Different Collection

{% endif %} diff --git a/pywb/ui/head_insert.html b/pywb/ui/head_insert.html index 98330da9..f22ef55a 100644 --- a/pywb/ui/head_insert.html +++ b/pywb/ui/head_insert.html @@ -20,7 +20,8 @@ wbinfo.is_frame_mp = {{"true" if wbrequest.wb_url.mod == 'mp_' else "false"}}; wbinfo.canon_url = "{{ canon_url }}"; wbinfo.is_live = {{ "true" if cdx.is_live else "false" }}; - wbinfo.proxy_select = "{{ wbrequest.env.pywb_proxy_select }}"; + wbinfo.coll = "{{ wbrequest.coll }}"; + wbinfo.proxy_magic = "{{ wbrequest.env.pywb_proxy_magic }}"; diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py index 7cd62a79..3b3a3cc6 100644 --- a/pywb/webapp/pywb_init.py +++ b/pywb/webapp/pywb_init.py @@ -78,7 +78,7 @@ def create_live_handler(config): #================================================================= def init_route_config(value, config): - if isinstance(value, str): + if isinstance(value, str) or isinstance(value, list): value = dict(index_paths=value) route_config = DictChain(value, config) From cfe11a5ad383052e7224dca689b7e4040656b778 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 31 Jul 2014 11:56:43 -0700 Subject: [PATCH 17/26] fix typo param ordering --- pywb/framework/proxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index 693e7bd0..90ff2fd0 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -118,7 +118,7 @@ class ProxyRouter(object): # check resolver, for pre connect resolve if self.resolver.pre_connect: - route, coll, matcher, response, ts = self.resolver.resolve(env) + route, coll, matcher, ts, response = self.resolver.resolve(env) if response: return response From 407da7528ba70565da46dc6363745e962c1e9983 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 31 Jul 2014 17:02:26 -0700 Subject: [PATCH 18/26] proxy/rewrite: don't rewrite headers banner_only --- pywb/framework/proxy.py | 7 ++++++- pywb/rewrite/header_rewriter.py | 16 +++++++++++----- pywb/rewrite/rewrite_content.py | 4 ++++ 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index 90ff2fd0..ab322374 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -181,7 +181,12 @@ class ProxyRouter(object): elif is_https: wbrequest.wb_url.mod = 'bn_' - return route.handler(wbrequest) + response = route.handler(wbrequest) + + if wbrequest.wb_url and wbrequest.wb_url.is_replay(): + response.status_headers.replace_header('Cache-Control', 'no-cache') + + return response def get_request_socket(self, env): if not self.ca: diff --git a/pywb/rewrite/header_rewriter.py b/pywb/rewrite/header_rewriter.py index 2dfc824d..fd41eba8 100644 --- a/pywb/rewrite/header_rewriter.py +++ b/pywb/rewrite/header_rewriter.py @@ -37,7 +37,7 @@ class HeaderRewriter: ENCODING_HEADERS = ['content-encoding'] - REMOVE_HEADERS = ['transfer-encoding'] + REMOVE_HEADERS = ['transfer-encoding', 'content-security-policy'] PROXY_NO_REWRITE_HEADERS = ['content-length'] @@ -90,7 +90,10 @@ class HeaderRewriter: new_headers = [] removed_header_dict = {} - cookie_rewriter = urlrewriter.get_cookie_rewriter() + if urlrewriter: + cookie_rewriter = urlrewriter.get_cookie_rewriter() + else: + cookie_rewriter = None for (name, value) in headers: @@ -99,7 +102,7 @@ class HeaderRewriter: if lowername in self.PROXY_HEADERS: new_headers.append((name, value)) - elif lowername in self.URL_REWRITE_HEADERS: + elif urlrewriter and lowername in self.URL_REWRITE_HEADERS: new_headers.append((name, urlrewriter.rewrite(value))) elif lowername in self.ENCODING_HEADERS: @@ -109,7 +112,8 @@ class HeaderRewriter: new_headers.append((name, value)) elif lowername in self.REMOVE_HEADERS: - removed_header_dict[lowername] = value + removed_header_dict[lowername] = value + new_headers.append((self.header_prefix + name, value)) elif (lowername in self.PROXY_NO_REWRITE_HEADERS and not content_rewritten): @@ -120,7 +124,9 @@ class HeaderRewriter: cookie_list = cookie_rewriter.rewrite(value) new_headers.extend(cookie_list) - else: + elif urlrewriter: new_headers.append((self.header_prefix + name, value)) + else: + new_headers.append((name, value)) return (new_headers, removed_header_dict) diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 93ec396b..e81fdf9a 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -63,6 +63,10 @@ class RewriteContent: status_headers, stream = self.sanitize_content(headers, stream) return (status_headers, self.stream_to_gen(stream), False) + + if wb_url.is_banner_only: + urlrewriter = None + (rewritten_headers, stream) = self.rewrite_headers(urlrewriter, headers, stream) From f5c27d7b068d9a61a8379eb9593353ce988e397d Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 31 Jul 2014 17:33:43 -0700 Subject: [PATCH 19/26] rewrite: fix header rewrite test proxy_pac: use http host header if available for proxy host --- pywb/framework/proxy.py | 4 +--- pywb/rewrite/test/test_header_rewriter.py | 8 +++++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index ab322374..76cd8843 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -300,8 +300,7 @@ class ProxyRouter(object): # Proxy Auto-Config (PAC) script for the proxy def make_pac_response(self, env): - import os - hostname = os.environ.get('PYWB_HOST_NAME') + hostname = env.get('HTTP_HOST') if not hostname: server_hostport = env['SERVER_NAME'] + ':' + env['SERVER_PORT'] hostonly = env['SERVER_NAME'] @@ -319,7 +318,6 @@ class ProxyRouter(object): buff += direct.format(hostonly) - #buff += '\n return "PROXY {0}";\n}}\n'.format(self.hostpaths[0]) buff += '\n return "PROXY {0}";\n}}\n'.format(server_hostport) content_type = 'application/x-ns-proxy-autoconfig' diff --git a/pywb/rewrite/test/test_header_rewriter.py b/pywb/rewrite/test/test_header_rewriter.py index 1a2b2cea..0b22d533 100644 --- a/pywb/rewrite/test/test_header_rewriter.py +++ b/pywb/rewrite/test/test_header_rewriter.py @@ -40,17 +40,19 @@ HTTP Headers Rewriting 'removed_header_dict': {'content-encoding': 'gzip', 'transfer-encoding': 'chunked'}, 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'), - ('Content-Type', 'text/javascript')]), + ('Content-Type', 'text/javascript'), + ('X-Archive-Orig-Transfer-Encoding', 'chunked')]), 'text_type': 'js'} -# Binary -- transfer-encoding removed +# Binary -- transfer-encoding rewritten >>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Set-Cookie', 'foo=bar; Path=/;'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')]) {'charset': None, 'removed_header_dict': {'transfer-encoding': 'chunked'}, 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/'), - ('Content-Encoding', 'gzip')]), + ('Content-Encoding', 'gzip'), + ('X-Archive-Orig-Transfer-Encoding', 'chunked')]), 'text_type': None} """ From 2ca4757599a358ec6b06d8eb29cb2c145e2de81f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 31 Jul 2014 18:03:18 -0700 Subject: [PATCH 20/26] fix integration test for proxy_pac --- tests/test_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index 456d50f8..a3bd6f3b 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -385,7 +385,7 @@ class TestWb: assert resp.status_int == 407 def test_proxy_pac(self): - resp = self.testapp.get('/proxy.pac', extra_environ = dict(SERVER_NAME='pywb-proxy', SERVER_PORT='8080')) + resp = self.testapp.get('/proxy.pac', headers = [('Host', 'pywb-proxy:8080')]) assert resp.content_type == 'application/x-ns-proxy-autoconfig' assert '"PROXY pywb-proxy:8080"' in resp.body assert '"localhost"' in resp.body From 92daad3b2bbced9b377d5a41f7ef3fcb38a840c1 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 31 Jul 2014 18:56:35 -0700 Subject: [PATCH 21/26] ui: tweak head insert text for proxy --- pywb/static/wb.js | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pywb/static/wb.js b/pywb/static/wb.js index f4267b8e..19d292c3 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -73,10 +73,9 @@ function init_banner() { if (wbinfo.proxy_magic && wbinfo.url) { var select_url = wbinfo.proxy_magic + "/" + wbinfo.url; var query_url = wbinfo.proxy_magic + "/*/" + wbinfo.url; + text += ' All Capture Times'; text += '
' - text += 'From ' + wbinfo.coll + ' [Switch]'; - text += '  '; - text += 'View All Captures'; + text += 'From collection ' + wbinfo.coll + ' All Collections'; } banner.innerHTML = text; From 37fd75f744234903ea5835c310cc2bc79cd01fe4 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 31 Jul 2014 21:17:07 -0700 Subject: [PATCH 22/26] update version to 0.6.0, update CHANGELIST add quotes around "coll" in header --- CHANGES.rst | 8 ++++++++ README.rst | 9 +++++---- pywb/rewrite/header_rewriter.py | 3 ++- pywb/static/wb.js | 2 +- setup.py | 2 +- 5 files changed, 17 insertions(+), 7 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index a7848d64..0ab917fb 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,3 +1,11 @@ +pywb 0.6.0 changelist +~~~~~~~~~~~~~~~~~~~~~ + +* HTTPS Proxy Support! + +* Revamped HTTP/S system: proxy collection and capture time switching via cookie! + + pywb 0.5.1 changelist ~~~~~~~~~~~~~~~~~~~~~ minor fixes: diff --git a/README.rst b/README.rst index 6aa256ac..3640c69d 100644 --- a/README.rst +++ b/README.rst @@ -1,11 +1,11 @@ -PyWb 0.5.2 +PyWb 0.6.0 ========== -.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop +.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=https-proxy :target: https://travis-ci.org/ikreymer/pywb -.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=develop - :target: https://coveralls.io/r/ikreymer/pywb?branch=develop +.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=https-proxy + :target: https://coveralls.io/r/ikreymer/pywb?branch=https-proxy pywb is a python implementation of web archival replay tools, sometimes also known as 'Wayback Machine'. @@ -21,6 +21,7 @@ This README contains a basic overview of using pywb. After reading this intro, c * `pywb-samples `_ provides additional archive samples with difficult-to-replay content. +* `pywb-proxy-demo `_ showcases the revamped HTTP/S proxy replay system (available from pywb 0.6.0) The following deployed applications use pywb: diff --git a/pywb/rewrite/header_rewriter.py b/pywb/rewrite/header_rewriter.py index fd41eba8..2d505e88 100644 --- a/pywb/rewrite/header_rewriter.py +++ b/pywb/rewrite/header_rewriter.py @@ -37,7 +37,8 @@ class HeaderRewriter: ENCODING_HEADERS = ['content-encoding'] - REMOVE_HEADERS = ['transfer-encoding', 'content-security-policy'] + REMOVE_HEADERS = ['transfer-encoding', 'content-security-policy', + 'strict-transport-security'] PROXY_NO_REWRITE_HEADERS = ['content-length'] diff --git a/pywb/static/wb.js b/pywb/static/wb.js index 19d292c3..3ef6471e 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -75,7 +75,7 @@ function init_banner() { var query_url = wbinfo.proxy_magic + "/*/" + wbinfo.url; text += ' All Capture Times'; text += '
' - text += 'From collection ' + wbinfo.coll + ' All Collections'; + text += 'From collection "' + wbinfo.coll + '" All Collections'; } banner.innerHTML = text; diff --git a/setup.py b/setup.py index 2881d1e5..6b5482bf 100755 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ class PyTest(TestCommand): setup( name='pywb', - version='0.5.2', + version='0.6.0', url='https://github.com/ikreymer/pywb', author='Ilya Kreymer', author_email='ikreymer@gmail.com', From 48b1c7891772a3326ca114ac0d824dff32a627e2 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 31 Jul 2014 21:27:30 -0700 Subject: [PATCH 23/26] proxy: more banner tweaks --- pywb/static/wb.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pywb/static/wb.js b/pywb/static/wb.js index 3ef6471e..d4db630e 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -73,9 +73,9 @@ function init_banner() { if (wbinfo.proxy_magic && wbinfo.url) { var select_url = wbinfo.proxy_magic + "/" + wbinfo.url; var query_url = wbinfo.proxy_magic + "/*/" + wbinfo.url; - text += ' All Capture Times'; + text += ' All Capture Times'; text += '
' - text += 'From collection "' + wbinfo.coll + '" All Collections'; + text += 'From collection "' + wbinfo.coll + '" All Collections'; } banner.innerHTML = text; From aeb246466b0bec2a28284dd4f0504b9b277bea8a Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 1 Aug 2014 12:35:19 -0700 Subject: [PATCH 24/26] proxy: SSL version is 0-based not 1-based, set_version(2) for version 3! --- pywb/framework/certauth.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pywb/framework/certauth.py b/pywb/framework/certauth.py index 73b0d0e4..ef47b380 100644 --- a/pywb/framework/certauth.py +++ b/pywb/framework/certauth.py @@ -59,7 +59,7 @@ class CertificateAuthority(object): @staticmethod def _make_cert(certname): cert = crypto.X509() - cert.set_version(3) + cert.set_version(2) cert.set_serial_number(random.randint(0, 2 ** 64 - 1)) cert.get_subject().CN = certname From 4efd2d514c4cafed5e5d22007ee1c1e9f1819332 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 1 Aug 2014 17:15:49 -0700 Subject: [PATCH 25/26] proxy: add proxy_cert download page at root http://pywb.proxy, serving .pem and .p12 (windows only) certs for auto installation --- pywb/framework/certauth.py | 6 +++ pywb/framework/proxy.py | 76 +++++++++++++++++++++++++++----- pywb/ui/proxy_cert_download.html | 14 ++++++ pywb/ui/proxy_select.html | 2 +- pywb/webapp/pywb_init.py | 12 ++++- 5 files changed, 96 insertions(+), 14 deletions(-) create mode 100644 pywb/ui/proxy_cert_download.html diff --git a/pywb/framework/certauth.py b/pywb/framework/certauth.py index ef47b380..260f5bdc 100644 --- a/pywb/framework/certauth.py +++ b/pywb/framework/certauth.py @@ -56,6 +56,12 @@ class CertificateAuthority(object): return True, host_filename + def get_root_PKCS12(self): + p12 = crypto.PKCS12() + p12.set_certificate(self.cert) + p12.set_privatekey(self.key) + return p12.export() + @staticmethod def _make_cert(certname): cert = crypto.X509() diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index 76cd8843..fe8e3ec8 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -57,6 +57,9 @@ class ProxyRouter(object): BLOCK_SIZE = 4096 DEF_MAGIC_NAME = 'pywb.proxy' + CERT_DL_PEM = '/pywb-ca.pem' + CERT_DL_P12 = '/pywb-ca.p12' + def __init__(self, routes, **kwargs): self.hostpaths = kwargs.get('hostpaths') @@ -81,19 +84,24 @@ class ProxyRouter(object): self.proxy_pac_path = proxy_options.get('pac_path', self.PAC_PATH) - if proxy_options.get('enable_https_proxy'): - ca_file = proxy_options.get('root_ca_file') - - # attempt to create the root_ca_file if doesn't exist - # (generally recommended to create this seperately) - certname = proxy_options.get('root_ca_name') - CertificateAuthority.generate_ca_root(certname, ca_file) - - certs_dir = proxy_options.get('certs_dir') - self.ca = CertificateAuthority(ca_file=ca_file, - certs_dir=certs_dir) - else: + if not proxy_options.get('enable_https_proxy'): self.ca = None + self.proxy_cert_dl_view = None + return + + # HTTPS Only Options + ca_file = proxy_options.get('root_ca_file') + + # attempt to create the root_ca_file if doesn't exist + # (generally recommended to create this seperately) + certname = proxy_options.get('root_ca_name') + CertificateAuthority.generate_ca_root(certname, ca_file) + + certs_dir = proxy_options.get('certs_dir') + self.ca = CertificateAuthority(ca_file=ca_file, + certs_dir=certs_dir) + + self.proxy_cert_dl_view = proxy_options.get('proxy_cert_download_view') def __call__(self, env): is_https = (env['REQUEST_METHOD'] == 'CONNECT') @@ -143,6 +151,12 @@ class ProxyRouter(object): # route (static) and other resources to archival replay if env['pywb.proxy_host'] == self.magic_name: env['REL_REQUEST_URI'] = env['pywb.proxy_req_uri'] + + # special case for proxy install + response = self.handle_cert_install(env) + if response: + return response + return None # check resolver, post connect @@ -298,6 +312,44 @@ class ProxyRouter(object): block_size=self.BLOCK_SIZE, starting_data=remainder) + def handle_cert_install(self, env): + if env['pywb.proxy_req_uri'] in ('/', '/index.html', '/index.html'): + available = (self.ca is not None) + + if self.proxy_cert_dl_view: + return (self.proxy_cert_dl_view. + render_response(available=available, + pem_path=self.CERT_DL_PEM, + p12_path=self.CERT_DL_P12)) + else: + return None + + elif env['pywb.proxy_req_uri'] == self.CERT_DL_PEM: + if not self.ca: + return None + + buff = '' + with open(self.ca.ca_file) as fh: + buff = fh.read() + + content_type = 'application/x-x509-ca-cert' + + return WbResponse.text_response(buff, + content_type=content_type) + + elif env['pywb.proxy_req_uri'] == self.CERT_DL_P12: + if not self.ca: + return None + + buff = self.ca.get_root_PKCS12() + + content_type = 'application/x-pkcs12' + + return WbResponse.text_response(buff, + content_type=content_type) + else: + return None + # Proxy Auto-Config (PAC) script for the proxy def make_pac_response(self, env): hostname = env.get('HTTP_HOST') diff --git a/pywb/ui/proxy_cert_download.html b/pywb/ui/proxy_cert_download.html new file mode 100644 index 00000000..71255e3a --- /dev/null +++ b/pywb/ui/proxy_cert_download.html @@ -0,0 +1,14 @@ +

HTTPS Certificate For PyWb Web Archive Replay

+{% if not available %} +

Sorry, HTTPS support is not configured for this proxy. However, the proxy should work in HTTP mode.

+{% else %} +

Download for all platforms (except Windows):

+

Download Certificate (All except Windows)

+ +

(If you see the Already Installed message, then no further action is necessary and you may start browsing!

+{% endif %} + +

Download for Windows platforms:

+

Download Certificate (Window Only)

+ + diff --git a/pywb/ui/proxy_select.html b/pywb/ui/proxy_select.html index a5164ff2..ff9afc00 100644 --- a/pywb/ui/proxy_select.html +++ b/pywb/ui/proxy_select.html @@ -14,7 +14,7 @@ Current collection is: {{ coll }}
    {% for route in routes %} -{% if route | is_wb_handler %} +{% if route.path and route | is_wb_handler %}
  • {{ route.path }}
  • {% endif %} {% endfor %} diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py index 3b3a3cc6..4c503be6 100644 --- a/pywb/webapp/pywb_init.py +++ b/pywb/webapp/pywb_init.py @@ -33,7 +33,9 @@ DEFAULTS = { 'search_html': 'ui/search.html', 'home_html': 'ui/index.html', 'error_html': 'ui/error.html', + 'proxy_select_html': 'ui/proxy_select.html', + 'proxy_cert_download_html': 'ui/proxy_cert_download.html', 'template_globals': {'static_path': 'static/default'}, @@ -227,7 +229,15 @@ def create_wb_router(passed_config={}): if not 'proxy_options' in passed_config: passed_config['proxy_options'] = {} - passed_config['proxy_options']['proxy_select_view'] = view + if view: + passed_config['proxy_options']['proxy_select_view'] = view + + view = J2TemplateView.create_template( + config.get('proxy_cert_download_html'), + 'Proxy Cert Download') + + if view: + passed_config['proxy_options']['proxy_cert_download_view'] = view else: router = ArchivalRouter From 92726309fc956393f5979ee90a7eecf41aa33118 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 2 Aug 2014 04:27:51 -0700 Subject: [PATCH 26/26] proxy: add 'extra_headers' to be added to proxy responses, customizable via proxy_options defaults include no-cache and p3p policy (needed for IE default settings) fix link generation for proxy_select page, better exception handling of ssl errors --- pywb/framework/proxy.py | 25 ++++++++++++++++++------- pywb/framework/proxy_resolvers.py | 26 +++++++++++++++++++------- pywb/ui/proxy_select.html | 2 +- pywb/utils/statusandheaders.py | 20 +++++++++++++++++++- 4 files changed, 57 insertions(+), 16 deletions(-) diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index fe8e3ec8..57dd5088 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -60,6 +60,9 @@ class ProxyRouter(object): CERT_DL_PEM = '/pywb-ca.pem' CERT_DL_P12 = '/pywb-ca.p12' + EXTRA_HEADERS = {'cache-control': 'no-cache', + 'p3p': 'CP="NOI ADM DEV COM NAV OUR STP"'} + def __init__(self, routes, **kwargs): self.hostpaths = kwargs.get('hostpaths') @@ -74,6 +77,11 @@ class ProxyRouter(object): self.magic_name = self.DEF_MAGIC_NAME proxy_options['magic_name'] = self.magic_name + self.extra_headers = proxy_options.get('extra_headers') + if not self.extra_headers: + self.extra_headers = self.EXTRA_HEADERS + proxy_options['extra_headers'] = self.extra_headers + if proxy_options.get('cookie_resolver'): self.resolver = CookieResolver(routes, proxy_options) else: @@ -198,7 +206,7 @@ class ProxyRouter(object): response = route.handler(wbrequest) if wbrequest.wb_url and wbrequest.wb_url.is_replay(): - response.status_headers.replace_header('Cache-Control', 'no-cache') + response.status_headers.replace_headers(self.extra_headers) return response @@ -252,19 +260,22 @@ class ProxyRouter(object): server_side=True, certfile=certfile, ciphers="ALL", + suppress_ragged_eofs=False, + #ssl_version=ssl.PROTOCOL_TLSv1) ssl_version=ssl.PROTOCOL_SSLv23) + env['pywb.proxy_ssl_sock'] = ssl_sock + + buffreader = BufferedReader(ssl_sock, block_size=self.BLOCK_SIZE) + + statusline = buffreader.readline().rstrip() + except Exception as se: raise BadRequestException(se.message) - env['pywb.proxy_ssl_sock'] = ssl_sock - - buffreader = BufferedReader(ssl_sock, block_size=self.BLOCK_SIZE) - - statusline = buffreader.readline() statusparts = statusline.split(' ') if len(statusparts) < 3: - raise BadRequestException('Invalid Proxy Request') + raise BadRequestException('Invalid Proxy Request: ' + statusline) env['REQUEST_METHOD'] = statusparts[0] env['REL_REQUEST_URI'] = ('https://' + diff --git a/pywb/framework/proxy_resolvers.py b/pywb/framework/proxy_resolvers.py index 8fb65b73..dc7b22fe 100644 --- a/pywb/framework/proxy_resolvers.py +++ b/pywb/framework/proxy_resolvers.py @@ -132,6 +132,8 @@ class CookieResolver(BaseCollResolver): # pragma: no cover self.cookie_name = config.get('cookie_name', '__pywb_coll') self.proxy_select_view = config.get('proxy_select_view') + self.extra_headers = config.get('extra_headers') + if uwsgi_cache: self.cache = UwsgiCache() else: @@ -222,14 +224,17 @@ class CookieResolver(BaseCollResolver): # pragma: no cover coll, ts, sesh_id = self.get_coll(env) - route_temp = env['pywb.proxy_scheme'] + '://%s-set.' - route_temp += self.magic_name + '/' + path_url + #scheme = env['pywb.proxy_scheme'] + '://' + route_temp = '-set.' + self.magic_name + '/' + path_url - return (self.proxy_select_view. - render_response(routes=self.routes, - route_temp=route_temp, - coll=coll, - url=path_url)) + try: + return (self.proxy_select_view. + render_response(routes=self.routes, + route_temp=route_temp, + coll=coll, + url=path_url)) + except Exception as exc: + raise #else: # msg = 'Invalid Magic Path: ' + url @@ -301,6 +306,13 @@ class CookieResolver(BaseCollResolver): # pragma: no cover return sesh_id def make_redir_response(self, url, headers=None): + if not headers: + headers = [] + + if self.extra_headers: + for name, value in self.extra_headers.iteritems(): + headers.append((name, value)) + return WbResponse.redir_response(url, headers=headers) @staticmethod diff --git a/pywb/ui/proxy_select.html b/pywb/ui/proxy_select.html index ff9afc00..b06f68a2 100644 --- a/pywb/ui/proxy_select.html +++ b/pywb/ui/proxy_select.html @@ -15,7 +15,7 @@ Current collection is: {{ coll }} diff --git a/pywb/utils/statusandheaders.py b/pywb/utils/statusandheaders.py index ae3fc261..70ba850c 100644 --- a/pywb/utils/statusandheaders.py +++ b/pywb/utils/statusandheaders.py @@ -3,6 +3,7 @@ Representation and parsing of HTTP-style status + headers """ import pprint +from copy import copy #================================================================= @@ -44,9 +45,26 @@ class StatusAndHeaders(object): self.headers.append((name, value)) return None + def replace_headers(self, header_dict): + """ + replace all headers in header_dict that already exist + add any remaining headers + """ + header_dict = copy(header_dict) + + for index in xrange(len(self.headers) - 1, -1, -1): + curr_name, curr_value = self.headers[index] + name_lower = curr_name.lower() + if name_lower in header_dict: + self.headers[index] = (curr_name, header_dict[name_lower]) + del header_dict[name_lower] + + for name, value in header_dict.iteritems(): + self.headers.append((name, value)) + def remove_header(self, name): """ - remove header (case-insensitive) + Remove header (case-insensitive) return True if header removed, False otherwise """ name_lower = name.lower()