From 739f23da9e2ab0beb865c0284eec8577f00c93b2 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Sat, 26 Jul 2014 09:48:44 -0700
Subject: [PATCH 01/26] https proxy support, CONNECT verb handling (uwsgi only)
---
README.rst | 2 +-
pywb/framework/proxy.py | 72 ++++++++++++++++++++++++++++++++-
pywb/framework/wsgi_wrappers.py | 33 +++++++++++++++
setup.py | 2 +-
4 files changed, 106 insertions(+), 3 deletions(-)
diff --git a/README.rst b/README.rst
index 9c4b380d..6aa256ac 100644
--- a/README.rst
+++ b/README.rst
@@ -1,4 +1,4 @@
-PyWb 0.5.1
+PyWb 0.5.2
==========
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop
diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py
index 62bc06b0..6754ecd7 100644
--- a/pywb/framework/proxy.py
+++ b/pywb/framework/proxy.py
@@ -62,12 +62,16 @@ class ProxyRouter(object):
self.unaltered = proxy_options.get('unaltered_replay', False)
def __call__(self, env):
+ if env['REQUEST_METHOD'] == 'CONNECT':
+ if not self.handle_connect(env):
+ return None
+
url = env['REL_REQUEST_URI']
if url.endswith('/proxy.pac'):
return self.make_pac_response(env)
- if not url.startswith('http://'):
+ if not url.startswith(('http://', 'https://')):
return None
proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION')
@@ -122,6 +126,72 @@ class ProxyRouter(object):
return route.handler(wbrequest)
+ def handle_connect(self, env):
+ import uwsgi
+ import socket
+ import ssl
+ from io import BytesIO
+
+ fd = uwsgi.connection_fd()
+ conn = socket.fromfd(fd, socket.AF_INET, socket.SOCK_STREAM)
+ sock = socket.socket(_sock=conn)
+
+ if (self.use_default_coll or
+ len(self.routes) == 1 or
+ env.get('HTTP_PROXY_AUTHORIZATION') is not None):
+
+ sock.send('HTTP/1.0 200 Connection Established\r\n')
+ sock.send('Server: pywb proxy\r\n')
+ sock.send('\r\n')
+ else:
+ env['pywb.proxy_statusline'] = '407 Proxy Auth Required'
+ sock.send('HTTP/1.0 407 Proxy Auth Required\r\n')
+ sock.send('Server: pywb proxy\r\n')
+ sock.send('\r\n')
+ return False
+
+ ssl_sock = ssl.wrap_socket(sock, server_side=True,
+ certfile='/tmp/testcert.pem',
+ ssl_version=ssl.PROTOCOL_SSLv23)
+
+ env['pywb.proxy_ssl_sock'] = ssl_sock
+
+ buff = ssl_sock.recv(4096)
+
+ buffreader = BytesIO(buff)
+
+ statusline = buffreader.readline()
+ statusparts = statusline.split(' ')
+
+ if len(statusparts) < 3:
+ return
+
+ env['REQUEST_METHOD'] = statusparts[0]
+ env['REL_REQUEST_URI'] = ('https://' +
+ env['REL_REQUEST_URI'].replace(':443', '') +
+ statusparts[1])
+
+ env['SERVER_PROTOCOL'] = statusparts[2].strip()
+
+ queryparts = env['REL_REQUEST_URI'].split('?', 1)
+ env['PATH_INFO'] = queryparts[0]
+ env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else ''
+
+ while True:
+ line = buffreader.readline()
+ if not line:
+ break
+
+ parts = line.split(':')
+ if len(parts) < 2:
+ continue
+
+ name = 'HTTP_' + parts[0].replace('-', '_').upper()
+ env[name] = parts[1]
+
+ return True
+
+
# Proxy Auto-Config (PAC) script for the proxy
def make_pac_response(self, env):
import os
diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py
index 3729a660..b40b5678 100644
--- a/pywb/framework/wsgi_wrappers.py
+++ b/pywb/framework/wsgi_wrappers.py
@@ -50,6 +50,39 @@ class WSGIApp(object):
# Top-level wsgi application
def __call__(self, env, start_response):
+ if env['REQUEST_METHOD'] == 'CONNECT':
+ return self.handle_connect(env, start_response)
+ else:
+ return self.handle_methods(env, start_response)
+
+ def handle_connect(self, env, start_response):
+ def ssl_start_response(statusline, headers):
+ ssl_sock = env.get('pywb.proxy_ssl_sock')
+ if not ssl_sock:
+ return
+
+ env['pywb.proxy_statusline'] = statusline
+
+ ssl_sock.write('HTTP/1.1 ' + statusline + '\r\n')
+ for name, value in headers:
+ ssl_sock.write(name + ': ' + value + '\r\n')
+
+ resp_iter = self.handle_methods(env, ssl_start_response)
+
+ ssl_sock = env.get('pywb.proxy_ssl_sock')
+ if ssl_sock:
+ ssl_sock.write('\r\n')
+
+ for obj in resp_iter:
+ ssl_sock.write(obj)
+
+ ssl_sock.close()
+
+ start_response(env['pywb.proxy_statusline'], [])
+
+ return []
+
+ def handle_methods(self, env, start_response):
if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
env['REL_REQUEST_URI'] = rel_request_uri(env)
else:
diff --git a/setup.py b/setup.py
index 3e89abed..a6e9c885 100755
--- a/setup.py
+++ b/setup.py
@@ -34,7 +34,7 @@ class PyTest(TestCommand):
setup(
name='pywb',
- version='0.5.1',
+ version='0.5.2',
url='https://github.com/ikreymer/pywb',
author='Ilya Kreymer',
author_email='ikreymer@gmail.com',
From eca3cf5fbf096b123c760d196f9c3067bce49eb0 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Sat, 26 Jul 2014 13:24:53 -0700
Subject: [PATCH 02/26] https proxy: add ca generator! support uwsgi, gunicorn
and ref better handling of 407, other error responses in response to CONNECT
---
config.yaml | 3 ++
pywb/framework/certa.py | 87 ++++++++++++++++++++++++++++++
pywb/framework/proxy.py | 95 ++++++++++++++++++++-------------
pywb/framework/wsgi_wrappers.py | 19 ++++---
4 files changed, 160 insertions(+), 44 deletions(-)
create mode 100644 pywb/framework/certa.py
diff --git a/config.yaml b/config.yaml
index 937b4545..fc2290ba 100644
--- a/config.yaml
+++ b/config.yaml
@@ -109,3 +109,6 @@ enable_memento: true
# Replay content in an iframe
framed_replay: true
+
+debug_echo_env: True
+
diff --git a/pywb/framework/certa.py b/pywb/framework/certa.py
new file mode 100644
index 00000000..844ad497
--- /dev/null
+++ b/pywb/framework/certa.py
@@ -0,0 +1,87 @@
+import logging
+import os
+import OpenSSL
+import random
+
+
+class CertificateAuthority(object):
+ logger = logging.getLogger('pywb.CertificateAuthority')
+
+ def __init__(self, ca_file='pywb-ca.pem', certs_dir='./pywb-ca'):
+ self.ca_file = ca_file
+ self.certs_dir = certs_dir
+
+ if not os.path.exists(ca_file):
+ self._generate_ca()
+ else:
+ self._read_ca(ca_file)
+
+ if not os.path.exists(certs_dir):
+ self.logger.info("directory for generated certs {} doesn't exist, creating it".format(certs_dir))
+ os.mkdir(certs_dir)
+
+
+ def _generate_ca(self):
+ # Generate key
+ self.key = OpenSSL.crypto.PKey()
+ self.key.generate_key(OpenSSL.crypto.TYPE_RSA, 2048)
+
+ # Generate certificate
+ self.cert = OpenSSL.crypto.X509()
+ self.cert.set_version(3)
+ # avoid sec_error_reused_issuer_and_serial
+ self.cert.set_serial_number(random.randint(0,2**64-1))
+ self.cert.get_subject().CN = 'pywb CA on {}'.format('')
+ self.cert.gmtime_adj_notBefore(0) # now
+ self.cert.gmtime_adj_notAfter(100*365*24*60*60) # 100 yrs in future
+ self.cert.set_issuer(self.cert.get_subject())
+ self.cert.set_pubkey(self.key)
+ self.cert.add_extensions([
+ OpenSSL.crypto.X509Extension(b"basicConstraints", True, b"CA:TRUE, pathlen:0"),
+ OpenSSL.crypto.X509Extension(b"keyUsage", True, b"keyCertSign, cRLSign"),
+ OpenSSL.crypto.X509Extension(b"subjectKeyIdentifier", False, b"hash", subject=self.cert),
+ ])
+ self.cert.sign(self.key, "sha1")
+
+ with open(self.ca_file, 'wb+') as f:
+ f.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM, self.key))
+ f.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM, self.cert))
+
+ self.logger.info('generated CA key+cert and wrote to {}'.format(self.ca_file))
+
+
+ def _read_ca(self, filename):
+ self.cert = OpenSSL.crypto.load_certificate(OpenSSL.SSL.FILETYPE_PEM, open(filename).read())
+ self.key = OpenSSL.crypto.load_privatekey(OpenSSL.SSL.FILETYPE_PEM, open(filename).read())
+ self.logger.info('read CA key+cert from {}'.format(self.ca_file))
+
+ def __getitem__(self, cn):
+ cnp = os.path.sep.join([self.certs_dir, '%s.pem' % cn])
+ if not os.path.exists(cnp):
+ # create certificate
+ key = OpenSSL.crypto.PKey()
+ key.generate_key(OpenSSL.crypto.TYPE_RSA, 2048)
+
+ # Generate CSR
+ req = OpenSSL.crypto.X509Req()
+ req.get_subject().CN = cn
+ req.set_pubkey(key)
+ req.sign(key, 'sha1')
+
+ # Sign CSR
+ cert = OpenSSL.crypto.X509()
+ cert.set_subject(req.get_subject())
+ cert.set_serial_number(random.randint(0,2**64-1))
+ cert.gmtime_adj_notBefore(0)
+ cert.gmtime_adj_notAfter(10*365*24*60*60)
+ cert.set_issuer(self.cert.get_subject())
+ cert.set_pubkey(req.get_pubkey())
+ cert.sign(self.key, 'sha1')
+
+ with open(cnp, 'wb+') as f:
+ f.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM, key))
+ f.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM, cert))
+
+ self.logger.info('wrote generated key+cert to {}'.format(cnp))
+
+ return cnp
diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py
index 6754ecd7..202e4f3b 100644
--- a/pywb/framework/proxy.py
+++ b/pywb/framework/proxy.py
@@ -4,8 +4,15 @@ from archivalrouter import ArchivalRouter
import urlparse
import base64
+import socket
+import ssl
+from io import BytesIO
+
from pywb.rewrite.url_rewriter import HttpsUrlRewriter
from pywb.utils.statusandheaders import StatusAndHeaders
+from pywb.utils.wbexception import BadRequestException
+
+from certa import CertificateAuthority
#=================================================================
@@ -61,19 +68,21 @@ class ProxyRouter(object):
self.unaltered = proxy_options.get('unaltered_replay', False)
+ self.ca = CertificateAuthority()
+
+
def __call__(self, env):
- if env['REQUEST_METHOD'] == 'CONNECT':
- if not self.handle_connect(env):
+ is_https = (env['REQUEST_METHOD'] == 'CONNECT')
+
+ if not is_https:
+ url = env['REL_REQUEST_URI']
+
+ if url.endswith('/proxy.pac'):
+ return self.make_pac_response(env)
+
+ if not url.startswith(('http://', 'https://')):
return None
- url = env['REL_REQUEST_URI']
-
- if url.endswith('/proxy.pac'):
- return self.make_pac_response(env)
-
- if not url.startswith(('http://', 'https://')):
- return None
-
proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION')
route = None
@@ -108,6 +117,12 @@ class ProxyRouter(object):
else:
return self.proxy_auth_coll_response()
+ # do connect, then get updated url
+ if is_https:
+ self.handle_connect(env)
+
+ url = env['REL_REQUEST_URI']
+
wbrequest = route.request_class(env,
request_uri=url,
wb_url_str=url,
@@ -126,36 +141,41 @@ class ProxyRouter(object):
return route.handler(wbrequest)
- def handle_connect(self, env):
- import uwsgi
- import socket
- import ssl
- from io import BytesIO
-
- fd = uwsgi.connection_fd()
- conn = socket.fromfd(fd, socket.AF_INET, socket.SOCK_STREAM)
- sock = socket.socket(_sock=conn)
-
- if (self.use_default_coll or
- len(self.routes) == 1 or
- env.get('HTTP_PROXY_AUTHORIZATION') is not None):
-
- sock.send('HTTP/1.0 200 Connection Established\r\n')
- sock.send('Server: pywb proxy\r\n')
- sock.send('\r\n')
+ def get_request_socket(self, env):
+ if env.get('uwsgi.version'):
+ import uwsgi
+ fd = uwsgi.connection_fd()
+ conn = socket.fromfd(fd, socket.AF_INET, socket.SOCK_STREAM)
+ sock = socket.socket(_sock=conn)
+ elif env.get('gunicorn.socket'):
+ sock = env['gunicorn.socket']
else:
- env['pywb.proxy_statusline'] = '407 Proxy Auth Required'
- sock.send('HTTP/1.0 407 Proxy Auth Required\r\n')
- sock.send('Server: pywb proxy\r\n')
- sock.send('\r\n')
- return False
+ # attempt to find socket from wsgi.input
+ input_ = env.get('wsgi.input')
+ if input_ and hasattr(input_, '_sock'):
+ sock = socket.socket(_sock=input_._sock)
+
+ return sock
+
+ def handle_connect(self, env):
+ sock = self.get_request_socket(env)
+ if not sock:
+ return WbResponse.text_response('HTTPS Proxy Not Supported',
+ '405 HTTPS Proxy Not Supported')
+
+ sock.send('HTTP/1.0 200 Connection Established\r\n')
+ sock.send('Server: pywb proxy\r\n')
+ sock.send('\r\n')
+
+ hostname = env['REL_REQUEST_URI'].split(':')[0]
ssl_sock = ssl.wrap_socket(sock, server_side=True,
- certfile='/tmp/testcert.pem',
- ssl_version=ssl.PROTOCOL_SSLv23)
+ certfile=self.ca[hostname])
+ #ssl_version=ssl.PROTOCOL_SSLv23)
env['pywb.proxy_ssl_sock'] = ssl_sock
+ #todo: better reading of all headers
buff = ssl_sock.recv(4096)
buffreader = BytesIO(buff)
@@ -164,7 +184,7 @@ class ProxyRouter(object):
statusparts = statusline.split(' ')
if len(statusparts) < 3:
- return
+ raise BadRequestException('Invalid Proxy Request')
env['REQUEST_METHOD'] = statusparts[0]
env['REL_REQUEST_URI'] = ('https://' +
@@ -177,6 +197,8 @@ class ProxyRouter(object):
env['PATH_INFO'] = queryparts[0]
env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else ''
+ env['wsgi.input'] = socket._fileobject(ssl_sock, mode='r')
+
while True:
line = buffreader.readline()
if not line:
@@ -189,9 +211,6 @@ class ProxyRouter(object):
name = 'HTTP_' + parts[0].replace('-', '_').upper()
env[name] = parts[1]
- return True
-
-
# Proxy Auto-Config (PAC) script for the proxy
def make_pac_response(self, env):
import os
diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py
index b40b5678..1e1100e4 100644
--- a/pywb/framework/wsgi_wrappers.py
+++ b/pywb/framework/wsgi_wrappers.py
@@ -59,24 +59,27 @@ class WSGIApp(object):
def ssl_start_response(statusline, headers):
ssl_sock = env.get('pywb.proxy_ssl_sock')
if not ssl_sock:
+ start_response(statusline, headers)
return
env['pywb.proxy_statusline'] = statusline
- ssl_sock.write('HTTP/1.1 ' + statusline + '\r\n')
+ ssl_sock.write('HTTP/1.0 ' + statusline + '\r\n')
for name, value in headers:
ssl_sock.write(name + ': ' + value + '\r\n')
resp_iter = self.handle_methods(env, ssl_start_response)
ssl_sock = env.get('pywb.proxy_ssl_sock')
- if ssl_sock:
- ssl_sock.write('\r\n')
+ if not ssl_sock:
+ return resp_iter
- for obj in resp_iter:
- ssl_sock.write(obj)
+ ssl_sock.write('\r\n')
- ssl_sock.close()
+ for obj in resp_iter:
+ ssl_sock.write(obj)
+
+ ssl_sock.close()
start_response(env['pywb.proxy_statusline'], [])
@@ -178,6 +181,10 @@ def init_app(init_func, load_yaml=True, config_file=None, config={}):
def start_wsgi_server(the_app, name, default_port=None): # pragma: no cover
from wsgiref.simple_server import make_server
+ # disable is_hop_by_hop restrictions
+ import wsgiref.handlers
+ wsgiref.handlers.is_hop_by_hop = lambda x: False
+
port = the_app.port
if not port:
From 1464e89c419c88a8986b8d351e359273912b5c8f Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Sat, 26 Jul 2014 14:24:28 -0700
Subject: [PATCH 03/26] wbresponse: always include Content-Length for
text_response
---
pywb/framework/certa.py | 7 +++++--
pywb/framework/wbrequestresponse.py | 3 ++-
2 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/pywb/framework/certa.py b/pywb/framework/certa.py
index 844ad497..d957dbad 100644
--- a/pywb/framework/certa.py
+++ b/pywb/framework/certa.py
@@ -7,7 +7,10 @@ import random
class CertificateAuthority(object):
logger = logging.getLogger('pywb.CertificateAuthority')
- def __init__(self, ca_file='pywb-ca.pem', certs_dir='./pywb-ca'):
+ def __init__(self, ca_file='pywb-ca.pem',
+ certs_dir='./pywb-ca',
+ certname='pywb CA'):
+
self.ca_file = ca_file
self.certs_dir = certs_dir
@@ -31,7 +34,7 @@ class CertificateAuthority(object):
self.cert.set_version(3)
# avoid sec_error_reused_issuer_and_serial
self.cert.set_serial_number(random.randint(0,2**64-1))
- self.cert.get_subject().CN = 'pywb CA on {}'.format('')
+ self.cert.get_subject().CN = certname
self.cert.gmtime_adj_notBefore(0) # now
self.cert.gmtime_adj_notAfter(100*365*24*60*60) # 100 yrs in future
self.cert.set_issuer(self.cert.get_subject())
diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py
index 0f1a9f32..f2c63f9c 100644
--- a/pywb/framework/wbrequestresponse.py
+++ b/pywb/framework/wbrequestresponse.py
@@ -161,7 +161,8 @@ class WbResponse(object):
@staticmethod
def text_response(text, status='200 OK', content_type='text/plain'):
status_headers = StatusAndHeaders(status,
- [('Content-Type', content_type)])
+ [('Content-Type', content_type),
+ ('Content-Length', str(len(text)))])
return WbResponse(status_headers, value=[text])
From 5beb831ae9c76727f6a67ae0078f250288e35448 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Sat, 26 Jul 2014 14:27:31 -0700
Subject: [PATCH 04/26] wbrequestresponse: update doctest
---
pywb/framework/test/test_wbrequestresponse.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pywb/framework/test/test_wbrequestresponse.py b/pywb/framework/test/test_wbrequestresponse.py
index e066d4d1..65940e4a 100644
--- a/pywb/framework/test/test_wbrequestresponse.py
+++ b/pywb/framework/test/test_wbrequestresponse.py
@@ -40,7 +40,7 @@
# WbResponse Tests
# =================
>>> WbResponse.text_response('Test')
-{'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain')])}
+{'body': ['Test'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [('Content-Type', 'text/plain'), ('Content-Length', '4')])}
>>> WbResponse.text_stream(['Test', 'Another'], '404')
{'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])}
From e58a63a9feaba6f54fb98d5c5960c8a50b7546e2 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Sat, 26 Jul 2014 14:35:52 -0700
Subject: [PATCH 05/26] setup: add openssl as a req
---
setup.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/setup.py b/setup.py
index a6e9c885..45349981 100755
--- a/setup.py
+++ b/setup.py
@@ -70,6 +70,7 @@ setup(
'jinja2',
'surt',
'pyyaml',
+ 'pyopenssl',
],
tests_require=[
'pytest',
From ae35d92dded125bd95813fd48b87366e4ff66a2d Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Sat, 26 Jul 2014 15:27:02 -0700
Subject: [PATCH 06/26] fix typo in certauth
---
pywb/framework/certa.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/pywb/framework/certa.py b/pywb/framework/certa.py
index d957dbad..21cf8770 100644
--- a/pywb/framework/certa.py
+++ b/pywb/framework/certa.py
@@ -13,6 +13,7 @@ class CertificateAuthority(object):
self.ca_file = ca_file
self.certs_dir = certs_dir
+ self.certname = certname
if not os.path.exists(ca_file):
self._generate_ca()
@@ -34,7 +35,7 @@ class CertificateAuthority(object):
self.cert.set_version(3)
# avoid sec_error_reused_issuer_and_serial
self.cert.set_serial_number(random.randint(0,2**64-1))
- self.cert.get_subject().CN = certname
+ self.cert.get_subject().CN = self.certname
self.cert.gmtime_adj_notBefore(0) # now
self.cert.gmtime_adj_notAfter(100*365*24*60*60) # 100 yrs in future
self.cert.set_issuer(self.cert.get_subject())
From 2a9197137e94b532a8a00aa8de269178bbb22cb9 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Sat, 26 Jul 2014 21:06:28 -0700
Subject: [PATCH 07/26] certauth: some cleanup for pep8, 2.6 compat
---
pywb/framework/certa.py | 44 +++++++++++++++++++++++++++--------------
1 file changed, 29 insertions(+), 15 deletions(-)
diff --git a/pywb/framework/certa.py b/pywb/framework/certa.py
index 21cf8770..b7b1e5bf 100644
--- a/pywb/framework/certa.py
+++ b/pywb/framework/certa.py
@@ -4,6 +4,7 @@ import OpenSSL
import random
+#=================================================================
class CertificateAuthority(object):
logger = logging.getLogger('pywb.CertificateAuthority')
@@ -21,7 +22,6 @@ class CertificateAuthority(object):
self._read_ca(ca_file)
if not os.path.exists(certs_dir):
- self.logger.info("directory for generated certs {} doesn't exist, creating it".format(certs_dir))
os.mkdir(certs_dir)
@@ -41,23 +41,37 @@ class CertificateAuthority(object):
self.cert.set_issuer(self.cert.get_subject())
self.cert.set_pubkey(self.key)
self.cert.add_extensions([
- OpenSSL.crypto.X509Extension(b"basicConstraints", True, b"CA:TRUE, pathlen:0"),
- OpenSSL.crypto.X509Extension(b"keyUsage", True, b"keyCertSign, cRLSign"),
- OpenSSL.crypto.X509Extension(b"subjectKeyIdentifier", False, b"hash", subject=self.cert),
+ OpenSSL.crypto.X509Extension(b"basicConstraints",
+ True,
+ b"CA:TRUE, pathlen:0"),
+
+ OpenSSL.crypto.X509Extension(b"keyUsage",
+ True,
+ b"keyCertSign, cRLSign"),
+
+ OpenSSL.crypto.X509Extension(b"subjectKeyIdentifier",
+ False,
+ b"hash",
+ subject=self.cert),
])
self.cert.sign(self.key, "sha1")
with open(self.ca_file, 'wb+') as f:
- f.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM, self.key))
- f.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM, self.cert))
-
- self.logger.info('generated CA key+cert and wrote to {}'.format(self.ca_file))
+ f.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM,
+ self.key))
+ f.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM,
+ self.cert))
def _read_ca(self, filename):
- self.cert = OpenSSL.crypto.load_certificate(OpenSSL.SSL.FILETYPE_PEM, open(filename).read())
- self.key = OpenSSL.crypto.load_privatekey(OpenSSL.SSL.FILETYPE_PEM, open(filename).read())
- self.logger.info('read CA key+cert from {}'.format(self.ca_file))
+ with open(filename) as cert_fh:
+ self.cert = OpenSSL.crypto.load_certificate(
+ OpenSSL.SSL.FILETYPE_PEM, cert_fh.read())
+
+ cert_fh.seek(0)
+
+ self.key = OpenSSL.crypto.load_privatekey(
+ OpenSSL.SSL.FILETYPE_PEM, cert_fh.read())
def __getitem__(self, cn):
cnp = os.path.sep.join([self.certs_dir, '%s.pem' % cn])
@@ -83,9 +97,9 @@ class CertificateAuthority(object):
cert.sign(self.key, 'sha1')
with open(cnp, 'wb+') as f:
- f.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM, key))
- f.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM, cert))
-
- self.logger.info('wrote generated key+cert to {}'.format(cnp))
+ f.write(OpenSSL.crypto.dump_privatekey(
+ OpenSSL.SSL.FILETYPE_PEM, key))
+ f.write(OpenSSL.crypto.dump_certificate(
+ OpenSSL.SSL.FILETYPE_PEM, cert))
return cnp
From b6fb0e510e2582626e0fa5a92167850f987cfd64 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Sun, 27 Jul 2014 19:35:16 -0700
Subject: [PATCH 08/26] certauth: clean up CertificatAuthority, add cli
interface for creating root cert and host certs CertificateAuthority instance
creates per-host certs, assume root cert exists static method
generate_ca_root() used to create root cert once add proxy_options to enable
https support
---
pywb/framework/certa.py | 105 -------------------
pywb/framework/certauth.py | 201 +++++++++++++++++++++++++++++++++++++
pywb/framework/proxy.py | 44 ++++++--
3 files changed, 235 insertions(+), 115 deletions(-)
delete mode 100644 pywb/framework/certa.py
create mode 100644 pywb/framework/certauth.py
diff --git a/pywb/framework/certa.py b/pywb/framework/certa.py
deleted file mode 100644
index b7b1e5bf..00000000
--- a/pywb/framework/certa.py
+++ /dev/null
@@ -1,105 +0,0 @@
-import logging
-import os
-import OpenSSL
-import random
-
-
-#=================================================================
-class CertificateAuthority(object):
- logger = logging.getLogger('pywb.CertificateAuthority')
-
- def __init__(self, ca_file='pywb-ca.pem',
- certs_dir='./pywb-ca',
- certname='pywb CA'):
-
- self.ca_file = ca_file
- self.certs_dir = certs_dir
- self.certname = certname
-
- if not os.path.exists(ca_file):
- self._generate_ca()
- else:
- self._read_ca(ca_file)
-
- if not os.path.exists(certs_dir):
- os.mkdir(certs_dir)
-
-
- def _generate_ca(self):
- # Generate key
- self.key = OpenSSL.crypto.PKey()
- self.key.generate_key(OpenSSL.crypto.TYPE_RSA, 2048)
-
- # Generate certificate
- self.cert = OpenSSL.crypto.X509()
- self.cert.set_version(3)
- # avoid sec_error_reused_issuer_and_serial
- self.cert.set_serial_number(random.randint(0,2**64-1))
- self.cert.get_subject().CN = self.certname
- self.cert.gmtime_adj_notBefore(0) # now
- self.cert.gmtime_adj_notAfter(100*365*24*60*60) # 100 yrs in future
- self.cert.set_issuer(self.cert.get_subject())
- self.cert.set_pubkey(self.key)
- self.cert.add_extensions([
- OpenSSL.crypto.X509Extension(b"basicConstraints",
- True,
- b"CA:TRUE, pathlen:0"),
-
- OpenSSL.crypto.X509Extension(b"keyUsage",
- True,
- b"keyCertSign, cRLSign"),
-
- OpenSSL.crypto.X509Extension(b"subjectKeyIdentifier",
- False,
- b"hash",
- subject=self.cert),
- ])
- self.cert.sign(self.key, "sha1")
-
- with open(self.ca_file, 'wb+') as f:
- f.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM,
- self.key))
-
- f.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM,
- self.cert))
-
- def _read_ca(self, filename):
- with open(filename) as cert_fh:
- self.cert = OpenSSL.crypto.load_certificate(
- OpenSSL.SSL.FILETYPE_PEM, cert_fh.read())
-
- cert_fh.seek(0)
-
- self.key = OpenSSL.crypto.load_privatekey(
- OpenSSL.SSL.FILETYPE_PEM, cert_fh.read())
-
- def __getitem__(self, cn):
- cnp = os.path.sep.join([self.certs_dir, '%s.pem' % cn])
- if not os.path.exists(cnp):
- # create certificate
- key = OpenSSL.crypto.PKey()
- key.generate_key(OpenSSL.crypto.TYPE_RSA, 2048)
-
- # Generate CSR
- req = OpenSSL.crypto.X509Req()
- req.get_subject().CN = cn
- req.set_pubkey(key)
- req.sign(key, 'sha1')
-
- # Sign CSR
- cert = OpenSSL.crypto.X509()
- cert.set_subject(req.get_subject())
- cert.set_serial_number(random.randint(0,2**64-1))
- cert.gmtime_adj_notBefore(0)
- cert.gmtime_adj_notAfter(10*365*24*60*60)
- cert.set_issuer(self.cert.get_subject())
- cert.set_pubkey(req.get_pubkey())
- cert.sign(self.key, 'sha1')
-
- with open(cnp, 'wb+') as f:
- f.write(OpenSSL.crypto.dump_privatekey(
- OpenSSL.SSL.FILETYPE_PEM, key))
- f.write(OpenSSL.crypto.dump_certificate(
- OpenSSL.SSL.FILETYPE_PEM, cert))
-
- return cnp
diff --git a/pywb/framework/certauth.py b/pywb/framework/certauth.py
new file mode 100644
index 00000000..0ce7cec3
--- /dev/null
+++ b/pywb/framework/certauth.py
@@ -0,0 +1,201 @@
+import logging
+import os
+from OpenSSL import crypto
+from OpenSSL.SSL import FILETYPE_PEM
+import random
+from argparse import ArgumentParser
+
+
+#=================================================================
+# Duration of 100 years
+CERT_DURATION = 100 * 365 * 24 * 60 * 60
+
+CERTS_DIR = './pywb-certs/'
+
+CERT_NAME = 'pywb https proxy replay CA'
+
+CERT_CA_FILE = './pywb-ca.pem'
+
+
+#=================================================================
+class CertificateAuthority(object):
+ """
+ Utility class for signing individual certificate
+ with a root cert.
+
+ Static generate_ca_root() method for creating the root cert
+
+ All certs saved on filesystem. Individual certs are stored
+ in specified certs_dir and reused if previously created.
+ """
+
+ def __init__(self, ca_file, certs_dir):
+ if not ca_file:
+ ca_file = CERT_CA_FILE
+
+ if not certs_dir:
+ certs_dir = CERTS_DIR
+
+ self.ca_file = ca_file
+ self.certs_dir = certs_dir
+
+ # read previously created root cert
+ self.cert, self.key = self.read_pem(ca_file)
+
+ if not os.path.exists(certs_dir):
+ os.mkdir(certs_dir)
+
+ def get_cert_for_host(self, host, overwrite=False):
+ host_filename = os.path.sep.join([self.certs_dir, '%s.pem' % host])
+
+ if not overwrite and os.path.exists(host_filename):
+ return False, host_filename
+
+ self.generate_host_cert(host, self.cert, self.key, host_filename)
+ return True, host_filename
+
+ @staticmethod
+ def _make_cert(certname):
+ cert = crypto.X509()
+ cert.set_version(3)
+ cert.set_serial_number(random.randint(0, 2 ** 64 - 1))
+ cert.get_subject().CN = certname
+
+ cert.gmtime_adj_notBefore(0)
+ cert.gmtime_adj_notAfter(CERT_DURATION)
+ return cert
+
+ @staticmethod
+ def generate_ca_root(ca_file, certname=None, overwrite=False):
+ if not certname:
+ certname = CERT_NAME
+
+ if not ca_file:
+ ca_file = CERT_CA_FILE
+
+ if not overwrite and os.path.exists(ca_file):
+ cert, key = CertificateAuthority.read_pem(ca_file)
+ return False, cert, key
+
+ # Generate key
+ key = crypto.PKey()
+ key.generate_key(crypto.TYPE_RSA, 2048)
+
+ # Generate cert
+ cert = CertificateAuthority._make_cert(certname)
+
+ cert.set_issuer(cert.get_subject())
+ cert.set_pubkey(key)
+ cert.add_extensions([
+ crypto.X509Extension(b"basicConstraints",
+ True,
+ b"CA:TRUE, pathlen:0"),
+
+ crypto.X509Extension(b"keyUsage",
+ True,
+ b"keyCertSign, cRLSign"),
+
+ crypto.X509Extension(b"subjectKeyIdentifier",
+ False,
+ b"hash",
+ subject=cert),
+ ])
+ cert.sign(key, "sha1")
+
+ # Write cert + key
+ CertificateAuthority.write_pem(ca_file, cert, key)
+ return True, cert, key
+
+ @staticmethod
+ def generate_host_cert(host, root_cert, root_key, host_filename):
+ # Generate key
+ key = crypto.PKey()
+ key.generate_key(crypto.TYPE_RSA, 2048)
+
+ # Generate CSR
+ req = crypto.X509Req()
+ req.get_subject().CN = host
+ req.set_pubkey(key)
+ req.sign(key, 'sha1')
+
+ # Generate Cert
+ cert = CertificateAuthority._make_cert(host)
+
+ cert.set_issuer(root_cert.get_subject())
+ cert.set_pubkey(req.get_pubkey())
+ cert.sign(root_key, 'sha1')
+
+ # Write cert + key
+ CertificateAuthority.write_pem(host_filename, cert, key)
+ return cert, key
+
+ @staticmethod
+ def write_pem(filename, cert, key):
+ with open(filename, 'wb+') as f:
+ f.write(crypto.dump_privatekey(FILETYPE_PEM, key))
+
+ f.write(crypto.dump_certificate(FILETYPE_PEM, cert))
+
+ @staticmethod
+ def read_pem(filename):
+ with open(filename, 'r') as f:
+ cert = crypto.load_certificate(FILETYPE_PEM, f.read())
+ f.seek(0)
+ key = crypto.load_privatekey(FILETYPE_PEM, f.read())
+
+ return cert, key
+
+
+#=================================================================
+def main():
+ parser = ArgumentParser(description='Cert Auth Cert Maker')
+
+ parser.add_argument('output_file', help='path to certificate file')
+
+ parser.add_argument('-r', '--use-root',
+ help='use specified root cert to create signed cert')
+
+ parser.add_argument('-n', '--name', action='store', default=CERT_NAME,
+ help='name for root certificate')
+
+ parser.add_argument('-d', '--certs-dir', default=CERTS_DIR)
+
+ parser.add_argument('-f', '--force', action='store_true')
+
+ result = parser.parse_args()
+
+ overwrite = result.force
+
+ # Create a new signed certificate using specified root
+ if result.use_root:
+ certs_dir = result.certs_dir
+ ca = CertificateAuthority(ca_file=result.use_root,
+ certs_dir=result.certs_dir,
+ certname=result.name)
+
+ created, host_filename = ca.get_cert_for_host(result.output_file,
+ overwrite)
+
+ if created:
+ print ('Created new cert "' + host_filename +
+ '" signed by root cert ' +
+ result.use_root)
+ else:
+ print ('Cert "' + host_filename + '" already exists,' +
+ ' use -f to overwrite')
+
+ # Create new root certificate
+ else:
+ created, c, k = (CertificateAuthority.
+ generate_ca_root(result.output_file,
+ result.name,
+ overwrite))
+
+ if created:
+ print 'Created new root cert: "' + result.output_file + '"'
+ else:
+ print ('Root cert "' + result.output_file + '" already exists,' +
+ ' use -f to overwrite')
+
+if __name__ == "__main__":
+ main()
diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py
index 202e4f3b..fdfb8ac1 100644
--- a/pywb/framework/proxy.py
+++ b/pywb/framework/proxy.py
@@ -12,7 +12,7 @@ from pywb.rewrite.url_rewriter import HttpsUrlRewriter
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.wbexception import BadRequestException
-from certa import CertificateAuthority
+from certauth import CertificateAuthority
#=================================================================
@@ -68,8 +68,19 @@ class ProxyRouter(object):
self.unaltered = proxy_options.get('unaltered_replay', False)
- self.ca = CertificateAuthority()
+ if proxy_options.get('enable_https_proxy'):
+ ca_file = proxy_options.get('root_ca_file')
+ # attempt to create the root_ca_file if doesn't exist
+ # (generally recommended to create this seperately)
+ certname = proxy_options.get('root_ca_name')
+ CertificateAuthority.generate_ca_root(certname, ca_file)
+
+ certs_dir = proxy_options.get('certs_dir')
+ self.ca = CertificateAuthority(ca_file=ca_file,
+ certs_dir=certs_dir)
+ else:
+ self.ca = None
def __call__(self, env):
is_https = (env['REQUEST_METHOD'] == 'CONNECT')
@@ -119,7 +130,9 @@ class ProxyRouter(object):
# do connect, then get updated url
if is_https:
- self.handle_connect(env)
+ response = self.handle_connect(env)
+ if response:
+ return response
url = env['REL_REQUEST_URI']
@@ -142,14 +155,23 @@ class ProxyRouter(object):
return route.handler(wbrequest)
def get_request_socket(self, env):
+ if not self.ca:
+ return None
+
+ sock = None
+
if env.get('uwsgi.version'):
- import uwsgi
- fd = uwsgi.connection_fd()
- conn = socket.fromfd(fd, socket.AF_INET, socket.SOCK_STREAM)
- sock = socket.socket(_sock=conn)
+ try:
+ import uwsgi
+ fd = uwsgi.connection_fd()
+ conn = socket.fromfd(fd, socket.AF_INET, socket.SOCK_STREAM)
+ sock = socket.socket(_sock=conn)
+ except Exception:
+ pass
elif env.get('gunicorn.socket'):
sock = env['gunicorn.socket']
- else:
+
+ if not sock:
# attempt to find socket from wsgi.input
input_ = env.get('wsgi.input')
if input_ and hasattr(input_, '_sock'):
@@ -168,9 +190,11 @@ class ProxyRouter(object):
sock.send('\r\n')
hostname = env['REL_REQUEST_URI'].split(':')[0]
+ created, certfile = self.ca.get_cert_for_host(hostname)
- ssl_sock = ssl.wrap_socket(sock, server_side=True,
- certfile=self.ca[hostname])
+ ssl_sock = ssl.wrap_socket(sock,
+ server_side=True,
+ certfile=certfile)
#ssl_version=ssl.PROTOCOL_SSLv23)
env['pywb.proxy_ssl_sock'] = ssl_sock
From 6234d795dcd242654362a46a18a410007942b0ee Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Mon, 28 Jul 2014 11:52:54 -0700
Subject: [PATCH 09/26] proxy improvements: refactor coll selector into
BaseCollSelector, supporting either proxy auth or cookie-based selection (in
progress) https proxy: support POST requests, properly read http header and
wrap remainder in wsgi.input https proxy: properly update wsgi for wrapped
request wbrequestresponse: add content-length 0 to redir_response
---
pywb/framework/proxy.py | 296 +++++++++++++++---
pywb/framework/test/test_wbrequestresponse.py | 2 +-
pywb/framework/wbrequestresponse.py | 11 +-
pywb/framework/wsgi_wrappers.py | 2 +-
pywb/webapp/pywb_init.py | 6 +-
5 files changed, 260 insertions(+), 57 deletions(-)
diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py
index fdfb8ac1..386927ca 100644
--- a/pywb/framework/proxy.py
+++ b/pywb/framework/proxy.py
@@ -12,6 +12,8 @@ from pywb.rewrite.url_rewriter import HttpsUrlRewriter
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.wbexception import BadRequestException
+from pywb.utils.bufferedreaders import BufferedReader
+
from certauth import CertificateAuthority
@@ -51,8 +53,10 @@ class ProxyRouter(object):
for more details.
"""
+ PAC_PATH = '/proxy.pac'
+ BLOCK_SIZE = 4096
+
def __init__(self, routes, **kwargs):
- self.routes = routes
self.hostpaths = kwargs.get('hostpaths')
self.error_view = kwargs.get('error_view')
@@ -61,13 +65,14 @@ class ProxyRouter(object):
if proxy_options:
proxy_options = proxy_options.get('proxy_options', {})
- self.auth_msg = proxy_options.get('auth_msg',
- 'Please enter name of a collection to use for proxy mode')
-
- self.use_default_coll = proxy_options.get('use_default_coll', True)
+ self.resolver = ProxyAuthResolver(routes, proxy_options)
+ #self.resolver = CookieResolver(routes, proxy_options)
self.unaltered = proxy_options.get('unaltered_replay', False)
+ self.proxy_pac_path = proxy_options.get('pac_path', self.PAC_PATH)
+
+
if proxy_options.get('enable_https_proxy'):
ca_file = proxy_options.get('root_ca_file')
@@ -85,48 +90,23 @@ class ProxyRouter(object):
def __call__(self, env):
is_https = (env['REQUEST_METHOD'] == 'CONNECT')
+ # for non-https requests, check pac path and non-proxy urls
if not is_https:
url = env['REL_REQUEST_URI']
- if url.endswith('/proxy.pac'):
+ if url == self.proxy_pac_path:
return self.make_pac_response(env)
if not url.startswith(('http://', 'https://')):
return None
- proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION')
+ env['pywb.proxy_scheme'] = 'https' if is_https else 'http'
- route = None
- coll = None
- matcher = None
-
- if proxy_auth:
- proxy_coll = self.read_basic_auth_coll(proxy_auth)
-
- if not proxy_coll:
- return self.proxy_auth_coll_response()
-
- proxy_coll = '/' + proxy_coll + '/'
-
- for r in self.routes:
- matcher, c = r.is_handling(proxy_coll)
- if matcher:
- route = r
- coll = c
- break
-
- if not route:
- return self.proxy_auth_coll_response()
-
- # if 'use_default_coll' or only one collection, use that
- # for proxy mode
- elif self.use_default_coll or len(self.routes) == 1:
- route = self.routes[0]
- coll = self.routes[0].regex.pattern
-
- # otherwise, require proxy auth 407 to select collection
- else:
- return self.proxy_auth_coll_response()
+ # check resolver, for pre connect resolve
+ if self.resolver.pre_connect:
+ route, coll, matcher, response = self.resolver.resolve(env)
+ if response:
+ return response
# do connect, then get updated url
if is_https:
@@ -136,6 +116,12 @@ class ProxyRouter(object):
url = env['REL_REQUEST_URI']
+ # check resolver, post connect
+ if not self.resolver.pre_connect:
+ route, coll, matcher, response = self.resolver.resolve(env)
+ if response:
+ return response
+
wbrequest = route.request_class(env,
request_uri=url,
wb_url_str=url,
@@ -189,20 +175,18 @@ class ProxyRouter(object):
sock.send('Server: pywb proxy\r\n')
sock.send('\r\n')
- hostname = env['REL_REQUEST_URI'].split(':')[0]
+ hostname, port = env['REL_REQUEST_URI'].split(':')
created, certfile = self.ca.get_cert_for_host(hostname)
ssl_sock = ssl.wrap_socket(sock,
server_side=True,
- certfile=certfile)
- #ssl_version=ssl.PROTOCOL_SSLv23)
+ certfile=certfile,
+ ciphers="ALL",
+ ssl_version=ssl.PROTOCOL_SSLv23)
env['pywb.proxy_ssl_sock'] = ssl_sock
- #todo: better reading of all headers
- buff = ssl_sock.recv(4096)
-
- buffreader = BytesIO(buff)
+ buffreader = BufferedReader(ssl_sock, block_size=self.BLOCK_SIZE)
statusline = buffreader.readline()
statusparts = statusline.split(' ')
@@ -217,23 +201,44 @@ class ProxyRouter(object):
env['SERVER_PROTOCOL'] = statusparts[2].strip()
+ env['SERVER_NAME'] = hostname
+ env['SERVER_PORT'] = port
+
queryparts = env['REL_REQUEST_URI'].split('?', 1)
env['PATH_INFO'] = queryparts[0]
env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else ''
- env['wsgi.input'] = socket._fileobject(ssl_sock, mode='r')
+ env['wsgi.url_scheme'] = 'https'
while True:
line = buffreader.readline()
+ if line:
+ line = line.rstrip()
+
if not line:
break
- parts = line.split(':')
+ parts = line.split(':', 1)
if len(parts) < 2:
continue
- name = 'HTTP_' + parts[0].replace('-', '_').upper()
- env[name] = parts[1]
+ name = parts[0].strip()
+ value = parts[1].strip()
+
+ name = name.replace('-', '_').upper()
+
+ if not name in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
+ name = 'HTTP_' + name
+
+ env[name] = value
+
+ remain = buffreader.rem_length()
+ if remain > 0:
+ remainder = buffreader.read(self.BLOCK_SIZE)
+ input_ = socket._fileobject(ssl_sock, mode='r')
+ env['wsgi.input'] = BufferedReader(input_,
+ block_size=self.BLOCK_SIZE,
+ starting_data=remainder)
# Proxy Auto-Config (PAC) script for the proxy
def make_pac_response(self, env):
@@ -263,7 +268,73 @@ class ProxyRouter(object):
return WbResponse.text_response(buff, content_type=content_type)
- def proxy_auth_coll_response(self):
+
+#=================================================================
+class BaseCollResolver(object):
+ def __init__(self, routes, config):
+ self.routes = routes
+ self.pre_connect = config.get('pre_connect', False)
+ self.use_default_coll = config.get('use_default_coll', True)
+
+ def resolve(self, env):
+ route = None
+ coll = None
+ matcher = None
+
+ proxy_coll = self.get_proxy_coll(env)
+
+ # invalid parsing
+ if proxy_coll == '':
+ return None, None, None, self.select_coll_response(env)
+
+ if proxy_coll is None and isinstance(self.use_default_coll, str):
+ proxy_coll = self.use_default_coll
+
+ if proxy_coll:
+ proxy_coll = '/' + proxy_coll + '/'
+
+ for r in self.routes:
+ matcher, c = r.is_handling(proxy_coll)
+ if matcher:
+ route = r
+ coll = c
+ break
+
+ # if no match, return coll selection response
+ if not route:
+ return None, None, None, self.select_coll_response(env)
+
+ # if 'use_default_coll'
+ elif self.use_default_coll == True or len(self.routes) == 1:
+ route = self.routes[0]
+ coll = self.routes[0].path
+
+ # otherwise, return the appropriate coll selection response
+ else:
+ return None, None, None, self.select_coll_response(env)
+
+ return route, coll, matcher, None
+
+
+#=================================================================
+class ProxyAuthResolver(BaseCollResolver):
+ DEFAULT_MSG = 'Please enter name of a collection to use with proxy mode'
+
+ def __init__(self, routes, config):
+ config['pre_connect'] = True
+ super(ProxyAuthResolver, self).__init__(routes, config)
+ self.auth_msg = config.get('auth_msg', self.DEFAULT_MSG)
+
+ def get_proxy_coll(self, env):
+ proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION')
+
+ if not proxy_auth:
+ return None
+
+ proxy_coll = self.read_basic_auth_coll(proxy_auth)
+ return proxy_coll
+
+ def select_coll_response(self, env):
proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg)
headers = [('Content-Type', 'text/plain'),
@@ -286,3 +357,128 @@ class ProxyRouter(object):
user_pass = base64.b64decode(parts[1])
return user_pass.split(':')[0]
+
+
+#=================================================================
+class CookieResolver(BaseCollResolver):
+ def __init__(self, routes, config):
+ config['pre_connect'] = False
+ super(CookieResolver, self).__init__(routes, config)
+ self.magic_name = config.get('magic_name', 'pywb-proxy.com')
+ self.cookie_name = config.get('cookie_name', '__pywb_coll')
+ self.proxy_select_view = config.get('proxy_select_view')
+
+ def get_proxy_coll(self, env):
+ cookie = self.extract_client_cookie(env, self.cookie_name)
+ return cookie
+
+ def select_coll_response(self, env):
+ return self.make_magic_response('auto',
+ env['REL_REQUEST_URI'],
+ env)
+
+ def resolve(self, env):
+ url = env['REL_REQUEST_URI']
+
+ if ('.' + self.magic_name) in url:
+ return None, None, None, self.handle_magic_page(url, env)
+
+ return super(CookieResolver, self).resolve(env)
+
+ def handle_magic_page(self, url, env):
+ parts = urlparse.urlsplit(url)
+
+ path_url = parts.path[1:]
+ if parts.query:
+ path_url += '?' + parts.query
+
+ if parts.netloc.startswith('auto'):
+ coll = self.extract_client_cookie(env, self.cookie_name)
+
+ if coll:
+ return self.make_sethost_cookie_response(coll, path_url, env)
+ else:
+ return self.make_magic_response('select', path_url, env)
+
+ elif '.set.' in parts.netloc:
+ coll = parts.netloc.split('.', 1)[0]
+ headers = self.make_cookie_headers(coll, self.magic_name)
+
+ return self.make_sethost_cookie_response(coll, path_url, env,
+ headers=headers)
+
+ elif '.sethost.' in parts.netloc:
+ host_parts = parts.netloc.split('.', 1)
+ coll = host_parts[0]
+
+ inx = parts.netloc.find('.' + self.magic_name + '.')
+ domain = parts.netloc[inx + len(self.magic_name) + 2:]
+
+ headers = self.make_cookie_headers(coll, domain)
+
+ full_url = env['pywb.proxy_scheme'] + '://' + domain
+ full_url += '/' + path_url
+ return WbResponse.redir_response(full_url, headers=headers)
+
+ elif self.proxy_select_view:
+ route_temp = env['pywb.proxy_scheme'] + '://%s.set.'
+ route_temp += self.magic_name + '/' + path_url
+
+ return (self.proxy_select_view.
+ render_response(routes=self.routes,
+ route_temp=route_temp,
+ url=path_url))
+ else:
+ return WbResponse.text_response('select text for ' + path_url)
+
+ def make_cookie_headers(self, coll, domain):
+ cookie_val = '{0}={1}; Path=/; Domain=.{2}; HttpOnly'
+ cookie_val = cookie_val.format(self.cookie_name, coll, domain)
+ headers = [('Set-Cookie', cookie_val)]
+ return headers
+
+ def make_sethost_cookie_response(self, coll, path_url, env, headers=None):
+ path_parts = urlparse.urlsplit(path_url)
+
+ new_url = path_parts.path[1:]
+ if path_parts.query:
+ new_url += '?' + path_parts.query
+
+ return self.make_magic_response(coll + '.sethost', new_url, env,
+ suffix=path_parts.netloc,
+ headers=headers)
+
+
+ def make_magic_response(self, prefix, url, env,
+ suffix=None, headers=None):
+ full_url = env['pywb.proxy_scheme'] + '://' + prefix + '.'
+ full_url += self.magic_name
+ if suffix:
+ full_url += '.' + suffix
+ full_url += '/' + url
+ return WbResponse.redir_response(full_url, headers=headers)
+
+ @staticmethod
+ def extract_client_cookie(env, cookie_name):
+ cookie_header = env.get('HTTP_COOKIE')
+ if not cookie_header:
+ return None
+
+ # attempt to extract cookie_name only
+ inx = cookie_header.find(cookie_name)
+ if inx < 0:
+ return None
+
+ end_inx = cookie_header.find(';', inx)
+ if end_inx > 0:
+ value = cookie_header[inx:end_inx]
+ else:
+ value = cookie_header[inx:]
+
+ value = value.split('=')
+ if len(value) < 2:
+ return None
+
+ value = value[1].strip()
+ return value
+
diff --git a/pywb/framework/test/test_wbrequestresponse.py b/pywb/framework/test/test_wbrequestresponse.py
index 65940e4a..5bbb65b8 100644
--- a/pywb/framework/test/test_wbrequestresponse.py
+++ b/pywb/framework/test/test_wbrequestresponse.py
@@ -46,7 +46,7 @@
{'body': ['Test', 'Another'], 'status_headers': StatusAndHeaders(protocol = '', statusline = '404', headers = [('Content-Type', 'text/plain')])}
>>> WbResponse.redir_response('http://example.com/otherfile')
-{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile')])}
+{'body': [], 'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [('Location', 'http://example.com/otherfile'), ('Content-Length', '0')])}
"""
diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py
index f2c63f9c..b17b3575 100644
--- a/pywb/framework/wbrequestresponse.py
+++ b/pywb/framework/wbrequestresponse.py
@@ -125,7 +125,7 @@ class WbRequest(object):
if not self.wb_url:
return
- mime = self.env.get('CONTENT_TYPE')
+ mime = self.env.get('CONTENT_TYPE').split(';')[0]
length = self.env.get('CONTENT_LENGTH')
stream = self.env['wsgi.input']
@@ -167,9 +167,12 @@ class WbResponse(object):
return WbResponse(status_headers, value=[text])
@staticmethod
- def redir_response(location, status='302 Redirect'):
- return WbResponse(StatusAndHeaders(status,
- [('Location', location)]))
+ def redir_response(location, status='302 Redirect', headers=None):
+ redir_headers = [('Location', location), ('Content-Length', '0')]
+ if headers:
+ redir_headers += headers
+
+ return WbResponse(StatusAndHeaders(status, redir_headers))
def __call__(self, env, start_response):
diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py
index 1e1100e4..c8e7c86a 100644
--- a/pywb/framework/wsgi_wrappers.py
+++ b/pywb/framework/wsgi_wrappers.py
@@ -64,7 +64,7 @@ class WSGIApp(object):
env['pywb.proxy_statusline'] = statusline
- ssl_sock.write('HTTP/1.0 ' + statusline + '\r\n')
+ ssl_sock.write('HTTP/1.1 ' + statusline + '\r\n')
for name, value in headers:
ssl_sock.write(name + ': ' + value + '\r\n')
diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py
index 2fd02377..6de8fafa 100644
--- a/pywb/webapp/pywb_init.py
+++ b/pywb/webapp/pywb_init.py
@@ -215,13 +215,17 @@ def create_wb_router(passed_config={}):
if hasattr(route.handler, 'resolve_refs'):
route.handler.resolve_refs(handler_dict)
-
# Check for new proxy mode!
if config.get('enable_http_proxy', False):
router = ProxyArchivalRouter
else:
router = ArchivalRouter
+ if config.get('proxy_select_html'):
+ temp = J2TemplateView.create_template(config.get('proxy_select_html'),
+ 'Proxy Coll Selector')
+ config.get('proxy_options')['proxy_select_view'] = temp
+
# Finally, create wb router
return router(
routes,
From ba61f23e40a7ecdf690a40a2dc3a4345533bc6a1 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Mon, 28 Jul 2014 15:22:22 -0700
Subject: [PATCH 10/26] proxy_resolvers: move resolvers to seperate file,
default to ProxyAuthResolver (CookieResolver still work-in-progress)
---
pywb/framework/proxy.py | 219 +-----------------------------
pywb/framework/proxy_resolvers.py | 219 ++++++++++++++++++++++++++++++
2 files changed, 221 insertions(+), 217 deletions(-)
create mode 100644 pywb/framework/proxy_resolvers.py
diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py
index 386927ca..a9cf6a66 100644
--- a/pywb/framework/proxy.py
+++ b/pywb/framework/proxy.py
@@ -6,16 +6,16 @@ import base64
import socket
import ssl
-from io import BytesIO
from pywb.rewrite.url_rewriter import HttpsUrlRewriter
-from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.wbexception import BadRequestException
from pywb.utils.bufferedreaders import BufferedReader
from certauth import CertificateAuthority
+from proxy_resolvers import ProxyAuthResolver
+
#=================================================================
class ProxyArchivalRouter(ArchivalRouter):
@@ -267,218 +267,3 @@ class ProxyRouter(object):
content_type = 'application/x-ns-proxy-autoconfig'
return WbResponse.text_response(buff, content_type=content_type)
-
-
-#=================================================================
-class BaseCollResolver(object):
- def __init__(self, routes, config):
- self.routes = routes
- self.pre_connect = config.get('pre_connect', False)
- self.use_default_coll = config.get('use_default_coll', True)
-
- def resolve(self, env):
- route = None
- coll = None
- matcher = None
-
- proxy_coll = self.get_proxy_coll(env)
-
- # invalid parsing
- if proxy_coll == '':
- return None, None, None, self.select_coll_response(env)
-
- if proxy_coll is None and isinstance(self.use_default_coll, str):
- proxy_coll = self.use_default_coll
-
- if proxy_coll:
- proxy_coll = '/' + proxy_coll + '/'
-
- for r in self.routes:
- matcher, c = r.is_handling(proxy_coll)
- if matcher:
- route = r
- coll = c
- break
-
- # if no match, return coll selection response
- if not route:
- return None, None, None, self.select_coll_response(env)
-
- # if 'use_default_coll'
- elif self.use_default_coll == True or len(self.routes) == 1:
- route = self.routes[0]
- coll = self.routes[0].path
-
- # otherwise, return the appropriate coll selection response
- else:
- return None, None, None, self.select_coll_response(env)
-
- return route, coll, matcher, None
-
-
-#=================================================================
-class ProxyAuthResolver(BaseCollResolver):
- DEFAULT_MSG = 'Please enter name of a collection to use with proxy mode'
-
- def __init__(self, routes, config):
- config['pre_connect'] = True
- super(ProxyAuthResolver, self).__init__(routes, config)
- self.auth_msg = config.get('auth_msg', self.DEFAULT_MSG)
-
- def get_proxy_coll(self, env):
- proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION')
-
- if not proxy_auth:
- return None
-
- proxy_coll = self.read_basic_auth_coll(proxy_auth)
- return proxy_coll
-
- def select_coll_response(self, env):
- proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg)
-
- headers = [('Content-Type', 'text/plain'),
- ('Proxy-Authenticate', proxy_msg)]
-
- status_headers = StatusAndHeaders('407 Proxy Authentication', headers)
-
- value = self.auth_msg
-
- return WbResponse(status_headers, value=[value])
-
- @staticmethod
- def read_basic_auth_coll(value):
- parts = value.split(' ')
- if parts[0].lower() != 'basic':
- return ''
-
- if len(parts) != 2:
- return ''
-
- user_pass = base64.b64decode(parts[1])
- return user_pass.split(':')[0]
-
-
-#=================================================================
-class CookieResolver(BaseCollResolver):
- def __init__(self, routes, config):
- config['pre_connect'] = False
- super(CookieResolver, self).__init__(routes, config)
- self.magic_name = config.get('magic_name', 'pywb-proxy.com')
- self.cookie_name = config.get('cookie_name', '__pywb_coll')
- self.proxy_select_view = config.get('proxy_select_view')
-
- def get_proxy_coll(self, env):
- cookie = self.extract_client_cookie(env, self.cookie_name)
- return cookie
-
- def select_coll_response(self, env):
- return self.make_magic_response('auto',
- env['REL_REQUEST_URI'],
- env)
-
- def resolve(self, env):
- url = env['REL_REQUEST_URI']
-
- if ('.' + self.magic_name) in url:
- return None, None, None, self.handle_magic_page(url, env)
-
- return super(CookieResolver, self).resolve(env)
-
- def handle_magic_page(self, url, env):
- parts = urlparse.urlsplit(url)
-
- path_url = parts.path[1:]
- if parts.query:
- path_url += '?' + parts.query
-
- if parts.netloc.startswith('auto'):
- coll = self.extract_client_cookie(env, self.cookie_name)
-
- if coll:
- return self.make_sethost_cookie_response(coll, path_url, env)
- else:
- return self.make_magic_response('select', path_url, env)
-
- elif '.set.' in parts.netloc:
- coll = parts.netloc.split('.', 1)[0]
- headers = self.make_cookie_headers(coll, self.magic_name)
-
- return self.make_sethost_cookie_response(coll, path_url, env,
- headers=headers)
-
- elif '.sethost.' in parts.netloc:
- host_parts = parts.netloc.split('.', 1)
- coll = host_parts[0]
-
- inx = parts.netloc.find('.' + self.magic_name + '.')
- domain = parts.netloc[inx + len(self.magic_name) + 2:]
-
- headers = self.make_cookie_headers(coll, domain)
-
- full_url = env['pywb.proxy_scheme'] + '://' + domain
- full_url += '/' + path_url
- return WbResponse.redir_response(full_url, headers=headers)
-
- elif self.proxy_select_view:
- route_temp = env['pywb.proxy_scheme'] + '://%s.set.'
- route_temp += self.magic_name + '/' + path_url
-
- return (self.proxy_select_view.
- render_response(routes=self.routes,
- route_temp=route_temp,
- url=path_url))
- else:
- return WbResponse.text_response('select text for ' + path_url)
-
- def make_cookie_headers(self, coll, domain):
- cookie_val = '{0}={1}; Path=/; Domain=.{2}; HttpOnly'
- cookie_val = cookie_val.format(self.cookie_name, coll, domain)
- headers = [('Set-Cookie', cookie_val)]
- return headers
-
- def make_sethost_cookie_response(self, coll, path_url, env, headers=None):
- path_parts = urlparse.urlsplit(path_url)
-
- new_url = path_parts.path[1:]
- if path_parts.query:
- new_url += '?' + path_parts.query
-
- return self.make_magic_response(coll + '.sethost', new_url, env,
- suffix=path_parts.netloc,
- headers=headers)
-
-
- def make_magic_response(self, prefix, url, env,
- suffix=None, headers=None):
- full_url = env['pywb.proxy_scheme'] + '://' + prefix + '.'
- full_url += self.magic_name
- if suffix:
- full_url += '.' + suffix
- full_url += '/' + url
- return WbResponse.redir_response(full_url, headers=headers)
-
- @staticmethod
- def extract_client_cookie(env, cookie_name):
- cookie_header = env.get('HTTP_COOKIE')
- if not cookie_header:
- return None
-
- # attempt to extract cookie_name only
- inx = cookie_header.find(cookie_name)
- if inx < 0:
- return None
-
- end_inx = cookie_header.find(';', inx)
- if end_inx > 0:
- value = cookie_header[inx:end_inx]
- else:
- value = cookie_header[inx:]
-
- value = value.split('=')
- if len(value) < 2:
- return None
-
- value = value[1].strip()
- return value
-
diff --git a/pywb/framework/proxy_resolvers.py b/pywb/framework/proxy_resolvers.py
new file mode 100644
index 00000000..b4bfe840
--- /dev/null
+++ b/pywb/framework/proxy_resolvers.py
@@ -0,0 +1,219 @@
+from wbrequestresponse import WbResponse, WbRequest
+from pywb.utils.statusandheaders import StatusAndHeaders
+import urlparse
+import base64
+
+
+#=================================================================
+class BaseCollResolver(object):
+ def __init__(self, routes, config):
+ self.routes = routes
+ self.pre_connect = config.get('pre_connect', False)
+ self.use_default_coll = config.get('use_default_coll', True)
+
+ def resolve(self, env):
+ route = None
+ coll = None
+ matcher = None
+
+ proxy_coll = self.get_proxy_coll(env)
+
+ # invalid parsing
+ if proxy_coll == '':
+ return None, None, None, self.select_coll_response(env)
+
+ if proxy_coll is None and isinstance(self.use_default_coll, str):
+ proxy_coll = self.use_default_coll
+
+ if proxy_coll:
+ proxy_coll = '/' + proxy_coll + '/'
+
+ for r in self.routes:
+ matcher, c = r.is_handling(proxy_coll)
+ if matcher:
+ route = r
+ coll = c
+ break
+
+ # if no match, return coll selection response
+ if not route:
+ return None, None, None, self.select_coll_response(env)
+
+ # if 'use_default_coll'
+ elif self.use_default_coll == True or len(self.routes) == 1:
+ route = self.routes[0]
+ coll = self.routes[0].path
+
+ # otherwise, return the appropriate coll selection response
+ else:
+ return None, None, None, self.select_coll_response(env)
+
+ return route, coll, matcher, None
+
+
+#=================================================================
+class ProxyAuthResolver(BaseCollResolver):
+ DEFAULT_MSG = 'Please enter name of a collection to use with proxy mode'
+
+ def __init__(self, routes, config):
+ config['pre_connect'] = True
+ super(ProxyAuthResolver, self).__init__(routes, config)
+ self.auth_msg = config.get('auth_msg', self.DEFAULT_MSG)
+
+ def get_proxy_coll(self, env):
+ proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION')
+
+ if not proxy_auth:
+ return None
+
+ proxy_coll = self.read_basic_auth_coll(proxy_auth)
+ return proxy_coll
+
+ def select_coll_response(self, env):
+ proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg)
+
+ headers = [('Content-Type', 'text/plain'),
+ ('Proxy-Authenticate', proxy_msg)]
+
+ status_headers = StatusAndHeaders('407 Proxy Authentication', headers)
+
+ value = self.auth_msg
+
+ return WbResponse(status_headers, value=[value])
+
+ @staticmethod
+ def read_basic_auth_coll(value):
+ parts = value.split(' ')
+ if parts[0].lower() != 'basic':
+ return ''
+
+ if len(parts) != 2:
+ return ''
+
+ user_pass = base64.b64decode(parts[1])
+ return user_pass.split(':')[0]
+
+
+#=================================================================
+# Experimental CookieResolver
+class CookieResolver(BaseCollResolver): # pragma: no cover
+ def __init__(self, routes, config):
+ config['pre_connect'] = False
+ super(CookieResolver, self).__init__(routes, config)
+ self.magic_name = config.get('magic_name', 'pywb-proxy.com')
+ self.cookie_name = config.get('cookie_name', '__pywb_coll')
+ self.proxy_select_view = config.get('proxy_select_view')
+
+ def get_proxy_coll(self, env):
+ cookie = self.extract_client_cookie(env, self.cookie_name)
+ return cookie
+
+ def select_coll_response(self, env):
+ return self.make_magic_response('auto',
+ env['REL_REQUEST_URI'],
+ env)
+
+ def resolve(self, env):
+ url = env['REL_REQUEST_URI']
+
+ if ('.' + self.magic_name) in url:
+ return None, None, None, self.handle_magic_page(url, env)
+
+ return super(CookieResolver, self).resolve(env)
+
+ def handle_magic_page(self, url, env):
+ parts = urlparse.urlsplit(url)
+
+ path_url = parts.path[1:]
+ if parts.query:
+ path_url += '?' + parts.query
+
+ if parts.netloc.startswith('auto'):
+ coll = self.extract_client_cookie(env, self.cookie_name)
+
+ if coll:
+ return self.make_sethost_cookie_response(coll, path_url, env)
+ else:
+ return self.make_magic_response('select', path_url, env)
+
+ elif '.set.' in parts.netloc:
+ coll = parts.netloc.split('.', 1)[0]
+ headers = self.make_cookie_headers(coll, self.magic_name)
+
+ return self.make_sethost_cookie_response(coll, path_url, env,
+ headers=headers)
+
+ elif '.sethost.' in parts.netloc:
+ host_parts = parts.netloc.split('.', 1)
+ coll = host_parts[0]
+
+ inx = parts.netloc.find('.' + self.magic_name + '.')
+ domain = parts.netloc[inx + len(self.magic_name) + 2:]
+
+ headers = self.make_cookie_headers(coll, domain)
+
+ full_url = env['pywb.proxy_scheme'] + '://' + domain
+ full_url += '/' + path_url
+ return WbResponse.redir_response(full_url, headers=headers)
+
+ elif self.proxy_select_view:
+ route_temp = env['pywb.proxy_scheme'] + '://%s.set.'
+ route_temp += self.magic_name + '/' + path_url
+
+ return (self.proxy_select_view.
+ render_response(routes=self.routes,
+ route_temp=route_temp,
+ url=path_url))
+ else:
+ return WbResponse.text_response('select text for ' + path_url)
+
+ def make_cookie_headers(self, coll, domain):
+ cookie_val = '{0}={1}; Path=/; Domain=.{2}; HttpOnly'
+ cookie_val = cookie_val.format(self.cookie_name, coll, domain)
+ headers = [('Set-Cookie', cookie_val)]
+ return headers
+
+ def make_sethost_cookie_response(self, coll, path_url, env, headers=None):
+ path_parts = urlparse.urlsplit(path_url)
+
+ new_url = path_parts.path[1:]
+ if path_parts.query:
+ new_url += '?' + path_parts.query
+
+ return self.make_magic_response(coll + '.sethost', new_url, env,
+ suffix=path_parts.netloc,
+ headers=headers)
+
+
+ def make_magic_response(self, prefix, url, env,
+ suffix=None, headers=None):
+ full_url = env['pywb.proxy_scheme'] + '://' + prefix + '.'
+ full_url += self.magic_name
+ if suffix:
+ full_url += '.' + suffix
+ full_url += '/' + url
+ return WbResponse.redir_response(full_url, headers=headers)
+
+ @staticmethod
+ def extract_client_cookie(env, cookie_name):
+ cookie_header = env.get('HTTP_COOKIE')
+ if not cookie_header:
+ return None
+
+ # attempt to extract cookie_name only
+ inx = cookie_header.find(cookie_name)
+ if inx < 0:
+ return None
+
+ end_inx = cookie_header.find(';', inx)
+ if end_inx > 0:
+ value = cookie_header[inx:end_inx]
+ else:
+ value = cookie_header[inx:]
+
+ value = value.split('=')
+ if len(value) < 2:
+ return None
+
+ value = value[1].strip()
+ return value
From 9c960269041e5edebba19cfc9735308e1ed91b9b Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Mon, 28 Jul 2014 16:06:01 -0700
Subject: [PATCH 11/26] proxy-cert-auth: add cli hook for 'proxy-cert-auth' for
creating root certs, tweak help
---
pywb/framework/certauth.py | 13 +++++++------
setup.py | 1 +
2 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/pywb/framework/certauth.py b/pywb/framework/certauth.py
index 0ce7cec3..023754af 100644
--- a/pywb/framework/certauth.py
+++ b/pywb/framework/certauth.py
@@ -150,10 +150,11 @@ class CertificateAuthority(object):
def main():
parser = ArgumentParser(description='Cert Auth Cert Maker')
- parser.add_argument('output_file', help='path to certificate file')
+ parser.add_argument('output_pem_file', help='path to cert .pem file')
parser.add_argument('-r', '--use-root',
- help='use specified root cert to create signed cert')
+ help=('use specified root cert (.pem file) ' +
+ 'to create signed cert'))
parser.add_argument('-n', '--name', action='store', default=CERT_NAME,
help='name for root certificate')
@@ -173,7 +174,7 @@ def main():
certs_dir=result.certs_dir,
certname=result.name)
- created, host_filename = ca.get_cert_for_host(result.output_file,
+ created, host_filename = ca.get_cert_for_host(result.output_pem_file,
overwrite)
if created:
@@ -187,14 +188,14 @@ def main():
# Create new root certificate
else:
created, c, k = (CertificateAuthority.
- generate_ca_root(result.output_file,
+ generate_ca_root(result.output_pem_file,
result.name,
overwrite))
if created:
- print 'Created new root cert: "' + result.output_file + '"'
+ print 'Created new root cert: "' + result.output_pem_file + '"'
else:
- print ('Root cert "' + result.output_file + '" already exists,' +
+ print ('Root cert "' + result.output_pem_file + '" already exists,' +
' use -f to overwrite')
if __name__ == "__main__":
diff --git a/setup.py b/setup.py
index 45349981..2881d1e5 100755
--- a/setup.py
+++ b/setup.py
@@ -87,6 +87,7 @@ setup(
cdx-server = pywb.apps.cdx_server:main
cdx-indexer = pywb.warc.cdxindexer:main
live-rewrite-server = pywb.apps.live_rewrite_server:main
+ proxy-cert-auth = pywb.framework.certauth:main
""",
zip_safe=False,
classifiers=[
From 607ea1ccf0990266fb96cf136924675a5955e6e9 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Tue, 29 Jul 2014 12:23:41 -0700
Subject: [PATCH 12/26] proxy resolver: cookie resolver uses session cookies
proxy static handler: handled via proxy to support http/https use
'pywb.proxy' prefix for custom env settings
---
pywb/framework/proxy.py | 42 +++++++++--
pywb/framework/proxy_resolvers.py | 104 ++++++++++++++++++++++------
pywb/framework/wbrequestresponse.py | 10 ++-
pywb/framework/wsgi_wrappers.py | 12 ++--
4 files changed, 132 insertions(+), 36 deletions(-)
diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py
index a9cf6a66..82218e20 100644
--- a/pywb/framework/proxy.py
+++ b/pywb/framework/proxy.py
@@ -14,7 +14,7 @@ from pywb.utils.bufferedreaders import BufferedReader
from certauth import CertificateAuthority
-from proxy_resolvers import ProxyAuthResolver
+from proxy_resolvers import ProxyAuthResolver, CookieResolver
#=================================================================
@@ -68,6 +68,8 @@ class ProxyRouter(object):
self.resolver = ProxyAuthResolver(routes, proxy_options)
#self.resolver = CookieResolver(routes, proxy_options)
+ self.magic_name = proxy_options.get('magic_name', 'pywb-proxy.com')
+
self.unaltered = proxy_options.get('unaltered_replay', False)
self.proxy_pac_path = proxy_options.get('pac_path', self.PAC_PATH)
@@ -100,7 +102,12 @@ class ProxyRouter(object):
if not url.startswith(('http://', 'https://')):
return None
- env['pywb.proxy_scheme'] = 'https' if is_https else 'http'
+ env['pywb.proxy_scheme'] = 'http'
+
+ route = None
+ coll = None
+ matcher = None
+ response = None
# check resolver, for pre connect resolve
if self.resolver.pre_connect:
@@ -115,6 +122,21 @@ class ProxyRouter(object):
return response
url = env['REL_REQUEST_URI']
+ else:
+ parts = urlparse.urlsplit(env['REL_REQUEST_URI'])
+ hostport = parts.netloc.split(':', 1)
+ env['pywb.proxy_host'] = hostport[0]
+ env['pywb.proxy_port'] = hostport[1] if len(hostport) == 2 else ''
+ env['pywb.proxy_req_uri'] = parts.path
+ if parts.query:
+ env['pywb.proxy_req_uri'] += '?' + parts.query
+
+ # static
+ static_prefix = 'static.' + self.magic_name
+
+ if env['pywb.proxy_host'] == static_prefix:
+ env['REL_REQUEST_URI'] = env['pywb.proxy_req_uri']
+ return None
# check resolver, post connect
if not self.resolver.pre_connect:
@@ -122,11 +144,14 @@ class ProxyRouter(object):
if response:
return response
+ host_prefix = env['pywb.proxy_scheme'] + '://' + static_prefix
+
wbrequest = route.request_class(env,
request_uri=url,
wb_url_str=url,
coll=coll,
- host_prefix=self.hostpaths[0],
+ # host_prefix=self.hostpaths[0],
+ host_prefix=host_prefix,
wburl_class=route.handler.get_wburl_type(),
urlrewriter_class=HttpsUrlRewriter,
use_abs_prefix=False,
@@ -136,7 +161,8 @@ class ProxyRouter(object):
route.apply_filters(wbrequest, matcher)
if self.unaltered:
- wbrequest.wb_url.mod = 'id_'
+ #wbrequest.wb_url.mod = 'id_'
+ wbrequest.wb_url.mod = 'bn_'
return route.handler(wbrequest)
@@ -201,14 +227,16 @@ class ProxyRouter(object):
env['SERVER_PROTOCOL'] = statusparts[2].strip()
- env['SERVER_NAME'] = hostname
- env['SERVER_PORT'] = port
+ env['pywb.proxy_scheme'] = 'https'
+
+ env['pywb.proxy_host'] = hostname
+ env['pywb.proxy_port'] = port
+ env['pywb.proxy_req_uri'] = statusparts[1]
queryparts = env['REL_REQUEST_URI'].split('?', 1)
env['PATH_INFO'] = queryparts[0]
env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else ''
- env['wsgi.url_scheme'] = 'https'
while True:
line = buffreader.readline()
diff --git a/pywb/framework/proxy_resolvers.py b/pywb/framework/proxy_resolvers.py
index b4bfe840..35c84c8a 100644
--- a/pywb/framework/proxy_resolvers.py
+++ b/pywb/framework/proxy_resolvers.py
@@ -2,6 +2,25 @@ from wbrequestresponse import WbResponse, WbRequest
from pywb.utils.statusandheaders import StatusAndHeaders
import urlparse
import base64
+import os
+
+try:
+ import uwsgi
+ uwsgi_cache = True
+except ImportError:
+ uwsgi_cache = False
+
+
+#=================================================================
+class UwsgiCache(object):
+ def __setitem__(self, item, value):
+ uwsgi.cache_update(item, value)
+
+ def __getitem__(self, item):
+ return uwsgi.cache_get(item)
+
+ def __contains__(self, item):
+ return uwsgi.cache_exists(item)
#=================================================================
@@ -104,9 +123,15 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
self.cookie_name = config.get('cookie_name', '__pywb_coll')
self.proxy_select_view = config.get('proxy_select_view')
+ if uwsgi_cache:
+ print 'UWSGI CACHE'
+ self.cache = UwsgiCache()
+ else:
+ self.cache = {}
+
def get_proxy_coll(self, env):
- cookie = self.extract_client_cookie(env, self.cookie_name)
- return cookie
+ coll, sesh_id = self.get_coll(env)
+ return coll
def select_coll_response(self, env):
return self.make_magic_response('auto',
@@ -114,14 +139,15 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
env)
def resolve(self, env):
- url = env['REL_REQUEST_URI']
+ server_name = env['pywb.proxy_host']
- if ('.' + self.magic_name) in url:
- return None, None, None, self.handle_magic_page(url, env)
+ if ('.' + self.magic_name) in server_name:
+ return None, None, None, self.handle_magic_page(env)
return super(CookieResolver, self).resolve(env)
- def handle_magic_page(self, url, env):
+ def handle_magic_page(self, env):
+ url = env['REL_REQUEST_URI']
parts = urlparse.urlsplit(url)
path_url = parts.path[1:]
@@ -129,58 +155,77 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
path_url += '?' + parts.query
if parts.netloc.startswith('auto'):
- coll = self.extract_client_cookie(env, self.cookie_name)
+ coll, sesh_id = self.get_coll(env)
if coll:
- return self.make_sethost_cookie_response(coll, path_url, env)
+ return self.make_sethost_cookie_response(sesh_id, path_url, env)
else:
return self.make_magic_response('select', path_url, env)
elif '.set.' in parts.netloc:
- coll = parts.netloc.split('.', 1)[0]
- headers = self.make_cookie_headers(coll, self.magic_name)
+ old_sesh_id = self.extract_client_cookie(env, self.cookie_name)
+ sesh_id = self.create_renew_sesh_id(old_sesh_id)
- return self.make_sethost_cookie_response(coll, path_url, env,
+ if sesh_id != old_sesh_id:
+ headers = self.make_cookie_headers(sesh_id, self.magic_name)
+ else:
+ headers = None
+
+ value, name, _ = parts.netloc.split('.', 2)
+
+ # set sesh value
+ self.cache[sesh_id] = value
+
+ return self.make_sethost_cookie_response(sesh_id, path_url, env,
headers=headers)
elif '.sethost.' in parts.netloc:
host_parts = parts.netloc.split('.', 1)
- coll = host_parts[0]
+ sesh_id = host_parts[0]
inx = parts.netloc.find('.' + self.magic_name + '.')
domain = parts.netloc[inx + len(self.magic_name) + 2:]
- headers = self.make_cookie_headers(coll, domain)
+ headers = self.make_cookie_headers(sesh_id, domain)
full_url = env['pywb.proxy_scheme'] + '://' + domain
full_url += '/' + path_url
return WbResponse.redir_response(full_url, headers=headers)
- elif self.proxy_select_view:
- route_temp = env['pywb.proxy_scheme'] + '://%s.set.'
+ elif 'select.' in parts.netloc:
+ if not self.proxy_select_view:
+ return WbResponse.text_response('select text for ' + path_url)
+
+ coll, sesh_id = self.get_coll(env)
+
+ route_temp = env['pywb.proxy_scheme'] + '://%s.coll.set.'
route_temp += self.magic_name + '/' + path_url
return (self.proxy_select_view.
render_response(routes=self.routes,
route_temp=route_temp,
+ coll=coll,
url=path_url))
- else:
- return WbResponse.text_response('select text for ' + path_url)
- def make_cookie_headers(self, coll, domain):
+ #else:
+ # msg = 'Invalid Magic Path: ' + url
+ # print msg
+ # return WbResponse.text_response(msg, status='404 Not Found')
+
+ def make_cookie_headers(self, sesh_id, domain):
cookie_val = '{0}={1}; Path=/; Domain=.{2}; HttpOnly'
- cookie_val = cookie_val.format(self.cookie_name, coll, domain)
+ cookie_val = cookie_val.format(self.cookie_name, sesh_id, domain)
headers = [('Set-Cookie', cookie_val)]
return headers
- def make_sethost_cookie_response(self, coll, path_url, env, headers=None):
+ def make_sethost_cookie_response(self, sesh_id, path_url, env, headers=None):
path_parts = urlparse.urlsplit(path_url)
new_url = path_parts.path[1:]
if path_parts.query:
new_url += '?' + path_parts.query
- return self.make_magic_response(coll + '.sethost', new_url, env,
+ return self.make_magic_response(sesh_id + '.sethost', new_url, env,
suffix=path_parts.netloc,
headers=headers)
@@ -194,6 +239,23 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
full_url += '/' + url
return WbResponse.redir_response(full_url, headers=headers)
+ def get_coll(self, env):
+ sesh_id = self.extract_client_cookie(env, self.cookie_name)
+
+ coll = None
+ if sesh_id:
+ coll = self.cache[sesh_id]
+
+ return coll, sesh_id
+
+ def create_renew_sesh_id(self, sesh_id, force=False):
+ #if sesh_id in self.cache and not force:
+ if sesh_id and (sesh_id in self.cache) and not force:
+ return sesh_id
+
+ sesh_id = base64.b32encode(os.urandom(5)).lower()
+ return sesh_id
+
@staticmethod
def extract_client_cookie(env, cookie_name):
cookie_header = env.get('HTTP_COOKIE')
diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py
index b17b3575..da456474 100644
--- a/pywb/framework/wbrequestresponse.py
+++ b/pywb/framework/wbrequestresponse.py
@@ -152,9 +152,13 @@ class WbResponse(object):
pass
@staticmethod
- def text_stream(stream, status='200 OK', content_type='text/plain'):
- status_headers = StatusAndHeaders(status,
- [('Content-Type', content_type)])
+ def text_stream(stream, status='200 OK', content_type='text/plain',
+ headers=None):
+ def_headers = [('Content-Type', content_type)]
+ if headers:
+ def_headers += headers
+
+ status_headers = StatusAndHeaders(status, def_headers)
return WbResponse(status_headers, value=stream)
diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py
index c8e7c86a..2babc83f 100644
--- a/pywb/framework/wsgi_wrappers.py
+++ b/pywb/framework/wsgi_wrappers.py
@@ -77,8 +77,8 @@ class WSGIApp(object):
ssl_sock.write('\r\n')
for obj in resp_iter:
- ssl_sock.write(obj)
-
+ if obj:
+ ssl_sock.write(obj)
ssl_sock.close()
start_response(env['pywb.proxy_statusline'], [])
@@ -125,22 +125,24 @@ class WSGIApp(object):
else:
err_url = None
+ err_msg = exc.message.encode('utf-8')
+
if print_trace:
import traceback
err_details = traceback.format_exc(exc)
print err_details
else:
- logging.info(str(exc))
+ logging.info(err_msg)
err_details = None
if error_view:
return error_view.render_response(exc_type=type(exc).__name__,
- err_msg=str(exc),
+ err_msg=err_msg,
err_details=err_details,
status=status,
err_url=err_url)
else:
- return WbResponse.text_response(status + ' Error: ' + str(exc),
+ return WbResponse.text_response(status + ' Error: ' + err_msg,
status=status)
#=================================================================
From 96d9f4dcad9f93d032e8b007d6aa1497dce1e38b Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Wed, 30 Jul 2014 10:38:13 -0700
Subject: [PATCH 13/26] proxy mode: cookie based selector using session to coll
ui: add proxy_selector html, add switch link to error and banner
---
pywb/framework/proxy.py | 4 ++--
pywb/framework/proxy_resolvers.py | 1 -
pywb/framework/wsgi_wrappers.py | 3 +++
pywb/static/wb.js | 4 ++++
pywb/ui/error.html | 7 +++++++
pywb/ui/head_insert.html | 3 ++-
pywb/ui/proxy_select.html | 25 +++++++++++++++++++++++++
pywb/webapp/pywb_init.py | 17 +++++++++++------
8 files changed, 54 insertions(+), 10 deletions(-)
create mode 100644 pywb/ui/proxy_select.html
diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py
index 82218e20..e387bf4b 100644
--- a/pywb/framework/proxy.py
+++ b/pywb/framework/proxy.py
@@ -65,8 +65,8 @@ class ProxyRouter(object):
if proxy_options:
proxy_options = proxy_options.get('proxy_options', {})
- self.resolver = ProxyAuthResolver(routes, proxy_options)
- #self.resolver = CookieResolver(routes, proxy_options)
+ #self.resolver = ProxyAuthResolver(routes, proxy_options)
+ self.resolver = CookieResolver(routes, proxy_options)
self.magic_name = proxy_options.get('magic_name', 'pywb-proxy.com')
diff --git a/pywb/framework/proxy_resolvers.py b/pywb/framework/proxy_resolvers.py
index 35c84c8a..1b33be01 100644
--- a/pywb/framework/proxy_resolvers.py
+++ b/pywb/framework/proxy_resolvers.py
@@ -124,7 +124,6 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
self.proxy_select_view = config.get('proxy_select_view')
if uwsgi_cache:
- print 'UWSGI CACHE'
self.cache = UwsgiCache()
else:
self.cache = {}
diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py
index 2babc83f..85e23aaa 100644
--- a/pywb/framework/wsgi_wrappers.py
+++ b/pywb/framework/wsgi_wrappers.py
@@ -135,11 +135,14 @@ class WSGIApp(object):
logging.info(err_msg)
err_details = None
+ is_proxy_mode = env.get('pywb.proxy_host') is not None
+
if error_view:
return error_view.render_response(exc_type=type(exc).__name__,
err_msg=err_msg,
err_details=err_details,
status=status,
+ is_proxy_mode=is_proxy_mode,
err_url=err_url)
else:
return WbResponse.text_response(status + ' Error: ' + err_msg,
diff --git a/pywb/static/wb.js b/pywb/static/wb.js
index 0244cde8..4a23b03c 100644
--- a/pywb/static/wb.js
+++ b/pywb/static/wb.js
@@ -69,6 +69,10 @@ function init_banner() {
var capture_str = (wbinfo ? wbinfo.capture_str : "");
text += "" + capture_str + "";
+
+ if (wbinfo.is_proxy_mode && wbinfo.url) {
+ text += '
Switch Collection';
+ }
banner.innerHTML = text;
diff --git a/pywb/ui/error.html b/pywb/ui/error.html
index b3a8c478..d7231893 100644
--- a/pywb/ui/error.html
+++ b/pywb/ui/error.html
@@ -9,3 +9,10 @@
{% endif %}
+
+{% if is_proxy_mode and err_url and status == '404 Not Found' %}
+
+Try Different Collections
+
+{% endif %}
+
diff --git a/pywb/ui/head_insert.html b/pywb/ui/head_insert.html
index b1ff4a26..d9e1207b 100644
--- a/pywb/ui/head_insert.html
+++ b/pywb/ui/head_insert.html
@@ -2,7 +2,7 @@
{% if rule.js_rewrite_location and include_wombat %}
From 522ea87637e99b4762ddf941b8d188d685dc9bd7 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Thu, 31 Jul 2014 11:12:50 -0700
Subject: [PATCH 16/26] proxy: timestamp selection support! certauth: wildcard
support, use *.host wildcard for proxy certs whenever possible ui: add coll
info/switch and calendar links to banner
---
pywb/framework/certauth.py | 30 ++++++--
pywb/framework/proxy.py | 52 +++++++++-----
pywb/framework/proxy_resolvers.py | 116 +++++++++++++++++++++---------
pywb/framework/wsgi_wrappers.py | 6 +-
pywb/rewrite/url_rewriter.py | 2 +-
pywb/static/wb.js | 10 ++-
pywb/ui/error.html | 4 +-
pywb/ui/head_insert.html | 3 +-
pywb/webapp/pywb_init.py | 2 +-
9 files changed, 160 insertions(+), 65 deletions(-)
diff --git a/pywb/framework/certauth.py b/pywb/framework/certauth.py
index 023754af..73b0d0e4 100644
--- a/pywb/framework/certauth.py
+++ b/pywb/framework/certauth.py
@@ -45,13 +45,15 @@ class CertificateAuthority(object):
if not os.path.exists(certs_dir):
os.mkdir(certs_dir)
- def get_cert_for_host(self, host, overwrite=False):
- host_filename = os.path.sep.join([self.certs_dir, '%s.pem' % host])
+ def get_cert_for_host(self, host, overwrite=False, wildcard=False):
+ host_filename = os.path.join(self.certs_dir, host) + '.pem'
if not overwrite and os.path.exists(host_filename):
return False, host_filename
- self.generate_host_cert(host, self.cert, self.key, host_filename)
+ self.generate_host_cert(host, self.cert, self.key, host_filename,
+ wildcard)
+
return True, host_filename
@staticmethod
@@ -107,7 +109,8 @@ class CertificateAuthority(object):
return True, cert, key
@staticmethod
- def generate_host_cert(host, root_cert, root_key, host_filename):
+ def generate_host_cert(host, root_cert, root_key, host_filename,
+ wildcard=False):
# Generate key
key = crypto.PKey()
key.generate_key(crypto.TYPE_RSA, 2048)
@@ -123,6 +126,19 @@ class CertificateAuthority(object):
cert.set_issuer(root_cert.get_subject())
cert.set_pubkey(req.get_pubkey())
+
+ if wildcard:
+ DNS = 'DNS:'
+ alt_hosts = [DNS + host,
+ DNS + '*.' + host]
+
+ alt_hosts = ', '.join(alt_hosts)
+
+ cert.add_extensions([
+ crypto.X509Extension('subjectAltName',
+ False,
+ alt_hosts)])
+
cert.sign(root_key, 'sha1')
# Write cert + key
@@ -163,6 +179,9 @@ def main():
parser.add_argument('-f', '--force', action='store_true')
+ parser.add_argument('-w', '--wildcard_cert', action='store_true',
+ help='add wildcard SAN to host: *., ')
+
result = parser.parse_args()
overwrite = result.force
@@ -170,12 +189,13 @@ def main():
# Create a new signed certificate using specified root
if result.use_root:
certs_dir = result.certs_dir
+ wildcard = result.wildcard
ca = CertificateAuthority(ca_file=result.use_root,
certs_dir=result.certs_dir,
certname=result.name)
created, host_filename = ca.get_cert_for_host(result.output_pem_file,
- overwrite)
+ overwrite, wildcard)
if created:
print ('Created new cert "' + host_filename +
diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py
index ba6d3266..693e7bd0 100644
--- a/pywb/framework/proxy.py
+++ b/pywb/framework/proxy.py
@@ -76,7 +76,6 @@ class ProxyRouter(object):
else:
self.resolver = ProxyAuthResolver(routes, proxy_options)
- self.insert_banner = proxy_options.get('banner_only_replay', False)
self.unaltered = proxy_options.get('unaltered_replay', False)
self.proxy_pac_path = proxy_options.get('pac_path', self.PAC_PATH)
@@ -115,10 +114,11 @@ class ProxyRouter(object):
coll = None
matcher = None
response = None
+ ts = None
# check resolver, for pre connect resolve
if self.resolver.pre_connect:
- route, coll, matcher, response = self.resolver.resolve(env)
+ route, coll, matcher, response, ts = self.resolver.resolve(env)
if response:
return response
@@ -138,26 +138,36 @@ class ProxyRouter(object):
if parts.query:
env['pywb.proxy_req_uri'] += '?' + parts.query
- # select prefix
- env['pywb_proxy_select'] = 'select.' + self.magic_name
+ env['pywb_proxy_magic'] = self.magic_name
+ # route (static) and other resources to archival replay
if env['pywb.proxy_host'] == self.magic_name:
env['REL_REQUEST_URI'] = env['pywb.proxy_req_uri']
return None
# check resolver, post connect
if not self.resolver.pre_connect:
- route, coll, matcher, response = self.resolver.resolve(env)
+ route, coll, matcher, ts, response = self.resolver.resolve(env)
if response:
return response
host_prefix = env['pywb.proxy_scheme'] + '://' + self.magic_name
+ rel_prefix = ''
+
+ # special case for proxy calendar
+ if (env['pywb.proxy_host'] == 'query.' + self.magic_name):
+ url = env['pywb.proxy_req_uri'][1:]
+ rel_prefix = '/'
+
+ if ts is not None:
+ url = ts + '/' + url
wbrequest = route.request_class(env,
request_uri=url,
wb_url_str=url,
coll=coll,
host_prefix=host_prefix,
+ rel_prefix=rel_prefix,
wburl_class=route.handler.get_wburl_type(),
urlrewriter_class=HttpsUrlRewriter,
use_abs_prefix=False,
@@ -166,10 +176,10 @@ class ProxyRouter(object):
if matcher:
route.apply_filters(wbrequest, matcher)
- if self.insert_banner:
- wbrequest.wb_url.mod = 'bn_'
- elif self.unaltered:
+ if self.unaltered:
wbrequest.wb_url.mod = 'id_'
+ elif is_https:
+ wbrequest.wb_url.mod = 'bn_'
return route.handler(wbrequest)
@@ -209,13 +219,23 @@ class ProxyRouter(object):
sock.send('\r\n')
hostname, port = env['REL_REQUEST_URI'].split(':')
- created, certfile = self.ca.get_cert_for_host(hostname)
+ cert_host = hostname
- ssl_sock = ssl.wrap_socket(sock,
- server_side=True,
- certfile=certfile,
- ciphers="ALL",
- ssl_version=ssl.PROTOCOL_SSLv23)
+ host_parts = hostname.split('.', 1)
+ if len(host_parts) == 2 and '.' in host_parts[1]:
+ cert_host = host_parts[1]
+
+ created, certfile = self.ca.get_cert_for_host(cert_host,
+ wildcard=True)
+
+ try:
+ ssl_sock = ssl.wrap_socket(sock,
+ server_side=True,
+ certfile=certfile,
+ ciphers="ALL",
+ ssl_version=ssl.PROTOCOL_SSLv23)
+ except Exception as se:
+ raise BadRequestException(se.message)
env['pywb.proxy_ssl_sock'] = ssl_sock
@@ -244,7 +264,6 @@ class ProxyRouter(object):
env['PATH_INFO'] = queryparts[0]
env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else ''
-
while True:
line = buffreader.readline()
if line:
@@ -270,8 +289,7 @@ class ProxyRouter(object):
remain = buffreader.rem_length()
if remain > 0:
remainder = buffreader.read(self.BLOCK_SIZE)
- input_ = socket._fileobject(ssl_sock, mode='r')
- env['wsgi.input'] = BufferedReader(input_,
+ env['wsgi.input'] = BufferedReader(ssl_sock,
block_size=self.BLOCK_SIZE,
starting_data=remainder)
diff --git a/pywb/framework/proxy_resolvers.py b/pywb/framework/proxy_resolvers.py
index 9062bafd..8fb65b73 100644
--- a/pywb/framework/proxy_resolvers.py
+++ b/pywb/framework/proxy_resolvers.py
@@ -1,5 +1,7 @@
from wbrequestresponse import WbResponse, WbRequest
from pywb.utils.statusandheaders import StatusAndHeaders
+from pywb.rewrite.wburl import WbUrl
+
import urlparse
import base64
import os
@@ -22,6 +24,9 @@ class UwsgiCache(object):
def __contains__(self, item):
return uwsgi.cache_exists(item)
+ def __delitem__(self, item):
+ uwsgi.cache_del(item)
+
#=================================================================
class BaseCollResolver(object):
@@ -34,12 +39,13 @@ class BaseCollResolver(object):
route = None
coll = None
matcher = None
+ ts = None
- proxy_coll = self.get_proxy_coll(env)
+ proxy_coll, ts = self.get_proxy_coll_ts(env)
# invalid parsing
if proxy_coll == '':
- return None, None, None, self.select_coll_response(env)
+ return None, None, None, None, self.select_coll_response(env)
if proxy_coll is None and isinstance(self.use_default_coll, str):
proxy_coll = self.use_default_coll
@@ -56,7 +62,7 @@ class BaseCollResolver(object):
# if no match, return coll selection response
if not route:
- return None, None, None, self.select_coll_response(env)
+ return None, None, None, None, self.select_coll_response(env)
# if 'use_default_coll'
elif self.use_default_coll == True or len(self.routes) == 1:
@@ -65,9 +71,9 @@ class BaseCollResolver(object):
# otherwise, return the appropriate coll selection response
else:
- return None, None, None, self.select_coll_response(env)
+ return None, None, None, None, self.select_coll_response(env)
- return route, coll, matcher, None
+ return route, coll, matcher, ts, None
#=================================================================
@@ -79,14 +85,14 @@ class ProxyAuthResolver(BaseCollResolver):
super(ProxyAuthResolver, self).__init__(routes, config)
self.auth_msg = config.get('auth_msg', self.DEFAULT_MSG)
- def get_proxy_coll(self, env):
+ def get_proxy_coll_ts(self, env):
proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION')
if not proxy_auth:
- return None
+ return None, None
proxy_coll = self.read_basic_auth_coll(proxy_auth)
- return proxy_coll
+ return proxy_coll, None
def select_coll_response(self, env):
proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg)
@@ -120,6 +126,9 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
config['pre_connect'] = False
super(CookieResolver, self).__init__(routes, config)
self.magic_name = config['magic_name']
+ self.sethost_prefix = '-sethost.' + self.magic_name + '.'
+ self.set_prefix = '-set.' + self.magic_name
+
self.cookie_name = config.get('cookie_name', '__pywb_coll')
self.proxy_select_view = config.get('proxy_select_view')
@@ -128,9 +137,9 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
else:
self.cache = {}
- def get_proxy_coll(self, env):
- coll, sesh_id = self.get_coll(env)
- return coll
+ def get_proxy_coll_ts(self, env):
+ coll, ts, sesh_id = self.get_coll(env)
+ return coll, ts
def select_coll_response(self, env):
return self.make_magic_response('auto',
@@ -141,27 +150,44 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
server_name = env['pywb.proxy_host']
if ('.' + self.magic_name) in server_name:
- return None, None, None, self.handle_magic_page(env)
+ response = self.handle_magic_page(env)
+ if response:
+ return None, None, None, None, response
return super(CookieResolver, self).resolve(env)
def handle_magic_page(self, env):
- url = env['REL_REQUEST_URI']
- parts = urlparse.urlsplit(url)
+ request_url = env['REL_REQUEST_URI']
+ parts = urlparse.urlsplit(request_url)
+ server_name = env['pywb.proxy_host']
path_url = parts.path[1:]
if parts.query:
path_url += '?' + parts.query
- if parts.netloc.startswith('auto'):
- coll, sesh_id = self.get_coll(env)
+ if server_name.startswith('auto'):
+ coll, ts, sesh_id = self.get_coll(env)
if coll:
return self.make_sethost_cookie_response(sesh_id, path_url, env)
else:
return self.make_magic_response('select', path_url, env)
- elif '.set.' in parts.netloc:
+ elif server_name.startswith('query.'):
+ wb_url = WbUrl(path_url)
+
+ # only dealing with specific timestamp setting
+ if wb_url.is_query():
+ return None
+
+ coll, ts, sesh_id = self.get_coll(env)
+ if not coll:
+ return self.make_magic_response('select', path_url, env)
+
+ self.set_ts(sesh_id, wb_url.timestamp)
+ return self.make_redir_response(wb_url.url)
+
+ elif server_name.endswith(self.set_prefix):
old_sesh_id = self.extract_client_cookie(env, self.cookie_name)
sesh_id = self.create_renew_sesh_id(old_sesh_id)
@@ -170,34 +196,33 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
else:
headers = None
- value, name, _ = parts.netloc.split('.', 2)
+ coll = server_name[:-len(self.set_prefix)]
# set sesh value
- self.cache[sesh_id] = value
+ self.set_coll(sesh_id, coll)
return self.make_sethost_cookie_response(sesh_id, path_url, env,
headers=headers)
- elif '.sethost.' in parts.netloc:
- host_parts = parts.netloc.split('.', 1)
- sesh_id = host_parts[0]
+ elif self.sethost_prefix in server_name:
+ inx = server_name.find(self.sethost_prefix)
+ sesh_id = server_name[:inx]
- inx = parts.netloc.find('.' + self.magic_name + '.')
- domain = parts.netloc[inx + len(self.magic_name) + 2:]
+ domain = server_name[inx + len(self.sethost_prefix):]
headers = self.make_cookie_headers(sesh_id, domain)
full_url = env['pywb.proxy_scheme'] + '://' + domain
full_url += '/' + path_url
- return WbResponse.redir_response(full_url, headers=headers)
+ return self.make_redir_response(full_url, headers=headers)
- elif 'select.' in parts.netloc:
+ elif 'select.' in server_name:
if not self.proxy_select_view:
return WbResponse.text_response('select text for ' + path_url)
- coll, sesh_id = self.get_coll(env)
+ coll, ts, sesh_id = self.get_coll(env)
- route_temp = env['pywb.proxy_scheme'] + '://%s.coll.set.'
+ route_temp = env['pywb.proxy_scheme'] + '://%s-set.'
route_temp += self.magic_name + '/' + path_url
return (self.proxy_select_view.
@@ -217,14 +242,18 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
headers = [('Set-Cookie', cookie_val)]
return headers
- def make_sethost_cookie_response(self, sesh_id, path_url, env, headers=None):
+ def make_sethost_cookie_response(self, sesh_id, path_url,
+ env, headers=None):
+ if '://' not in path_url:
+ path_url = 'http://' + path_url
+
path_parts = urlparse.urlsplit(path_url)
new_url = path_parts.path[1:]
if path_parts.query:
new_url += '?' + path_parts.query
- return self.make_magic_response(sesh_id + '.sethost', new_url, env,
+ return self.make_magic_response(sesh_id + '-sethost', new_url, env,
suffix=path_parts.netloc,
headers=headers)
@@ -236,25 +265,44 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
if suffix:
full_url += '.' + suffix
full_url += '/' + url
- return WbResponse.redir_response(full_url, headers=headers)
+ return self.make_redir_response(full_url, headers=headers)
+
+ def set_coll(self, sesh_id, coll):
+ self.cache[sesh_id + ':c'] = coll
+
+ def set_ts(self, sesh_id, ts):
+ if ts:
+ self.cache[sesh_id + ':t'] = ts
+ # this ensures that omitting timestamp will reset to latest
+ # capture by deleting the cache entry
+ else:
+ del self.cache[sesh_id + ':t']
def get_coll(self, env):
sesh_id = self.extract_client_cookie(env, self.cookie_name)
coll = None
+ ts = None
if sesh_id:
- coll = self.cache[sesh_id]
+ coll = self.cache[sesh_id + ':c']
+ try:
+ ts = self.cache[sesh_id + ':t']
+ except KeyError:
+ pass
- return coll, sesh_id
+ return coll, ts, sesh_id
def create_renew_sesh_id(self, sesh_id, force=False):
#if sesh_id in self.cache and not force:
- if sesh_id and (sesh_id in self.cache) and not force:
+ if sesh_id and ((sesh_id + ':c') in self.cache) and not force:
return sesh_id
sesh_id = base64.b32encode(os.urandom(5)).lower()
return sesh_id
+ def make_redir_response(self, url, headers=None):
+ return WbResponse.redir_response(url, headers=headers)
+
@staticmethod
def extract_client_cookie(env, cookie_name):
cookie_header = env.get('HTTP_COOKIE')
diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py
index d1a4f772..3498c819 100644
--- a/pywb/framework/wsgi_wrappers.py
+++ b/pywb/framework/wsgi_wrappers.py
@@ -125,7 +125,11 @@ class WSGIApp(object):
else:
err_url = None
- err_msg = exc.message.encode('utf-8')
+ try:
+ err_msg = exc.message.encode('utf-8')
+ except Exception:
+ err_msg = exc.message
+ err_url = ''
if print_trace:
import traceback
diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py
index d5593a22..2679b4dc 100644
--- a/pywb/rewrite/url_rewriter.py
+++ b/pywb/rewrite/url_rewriter.py
@@ -144,7 +144,7 @@ class HttpsUrlRewriter(object):
else:
return url
- def get_timestamp_url(self, timestamp, url):
+ def get_timestamp_url(self, timestamp, url=''):
return url
def get_abs_url(self, url=''):
diff --git a/pywb/static/wb.js b/pywb/static/wb.js
index fb2c3ac3..f4267b8e 100644
--- a/pywb/static/wb.js
+++ b/pywb/static/wb.js
@@ -70,9 +70,13 @@ function init_banner() {
text += "" + capture_str + "";
- if (wbinfo.proxy_select && wbinfo.url) {
- full_url = wbinfo.proxy_select + "/" + wbinfo.url;
- text += '
Switch Collection';
+ if (wbinfo.proxy_magic && wbinfo.url) {
+ var select_url = wbinfo.proxy_magic + "/" + wbinfo.url;
+ var query_url = wbinfo.proxy_magic + "/*/" + wbinfo.url;
+ text += '
'
+ text += 'From ' + wbinfo.coll + ' [Switch]';
+ text += ' ';
+ text += 'View All Captures';
}
banner.innerHTML = text;
diff --git a/pywb/ui/error.html b/pywb/ui/error.html
index 6453e987..b122fc38 100644
--- a/pywb/ui/error.html
+++ b/pywb/ui/error.html
@@ -10,9 +10,9 @@
{% endif %}
-{% if env.pywb_proxy_select and err_url and status == '404 Not Found' %}
+{% if env.pywb_proxy_magic and err_url and status == '404 Not Found' %}
-Try Different Collections
+Try Different Collection
{% endif %}
diff --git a/pywb/ui/head_insert.html b/pywb/ui/head_insert.html
index 98330da9..f22ef55a 100644
--- a/pywb/ui/head_insert.html
+++ b/pywb/ui/head_insert.html
@@ -20,7 +20,8 @@
wbinfo.is_frame_mp = {{"true" if wbrequest.wb_url.mod == 'mp_' else "false"}};
wbinfo.canon_url = "{{ canon_url }}";
wbinfo.is_live = {{ "true" if cdx.is_live else "false" }};
- wbinfo.proxy_select = "{{ wbrequest.env.pywb_proxy_select }}";
+ wbinfo.coll = "{{ wbrequest.coll }}";
+ wbinfo.proxy_magic = "{{ wbrequest.env.pywb_proxy_magic }}";
diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py
index 7cd62a79..3b3a3cc6 100644
--- a/pywb/webapp/pywb_init.py
+++ b/pywb/webapp/pywb_init.py
@@ -78,7 +78,7 @@ def create_live_handler(config):
#=================================================================
def init_route_config(value, config):
- if isinstance(value, str):
+ if isinstance(value, str) or isinstance(value, list):
value = dict(index_paths=value)
route_config = DictChain(value, config)
From cfe11a5ad383052e7224dca689b7e4040656b778 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Thu, 31 Jul 2014 11:56:43 -0700
Subject: [PATCH 17/26] fix typo param ordering
---
pywb/framework/proxy.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py
index 693e7bd0..90ff2fd0 100644
--- a/pywb/framework/proxy.py
+++ b/pywb/framework/proxy.py
@@ -118,7 +118,7 @@ class ProxyRouter(object):
# check resolver, for pre connect resolve
if self.resolver.pre_connect:
- route, coll, matcher, response, ts = self.resolver.resolve(env)
+ route, coll, matcher, ts, response = self.resolver.resolve(env)
if response:
return response
From 407da7528ba70565da46dc6363745e962c1e9983 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Thu, 31 Jul 2014 17:02:26 -0700
Subject: [PATCH 18/26] proxy/rewrite: don't rewrite headers banner_only
---
pywb/framework/proxy.py | 7 ++++++-
pywb/rewrite/header_rewriter.py | 16 +++++++++++-----
pywb/rewrite/rewrite_content.py | 4 ++++
3 files changed, 21 insertions(+), 6 deletions(-)
diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py
index 90ff2fd0..ab322374 100644
--- a/pywb/framework/proxy.py
+++ b/pywb/framework/proxy.py
@@ -181,7 +181,12 @@ class ProxyRouter(object):
elif is_https:
wbrequest.wb_url.mod = 'bn_'
- return route.handler(wbrequest)
+ response = route.handler(wbrequest)
+
+ if wbrequest.wb_url and wbrequest.wb_url.is_replay():
+ response.status_headers.replace_header('Cache-Control', 'no-cache')
+
+ return response
def get_request_socket(self, env):
if not self.ca:
diff --git a/pywb/rewrite/header_rewriter.py b/pywb/rewrite/header_rewriter.py
index 2dfc824d..fd41eba8 100644
--- a/pywb/rewrite/header_rewriter.py
+++ b/pywb/rewrite/header_rewriter.py
@@ -37,7 +37,7 @@ class HeaderRewriter:
ENCODING_HEADERS = ['content-encoding']
- REMOVE_HEADERS = ['transfer-encoding']
+ REMOVE_HEADERS = ['transfer-encoding', 'content-security-policy']
PROXY_NO_REWRITE_HEADERS = ['content-length']
@@ -90,7 +90,10 @@ class HeaderRewriter:
new_headers = []
removed_header_dict = {}
- cookie_rewriter = urlrewriter.get_cookie_rewriter()
+ if urlrewriter:
+ cookie_rewriter = urlrewriter.get_cookie_rewriter()
+ else:
+ cookie_rewriter = None
for (name, value) in headers:
@@ -99,7 +102,7 @@ class HeaderRewriter:
if lowername in self.PROXY_HEADERS:
new_headers.append((name, value))
- elif lowername in self.URL_REWRITE_HEADERS:
+ elif urlrewriter and lowername in self.URL_REWRITE_HEADERS:
new_headers.append((name, urlrewriter.rewrite(value)))
elif lowername in self.ENCODING_HEADERS:
@@ -109,7 +112,8 @@ class HeaderRewriter:
new_headers.append((name, value))
elif lowername in self.REMOVE_HEADERS:
- removed_header_dict[lowername] = value
+ removed_header_dict[lowername] = value
+ new_headers.append((self.header_prefix + name, value))
elif (lowername in self.PROXY_NO_REWRITE_HEADERS and
not content_rewritten):
@@ -120,7 +124,9 @@ class HeaderRewriter:
cookie_list = cookie_rewriter.rewrite(value)
new_headers.extend(cookie_list)
- else:
+ elif urlrewriter:
new_headers.append((self.header_prefix + name, value))
+ else:
+ new_headers.append((name, value))
return (new_headers, removed_header_dict)
diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py
index 93ec396b..e81fdf9a 100644
--- a/pywb/rewrite/rewrite_content.py
+++ b/pywb/rewrite/rewrite_content.py
@@ -63,6 +63,10 @@ class RewriteContent:
status_headers, stream = self.sanitize_content(headers, stream)
return (status_headers, self.stream_to_gen(stream), False)
+
+ if wb_url.is_banner_only:
+ urlrewriter = None
+
(rewritten_headers, stream) = self.rewrite_headers(urlrewriter,
headers,
stream)
From f5c27d7b068d9a61a8379eb9593353ce988e397d Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Thu, 31 Jul 2014 17:33:43 -0700
Subject: [PATCH 19/26] rewrite: fix header rewrite test proxy_pac: use http
host header if available for proxy host
---
pywb/framework/proxy.py | 4 +---
pywb/rewrite/test/test_header_rewriter.py | 8 +++++---
2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py
index ab322374..76cd8843 100644
--- a/pywb/framework/proxy.py
+++ b/pywb/framework/proxy.py
@@ -300,8 +300,7 @@ class ProxyRouter(object):
# Proxy Auto-Config (PAC) script for the proxy
def make_pac_response(self, env):
- import os
- hostname = os.environ.get('PYWB_HOST_NAME')
+ hostname = env.get('HTTP_HOST')
if not hostname:
server_hostport = env['SERVER_NAME'] + ':' + env['SERVER_PORT']
hostonly = env['SERVER_NAME']
@@ -319,7 +318,6 @@ class ProxyRouter(object):
buff += direct.format(hostonly)
- #buff += '\n return "PROXY {0}";\n}}\n'.format(self.hostpaths[0])
buff += '\n return "PROXY {0}";\n}}\n'.format(server_hostport)
content_type = 'application/x-ns-proxy-autoconfig'
diff --git a/pywb/rewrite/test/test_header_rewriter.py b/pywb/rewrite/test/test_header_rewriter.py
index 1a2b2cea..0b22d533 100644
--- a/pywb/rewrite/test/test_header_rewriter.py
+++ b/pywb/rewrite/test/test_header_rewriter.py
@@ -40,17 +40,19 @@ HTTP Headers Rewriting
'removed_header_dict': {'content-encoding': 'gzip',
'transfer-encoding': 'chunked'},
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
- ('Content-Type', 'text/javascript')]),
+ ('Content-Type', 'text/javascript'),
+ ('X-Archive-Orig-Transfer-Encoding', 'chunked')]),
'text_type': 'js'}
-# Binary -- transfer-encoding removed
+# Binary -- transfer-encoding rewritten
>>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Set-Cookie', 'foo=bar; Path=/;'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
{'charset': None,
'removed_header_dict': {'transfer-encoding': 'chunked'},
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
('Content-Type', 'image/png'),
('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/'),
- ('Content-Encoding', 'gzip')]),
+ ('Content-Encoding', 'gzip'),
+ ('X-Archive-Orig-Transfer-Encoding', 'chunked')]),
'text_type': None}
"""
From 2ca4757599a358ec6b06d8eb29cb2c145e2de81f Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Thu, 31 Jul 2014 18:03:18 -0700
Subject: [PATCH 20/26] fix integration test for proxy_pac
---
tests/test_integration.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tests/test_integration.py b/tests/test_integration.py
index 456d50f8..a3bd6f3b 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -385,7 +385,7 @@ class TestWb:
assert resp.status_int == 407
def test_proxy_pac(self):
- resp = self.testapp.get('/proxy.pac', extra_environ = dict(SERVER_NAME='pywb-proxy', SERVER_PORT='8080'))
+ resp = self.testapp.get('/proxy.pac', headers = [('Host', 'pywb-proxy:8080')])
assert resp.content_type == 'application/x-ns-proxy-autoconfig'
assert '"PROXY pywb-proxy:8080"' in resp.body
assert '"localhost"' in resp.body
From 92daad3b2bbced9b377d5a41f7ef3fcb38a840c1 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Thu, 31 Jul 2014 18:56:35 -0700
Subject: [PATCH 21/26] ui: tweak head insert text for proxy
---
pywb/static/wb.js | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/pywb/static/wb.js b/pywb/static/wb.js
index f4267b8e..19d292c3 100644
--- a/pywb/static/wb.js
+++ b/pywb/static/wb.js
@@ -73,10 +73,9 @@ function init_banner() {
if (wbinfo.proxy_magic && wbinfo.url) {
var select_url = wbinfo.proxy_magic + "/" + wbinfo.url;
var query_url = wbinfo.proxy_magic + "/*/" + wbinfo.url;
+ text += ' All Capture Times';
text += '
'
- text += 'From ' + wbinfo.coll + ' [Switch]';
- text += ' ';
- text += 'View All Captures';
+ text += 'From collection ' + wbinfo.coll + ' All Collections';
}
banner.innerHTML = text;
From 37fd75f744234903ea5835c310cc2bc79cd01fe4 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Thu, 31 Jul 2014 21:17:07 -0700
Subject: [PATCH 22/26] update version to 0.6.0, update CHANGELIST add quotes
around "coll" in header
---
CHANGES.rst | 8 ++++++++
README.rst | 9 +++++----
pywb/rewrite/header_rewriter.py | 3 ++-
pywb/static/wb.js | 2 +-
setup.py | 2 +-
5 files changed, 17 insertions(+), 7 deletions(-)
diff --git a/CHANGES.rst b/CHANGES.rst
index a7848d64..0ab917fb 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -1,3 +1,11 @@
+pywb 0.6.0 changelist
+~~~~~~~~~~~~~~~~~~~~~
+
+* HTTPS Proxy Support!
+
+* Revamped HTTP/S system: proxy collection and capture time switching via cookie!
+
+
pywb 0.5.1 changelist
~~~~~~~~~~~~~~~~~~~~~
minor fixes:
diff --git a/README.rst b/README.rst
index 6aa256ac..3640c69d 100644
--- a/README.rst
+++ b/README.rst
@@ -1,11 +1,11 @@
-PyWb 0.5.2
+PyWb 0.6.0
==========
-.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop
+.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=https-proxy
:target: https://travis-ci.org/ikreymer/pywb
-.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=develop
- :target: https://coveralls.io/r/ikreymer/pywb?branch=develop
+.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=https-proxy
+ :target: https://coveralls.io/r/ikreymer/pywb?branch=https-proxy
pywb is a python implementation of web archival replay tools, sometimes also known as 'Wayback Machine'.
@@ -21,6 +21,7 @@ This README contains a basic overview of using pywb. After reading this intro, c
* `pywb-samples `_ provides additional archive samples with difficult-to-replay content.
+* `pywb-proxy-demo `_ showcases the revamped HTTP/S proxy replay system (available from pywb 0.6.0)
The following deployed applications use pywb:
diff --git a/pywb/rewrite/header_rewriter.py b/pywb/rewrite/header_rewriter.py
index fd41eba8..2d505e88 100644
--- a/pywb/rewrite/header_rewriter.py
+++ b/pywb/rewrite/header_rewriter.py
@@ -37,7 +37,8 @@ class HeaderRewriter:
ENCODING_HEADERS = ['content-encoding']
- REMOVE_HEADERS = ['transfer-encoding', 'content-security-policy']
+ REMOVE_HEADERS = ['transfer-encoding', 'content-security-policy',
+ 'strict-transport-security']
PROXY_NO_REWRITE_HEADERS = ['content-length']
diff --git a/pywb/static/wb.js b/pywb/static/wb.js
index 19d292c3..3ef6471e 100644
--- a/pywb/static/wb.js
+++ b/pywb/static/wb.js
@@ -75,7 +75,7 @@ function init_banner() {
var query_url = wbinfo.proxy_magic + "/*/" + wbinfo.url;
text += ' All Capture Times';
text += '
'
- text += 'From collection ' + wbinfo.coll + ' All Collections';
+ text += 'From collection "' + wbinfo.coll + '" All Collections';
}
banner.innerHTML = text;
diff --git a/setup.py b/setup.py
index 2881d1e5..6b5482bf 100755
--- a/setup.py
+++ b/setup.py
@@ -34,7 +34,7 @@ class PyTest(TestCommand):
setup(
name='pywb',
- version='0.5.2',
+ version='0.6.0',
url='https://github.com/ikreymer/pywb',
author='Ilya Kreymer',
author_email='ikreymer@gmail.com',
From 48b1c7891772a3326ca114ac0d824dff32a627e2 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Thu, 31 Jul 2014 21:27:30 -0700
Subject: [PATCH 23/26] proxy: more banner tweaks
---
pywb/static/wb.js | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pywb/static/wb.js b/pywb/static/wb.js
index 3ef6471e..d4db630e 100644
--- a/pywb/static/wb.js
+++ b/pywb/static/wb.js
@@ -73,9 +73,9 @@ function init_banner() {
if (wbinfo.proxy_magic && wbinfo.url) {
var select_url = wbinfo.proxy_magic + "/" + wbinfo.url;
var query_url = wbinfo.proxy_magic + "/*/" + wbinfo.url;
- text += ' All Capture Times';
+ text += ' All Capture Times';
text += '
'
- text += 'From collection "' + wbinfo.coll + '" All Collections';
+ text += 'From collection "' + wbinfo.coll + '" All Collections';
}
banner.innerHTML = text;
From aeb246466b0bec2a28284dd4f0504b9b277bea8a Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Fri, 1 Aug 2014 12:35:19 -0700
Subject: [PATCH 24/26] proxy: SSL version is 0-based not 1-based,
set_version(2) for version 3!
---
pywb/framework/certauth.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pywb/framework/certauth.py b/pywb/framework/certauth.py
index 73b0d0e4..ef47b380 100644
--- a/pywb/framework/certauth.py
+++ b/pywb/framework/certauth.py
@@ -59,7 +59,7 @@ class CertificateAuthority(object):
@staticmethod
def _make_cert(certname):
cert = crypto.X509()
- cert.set_version(3)
+ cert.set_version(2)
cert.set_serial_number(random.randint(0, 2 ** 64 - 1))
cert.get_subject().CN = certname
From 4efd2d514c4cafed5e5d22007ee1c1e9f1819332 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Fri, 1 Aug 2014 17:15:49 -0700
Subject: [PATCH 25/26] proxy: add proxy_cert download page at root
http://pywb.proxy, serving .pem and .p12 (windows only) certs for auto
installation
---
pywb/framework/certauth.py | 6 +++
pywb/framework/proxy.py | 76 +++++++++++++++++++++++++++-----
pywb/ui/proxy_cert_download.html | 14 ++++++
pywb/ui/proxy_select.html | 2 +-
pywb/webapp/pywb_init.py | 12 ++++-
5 files changed, 96 insertions(+), 14 deletions(-)
create mode 100644 pywb/ui/proxy_cert_download.html
diff --git a/pywb/framework/certauth.py b/pywb/framework/certauth.py
index ef47b380..260f5bdc 100644
--- a/pywb/framework/certauth.py
+++ b/pywb/framework/certauth.py
@@ -56,6 +56,12 @@ class CertificateAuthority(object):
return True, host_filename
+ def get_root_PKCS12(self):
+ p12 = crypto.PKCS12()
+ p12.set_certificate(self.cert)
+ p12.set_privatekey(self.key)
+ return p12.export()
+
@staticmethod
def _make_cert(certname):
cert = crypto.X509()
diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py
index 76cd8843..fe8e3ec8 100644
--- a/pywb/framework/proxy.py
+++ b/pywb/framework/proxy.py
@@ -57,6 +57,9 @@ class ProxyRouter(object):
BLOCK_SIZE = 4096
DEF_MAGIC_NAME = 'pywb.proxy'
+ CERT_DL_PEM = '/pywb-ca.pem'
+ CERT_DL_P12 = '/pywb-ca.p12'
+
def __init__(self, routes, **kwargs):
self.hostpaths = kwargs.get('hostpaths')
@@ -81,19 +84,24 @@ class ProxyRouter(object):
self.proxy_pac_path = proxy_options.get('pac_path', self.PAC_PATH)
- if proxy_options.get('enable_https_proxy'):
- ca_file = proxy_options.get('root_ca_file')
-
- # attempt to create the root_ca_file if doesn't exist
- # (generally recommended to create this seperately)
- certname = proxy_options.get('root_ca_name')
- CertificateAuthority.generate_ca_root(certname, ca_file)
-
- certs_dir = proxy_options.get('certs_dir')
- self.ca = CertificateAuthority(ca_file=ca_file,
- certs_dir=certs_dir)
- else:
+ if not proxy_options.get('enable_https_proxy'):
self.ca = None
+ self.proxy_cert_dl_view = None
+ return
+
+ # HTTPS Only Options
+ ca_file = proxy_options.get('root_ca_file')
+
+ # attempt to create the root_ca_file if doesn't exist
+ # (generally recommended to create this seperately)
+ certname = proxy_options.get('root_ca_name')
+ CertificateAuthority.generate_ca_root(certname, ca_file)
+
+ certs_dir = proxy_options.get('certs_dir')
+ self.ca = CertificateAuthority(ca_file=ca_file,
+ certs_dir=certs_dir)
+
+ self.proxy_cert_dl_view = proxy_options.get('proxy_cert_download_view')
def __call__(self, env):
is_https = (env['REQUEST_METHOD'] == 'CONNECT')
@@ -143,6 +151,12 @@ class ProxyRouter(object):
# route (static) and other resources to archival replay
if env['pywb.proxy_host'] == self.magic_name:
env['REL_REQUEST_URI'] = env['pywb.proxy_req_uri']
+
+ # special case for proxy install
+ response = self.handle_cert_install(env)
+ if response:
+ return response
+
return None
# check resolver, post connect
@@ -298,6 +312,44 @@ class ProxyRouter(object):
block_size=self.BLOCK_SIZE,
starting_data=remainder)
+ def handle_cert_install(self, env):
+ if env['pywb.proxy_req_uri'] in ('/', '/index.html', '/index.html'):
+ available = (self.ca is not None)
+
+ if self.proxy_cert_dl_view:
+ return (self.proxy_cert_dl_view.
+ render_response(available=available,
+ pem_path=self.CERT_DL_PEM,
+ p12_path=self.CERT_DL_P12))
+ else:
+ return None
+
+ elif env['pywb.proxy_req_uri'] == self.CERT_DL_PEM:
+ if not self.ca:
+ return None
+
+ buff = ''
+ with open(self.ca.ca_file) as fh:
+ buff = fh.read()
+
+ content_type = 'application/x-x509-ca-cert'
+
+ return WbResponse.text_response(buff,
+ content_type=content_type)
+
+ elif env['pywb.proxy_req_uri'] == self.CERT_DL_P12:
+ if not self.ca:
+ return None
+
+ buff = self.ca.get_root_PKCS12()
+
+ content_type = 'application/x-pkcs12'
+
+ return WbResponse.text_response(buff,
+ content_type=content_type)
+ else:
+ return None
+
# Proxy Auto-Config (PAC) script for the proxy
def make_pac_response(self, env):
hostname = env.get('HTTP_HOST')
diff --git a/pywb/ui/proxy_cert_download.html b/pywb/ui/proxy_cert_download.html
new file mode 100644
index 00000000..71255e3a
--- /dev/null
+++ b/pywb/ui/proxy_cert_download.html
@@ -0,0 +1,14 @@
+HTTPS Certificate For PyWb Web Archive Replay
+{% if not available %}
+Sorry, HTTPS support is not configured for this proxy. However, the proxy should work in HTTP mode.
+{% else %}
+Download for all platforms (except Windows):
+Download Certificate (All except Windows)
+
+(If you see the Already Installed message, then no further action is necessary and you may start browsing!
+{% endif %}
+
+Download for Windows platforms:
+Download Certificate (Window Only)
+
+
diff --git a/pywb/ui/proxy_select.html b/pywb/ui/proxy_select.html
index a5164ff2..ff9afc00 100644
--- a/pywb/ui/proxy_select.html
+++ b/pywb/ui/proxy_select.html
@@ -14,7 +14,7 @@ Current collection is: {{ coll }}
{% for route in routes %}
-{% if route | is_wb_handler %}
+{% if route.path and route | is_wb_handler %}
- {{ route.path }}
{% endif %}
{% endfor %}
diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py
index 3b3a3cc6..4c503be6 100644
--- a/pywb/webapp/pywb_init.py
+++ b/pywb/webapp/pywb_init.py
@@ -33,7 +33,9 @@ DEFAULTS = {
'search_html': 'ui/search.html',
'home_html': 'ui/index.html',
'error_html': 'ui/error.html',
+
'proxy_select_html': 'ui/proxy_select.html',
+ 'proxy_cert_download_html': 'ui/proxy_cert_download.html',
'template_globals': {'static_path': 'static/default'},
@@ -227,7 +229,15 @@ def create_wb_router(passed_config={}):
if not 'proxy_options' in passed_config:
passed_config['proxy_options'] = {}
- passed_config['proxy_options']['proxy_select_view'] = view
+ if view:
+ passed_config['proxy_options']['proxy_select_view'] = view
+
+ view = J2TemplateView.create_template(
+ config.get('proxy_cert_download_html'),
+ 'Proxy Cert Download')
+
+ if view:
+ passed_config['proxy_options']['proxy_cert_download_view'] = view
else:
router = ArchivalRouter
From 92726309fc956393f5979ee90a7eecf41aa33118 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer
Date: Sat, 2 Aug 2014 04:27:51 -0700
Subject: [PATCH 26/26] proxy: add 'extra_headers' to be added to proxy
responses, customizable via proxy_options defaults include no-cache and p3p
policy (needed for IE default settings) fix link generation for proxy_select
page, better exception handling of ssl errors
---
pywb/framework/proxy.py | 25 ++++++++++++++++++-------
pywb/framework/proxy_resolvers.py | 26 +++++++++++++++++++-------
pywb/ui/proxy_select.html | 2 +-
pywb/utils/statusandheaders.py | 20 +++++++++++++++++++-
4 files changed, 57 insertions(+), 16 deletions(-)
diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py
index fe8e3ec8..57dd5088 100644
--- a/pywb/framework/proxy.py
+++ b/pywb/framework/proxy.py
@@ -60,6 +60,9 @@ class ProxyRouter(object):
CERT_DL_PEM = '/pywb-ca.pem'
CERT_DL_P12 = '/pywb-ca.p12'
+ EXTRA_HEADERS = {'cache-control': 'no-cache',
+ 'p3p': 'CP="NOI ADM DEV COM NAV OUR STP"'}
+
def __init__(self, routes, **kwargs):
self.hostpaths = kwargs.get('hostpaths')
@@ -74,6 +77,11 @@ class ProxyRouter(object):
self.magic_name = self.DEF_MAGIC_NAME
proxy_options['magic_name'] = self.magic_name
+ self.extra_headers = proxy_options.get('extra_headers')
+ if not self.extra_headers:
+ self.extra_headers = self.EXTRA_HEADERS
+ proxy_options['extra_headers'] = self.extra_headers
+
if proxy_options.get('cookie_resolver'):
self.resolver = CookieResolver(routes, proxy_options)
else:
@@ -198,7 +206,7 @@ class ProxyRouter(object):
response = route.handler(wbrequest)
if wbrequest.wb_url and wbrequest.wb_url.is_replay():
- response.status_headers.replace_header('Cache-Control', 'no-cache')
+ response.status_headers.replace_headers(self.extra_headers)
return response
@@ -252,19 +260,22 @@ class ProxyRouter(object):
server_side=True,
certfile=certfile,
ciphers="ALL",
+ suppress_ragged_eofs=False,
+ #ssl_version=ssl.PROTOCOL_TLSv1)
ssl_version=ssl.PROTOCOL_SSLv23)
+ env['pywb.proxy_ssl_sock'] = ssl_sock
+
+ buffreader = BufferedReader(ssl_sock, block_size=self.BLOCK_SIZE)
+
+ statusline = buffreader.readline().rstrip()
+
except Exception as se:
raise BadRequestException(se.message)
- env['pywb.proxy_ssl_sock'] = ssl_sock
-
- buffreader = BufferedReader(ssl_sock, block_size=self.BLOCK_SIZE)
-
- statusline = buffreader.readline()
statusparts = statusline.split(' ')
if len(statusparts) < 3:
- raise BadRequestException('Invalid Proxy Request')
+ raise BadRequestException('Invalid Proxy Request: ' + statusline)
env['REQUEST_METHOD'] = statusparts[0]
env['REL_REQUEST_URI'] = ('https://' +
diff --git a/pywb/framework/proxy_resolvers.py b/pywb/framework/proxy_resolvers.py
index 8fb65b73..dc7b22fe 100644
--- a/pywb/framework/proxy_resolvers.py
+++ b/pywb/framework/proxy_resolvers.py
@@ -132,6 +132,8 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
self.cookie_name = config.get('cookie_name', '__pywb_coll')
self.proxy_select_view = config.get('proxy_select_view')
+ self.extra_headers = config.get('extra_headers')
+
if uwsgi_cache:
self.cache = UwsgiCache()
else:
@@ -222,14 +224,17 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
coll, ts, sesh_id = self.get_coll(env)
- route_temp = env['pywb.proxy_scheme'] + '://%s-set.'
- route_temp += self.magic_name + '/' + path_url
+ #scheme = env['pywb.proxy_scheme'] + '://'
+ route_temp = '-set.' + self.magic_name + '/' + path_url
- return (self.proxy_select_view.
- render_response(routes=self.routes,
- route_temp=route_temp,
- coll=coll,
- url=path_url))
+ try:
+ return (self.proxy_select_view.
+ render_response(routes=self.routes,
+ route_temp=route_temp,
+ coll=coll,
+ url=path_url))
+ except Exception as exc:
+ raise
#else:
# msg = 'Invalid Magic Path: ' + url
@@ -301,6 +306,13 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
return sesh_id
def make_redir_response(self, url, headers=None):
+ if not headers:
+ headers = []
+
+ if self.extra_headers:
+ for name, value in self.extra_headers.iteritems():
+ headers.append((name, value))
+
return WbResponse.redir_response(url, headers=headers)
@staticmethod
diff --git a/pywb/ui/proxy_select.html b/pywb/ui/proxy_select.html
index ff9afc00..b06f68a2 100644
--- a/pywb/ui/proxy_select.html
+++ b/pywb/ui/proxy_select.html
@@ -15,7 +15,7 @@ Current collection is: {{ coll }}
diff --git a/pywb/utils/statusandheaders.py b/pywb/utils/statusandheaders.py
index ae3fc261..70ba850c 100644
--- a/pywb/utils/statusandheaders.py
+++ b/pywb/utils/statusandheaders.py
@@ -3,6 +3,7 @@ Representation and parsing of HTTP-style status + headers
"""
import pprint
+from copy import copy
#=================================================================
@@ -44,9 +45,26 @@ class StatusAndHeaders(object):
self.headers.append((name, value))
return None
+ def replace_headers(self, header_dict):
+ """
+ replace all headers in header_dict that already exist
+ add any remaining headers
+ """
+ header_dict = copy(header_dict)
+
+ for index in xrange(len(self.headers) - 1, -1, -1):
+ curr_name, curr_value = self.headers[index]
+ name_lower = curr_name.lower()
+ if name_lower in header_dict:
+ self.headers[index] = (curr_name, header_dict[name_lower])
+ del header_dict[name_lower]
+
+ for name, value in header_dict.iteritems():
+ self.headers.append((name, value))
+
def remove_header(self, name):
"""
- remove header (case-insensitive)
+ Remove header (case-insensitive)
return True if header removed, False otherwise
"""
name_lower = name.lower()