diff --git a/CHANGES.rst b/CHANGES.rst
index cea9f087..2635c87d 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -1,3 +1,11 @@
+pywb 0.6.0 changelist
+~~~~~~~~~~~~~~~~~~~~~
+
+* HTTPS Proxy Support!
+
+* Revamped HTTP/S system: proxy collection and capture time switching via cookie!
+
+
pywb 0.5.3 changelist
~~~~~~~~~~~~~~~~~~~~~
* better framed replay for non-html content -- include live rewrite timestamp via temp 'pywb.timestamp' cookie, updating banner of iframe load. All timestamp formatting moved to client-side for better customization.
@@ -6,6 +14,7 @@ pywb 0.5.3 changelist
* banner-only rewrite mode (via 'bn_' modifier) to support only banner insertion with no rewriting, server-side or client-side.
+
pywb 0.5.1 changelist
~~~~~~~~~~~~~~~~~~~~~
minor fixes:
diff --git a/README.rst b/README.rst
index 078ad24b..3640c69d 100644
--- a/README.rst
+++ b/README.rst
@@ -1,11 +1,11 @@
-PyWb 0.5.3
+PyWb 0.6.0
==========
-.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop
+.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=https-proxy
:target: https://travis-ci.org/ikreymer/pywb
-.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=develop
- :target: https://coveralls.io/r/ikreymer/pywb?branch=develop
+.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=https-proxy
+ :target: https://coveralls.io/r/ikreymer/pywb?branch=https-proxy
pywb is a python implementation of web archival replay tools, sometimes also known as 'Wayback Machine'.
@@ -21,6 +21,7 @@ This README contains a basic overview of using pywb. After reading this intro, c
* `pywb-samples `_ provides additional archive samples with difficult-to-replay content.
+* `pywb-proxy-demo `_ showcases the revamped HTTP/S proxy replay system (available from pywb 0.6.0)
The following deployed applications use pywb:
diff --git a/config.yaml b/config.yaml
index 937b4545..fc2290ba 100644
--- a/config.yaml
+++ b/config.yaml
@@ -109,3 +109,6 @@ enable_memento: true
# Replay content in an iframe
framed_replay: true
+
+debug_echo_env: True
+
diff --git a/pywb/framework/certauth.py b/pywb/framework/certauth.py
new file mode 100644
index 00000000..260f5bdc
--- /dev/null
+++ b/pywb/framework/certauth.py
@@ -0,0 +1,228 @@
+import logging
+import os
+from OpenSSL import crypto
+from OpenSSL.SSL import FILETYPE_PEM
+import random
+from argparse import ArgumentParser
+
+
+#=================================================================
+# Duration of 100 years
+CERT_DURATION = 100 * 365 * 24 * 60 * 60
+
+CERTS_DIR = './pywb-certs/'
+
+CERT_NAME = 'pywb https proxy replay CA'
+
+CERT_CA_FILE = './pywb-ca.pem'
+
+
+#=================================================================
+class CertificateAuthority(object):
+ """
+ Utility class for signing individual certificate
+ with a root cert.
+
+ Static generate_ca_root() method for creating the root cert
+
+ All certs saved on filesystem. Individual certs are stored
+ in specified certs_dir and reused if previously created.
+ """
+
+ def __init__(self, ca_file, certs_dir):
+ if not ca_file:
+ ca_file = CERT_CA_FILE
+
+ if not certs_dir:
+ certs_dir = CERTS_DIR
+
+ self.ca_file = ca_file
+ self.certs_dir = certs_dir
+
+ # read previously created root cert
+ self.cert, self.key = self.read_pem(ca_file)
+
+ if not os.path.exists(certs_dir):
+ os.mkdir(certs_dir)
+
+ def get_cert_for_host(self, host, overwrite=False, wildcard=False):
+ host_filename = os.path.join(self.certs_dir, host) + '.pem'
+
+ if not overwrite and os.path.exists(host_filename):
+ return False, host_filename
+
+ self.generate_host_cert(host, self.cert, self.key, host_filename,
+ wildcard)
+
+ return True, host_filename
+
+ def get_root_PKCS12(self):
+ p12 = crypto.PKCS12()
+ p12.set_certificate(self.cert)
+ p12.set_privatekey(self.key)
+ return p12.export()
+
+ @staticmethod
+ def _make_cert(certname):
+ cert = crypto.X509()
+ cert.set_version(2)
+ cert.set_serial_number(random.randint(0, 2 ** 64 - 1))
+ cert.get_subject().CN = certname
+
+ cert.gmtime_adj_notBefore(0)
+ cert.gmtime_adj_notAfter(CERT_DURATION)
+ return cert
+
+ @staticmethod
+ def generate_ca_root(ca_file, certname=None, overwrite=False):
+ if not certname:
+ certname = CERT_NAME
+
+ if not ca_file:
+ ca_file = CERT_CA_FILE
+
+ if not overwrite and os.path.exists(ca_file):
+ cert, key = CertificateAuthority.read_pem(ca_file)
+ return False, cert, key
+
+ # Generate key
+ key = crypto.PKey()
+ key.generate_key(crypto.TYPE_RSA, 2048)
+
+ # Generate cert
+ cert = CertificateAuthority._make_cert(certname)
+
+ cert.set_issuer(cert.get_subject())
+ cert.set_pubkey(key)
+ cert.add_extensions([
+ crypto.X509Extension(b"basicConstraints",
+ True,
+ b"CA:TRUE, pathlen:0"),
+
+ crypto.X509Extension(b"keyUsage",
+ True,
+ b"keyCertSign, cRLSign"),
+
+ crypto.X509Extension(b"subjectKeyIdentifier",
+ False,
+ b"hash",
+ subject=cert),
+ ])
+ cert.sign(key, "sha1")
+
+ # Write cert + key
+ CertificateAuthority.write_pem(ca_file, cert, key)
+ return True, cert, key
+
+ @staticmethod
+ def generate_host_cert(host, root_cert, root_key, host_filename,
+ wildcard=False):
+ # Generate key
+ key = crypto.PKey()
+ key.generate_key(crypto.TYPE_RSA, 2048)
+
+ # Generate CSR
+ req = crypto.X509Req()
+ req.get_subject().CN = host
+ req.set_pubkey(key)
+ req.sign(key, 'sha1')
+
+ # Generate Cert
+ cert = CertificateAuthority._make_cert(host)
+
+ cert.set_issuer(root_cert.get_subject())
+ cert.set_pubkey(req.get_pubkey())
+
+ if wildcard:
+ DNS = 'DNS:'
+ alt_hosts = [DNS + host,
+ DNS + '*.' + host]
+
+ alt_hosts = ', '.join(alt_hosts)
+
+ cert.add_extensions([
+ crypto.X509Extension('subjectAltName',
+ False,
+ alt_hosts)])
+
+ cert.sign(root_key, 'sha1')
+
+ # Write cert + key
+ CertificateAuthority.write_pem(host_filename, cert, key)
+ return cert, key
+
+ @staticmethod
+ def write_pem(filename, cert, key):
+ with open(filename, 'wb+') as f:
+ f.write(crypto.dump_privatekey(FILETYPE_PEM, key))
+
+ f.write(crypto.dump_certificate(FILETYPE_PEM, cert))
+
+ @staticmethod
+ def read_pem(filename):
+ with open(filename, 'r') as f:
+ cert = crypto.load_certificate(FILETYPE_PEM, f.read())
+ f.seek(0)
+ key = crypto.load_privatekey(FILETYPE_PEM, f.read())
+
+ return cert, key
+
+
+#=================================================================
+def main():
+ parser = ArgumentParser(description='Cert Auth Cert Maker')
+
+ parser.add_argument('output_pem_file', help='path to cert .pem file')
+
+ parser.add_argument('-r', '--use-root',
+ help=('use specified root cert (.pem file) ' +
+ 'to create signed cert'))
+
+ parser.add_argument('-n', '--name', action='store', default=CERT_NAME,
+ help='name for root certificate')
+
+ parser.add_argument('-d', '--certs-dir', default=CERTS_DIR)
+
+ parser.add_argument('-f', '--force', action='store_true')
+
+ parser.add_argument('-w', '--wildcard_cert', action='store_true',
+ help='add wildcard SAN to host: *., ')
+
+ result = parser.parse_args()
+
+ overwrite = result.force
+
+ # Create a new signed certificate using specified root
+ if result.use_root:
+ certs_dir = result.certs_dir
+ wildcard = result.wildcard
+ ca = CertificateAuthority(ca_file=result.use_root,
+ certs_dir=result.certs_dir,
+ certname=result.name)
+
+ created, host_filename = ca.get_cert_for_host(result.output_pem_file,
+ overwrite, wildcard)
+
+ if created:
+ print ('Created new cert "' + host_filename +
+ '" signed by root cert ' +
+ result.use_root)
+ else:
+ print ('Cert "' + host_filename + '" already exists,' +
+ ' use -f to overwrite')
+
+ # Create new root certificate
+ else:
+ created, c, k = (CertificateAuthority.
+ generate_ca_root(result.output_pem_file,
+ result.name,
+ overwrite))
+
+ if created:
+ print 'Created new root cert: "' + result.output_pem_file + '"'
+ else:
+ print ('Root cert "' + result.output_pem_file + '" already exists,' +
+ ' use -f to overwrite')
+
+if __name__ == "__main__":
+ main()
diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py
index 62bc06b0..57dd5088 100644
--- a/pywb/framework/proxy.py
+++ b/pywb/framework/proxy.py
@@ -4,8 +4,17 @@ from archivalrouter import ArchivalRouter
import urlparse
import base64
+import socket
+import ssl
+
from pywb.rewrite.url_rewriter import HttpsUrlRewriter
-from pywb.utils.statusandheaders import StatusAndHeaders
+from pywb.utils.wbexception import BadRequestException
+
+from pywb.utils.bufferedreaders import BufferedReader
+
+from certauth import CertificateAuthority
+
+from proxy_resolvers import ProxyAuthResolver, CookieResolver
#=================================================================
@@ -44,8 +53,17 @@ class ProxyRouter(object):
for more details.
"""
+ PAC_PATH = '/proxy.pac'
+ BLOCK_SIZE = 4096
+ DEF_MAGIC_NAME = 'pywb.proxy'
+
+ CERT_DL_PEM = '/pywb-ca.pem'
+ CERT_DL_P12 = '/pywb-ca.p12'
+
+ EXTRA_HEADERS = {'cache-control': 'no-cache',
+ 'p3p': 'CP="NOI ADM DEV COM NAV OUR STP"'}
+
def __init__(self, routes, **kwargs):
- self.routes = routes
self.hostpaths = kwargs.get('hostpaths')
self.error_view = kwargs.get('error_view')
@@ -54,61 +72,124 @@ class ProxyRouter(object):
if proxy_options:
proxy_options = proxy_options.get('proxy_options', {})
- self.auth_msg = proxy_options.get('auth_msg',
- 'Please enter name of a collection to use for proxy mode')
+ self.magic_name = proxy_options.get('magic_name')
+ if not self.magic_name:
+ self.magic_name = self.DEF_MAGIC_NAME
+ proxy_options['magic_name'] = self.magic_name
- self.use_default_coll = proxy_options.get('use_default_coll', True)
+ self.extra_headers = proxy_options.get('extra_headers')
+ if not self.extra_headers:
+ self.extra_headers = self.EXTRA_HEADERS
+ proxy_options['extra_headers'] = self.extra_headers
+
+ if proxy_options.get('cookie_resolver'):
+ self.resolver = CookieResolver(routes, proxy_options)
+ else:
+ self.resolver = ProxyAuthResolver(routes, proxy_options)
self.unaltered = proxy_options.get('unaltered_replay', False)
+ self.proxy_pac_path = proxy_options.get('pac_path', self.PAC_PATH)
+
+
+ if not proxy_options.get('enable_https_proxy'):
+ self.ca = None
+ self.proxy_cert_dl_view = None
+ return
+
+ # HTTPS Only Options
+ ca_file = proxy_options.get('root_ca_file')
+
+ # attempt to create the root_ca_file if doesn't exist
+ # (generally recommended to create this seperately)
+ certname = proxy_options.get('root_ca_name')
+ CertificateAuthority.generate_ca_root(certname, ca_file)
+
+ certs_dir = proxy_options.get('certs_dir')
+ self.ca = CertificateAuthority(ca_file=ca_file,
+ certs_dir=certs_dir)
+
+ self.proxy_cert_dl_view = proxy_options.get('proxy_cert_download_view')
+
def __call__(self, env):
- url = env['REL_REQUEST_URI']
+ is_https = (env['REQUEST_METHOD'] == 'CONNECT')
- if url.endswith('/proxy.pac'):
- return self.make_pac_response(env)
+ # for non-https requests, check pac path and non-proxy urls
+ if not is_https:
+ url = env['REL_REQUEST_URI']
- if not url.startswith('http://'):
- return None
+ if url == self.proxy_pac_path:
+ return self.make_pac_response(env)
- proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION')
+ if not url.startswith(('http://', 'https://')):
+ return None
+
+ env['pywb.proxy_scheme'] = 'http'
route = None
coll = None
matcher = None
+ response = None
+ ts = None
- if proxy_auth:
- proxy_coll = self.read_basic_auth_coll(proxy_auth)
+ # check resolver, for pre connect resolve
+ if self.resolver.pre_connect:
+ route, coll, matcher, ts, response = self.resolver.resolve(env)
+ if response:
+ return response
- if not proxy_coll:
- return self.proxy_auth_coll_response()
+ # do connect, then get updated url
+ if is_https:
+ response = self.handle_connect(env)
+ if response:
+ return response
- proxy_coll = '/' + proxy_coll + '/'
-
- for r in self.routes:
- matcher, c = r.is_handling(proxy_coll)
- if matcher:
- route = r
- coll = c
- break
-
- if not route:
- return self.proxy_auth_coll_response()
-
- # if 'use_default_coll' or only one collection, use that
- # for proxy mode
- elif self.use_default_coll or len(self.routes) == 1:
- route = self.routes[0]
- coll = self.routes[0].regex.pattern
-
- # otherwise, require proxy auth 407 to select collection
+ url = env['REL_REQUEST_URI']
else:
- return self.proxy_auth_coll_response()
+ parts = urlparse.urlsplit(env['REL_REQUEST_URI'])
+ hostport = parts.netloc.split(':', 1)
+ env['pywb.proxy_host'] = hostport[0]
+ env['pywb.proxy_port'] = hostport[1] if len(hostport) == 2 else ''
+ env['pywb.proxy_req_uri'] = parts.path
+ if parts.query:
+ env['pywb.proxy_req_uri'] += '?' + parts.query
+
+ env['pywb_proxy_magic'] = self.magic_name
+
+ # route (static) and other resources to archival replay
+ if env['pywb.proxy_host'] == self.magic_name:
+ env['REL_REQUEST_URI'] = env['pywb.proxy_req_uri']
+
+ # special case for proxy install
+ response = self.handle_cert_install(env)
+ if response:
+ return response
+
+ return None
+
+ # check resolver, post connect
+ if not self.resolver.pre_connect:
+ route, coll, matcher, ts, response = self.resolver.resolve(env)
+ if response:
+ return response
+
+ host_prefix = env['pywb.proxy_scheme'] + '://' + self.magic_name
+ rel_prefix = ''
+
+ # special case for proxy calendar
+ if (env['pywb.proxy_host'] == 'query.' + self.magic_name):
+ url = env['pywb.proxy_req_uri'][1:]
+ rel_prefix = '/'
+
+ if ts is not None:
+ url = ts + '/' + url
wbrequest = route.request_class(env,
request_uri=url,
wb_url_str=url,
coll=coll,
- host_prefix=self.hostpaths[0],
+ host_prefix=host_prefix,
+ rel_prefix=rel_prefix,
wburl_class=route.handler.get_wburl_type(),
urlrewriter_class=HttpsUrlRewriter,
use_abs_prefix=False,
@@ -119,13 +200,170 @@ class ProxyRouter(object):
if self.unaltered:
wbrequest.wb_url.mod = 'id_'
+ elif is_https:
+ wbrequest.wb_url.mod = 'bn_'
- return route.handler(wbrequest)
+ response = route.handler(wbrequest)
+
+ if wbrequest.wb_url and wbrequest.wb_url.is_replay():
+ response.status_headers.replace_headers(self.extra_headers)
+
+ return response
+
+ def get_request_socket(self, env):
+ if not self.ca:
+ return None
+
+ sock = None
+
+ if env.get('uwsgi.version'):
+ try:
+ import uwsgi
+ fd = uwsgi.connection_fd()
+ conn = socket.fromfd(fd, socket.AF_INET, socket.SOCK_STREAM)
+ sock = socket.socket(_sock=conn)
+ except Exception:
+ pass
+ elif env.get('gunicorn.socket'):
+ sock = env['gunicorn.socket']
+
+ if not sock:
+ # attempt to find socket from wsgi.input
+ input_ = env.get('wsgi.input')
+ if input_ and hasattr(input_, '_sock'):
+ sock = socket.socket(_sock=input_._sock)
+
+ return sock
+
+ def handle_connect(self, env):
+ sock = self.get_request_socket(env)
+ if not sock:
+ return WbResponse.text_response('HTTPS Proxy Not Supported',
+ '405 HTTPS Proxy Not Supported')
+
+ sock.send('HTTP/1.0 200 Connection Established\r\n')
+ sock.send('Server: pywb proxy\r\n')
+ sock.send('\r\n')
+
+ hostname, port = env['REL_REQUEST_URI'].split(':')
+ cert_host = hostname
+
+ host_parts = hostname.split('.', 1)
+ if len(host_parts) == 2 and '.' in host_parts[1]:
+ cert_host = host_parts[1]
+
+ created, certfile = self.ca.get_cert_for_host(cert_host,
+ wildcard=True)
+
+ try:
+ ssl_sock = ssl.wrap_socket(sock,
+ server_side=True,
+ certfile=certfile,
+ ciphers="ALL",
+ suppress_ragged_eofs=False,
+ #ssl_version=ssl.PROTOCOL_TLSv1)
+ ssl_version=ssl.PROTOCOL_SSLv23)
+ env['pywb.proxy_ssl_sock'] = ssl_sock
+
+ buffreader = BufferedReader(ssl_sock, block_size=self.BLOCK_SIZE)
+
+ statusline = buffreader.readline().rstrip()
+
+ except Exception as se:
+ raise BadRequestException(se.message)
+
+ statusparts = statusline.split(' ')
+
+ if len(statusparts) < 3:
+ raise BadRequestException('Invalid Proxy Request: ' + statusline)
+
+ env['REQUEST_METHOD'] = statusparts[0]
+ env['REL_REQUEST_URI'] = ('https://' +
+ env['REL_REQUEST_URI'].replace(':443', '') +
+ statusparts[1])
+
+ env['SERVER_PROTOCOL'] = statusparts[2].strip()
+
+ env['pywb.proxy_scheme'] = 'https'
+
+ env['pywb.proxy_host'] = hostname
+ env['pywb.proxy_port'] = port
+ env['pywb.proxy_req_uri'] = statusparts[1]
+
+ queryparts = env['REL_REQUEST_URI'].split('?', 1)
+ env['PATH_INFO'] = queryparts[0]
+ env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else ''
+
+ while True:
+ line = buffreader.readline()
+ if line:
+ line = line.rstrip()
+
+ if not line:
+ break
+
+ parts = line.split(':', 1)
+ if len(parts) < 2:
+ continue
+
+ name = parts[0].strip()
+ value = parts[1].strip()
+
+ name = name.replace('-', '_').upper()
+
+ if not name in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
+ name = 'HTTP_' + name
+
+ env[name] = value
+
+ remain = buffreader.rem_length()
+ if remain > 0:
+ remainder = buffreader.read(self.BLOCK_SIZE)
+ env['wsgi.input'] = BufferedReader(ssl_sock,
+ block_size=self.BLOCK_SIZE,
+ starting_data=remainder)
+
+ def handle_cert_install(self, env):
+ if env['pywb.proxy_req_uri'] in ('/', '/index.html', '/index.html'):
+ available = (self.ca is not None)
+
+ if self.proxy_cert_dl_view:
+ return (self.proxy_cert_dl_view.
+ render_response(available=available,
+ pem_path=self.CERT_DL_PEM,
+ p12_path=self.CERT_DL_P12))
+ else:
+ return None
+
+ elif env['pywb.proxy_req_uri'] == self.CERT_DL_PEM:
+ if not self.ca:
+ return None
+
+ buff = ''
+ with open(self.ca.ca_file) as fh:
+ buff = fh.read()
+
+ content_type = 'application/x-x509-ca-cert'
+
+ return WbResponse.text_response(buff,
+ content_type=content_type)
+
+ elif env['pywb.proxy_req_uri'] == self.CERT_DL_P12:
+ if not self.ca:
+ return None
+
+ buff = self.ca.get_root_PKCS12()
+
+ content_type = 'application/x-pkcs12'
+
+ return WbResponse.text_response(buff,
+ content_type=content_type)
+ else:
+ return None
# Proxy Auto-Config (PAC) script for the proxy
def make_pac_response(self, env):
- import os
- hostname = os.environ.get('PYWB_HOST_NAME')
+ hostname = env.get('HTTP_HOST')
if not hostname:
server_hostport = env['SERVER_NAME'] + ':' + env['SERVER_PORT']
hostonly = env['SERVER_NAME']
@@ -143,33 +381,8 @@ class ProxyRouter(object):
buff += direct.format(hostonly)
- #buff += '\n return "PROXY {0}";\n}}\n'.format(self.hostpaths[0])
buff += '\n return "PROXY {0}";\n}}\n'.format(server_hostport)
content_type = 'application/x-ns-proxy-autoconfig'
return WbResponse.text_response(buff, content_type=content_type)
-
- def proxy_auth_coll_response(self):
- proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg)
-
- headers = [('Content-Type', 'text/plain'),
- ('Proxy-Authenticate', proxy_msg)]
-
- status_headers = StatusAndHeaders('407 Proxy Authentication', headers)
-
- value = self.auth_msg
-
- return WbResponse(status_headers, value=[value])
-
- @staticmethod
- def read_basic_auth_coll(value):
- parts = value.split(' ')
- if parts[0].lower() != 'basic':
- return ''
-
- if len(parts) != 2:
- return ''
-
- user_pass = base64.b64decode(parts[1])
- return user_pass.split(':')[0]
diff --git a/pywb/framework/proxy_resolvers.py b/pywb/framework/proxy_resolvers.py
new file mode 100644
index 00000000..dc7b22fe
--- /dev/null
+++ b/pywb/framework/proxy_resolvers.py
@@ -0,0 +1,340 @@
+from wbrequestresponse import WbResponse, WbRequest
+from pywb.utils.statusandheaders import StatusAndHeaders
+from pywb.rewrite.wburl import WbUrl
+
+import urlparse
+import base64
+import os
+
+try:
+ import uwsgi
+ uwsgi_cache = True
+except ImportError:
+ uwsgi_cache = False
+
+
+#=================================================================
+class UwsgiCache(object):
+ def __setitem__(self, item, value):
+ uwsgi.cache_update(item, value)
+
+ def __getitem__(self, item):
+ return uwsgi.cache_get(item)
+
+ def __contains__(self, item):
+ return uwsgi.cache_exists(item)
+
+ def __delitem__(self, item):
+ uwsgi.cache_del(item)
+
+
+#=================================================================
+class BaseCollResolver(object):
+ def __init__(self, routes, config):
+ self.routes = routes
+ self.pre_connect = config.get('pre_connect', False)
+ self.use_default_coll = config.get('use_default_coll', True)
+
+ def resolve(self, env):
+ route = None
+ coll = None
+ matcher = None
+ ts = None
+
+ proxy_coll, ts = self.get_proxy_coll_ts(env)
+
+ # invalid parsing
+ if proxy_coll == '':
+ return None, None, None, None, self.select_coll_response(env)
+
+ if proxy_coll is None and isinstance(self.use_default_coll, str):
+ proxy_coll = self.use_default_coll
+
+ if proxy_coll:
+ proxy_coll = '/' + proxy_coll + '/'
+
+ for r in self.routes:
+ matcher, c = r.is_handling(proxy_coll)
+ if matcher:
+ route = r
+ coll = c
+ break
+
+ # if no match, return coll selection response
+ if not route:
+ return None, None, None, None, self.select_coll_response(env)
+
+ # if 'use_default_coll'
+ elif self.use_default_coll == True or len(self.routes) == 1:
+ route = self.routes[0]
+ coll = self.routes[0].path
+
+ # otherwise, return the appropriate coll selection response
+ else:
+ return None, None, None, None, self.select_coll_response(env)
+
+ return route, coll, matcher, ts, None
+
+
+#=================================================================
+class ProxyAuthResolver(BaseCollResolver):
+ DEFAULT_MSG = 'Please enter name of a collection to use with proxy mode'
+
+ def __init__(self, routes, config):
+ config['pre_connect'] = True
+ super(ProxyAuthResolver, self).__init__(routes, config)
+ self.auth_msg = config.get('auth_msg', self.DEFAULT_MSG)
+
+ def get_proxy_coll_ts(self, env):
+ proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION')
+
+ if not proxy_auth:
+ return None, None
+
+ proxy_coll = self.read_basic_auth_coll(proxy_auth)
+ return proxy_coll, None
+
+ def select_coll_response(self, env):
+ proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg)
+
+ headers = [('Content-Type', 'text/plain'),
+ ('Proxy-Authenticate', proxy_msg)]
+
+ status_headers = StatusAndHeaders('407 Proxy Authentication', headers)
+
+ value = self.auth_msg
+
+ return WbResponse(status_headers, value=[value])
+
+ @staticmethod
+ def read_basic_auth_coll(value):
+ parts = value.split(' ')
+ if parts[0].lower() != 'basic':
+ return ''
+
+ if len(parts) != 2:
+ return ''
+
+ user_pass = base64.b64decode(parts[1])
+ return user_pass.split(':')[0]
+
+
+#=================================================================
+# Experimental CookieResolver
+class CookieResolver(BaseCollResolver): # pragma: no cover
+ def __init__(self, routes, config):
+ config['pre_connect'] = False
+ super(CookieResolver, self).__init__(routes, config)
+ self.magic_name = config['magic_name']
+ self.sethost_prefix = '-sethost.' + self.magic_name + '.'
+ self.set_prefix = '-set.' + self.magic_name
+
+ self.cookie_name = config.get('cookie_name', '__pywb_coll')
+ self.proxy_select_view = config.get('proxy_select_view')
+
+ self.extra_headers = config.get('extra_headers')
+
+ if uwsgi_cache:
+ self.cache = UwsgiCache()
+ else:
+ self.cache = {}
+
+ def get_proxy_coll_ts(self, env):
+ coll, ts, sesh_id = self.get_coll(env)
+ return coll, ts
+
+ def select_coll_response(self, env):
+ return self.make_magic_response('auto',
+ env['REL_REQUEST_URI'],
+ env)
+
+ def resolve(self, env):
+ server_name = env['pywb.proxy_host']
+
+ if ('.' + self.magic_name) in server_name:
+ response = self.handle_magic_page(env)
+ if response:
+ return None, None, None, None, response
+
+ return super(CookieResolver, self).resolve(env)
+
+ def handle_magic_page(self, env):
+ request_url = env['REL_REQUEST_URI']
+ parts = urlparse.urlsplit(request_url)
+ server_name = env['pywb.proxy_host']
+
+ path_url = parts.path[1:]
+ if parts.query:
+ path_url += '?' + parts.query
+
+ if server_name.startswith('auto'):
+ coll, ts, sesh_id = self.get_coll(env)
+
+ if coll:
+ return self.make_sethost_cookie_response(sesh_id, path_url, env)
+ else:
+ return self.make_magic_response('select', path_url, env)
+
+ elif server_name.startswith('query.'):
+ wb_url = WbUrl(path_url)
+
+ # only dealing with specific timestamp setting
+ if wb_url.is_query():
+ return None
+
+ coll, ts, sesh_id = self.get_coll(env)
+ if not coll:
+ return self.make_magic_response('select', path_url, env)
+
+ self.set_ts(sesh_id, wb_url.timestamp)
+ return self.make_redir_response(wb_url.url)
+
+ elif server_name.endswith(self.set_prefix):
+ old_sesh_id = self.extract_client_cookie(env, self.cookie_name)
+ sesh_id = self.create_renew_sesh_id(old_sesh_id)
+
+ if sesh_id != old_sesh_id:
+ headers = self.make_cookie_headers(sesh_id, self.magic_name)
+ else:
+ headers = None
+
+ coll = server_name[:-len(self.set_prefix)]
+
+ # set sesh value
+ self.set_coll(sesh_id, coll)
+
+ return self.make_sethost_cookie_response(sesh_id, path_url, env,
+ headers=headers)
+
+ elif self.sethost_prefix in server_name:
+ inx = server_name.find(self.sethost_prefix)
+ sesh_id = server_name[:inx]
+
+ domain = server_name[inx + len(self.sethost_prefix):]
+
+ headers = self.make_cookie_headers(sesh_id, domain)
+
+ full_url = env['pywb.proxy_scheme'] + '://' + domain
+ full_url += '/' + path_url
+ return self.make_redir_response(full_url, headers=headers)
+
+ elif 'select.' in server_name:
+ if not self.proxy_select_view:
+ return WbResponse.text_response('select text for ' + path_url)
+
+ coll, ts, sesh_id = self.get_coll(env)
+
+ #scheme = env['pywb.proxy_scheme'] + '://'
+ route_temp = '-set.' + self.magic_name + '/' + path_url
+
+ try:
+ return (self.proxy_select_view.
+ render_response(routes=self.routes,
+ route_temp=route_temp,
+ coll=coll,
+ url=path_url))
+ except Exception as exc:
+ raise
+
+ #else:
+ # msg = 'Invalid Magic Path: ' + url
+ # print msg
+ # return WbResponse.text_response(msg, status='404 Not Found')
+
+ def make_cookie_headers(self, sesh_id, domain):
+ cookie_val = '{0}={1}; Path=/; Domain=.{2}; HttpOnly'
+ cookie_val = cookie_val.format(self.cookie_name, sesh_id, domain)
+ headers = [('Set-Cookie', cookie_val)]
+ return headers
+
+ def make_sethost_cookie_response(self, sesh_id, path_url,
+ env, headers=None):
+ if '://' not in path_url:
+ path_url = 'http://' + path_url
+
+ path_parts = urlparse.urlsplit(path_url)
+
+ new_url = path_parts.path[1:]
+ if path_parts.query:
+ new_url += '?' + path_parts.query
+
+ return self.make_magic_response(sesh_id + '-sethost', new_url, env,
+ suffix=path_parts.netloc,
+ headers=headers)
+
+
+ def make_magic_response(self, prefix, url, env,
+ suffix=None, headers=None):
+ full_url = env['pywb.proxy_scheme'] + '://' + prefix + '.'
+ full_url += self.magic_name
+ if suffix:
+ full_url += '.' + suffix
+ full_url += '/' + url
+ return self.make_redir_response(full_url, headers=headers)
+
+ def set_coll(self, sesh_id, coll):
+ self.cache[sesh_id + ':c'] = coll
+
+ def set_ts(self, sesh_id, ts):
+ if ts:
+ self.cache[sesh_id + ':t'] = ts
+ # this ensures that omitting timestamp will reset to latest
+ # capture by deleting the cache entry
+ else:
+ del self.cache[sesh_id + ':t']
+
+ def get_coll(self, env):
+ sesh_id = self.extract_client_cookie(env, self.cookie_name)
+
+ coll = None
+ ts = None
+ if sesh_id:
+ coll = self.cache[sesh_id + ':c']
+ try:
+ ts = self.cache[sesh_id + ':t']
+ except KeyError:
+ pass
+
+ return coll, ts, sesh_id
+
+ def create_renew_sesh_id(self, sesh_id, force=False):
+ #if sesh_id in self.cache and not force:
+ if sesh_id and ((sesh_id + ':c') in self.cache) and not force:
+ return sesh_id
+
+ sesh_id = base64.b32encode(os.urandom(5)).lower()
+ return sesh_id
+
+ def make_redir_response(self, url, headers=None):
+ if not headers:
+ headers = []
+
+ if self.extra_headers:
+ for name, value in self.extra_headers.iteritems():
+ headers.append((name, value))
+
+ return WbResponse.redir_response(url, headers=headers)
+
+ @staticmethod
+ def extract_client_cookie(env, cookie_name):
+ cookie_header = env.get('HTTP_COOKIE')
+ if not cookie_header:
+ return None
+
+ # attempt to extract cookie_name only
+ inx = cookie_header.find(cookie_name)
+ if inx < 0:
+ return None
+
+ end_inx = cookie_header.find(';', inx)
+ if end_inx > 0:
+ value = cookie_header[inx:end_inx]
+ else:
+ value = cookie_header[inx:]
+
+ value = value.split('=')
+ if len(value) < 2:
+ return None
+
+ value = value[1].strip()
+ return value
diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py
index 3729a660..3498c819 100644
--- a/pywb/framework/wsgi_wrappers.py
+++ b/pywb/framework/wsgi_wrappers.py
@@ -50,6 +50,42 @@ class WSGIApp(object):
# Top-level wsgi application
def __call__(self, env, start_response):
+ if env['REQUEST_METHOD'] == 'CONNECT':
+ return self.handle_connect(env, start_response)
+ else:
+ return self.handle_methods(env, start_response)
+
+ def handle_connect(self, env, start_response):
+ def ssl_start_response(statusline, headers):
+ ssl_sock = env.get('pywb.proxy_ssl_sock')
+ if not ssl_sock:
+ start_response(statusline, headers)
+ return
+
+ env['pywb.proxy_statusline'] = statusline
+
+ ssl_sock.write('HTTP/1.1 ' + statusline + '\r\n')
+ for name, value in headers:
+ ssl_sock.write(name + ': ' + value + '\r\n')
+
+ resp_iter = self.handle_methods(env, ssl_start_response)
+
+ ssl_sock = env.get('pywb.proxy_ssl_sock')
+ if not ssl_sock:
+ return resp_iter
+
+ ssl_sock.write('\r\n')
+
+ for obj in resp_iter:
+ if obj:
+ ssl_sock.write(obj)
+ ssl_sock.close()
+
+ start_response(env['pywb.proxy_statusline'], [])
+
+ return []
+
+ def handle_methods(self, env, start_response):
if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
env['REL_REQUEST_URI'] = rel_request_uri(env)
else:
@@ -89,22 +125,29 @@ class WSGIApp(object):
else:
err_url = None
+ try:
+ err_msg = exc.message.encode('utf-8')
+ except Exception:
+ err_msg = exc.message
+ err_url = ''
+
if print_trace:
import traceback
err_details = traceback.format_exc(exc)
print err_details
else:
- logging.info(str(exc))
+ logging.info(err_msg)
err_details = None
if error_view:
return error_view.render_response(exc_type=type(exc).__name__,
- err_msg=str(exc),
+ err_msg=err_msg,
err_details=err_details,
status=status,
+ env=env,
err_url=err_url)
else:
- return WbResponse.text_response(status + ' Error: ' + str(exc),
+ return WbResponse.text_response(status + ' Error: ' + err_msg,
status=status)
#=================================================================
@@ -145,6 +188,10 @@ def init_app(init_func, load_yaml=True, config_file=None, config={}):
def start_wsgi_server(the_app, name, default_port=None): # pragma: no cover
from wsgiref.simple_server import make_server
+ # disable is_hop_by_hop restrictions
+ import wsgiref.handlers
+ wsgiref.handlers.is_hop_by_hop = lambda x: False
+
port = the_app.port
if not port:
diff --git a/pywb/rewrite/header_rewriter.py b/pywb/rewrite/header_rewriter.py
index 2dfc824d..2d505e88 100644
--- a/pywb/rewrite/header_rewriter.py
+++ b/pywb/rewrite/header_rewriter.py
@@ -37,7 +37,8 @@ class HeaderRewriter:
ENCODING_HEADERS = ['content-encoding']
- REMOVE_HEADERS = ['transfer-encoding']
+ REMOVE_HEADERS = ['transfer-encoding', 'content-security-policy',
+ 'strict-transport-security']
PROXY_NO_REWRITE_HEADERS = ['content-length']
@@ -90,7 +91,10 @@ class HeaderRewriter:
new_headers = []
removed_header_dict = {}
- cookie_rewriter = urlrewriter.get_cookie_rewriter()
+ if urlrewriter:
+ cookie_rewriter = urlrewriter.get_cookie_rewriter()
+ else:
+ cookie_rewriter = None
for (name, value) in headers:
@@ -99,7 +103,7 @@ class HeaderRewriter:
if lowername in self.PROXY_HEADERS:
new_headers.append((name, value))
- elif lowername in self.URL_REWRITE_HEADERS:
+ elif urlrewriter and lowername in self.URL_REWRITE_HEADERS:
new_headers.append((name, urlrewriter.rewrite(value)))
elif lowername in self.ENCODING_HEADERS:
@@ -109,7 +113,8 @@ class HeaderRewriter:
new_headers.append((name, value))
elif lowername in self.REMOVE_HEADERS:
- removed_header_dict[lowername] = value
+ removed_header_dict[lowername] = value
+ new_headers.append((self.header_prefix + name, value))
elif (lowername in self.PROXY_NO_REWRITE_HEADERS and
not content_rewritten):
@@ -120,7 +125,9 @@ class HeaderRewriter:
cookie_list = cookie_rewriter.rewrite(value)
new_headers.extend(cookie_list)
- else:
+ elif urlrewriter:
new_headers.append((self.header_prefix + name, value))
+ else:
+ new_headers.append((name, value))
return (new_headers, removed_header_dict)
diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py
index 207d879e..2225bbaf 100644
--- a/pywb/rewrite/rewrite_content.py
+++ b/pywb/rewrite/rewrite_content.py
@@ -69,6 +69,10 @@ class RewriteContent:
status_headers, stream = self.sanitize_content(headers, stream)
return (status_headers, self.stream_to_gen(stream), False)
+
+ if wb_url.is_banner_only:
+ urlrewriter = None
+
(rewritten_headers, stream) = self.rewrite_headers(urlrewriter,
headers,
stream)
diff --git a/pywb/rewrite/test/test_header_rewriter.py b/pywb/rewrite/test/test_header_rewriter.py
index 1a2b2cea..0b22d533 100644
--- a/pywb/rewrite/test/test_header_rewriter.py
+++ b/pywb/rewrite/test/test_header_rewriter.py
@@ -40,17 +40,19 @@ HTTP Headers Rewriting
'removed_header_dict': {'content-encoding': 'gzip',
'transfer-encoding': 'chunked'},
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
- ('Content-Type', 'text/javascript')]),
+ ('Content-Type', 'text/javascript'),
+ ('X-Archive-Orig-Transfer-Encoding', 'chunked')]),
'text_type': 'js'}
-# Binary -- transfer-encoding removed
+# Binary -- transfer-encoding rewritten
>>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Set-Cookie', 'foo=bar; Path=/;'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
{'charset': None,
'removed_header_dict': {'transfer-encoding': 'chunked'},
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
('Content-Type', 'image/png'),
('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/'),
- ('Content-Encoding', 'gzip')]),
+ ('Content-Encoding', 'gzip'),
+ ('X-Archive-Orig-Transfer-Encoding', 'chunked')]),
'text_type': None}
"""
diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py
index 5b2f8e7b..c89e9a21 100644
--- a/pywb/rewrite/url_rewriter.py
+++ b/pywb/rewrite/url_rewriter.py
@@ -142,7 +142,7 @@ class HttpsUrlRewriter(UrlRewriter):
else:
return url
- def get_timestamp_url(self, timestamp, url):
+ def get_timestamp_url(self, timestamp, url=''):
return url
def get_abs_url(self, url=''):
diff --git a/pywb/static/wb.js b/pywb/static/wb.js
index 2d8b2470..0511f983 100644
--- a/pywb/static/wb.js
+++ b/pywb/static/wb.js
@@ -72,6 +72,14 @@ function init_banner() {
}
text += "" + capture_str + "";
+
+ if (wbinfo.proxy_magic && wbinfo.url) {
+ var select_url = wbinfo.proxy_magic + "/" + wbinfo.url;
+ var query_url = wbinfo.proxy_magic + "/*/" + wbinfo.url;
+ text += ' All Capture Times';
+ text += '
'
+ text += 'From collection "' + wbinfo.coll + '" All Collections';
+ }
banner.innerHTML = text;
diff --git a/pywb/ui/error.html b/pywb/ui/error.html
index b3a8c478..b122fc38 100644
--- a/pywb/ui/error.html
+++ b/pywb/ui/error.html
@@ -9,3 +9,10 @@
{% endif %}
+
+{% if env.pywb_proxy_magic and err_url and status == '404 Not Found' %}
+
+Try Different Collection
+
+{% endif %}
+
diff --git a/pywb/ui/head_insert.html b/pywb/ui/head_insert.html
index 2cd94ab5..c97cb86a 100644
--- a/pywb/ui/head_insert.html
+++ b/pywb/ui/head_insert.html
@@ -2,7 +2,7 @@
{% if rule.js_rewrite_location and include_wombat %}
diff --git a/pywb/ui/proxy_cert_download.html b/pywb/ui/proxy_cert_download.html
new file mode 100644
index 00000000..71255e3a
--- /dev/null
+++ b/pywb/ui/proxy_cert_download.html
@@ -0,0 +1,14 @@
+HTTPS Certificate For PyWb Web Archive Replay
+{% if not available %}
+Sorry, HTTPS support is not configured for this proxy. However, the proxy should work in HTTP mode.
+{% else %}
+Download for all platforms (except Windows):
+Download Certificate (All except Windows)
+
+(If you see the Already Installed message, then no further action is necessary and you may start browsing!
+{% endif %}
+
+Download for Windows platforms:
+Download Certificate (Window Only)
+
+
diff --git a/pywb/ui/proxy_select.html b/pywb/ui/proxy_select.html
new file mode 100644
index 00000000..b06f68a2
--- /dev/null
+++ b/pywb/ui/proxy_select.html
@@ -0,0 +1,25 @@
+
+
+Pywb Proxy Collection Selector
+{% if coll %}
+
+Current collection is: {{ coll }}
+
+{% else %}
+You have attempted to load the url {{ url }}, but there are multiple collections available.
+{% endif %}
+
+Please select which collection you would like to use (You will be redirected back to {{ url }}):
+
+
+
+{% for route in routes %}
+{% if route.path and route | is_wb_handler %}
+- {{ route.path }}
+{% endif %}
+{% endfor %}
+
+
+(Once selected, you will not be prompted again, however you can return to this page to switch collections.)
+
+
diff --git a/pywb/utils/statusandheaders.py b/pywb/utils/statusandheaders.py
index ae3fc261..70ba850c 100644
--- a/pywb/utils/statusandheaders.py
+++ b/pywb/utils/statusandheaders.py
@@ -3,6 +3,7 @@ Representation and parsing of HTTP-style status + headers
"""
import pprint
+from copy import copy
#=================================================================
@@ -44,9 +45,26 @@ class StatusAndHeaders(object):
self.headers.append((name, value))
return None
+ def replace_headers(self, header_dict):
+ """
+ replace all headers in header_dict that already exist
+ add any remaining headers
+ """
+ header_dict = copy(header_dict)
+
+ for index in xrange(len(self.headers) - 1, -1, -1):
+ curr_name, curr_value = self.headers[index]
+ name_lower = curr_name.lower()
+ if name_lower in header_dict:
+ self.headers[index] = (curr_name, header_dict[name_lower])
+ del header_dict[name_lower]
+
+ for name, value in header_dict.iteritems():
+ self.headers.append((name, value))
+
def remove_header(self, name):
"""
- remove header (case-insensitive)
+ Remove header (case-insensitive)
return True if header removed, False otherwise
"""
name_lower = name.lower()
diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py
index 02efbf89..e9cf0791 100644
--- a/pywb/webapp/pywb_init.py
+++ b/pywb/webapp/pywb_init.py
@@ -34,6 +34,9 @@ DEFAULTS = {
'home_html': 'ui/index.html',
'error_html': 'ui/error.html',
+ 'proxy_select_html': 'ui/proxy_select.html',
+ 'proxy_cert_download_html': 'ui/proxy_cert_download.html',
+
'template_globals': {'static_path': 'static/default'},
'static_routes': {'static/default': 'pywb/static/'},
@@ -80,7 +83,7 @@ def create_live_handler(config):
#=================================================================
def init_route_config(value, config):
- if isinstance(value, str):
+ if isinstance(value, str) or isinstance(value, list):
value = dict(index_paths=value)
route_config = DictChain(value, config)
@@ -226,10 +229,27 @@ def create_wb_router(passed_config={}):
if hasattr(route.handler, 'resolve_refs'):
route.handler.resolve_refs(handler_dict)
-
# Check for new proxy mode!
if config.get('enable_http_proxy', False):
router = ProxyArchivalRouter
+
+ view = J2TemplateView.create_template(
+ config.get('proxy_select_html'),
+ 'Proxy Coll Selector')
+
+ if not 'proxy_options' in passed_config:
+ passed_config['proxy_options'] = {}
+
+ if view:
+ passed_config['proxy_options']['proxy_select_view'] = view
+
+ view = J2TemplateView.create_template(
+ config.get('proxy_cert_download_html'),
+ 'Proxy Cert Download')
+
+ if view:
+ passed_config['proxy_options']['proxy_cert_download_view'] = view
+
else:
router = ArchivalRouter
@@ -250,6 +270,5 @@ def create_wb_router(passed_config={}):
error_view=J2TemplateView.create_template(config.get('error_html'),
'Error Page'),
-
config=config
)
diff --git a/setup.py b/setup.py
index a170b93d..6b5482bf 100755
--- a/setup.py
+++ b/setup.py
@@ -34,7 +34,7 @@ class PyTest(TestCommand):
setup(
name='pywb',
- version='0.5.3',
+ version='0.6.0',
url='https://github.com/ikreymer/pywb',
author='Ilya Kreymer',
author_email='ikreymer@gmail.com',
@@ -70,6 +70,7 @@ setup(
'jinja2',
'surt',
'pyyaml',
+ 'pyopenssl',
],
tests_require=[
'pytest',
@@ -86,6 +87,7 @@ setup(
cdx-server = pywb.apps.cdx_server:main
cdx-indexer = pywb.warc.cdxindexer:main
live-rewrite-server = pywb.apps.live_rewrite_server:main
+ proxy-cert-auth = pywb.framework.certauth:main
""",
zip_safe=False,
classifiers=[
diff --git a/tests/test_integration.py b/tests/test_integration.py
index 8c9ee900..67bf698b 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -389,7 +389,7 @@ class TestWb:
assert resp.status_int == 407
def test_proxy_pac(self):
- resp = self.testapp.get('/proxy.pac', extra_environ = dict(SERVER_NAME='pywb-proxy', SERVER_PORT='8080'))
+ resp = self.testapp.get('/proxy.pac', headers = [('Host', 'pywb-proxy:8080')])
assert resp.content_type == 'application/x-ns-proxy-autoconfig'
assert '"PROXY pywb-proxy:8080"' in resp.body
assert '"localhost"' in resp.body