diff --git a/CHANGES.rst b/CHANGES.rst index cea9f087..2635c87d 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,3 +1,11 @@ +pywb 0.6.0 changelist +~~~~~~~~~~~~~~~~~~~~~ + +* HTTPS Proxy Support! + +* Revamped HTTP/S system: proxy collection and capture time switching via cookie! + + pywb 0.5.3 changelist ~~~~~~~~~~~~~~~~~~~~~ * better framed replay for non-html content -- include live rewrite timestamp via temp 'pywb.timestamp' cookie, updating banner of iframe load. All timestamp formatting moved to client-side for better customization. @@ -6,6 +14,7 @@ pywb 0.5.3 changelist * banner-only rewrite mode (via 'bn_' modifier) to support only banner insertion with no rewriting, server-side or client-side. + pywb 0.5.1 changelist ~~~~~~~~~~~~~~~~~~~~~ minor fixes: diff --git a/README.rst b/README.rst index 078ad24b..3640c69d 100644 --- a/README.rst +++ b/README.rst @@ -1,11 +1,11 @@ -PyWb 0.5.3 +PyWb 0.6.0 ========== -.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop +.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=https-proxy :target: https://travis-ci.org/ikreymer/pywb -.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=develop - :target: https://coveralls.io/r/ikreymer/pywb?branch=develop +.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=https-proxy + :target: https://coveralls.io/r/ikreymer/pywb?branch=https-proxy pywb is a python implementation of web archival replay tools, sometimes also known as 'Wayback Machine'. @@ -21,6 +21,7 @@ This README contains a basic overview of using pywb. After reading this intro, c * `pywb-samples `_ provides additional archive samples with difficult-to-replay content. +* `pywb-proxy-demo `_ showcases the revamped HTTP/S proxy replay system (available from pywb 0.6.0) The following deployed applications use pywb: diff --git a/config.yaml b/config.yaml index 937b4545..fc2290ba 100644 --- a/config.yaml +++ b/config.yaml @@ -109,3 +109,6 @@ enable_memento: true # Replay content in an iframe framed_replay: true + +debug_echo_env: True + diff --git a/pywb/framework/certauth.py b/pywb/framework/certauth.py new file mode 100644 index 00000000..260f5bdc --- /dev/null +++ b/pywb/framework/certauth.py @@ -0,0 +1,228 @@ +import logging +import os +from OpenSSL import crypto +from OpenSSL.SSL import FILETYPE_PEM +import random +from argparse import ArgumentParser + + +#================================================================= +# Duration of 100 years +CERT_DURATION = 100 * 365 * 24 * 60 * 60 + +CERTS_DIR = './pywb-certs/' + +CERT_NAME = 'pywb https proxy replay CA' + +CERT_CA_FILE = './pywb-ca.pem' + + +#================================================================= +class CertificateAuthority(object): + """ + Utility class for signing individual certificate + with a root cert. + + Static generate_ca_root() method for creating the root cert + + All certs saved on filesystem. Individual certs are stored + in specified certs_dir and reused if previously created. + """ + + def __init__(self, ca_file, certs_dir): + if not ca_file: + ca_file = CERT_CA_FILE + + if not certs_dir: + certs_dir = CERTS_DIR + + self.ca_file = ca_file + self.certs_dir = certs_dir + + # read previously created root cert + self.cert, self.key = self.read_pem(ca_file) + + if not os.path.exists(certs_dir): + os.mkdir(certs_dir) + + def get_cert_for_host(self, host, overwrite=False, wildcard=False): + host_filename = os.path.join(self.certs_dir, host) + '.pem' + + if not overwrite and os.path.exists(host_filename): + return False, host_filename + + self.generate_host_cert(host, self.cert, self.key, host_filename, + wildcard) + + return True, host_filename + + def get_root_PKCS12(self): + p12 = crypto.PKCS12() + p12.set_certificate(self.cert) + p12.set_privatekey(self.key) + return p12.export() + + @staticmethod + def _make_cert(certname): + cert = crypto.X509() + cert.set_version(2) + cert.set_serial_number(random.randint(0, 2 ** 64 - 1)) + cert.get_subject().CN = certname + + cert.gmtime_adj_notBefore(0) + cert.gmtime_adj_notAfter(CERT_DURATION) + return cert + + @staticmethod + def generate_ca_root(ca_file, certname=None, overwrite=False): + if not certname: + certname = CERT_NAME + + if not ca_file: + ca_file = CERT_CA_FILE + + if not overwrite and os.path.exists(ca_file): + cert, key = CertificateAuthority.read_pem(ca_file) + return False, cert, key + + # Generate key + key = crypto.PKey() + key.generate_key(crypto.TYPE_RSA, 2048) + + # Generate cert + cert = CertificateAuthority._make_cert(certname) + + cert.set_issuer(cert.get_subject()) + cert.set_pubkey(key) + cert.add_extensions([ + crypto.X509Extension(b"basicConstraints", + True, + b"CA:TRUE, pathlen:0"), + + crypto.X509Extension(b"keyUsage", + True, + b"keyCertSign, cRLSign"), + + crypto.X509Extension(b"subjectKeyIdentifier", + False, + b"hash", + subject=cert), + ]) + cert.sign(key, "sha1") + + # Write cert + key + CertificateAuthority.write_pem(ca_file, cert, key) + return True, cert, key + + @staticmethod + def generate_host_cert(host, root_cert, root_key, host_filename, + wildcard=False): + # Generate key + key = crypto.PKey() + key.generate_key(crypto.TYPE_RSA, 2048) + + # Generate CSR + req = crypto.X509Req() + req.get_subject().CN = host + req.set_pubkey(key) + req.sign(key, 'sha1') + + # Generate Cert + cert = CertificateAuthority._make_cert(host) + + cert.set_issuer(root_cert.get_subject()) + cert.set_pubkey(req.get_pubkey()) + + if wildcard: + DNS = 'DNS:' + alt_hosts = [DNS + host, + DNS + '*.' + host] + + alt_hosts = ', '.join(alt_hosts) + + cert.add_extensions([ + crypto.X509Extension('subjectAltName', + False, + alt_hosts)]) + + cert.sign(root_key, 'sha1') + + # Write cert + key + CertificateAuthority.write_pem(host_filename, cert, key) + return cert, key + + @staticmethod + def write_pem(filename, cert, key): + with open(filename, 'wb+') as f: + f.write(crypto.dump_privatekey(FILETYPE_PEM, key)) + + f.write(crypto.dump_certificate(FILETYPE_PEM, cert)) + + @staticmethod + def read_pem(filename): + with open(filename, 'r') as f: + cert = crypto.load_certificate(FILETYPE_PEM, f.read()) + f.seek(0) + key = crypto.load_privatekey(FILETYPE_PEM, f.read()) + + return cert, key + + +#================================================================= +def main(): + parser = ArgumentParser(description='Cert Auth Cert Maker') + + parser.add_argument('output_pem_file', help='path to cert .pem file') + + parser.add_argument('-r', '--use-root', + help=('use specified root cert (.pem file) ' + + 'to create signed cert')) + + parser.add_argument('-n', '--name', action='store', default=CERT_NAME, + help='name for root certificate') + + parser.add_argument('-d', '--certs-dir', default=CERTS_DIR) + + parser.add_argument('-f', '--force', action='store_true') + + parser.add_argument('-w', '--wildcard_cert', action='store_true', + help='add wildcard SAN to host: *., ') + + result = parser.parse_args() + + overwrite = result.force + + # Create a new signed certificate using specified root + if result.use_root: + certs_dir = result.certs_dir + wildcard = result.wildcard + ca = CertificateAuthority(ca_file=result.use_root, + certs_dir=result.certs_dir, + certname=result.name) + + created, host_filename = ca.get_cert_for_host(result.output_pem_file, + overwrite, wildcard) + + if created: + print ('Created new cert "' + host_filename + + '" signed by root cert ' + + result.use_root) + else: + print ('Cert "' + host_filename + '" already exists,' + + ' use -f to overwrite') + + # Create new root certificate + else: + created, c, k = (CertificateAuthority. + generate_ca_root(result.output_pem_file, + result.name, + overwrite)) + + if created: + print 'Created new root cert: "' + result.output_pem_file + '"' + else: + print ('Root cert "' + result.output_pem_file + '" already exists,' + + ' use -f to overwrite') + +if __name__ == "__main__": + main() diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index 62bc06b0..57dd5088 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -4,8 +4,17 @@ from archivalrouter import ArchivalRouter import urlparse import base64 +import socket +import ssl + from pywb.rewrite.url_rewriter import HttpsUrlRewriter -from pywb.utils.statusandheaders import StatusAndHeaders +from pywb.utils.wbexception import BadRequestException + +from pywb.utils.bufferedreaders import BufferedReader + +from certauth import CertificateAuthority + +from proxy_resolvers import ProxyAuthResolver, CookieResolver #================================================================= @@ -44,8 +53,17 @@ class ProxyRouter(object): for more details. """ + PAC_PATH = '/proxy.pac' + BLOCK_SIZE = 4096 + DEF_MAGIC_NAME = 'pywb.proxy' + + CERT_DL_PEM = '/pywb-ca.pem' + CERT_DL_P12 = '/pywb-ca.p12' + + EXTRA_HEADERS = {'cache-control': 'no-cache', + 'p3p': 'CP="NOI ADM DEV COM NAV OUR STP"'} + def __init__(self, routes, **kwargs): - self.routes = routes self.hostpaths = kwargs.get('hostpaths') self.error_view = kwargs.get('error_view') @@ -54,61 +72,124 @@ class ProxyRouter(object): if proxy_options: proxy_options = proxy_options.get('proxy_options', {}) - self.auth_msg = proxy_options.get('auth_msg', - 'Please enter name of a collection to use for proxy mode') + self.magic_name = proxy_options.get('magic_name') + if not self.magic_name: + self.magic_name = self.DEF_MAGIC_NAME + proxy_options['magic_name'] = self.magic_name - self.use_default_coll = proxy_options.get('use_default_coll', True) + self.extra_headers = proxy_options.get('extra_headers') + if not self.extra_headers: + self.extra_headers = self.EXTRA_HEADERS + proxy_options['extra_headers'] = self.extra_headers + + if proxy_options.get('cookie_resolver'): + self.resolver = CookieResolver(routes, proxy_options) + else: + self.resolver = ProxyAuthResolver(routes, proxy_options) self.unaltered = proxy_options.get('unaltered_replay', False) + self.proxy_pac_path = proxy_options.get('pac_path', self.PAC_PATH) + + + if not proxy_options.get('enable_https_proxy'): + self.ca = None + self.proxy_cert_dl_view = None + return + + # HTTPS Only Options + ca_file = proxy_options.get('root_ca_file') + + # attempt to create the root_ca_file if doesn't exist + # (generally recommended to create this seperately) + certname = proxy_options.get('root_ca_name') + CertificateAuthority.generate_ca_root(certname, ca_file) + + certs_dir = proxy_options.get('certs_dir') + self.ca = CertificateAuthority(ca_file=ca_file, + certs_dir=certs_dir) + + self.proxy_cert_dl_view = proxy_options.get('proxy_cert_download_view') + def __call__(self, env): - url = env['REL_REQUEST_URI'] + is_https = (env['REQUEST_METHOD'] == 'CONNECT') - if url.endswith('/proxy.pac'): - return self.make_pac_response(env) + # for non-https requests, check pac path and non-proxy urls + if not is_https: + url = env['REL_REQUEST_URI'] - if not url.startswith('http://'): - return None + if url == self.proxy_pac_path: + return self.make_pac_response(env) - proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION') + if not url.startswith(('http://', 'https://')): + return None + + env['pywb.proxy_scheme'] = 'http' route = None coll = None matcher = None + response = None + ts = None - if proxy_auth: - proxy_coll = self.read_basic_auth_coll(proxy_auth) + # check resolver, for pre connect resolve + if self.resolver.pre_connect: + route, coll, matcher, ts, response = self.resolver.resolve(env) + if response: + return response - if not proxy_coll: - return self.proxy_auth_coll_response() + # do connect, then get updated url + if is_https: + response = self.handle_connect(env) + if response: + return response - proxy_coll = '/' + proxy_coll + '/' - - for r in self.routes: - matcher, c = r.is_handling(proxy_coll) - if matcher: - route = r - coll = c - break - - if not route: - return self.proxy_auth_coll_response() - - # if 'use_default_coll' or only one collection, use that - # for proxy mode - elif self.use_default_coll or len(self.routes) == 1: - route = self.routes[0] - coll = self.routes[0].regex.pattern - - # otherwise, require proxy auth 407 to select collection + url = env['REL_REQUEST_URI'] else: - return self.proxy_auth_coll_response() + parts = urlparse.urlsplit(env['REL_REQUEST_URI']) + hostport = parts.netloc.split(':', 1) + env['pywb.proxy_host'] = hostport[0] + env['pywb.proxy_port'] = hostport[1] if len(hostport) == 2 else '' + env['pywb.proxy_req_uri'] = parts.path + if parts.query: + env['pywb.proxy_req_uri'] += '?' + parts.query + + env['pywb_proxy_magic'] = self.magic_name + + # route (static) and other resources to archival replay + if env['pywb.proxy_host'] == self.magic_name: + env['REL_REQUEST_URI'] = env['pywb.proxy_req_uri'] + + # special case for proxy install + response = self.handle_cert_install(env) + if response: + return response + + return None + + # check resolver, post connect + if not self.resolver.pre_connect: + route, coll, matcher, ts, response = self.resolver.resolve(env) + if response: + return response + + host_prefix = env['pywb.proxy_scheme'] + '://' + self.magic_name + rel_prefix = '' + + # special case for proxy calendar + if (env['pywb.proxy_host'] == 'query.' + self.magic_name): + url = env['pywb.proxy_req_uri'][1:] + rel_prefix = '/' + + if ts is not None: + url = ts + '/' + url wbrequest = route.request_class(env, request_uri=url, wb_url_str=url, coll=coll, - host_prefix=self.hostpaths[0], + host_prefix=host_prefix, + rel_prefix=rel_prefix, wburl_class=route.handler.get_wburl_type(), urlrewriter_class=HttpsUrlRewriter, use_abs_prefix=False, @@ -119,13 +200,170 @@ class ProxyRouter(object): if self.unaltered: wbrequest.wb_url.mod = 'id_' + elif is_https: + wbrequest.wb_url.mod = 'bn_' - return route.handler(wbrequest) + response = route.handler(wbrequest) + + if wbrequest.wb_url and wbrequest.wb_url.is_replay(): + response.status_headers.replace_headers(self.extra_headers) + + return response + + def get_request_socket(self, env): + if not self.ca: + return None + + sock = None + + if env.get('uwsgi.version'): + try: + import uwsgi + fd = uwsgi.connection_fd() + conn = socket.fromfd(fd, socket.AF_INET, socket.SOCK_STREAM) + sock = socket.socket(_sock=conn) + except Exception: + pass + elif env.get('gunicorn.socket'): + sock = env['gunicorn.socket'] + + if not sock: + # attempt to find socket from wsgi.input + input_ = env.get('wsgi.input') + if input_ and hasattr(input_, '_sock'): + sock = socket.socket(_sock=input_._sock) + + return sock + + def handle_connect(self, env): + sock = self.get_request_socket(env) + if not sock: + return WbResponse.text_response('HTTPS Proxy Not Supported', + '405 HTTPS Proxy Not Supported') + + sock.send('HTTP/1.0 200 Connection Established\r\n') + sock.send('Server: pywb proxy\r\n') + sock.send('\r\n') + + hostname, port = env['REL_REQUEST_URI'].split(':') + cert_host = hostname + + host_parts = hostname.split('.', 1) + if len(host_parts) == 2 and '.' in host_parts[1]: + cert_host = host_parts[1] + + created, certfile = self.ca.get_cert_for_host(cert_host, + wildcard=True) + + try: + ssl_sock = ssl.wrap_socket(sock, + server_side=True, + certfile=certfile, + ciphers="ALL", + suppress_ragged_eofs=False, + #ssl_version=ssl.PROTOCOL_TLSv1) + ssl_version=ssl.PROTOCOL_SSLv23) + env['pywb.proxy_ssl_sock'] = ssl_sock + + buffreader = BufferedReader(ssl_sock, block_size=self.BLOCK_SIZE) + + statusline = buffreader.readline().rstrip() + + except Exception as se: + raise BadRequestException(se.message) + + statusparts = statusline.split(' ') + + if len(statusparts) < 3: + raise BadRequestException('Invalid Proxy Request: ' + statusline) + + env['REQUEST_METHOD'] = statusparts[0] + env['REL_REQUEST_URI'] = ('https://' + + env['REL_REQUEST_URI'].replace(':443', '') + + statusparts[1]) + + env['SERVER_PROTOCOL'] = statusparts[2].strip() + + env['pywb.proxy_scheme'] = 'https' + + env['pywb.proxy_host'] = hostname + env['pywb.proxy_port'] = port + env['pywb.proxy_req_uri'] = statusparts[1] + + queryparts = env['REL_REQUEST_URI'].split('?', 1) + env['PATH_INFO'] = queryparts[0] + env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else '' + + while True: + line = buffreader.readline() + if line: + line = line.rstrip() + + if not line: + break + + parts = line.split(':', 1) + if len(parts) < 2: + continue + + name = parts[0].strip() + value = parts[1].strip() + + name = name.replace('-', '_').upper() + + if not name in ('CONTENT_LENGTH', 'CONTENT_TYPE'): + name = 'HTTP_' + name + + env[name] = value + + remain = buffreader.rem_length() + if remain > 0: + remainder = buffreader.read(self.BLOCK_SIZE) + env['wsgi.input'] = BufferedReader(ssl_sock, + block_size=self.BLOCK_SIZE, + starting_data=remainder) + + def handle_cert_install(self, env): + if env['pywb.proxy_req_uri'] in ('/', '/index.html', '/index.html'): + available = (self.ca is not None) + + if self.proxy_cert_dl_view: + return (self.proxy_cert_dl_view. + render_response(available=available, + pem_path=self.CERT_DL_PEM, + p12_path=self.CERT_DL_P12)) + else: + return None + + elif env['pywb.proxy_req_uri'] == self.CERT_DL_PEM: + if not self.ca: + return None + + buff = '' + with open(self.ca.ca_file) as fh: + buff = fh.read() + + content_type = 'application/x-x509-ca-cert' + + return WbResponse.text_response(buff, + content_type=content_type) + + elif env['pywb.proxy_req_uri'] == self.CERT_DL_P12: + if not self.ca: + return None + + buff = self.ca.get_root_PKCS12() + + content_type = 'application/x-pkcs12' + + return WbResponse.text_response(buff, + content_type=content_type) + else: + return None # Proxy Auto-Config (PAC) script for the proxy def make_pac_response(self, env): - import os - hostname = os.environ.get('PYWB_HOST_NAME') + hostname = env.get('HTTP_HOST') if not hostname: server_hostport = env['SERVER_NAME'] + ':' + env['SERVER_PORT'] hostonly = env['SERVER_NAME'] @@ -143,33 +381,8 @@ class ProxyRouter(object): buff += direct.format(hostonly) - #buff += '\n return "PROXY {0}";\n}}\n'.format(self.hostpaths[0]) buff += '\n return "PROXY {0}";\n}}\n'.format(server_hostport) content_type = 'application/x-ns-proxy-autoconfig' return WbResponse.text_response(buff, content_type=content_type) - - def proxy_auth_coll_response(self): - proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg) - - headers = [('Content-Type', 'text/plain'), - ('Proxy-Authenticate', proxy_msg)] - - status_headers = StatusAndHeaders('407 Proxy Authentication', headers) - - value = self.auth_msg - - return WbResponse(status_headers, value=[value]) - - @staticmethod - def read_basic_auth_coll(value): - parts = value.split(' ') - if parts[0].lower() != 'basic': - return '' - - if len(parts) != 2: - return '' - - user_pass = base64.b64decode(parts[1]) - return user_pass.split(':')[0] diff --git a/pywb/framework/proxy_resolvers.py b/pywb/framework/proxy_resolvers.py new file mode 100644 index 00000000..dc7b22fe --- /dev/null +++ b/pywb/framework/proxy_resolvers.py @@ -0,0 +1,340 @@ +from wbrequestresponse import WbResponse, WbRequest +from pywb.utils.statusandheaders import StatusAndHeaders +from pywb.rewrite.wburl import WbUrl + +import urlparse +import base64 +import os + +try: + import uwsgi + uwsgi_cache = True +except ImportError: + uwsgi_cache = False + + +#================================================================= +class UwsgiCache(object): + def __setitem__(self, item, value): + uwsgi.cache_update(item, value) + + def __getitem__(self, item): + return uwsgi.cache_get(item) + + def __contains__(self, item): + return uwsgi.cache_exists(item) + + def __delitem__(self, item): + uwsgi.cache_del(item) + + +#================================================================= +class BaseCollResolver(object): + def __init__(self, routes, config): + self.routes = routes + self.pre_connect = config.get('pre_connect', False) + self.use_default_coll = config.get('use_default_coll', True) + + def resolve(self, env): + route = None + coll = None + matcher = None + ts = None + + proxy_coll, ts = self.get_proxy_coll_ts(env) + + # invalid parsing + if proxy_coll == '': + return None, None, None, None, self.select_coll_response(env) + + if proxy_coll is None and isinstance(self.use_default_coll, str): + proxy_coll = self.use_default_coll + + if proxy_coll: + proxy_coll = '/' + proxy_coll + '/' + + for r in self.routes: + matcher, c = r.is_handling(proxy_coll) + if matcher: + route = r + coll = c + break + + # if no match, return coll selection response + if not route: + return None, None, None, None, self.select_coll_response(env) + + # if 'use_default_coll' + elif self.use_default_coll == True or len(self.routes) == 1: + route = self.routes[0] + coll = self.routes[0].path + + # otherwise, return the appropriate coll selection response + else: + return None, None, None, None, self.select_coll_response(env) + + return route, coll, matcher, ts, None + + +#================================================================= +class ProxyAuthResolver(BaseCollResolver): + DEFAULT_MSG = 'Please enter name of a collection to use with proxy mode' + + def __init__(self, routes, config): + config['pre_connect'] = True + super(ProxyAuthResolver, self).__init__(routes, config) + self.auth_msg = config.get('auth_msg', self.DEFAULT_MSG) + + def get_proxy_coll_ts(self, env): + proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION') + + if not proxy_auth: + return None, None + + proxy_coll = self.read_basic_auth_coll(proxy_auth) + return proxy_coll, None + + def select_coll_response(self, env): + proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg) + + headers = [('Content-Type', 'text/plain'), + ('Proxy-Authenticate', proxy_msg)] + + status_headers = StatusAndHeaders('407 Proxy Authentication', headers) + + value = self.auth_msg + + return WbResponse(status_headers, value=[value]) + + @staticmethod + def read_basic_auth_coll(value): + parts = value.split(' ') + if parts[0].lower() != 'basic': + return '' + + if len(parts) != 2: + return '' + + user_pass = base64.b64decode(parts[1]) + return user_pass.split(':')[0] + + +#================================================================= +# Experimental CookieResolver +class CookieResolver(BaseCollResolver): # pragma: no cover + def __init__(self, routes, config): + config['pre_connect'] = False + super(CookieResolver, self).__init__(routes, config) + self.magic_name = config['magic_name'] + self.sethost_prefix = '-sethost.' + self.magic_name + '.' + self.set_prefix = '-set.' + self.magic_name + + self.cookie_name = config.get('cookie_name', '__pywb_coll') + self.proxy_select_view = config.get('proxy_select_view') + + self.extra_headers = config.get('extra_headers') + + if uwsgi_cache: + self.cache = UwsgiCache() + else: + self.cache = {} + + def get_proxy_coll_ts(self, env): + coll, ts, sesh_id = self.get_coll(env) + return coll, ts + + def select_coll_response(self, env): + return self.make_magic_response('auto', + env['REL_REQUEST_URI'], + env) + + def resolve(self, env): + server_name = env['pywb.proxy_host'] + + if ('.' + self.magic_name) in server_name: + response = self.handle_magic_page(env) + if response: + return None, None, None, None, response + + return super(CookieResolver, self).resolve(env) + + def handle_magic_page(self, env): + request_url = env['REL_REQUEST_URI'] + parts = urlparse.urlsplit(request_url) + server_name = env['pywb.proxy_host'] + + path_url = parts.path[1:] + if parts.query: + path_url += '?' + parts.query + + if server_name.startswith('auto'): + coll, ts, sesh_id = self.get_coll(env) + + if coll: + return self.make_sethost_cookie_response(sesh_id, path_url, env) + else: + return self.make_magic_response('select', path_url, env) + + elif server_name.startswith('query.'): + wb_url = WbUrl(path_url) + + # only dealing with specific timestamp setting + if wb_url.is_query(): + return None + + coll, ts, sesh_id = self.get_coll(env) + if not coll: + return self.make_magic_response('select', path_url, env) + + self.set_ts(sesh_id, wb_url.timestamp) + return self.make_redir_response(wb_url.url) + + elif server_name.endswith(self.set_prefix): + old_sesh_id = self.extract_client_cookie(env, self.cookie_name) + sesh_id = self.create_renew_sesh_id(old_sesh_id) + + if sesh_id != old_sesh_id: + headers = self.make_cookie_headers(sesh_id, self.magic_name) + else: + headers = None + + coll = server_name[:-len(self.set_prefix)] + + # set sesh value + self.set_coll(sesh_id, coll) + + return self.make_sethost_cookie_response(sesh_id, path_url, env, + headers=headers) + + elif self.sethost_prefix in server_name: + inx = server_name.find(self.sethost_prefix) + sesh_id = server_name[:inx] + + domain = server_name[inx + len(self.sethost_prefix):] + + headers = self.make_cookie_headers(sesh_id, domain) + + full_url = env['pywb.proxy_scheme'] + '://' + domain + full_url += '/' + path_url + return self.make_redir_response(full_url, headers=headers) + + elif 'select.' in server_name: + if not self.proxy_select_view: + return WbResponse.text_response('select text for ' + path_url) + + coll, ts, sesh_id = self.get_coll(env) + + #scheme = env['pywb.proxy_scheme'] + '://' + route_temp = '-set.' + self.magic_name + '/' + path_url + + try: + return (self.proxy_select_view. + render_response(routes=self.routes, + route_temp=route_temp, + coll=coll, + url=path_url)) + except Exception as exc: + raise + + #else: + # msg = 'Invalid Magic Path: ' + url + # print msg + # return WbResponse.text_response(msg, status='404 Not Found') + + def make_cookie_headers(self, sesh_id, domain): + cookie_val = '{0}={1}; Path=/; Domain=.{2}; HttpOnly' + cookie_val = cookie_val.format(self.cookie_name, sesh_id, domain) + headers = [('Set-Cookie', cookie_val)] + return headers + + def make_sethost_cookie_response(self, sesh_id, path_url, + env, headers=None): + if '://' not in path_url: + path_url = 'http://' + path_url + + path_parts = urlparse.urlsplit(path_url) + + new_url = path_parts.path[1:] + if path_parts.query: + new_url += '?' + path_parts.query + + return self.make_magic_response(sesh_id + '-sethost', new_url, env, + suffix=path_parts.netloc, + headers=headers) + + + def make_magic_response(self, prefix, url, env, + suffix=None, headers=None): + full_url = env['pywb.proxy_scheme'] + '://' + prefix + '.' + full_url += self.magic_name + if suffix: + full_url += '.' + suffix + full_url += '/' + url + return self.make_redir_response(full_url, headers=headers) + + def set_coll(self, sesh_id, coll): + self.cache[sesh_id + ':c'] = coll + + def set_ts(self, sesh_id, ts): + if ts: + self.cache[sesh_id + ':t'] = ts + # this ensures that omitting timestamp will reset to latest + # capture by deleting the cache entry + else: + del self.cache[sesh_id + ':t'] + + def get_coll(self, env): + sesh_id = self.extract_client_cookie(env, self.cookie_name) + + coll = None + ts = None + if sesh_id: + coll = self.cache[sesh_id + ':c'] + try: + ts = self.cache[sesh_id + ':t'] + except KeyError: + pass + + return coll, ts, sesh_id + + def create_renew_sesh_id(self, sesh_id, force=False): + #if sesh_id in self.cache and not force: + if sesh_id and ((sesh_id + ':c') in self.cache) and not force: + return sesh_id + + sesh_id = base64.b32encode(os.urandom(5)).lower() + return sesh_id + + def make_redir_response(self, url, headers=None): + if not headers: + headers = [] + + if self.extra_headers: + for name, value in self.extra_headers.iteritems(): + headers.append((name, value)) + + return WbResponse.redir_response(url, headers=headers) + + @staticmethod + def extract_client_cookie(env, cookie_name): + cookie_header = env.get('HTTP_COOKIE') + if not cookie_header: + return None + + # attempt to extract cookie_name only + inx = cookie_header.find(cookie_name) + if inx < 0: + return None + + end_inx = cookie_header.find(';', inx) + if end_inx > 0: + value = cookie_header[inx:end_inx] + else: + value = cookie_header[inx:] + + value = value.split('=') + if len(value) < 2: + return None + + value = value[1].strip() + return value diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py index 3729a660..3498c819 100644 --- a/pywb/framework/wsgi_wrappers.py +++ b/pywb/framework/wsgi_wrappers.py @@ -50,6 +50,42 @@ class WSGIApp(object): # Top-level wsgi application def __call__(self, env, start_response): + if env['REQUEST_METHOD'] == 'CONNECT': + return self.handle_connect(env, start_response) + else: + return self.handle_methods(env, start_response) + + def handle_connect(self, env, start_response): + def ssl_start_response(statusline, headers): + ssl_sock = env.get('pywb.proxy_ssl_sock') + if not ssl_sock: + start_response(statusline, headers) + return + + env['pywb.proxy_statusline'] = statusline + + ssl_sock.write('HTTP/1.1 ' + statusline + '\r\n') + for name, value in headers: + ssl_sock.write(name + ': ' + value + '\r\n') + + resp_iter = self.handle_methods(env, ssl_start_response) + + ssl_sock = env.get('pywb.proxy_ssl_sock') + if not ssl_sock: + return resp_iter + + ssl_sock.write('\r\n') + + for obj in resp_iter: + if obj: + ssl_sock.write(obj) + ssl_sock.close() + + start_response(env['pywb.proxy_statusline'], []) + + return [] + + def handle_methods(self, env, start_response): if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'): env['REL_REQUEST_URI'] = rel_request_uri(env) else: @@ -89,22 +125,29 @@ class WSGIApp(object): else: err_url = None + try: + err_msg = exc.message.encode('utf-8') + except Exception: + err_msg = exc.message + err_url = '' + if print_trace: import traceback err_details = traceback.format_exc(exc) print err_details else: - logging.info(str(exc)) + logging.info(err_msg) err_details = None if error_view: return error_view.render_response(exc_type=type(exc).__name__, - err_msg=str(exc), + err_msg=err_msg, err_details=err_details, status=status, + env=env, err_url=err_url) else: - return WbResponse.text_response(status + ' Error: ' + str(exc), + return WbResponse.text_response(status + ' Error: ' + err_msg, status=status) #================================================================= @@ -145,6 +188,10 @@ def init_app(init_func, load_yaml=True, config_file=None, config={}): def start_wsgi_server(the_app, name, default_port=None): # pragma: no cover from wsgiref.simple_server import make_server + # disable is_hop_by_hop restrictions + import wsgiref.handlers + wsgiref.handlers.is_hop_by_hop = lambda x: False + port = the_app.port if not port: diff --git a/pywb/rewrite/header_rewriter.py b/pywb/rewrite/header_rewriter.py index 2dfc824d..2d505e88 100644 --- a/pywb/rewrite/header_rewriter.py +++ b/pywb/rewrite/header_rewriter.py @@ -37,7 +37,8 @@ class HeaderRewriter: ENCODING_HEADERS = ['content-encoding'] - REMOVE_HEADERS = ['transfer-encoding'] + REMOVE_HEADERS = ['transfer-encoding', 'content-security-policy', + 'strict-transport-security'] PROXY_NO_REWRITE_HEADERS = ['content-length'] @@ -90,7 +91,10 @@ class HeaderRewriter: new_headers = [] removed_header_dict = {} - cookie_rewriter = urlrewriter.get_cookie_rewriter() + if urlrewriter: + cookie_rewriter = urlrewriter.get_cookie_rewriter() + else: + cookie_rewriter = None for (name, value) in headers: @@ -99,7 +103,7 @@ class HeaderRewriter: if lowername in self.PROXY_HEADERS: new_headers.append((name, value)) - elif lowername in self.URL_REWRITE_HEADERS: + elif urlrewriter and lowername in self.URL_REWRITE_HEADERS: new_headers.append((name, urlrewriter.rewrite(value))) elif lowername in self.ENCODING_HEADERS: @@ -109,7 +113,8 @@ class HeaderRewriter: new_headers.append((name, value)) elif lowername in self.REMOVE_HEADERS: - removed_header_dict[lowername] = value + removed_header_dict[lowername] = value + new_headers.append((self.header_prefix + name, value)) elif (lowername in self.PROXY_NO_REWRITE_HEADERS and not content_rewritten): @@ -120,7 +125,9 @@ class HeaderRewriter: cookie_list = cookie_rewriter.rewrite(value) new_headers.extend(cookie_list) - else: + elif urlrewriter: new_headers.append((self.header_prefix + name, value)) + else: + new_headers.append((name, value)) return (new_headers, removed_header_dict) diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 207d879e..2225bbaf 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -69,6 +69,10 @@ class RewriteContent: status_headers, stream = self.sanitize_content(headers, stream) return (status_headers, self.stream_to_gen(stream), False) + + if wb_url.is_banner_only: + urlrewriter = None + (rewritten_headers, stream) = self.rewrite_headers(urlrewriter, headers, stream) diff --git a/pywb/rewrite/test/test_header_rewriter.py b/pywb/rewrite/test/test_header_rewriter.py index 1a2b2cea..0b22d533 100644 --- a/pywb/rewrite/test/test_header_rewriter.py +++ b/pywb/rewrite/test/test_header_rewriter.py @@ -40,17 +40,19 @@ HTTP Headers Rewriting 'removed_header_dict': {'content-encoding': 'gzip', 'transfer-encoding': 'chunked'}, 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'), - ('Content-Type', 'text/javascript')]), + ('Content-Type', 'text/javascript'), + ('X-Archive-Orig-Transfer-Encoding', 'chunked')]), 'text_type': 'js'} -# Binary -- transfer-encoding removed +# Binary -- transfer-encoding rewritten >>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Set-Cookie', 'foo=bar; Path=/;'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')]) {'charset': None, 'removed_header_dict': {'transfer-encoding': 'chunked'}, 'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/'), - ('Content-Encoding', 'gzip')]), + ('Content-Encoding', 'gzip'), + ('X-Archive-Orig-Transfer-Encoding', 'chunked')]), 'text_type': None} """ diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index 5b2f8e7b..c89e9a21 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -142,7 +142,7 @@ class HttpsUrlRewriter(UrlRewriter): else: return url - def get_timestamp_url(self, timestamp, url): + def get_timestamp_url(self, timestamp, url=''): return url def get_abs_url(self, url=''): diff --git a/pywb/static/wb.js b/pywb/static/wb.js index 2d8b2470..0511f983 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -72,6 +72,14 @@ function init_banner() { } text += "" + capture_str + ""; + + if (wbinfo.proxy_magic && wbinfo.url) { + var select_url = wbinfo.proxy_magic + "/" + wbinfo.url; + var query_url = wbinfo.proxy_magic + "/*/" + wbinfo.url; + text += ' All Capture Times'; + text += '
' + text += 'From collection "' + wbinfo.coll + '" All Collections'; + } banner.innerHTML = text; diff --git a/pywb/ui/error.html b/pywb/ui/error.html index b3a8c478..b122fc38 100644 --- a/pywb/ui/error.html +++ b/pywb/ui/error.html @@ -9,3 +9,10 @@

{% endif %} + +{% if env.pywb_proxy_magic and err_url and status == '404 Not Found' %} +

+Try Different Collection +

+{% endif %} + diff --git a/pywb/ui/head_insert.html b/pywb/ui/head_insert.html index 2cd94ab5..c97cb86a 100644 --- a/pywb/ui/head_insert.html +++ b/pywb/ui/head_insert.html @@ -2,7 +2,7 @@ {% if rule.js_rewrite_location and include_wombat %} diff --git a/pywb/ui/proxy_cert_download.html b/pywb/ui/proxy_cert_download.html new file mode 100644 index 00000000..71255e3a --- /dev/null +++ b/pywb/ui/proxy_cert_download.html @@ -0,0 +1,14 @@ +

HTTPS Certificate For PyWb Web Archive Replay

+{% if not available %} +

Sorry, HTTPS support is not configured for this proxy. However, the proxy should work in HTTP mode.

+{% else %} +

Download for all platforms (except Windows):

+

Download Certificate (All except Windows)

+ +

(If you see the Already Installed message, then no further action is necessary and you may start browsing!

+{% endif %} + +

Download for Windows platforms:

+

Download Certificate (Window Only)

+ + diff --git a/pywb/ui/proxy_select.html b/pywb/ui/proxy_select.html new file mode 100644 index 00000000..b06f68a2 --- /dev/null +++ b/pywb/ui/proxy_select.html @@ -0,0 +1,25 @@ + + +

Pywb Proxy Collection Selector

+{% if coll %} +

+Current collection is: {{ coll }} +

+{% else %} +

You have attempted to load the url {{ url }}, but there are multiple collections available.

+{% endif %} + +

Please select which collection you would like to use (You will be redirected back to {{ url }}): +

+ +
    +{% for route in routes %} +{% if route.path and route | is_wb_handler %} +
  • {{ route.path }}
  • +{% endif %} +{% endfor %} +
+ +

(Once selected, you will not be prompted again, however you can return to this page to switch collections.)

+ + diff --git a/pywb/utils/statusandheaders.py b/pywb/utils/statusandheaders.py index ae3fc261..70ba850c 100644 --- a/pywb/utils/statusandheaders.py +++ b/pywb/utils/statusandheaders.py @@ -3,6 +3,7 @@ Representation and parsing of HTTP-style status + headers """ import pprint +from copy import copy #================================================================= @@ -44,9 +45,26 @@ class StatusAndHeaders(object): self.headers.append((name, value)) return None + def replace_headers(self, header_dict): + """ + replace all headers in header_dict that already exist + add any remaining headers + """ + header_dict = copy(header_dict) + + for index in xrange(len(self.headers) - 1, -1, -1): + curr_name, curr_value = self.headers[index] + name_lower = curr_name.lower() + if name_lower in header_dict: + self.headers[index] = (curr_name, header_dict[name_lower]) + del header_dict[name_lower] + + for name, value in header_dict.iteritems(): + self.headers.append((name, value)) + def remove_header(self, name): """ - remove header (case-insensitive) + Remove header (case-insensitive) return True if header removed, False otherwise """ name_lower = name.lower() diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py index 02efbf89..e9cf0791 100644 --- a/pywb/webapp/pywb_init.py +++ b/pywb/webapp/pywb_init.py @@ -34,6 +34,9 @@ DEFAULTS = { 'home_html': 'ui/index.html', 'error_html': 'ui/error.html', + 'proxy_select_html': 'ui/proxy_select.html', + 'proxy_cert_download_html': 'ui/proxy_cert_download.html', + 'template_globals': {'static_path': 'static/default'}, 'static_routes': {'static/default': 'pywb/static/'}, @@ -80,7 +83,7 @@ def create_live_handler(config): #================================================================= def init_route_config(value, config): - if isinstance(value, str): + if isinstance(value, str) or isinstance(value, list): value = dict(index_paths=value) route_config = DictChain(value, config) @@ -226,10 +229,27 @@ def create_wb_router(passed_config={}): if hasattr(route.handler, 'resolve_refs'): route.handler.resolve_refs(handler_dict) - # Check for new proxy mode! if config.get('enable_http_proxy', False): router = ProxyArchivalRouter + + view = J2TemplateView.create_template( + config.get('proxy_select_html'), + 'Proxy Coll Selector') + + if not 'proxy_options' in passed_config: + passed_config['proxy_options'] = {} + + if view: + passed_config['proxy_options']['proxy_select_view'] = view + + view = J2TemplateView.create_template( + config.get('proxy_cert_download_html'), + 'Proxy Cert Download') + + if view: + passed_config['proxy_options']['proxy_cert_download_view'] = view + else: router = ArchivalRouter @@ -250,6 +270,5 @@ def create_wb_router(passed_config={}): error_view=J2TemplateView.create_template(config.get('error_html'), 'Error Page'), - config=config ) diff --git a/setup.py b/setup.py index a170b93d..6b5482bf 100755 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ class PyTest(TestCommand): setup( name='pywb', - version='0.5.3', + version='0.6.0', url='https://github.com/ikreymer/pywb', author='Ilya Kreymer', author_email='ikreymer@gmail.com', @@ -70,6 +70,7 @@ setup( 'jinja2', 'surt', 'pyyaml', + 'pyopenssl', ], tests_require=[ 'pytest', @@ -86,6 +87,7 @@ setup( cdx-server = pywb.apps.cdx_server:main cdx-indexer = pywb.warc.cdxindexer:main live-rewrite-server = pywb.apps.live_rewrite_server:main + proxy-cert-auth = pywb.framework.certauth:main """, zip_safe=False, classifiers=[ diff --git a/tests/test_integration.py b/tests/test_integration.py index 8c9ee900..67bf698b 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -389,7 +389,7 @@ class TestWb: assert resp.status_int == 407 def test_proxy_pac(self): - resp = self.testapp.get('/proxy.pac', extra_environ = dict(SERVER_NAME='pywb-proxy', SERVER_PORT='8080')) + resp = self.testapp.get('/proxy.pac', headers = [('Host', 'pywb-proxy:8080')]) assert resp.content_type == 'application/x-ns-proxy-autoconfig' assert '"PROXY pywb-proxy:8080"' in resp.body assert '"localhost"' in resp.body