1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Merge branch 'https-proxy' into develop

Merge readme and changelist from 0.5.3
This commit is contained in:
Ilya Kreymer 2014-08-04 23:15:57 -07:00
commit c251840141
20 changed files with 1037 additions and 88 deletions

View File

@ -1,3 +1,11 @@
pywb 0.6.0 changelist
~~~~~~~~~~~~~~~~~~~~~
* HTTPS Proxy Support!
* Revamped HTTP/S system: proxy collection and capture time switching via cookie!
pywb 0.5.3 changelist
~~~~~~~~~~~~~~~~~~~~~
* better framed replay for non-html content -- include live rewrite timestamp via temp 'pywb.timestamp' cookie, updating banner of iframe load. All timestamp formatting moved to client-side for better customization.
@ -6,6 +14,7 @@ pywb 0.5.3 changelist
* banner-only rewrite mode (via 'bn_' modifier) to support only banner insertion with no rewriting, server-side or client-side.
pywb 0.5.1 changelist
~~~~~~~~~~~~~~~~~~~~~
minor fixes:

View File

@ -1,11 +1,11 @@
PyWb 0.5.3
PyWb 0.6.0
==========
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=https-proxy
:target: https://travis-ci.org/ikreymer/pywb
.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=develop
:target: https://coveralls.io/r/ikreymer/pywb?branch=develop
.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=https-proxy
:target: https://coveralls.io/r/ikreymer/pywb?branch=https-proxy
pywb is a python implementation of web archival replay tools, sometimes also known as 'Wayback Machine'.
@ -21,6 +21,7 @@ This README contains a basic overview of using pywb. After reading this intro, c
* `pywb-samples <https://github.com/ikreymer/pywb-samples>`_ provides additional archive samples with difficult-to-replay content.
* `pywb-proxy-demo <https://github.com/ikreymer/pywb-proxy-demo>`_ showcases the revamped HTTP/S proxy replay system (available from pywb 0.6.0)
The following deployed applications use pywb:

View File

@ -109,3 +109,6 @@ enable_memento: true
# Replay content in an iframe
framed_replay: true
debug_echo_env: True

228
pywb/framework/certauth.py Normal file
View File

@ -0,0 +1,228 @@
import logging
import os
from OpenSSL import crypto
from OpenSSL.SSL import FILETYPE_PEM
import random
from argparse import ArgumentParser
#=================================================================
# Duration of 100 years
CERT_DURATION = 100 * 365 * 24 * 60 * 60
CERTS_DIR = './pywb-certs/'
CERT_NAME = 'pywb https proxy replay CA'
CERT_CA_FILE = './pywb-ca.pem'
#=================================================================
class CertificateAuthority(object):
"""
Utility class for signing individual certificate
with a root cert.
Static generate_ca_root() method for creating the root cert
All certs saved on filesystem. Individual certs are stored
in specified certs_dir and reused if previously created.
"""
def __init__(self, ca_file, certs_dir):
if not ca_file:
ca_file = CERT_CA_FILE
if not certs_dir:
certs_dir = CERTS_DIR
self.ca_file = ca_file
self.certs_dir = certs_dir
# read previously created root cert
self.cert, self.key = self.read_pem(ca_file)
if not os.path.exists(certs_dir):
os.mkdir(certs_dir)
def get_cert_for_host(self, host, overwrite=False, wildcard=False):
host_filename = os.path.join(self.certs_dir, host) + '.pem'
if not overwrite and os.path.exists(host_filename):
return False, host_filename
self.generate_host_cert(host, self.cert, self.key, host_filename,
wildcard)
return True, host_filename
def get_root_PKCS12(self):
p12 = crypto.PKCS12()
p12.set_certificate(self.cert)
p12.set_privatekey(self.key)
return p12.export()
@staticmethod
def _make_cert(certname):
cert = crypto.X509()
cert.set_version(2)
cert.set_serial_number(random.randint(0, 2 ** 64 - 1))
cert.get_subject().CN = certname
cert.gmtime_adj_notBefore(0)
cert.gmtime_adj_notAfter(CERT_DURATION)
return cert
@staticmethod
def generate_ca_root(ca_file, certname=None, overwrite=False):
if not certname:
certname = CERT_NAME
if not ca_file:
ca_file = CERT_CA_FILE
if not overwrite and os.path.exists(ca_file):
cert, key = CertificateAuthority.read_pem(ca_file)
return False, cert, key
# Generate key
key = crypto.PKey()
key.generate_key(crypto.TYPE_RSA, 2048)
# Generate cert
cert = CertificateAuthority._make_cert(certname)
cert.set_issuer(cert.get_subject())
cert.set_pubkey(key)
cert.add_extensions([
crypto.X509Extension(b"basicConstraints",
True,
b"CA:TRUE, pathlen:0"),
crypto.X509Extension(b"keyUsage",
True,
b"keyCertSign, cRLSign"),
crypto.X509Extension(b"subjectKeyIdentifier",
False,
b"hash",
subject=cert),
])
cert.sign(key, "sha1")
# Write cert + key
CertificateAuthority.write_pem(ca_file, cert, key)
return True, cert, key
@staticmethod
def generate_host_cert(host, root_cert, root_key, host_filename,
wildcard=False):
# Generate key
key = crypto.PKey()
key.generate_key(crypto.TYPE_RSA, 2048)
# Generate CSR
req = crypto.X509Req()
req.get_subject().CN = host
req.set_pubkey(key)
req.sign(key, 'sha1')
# Generate Cert
cert = CertificateAuthority._make_cert(host)
cert.set_issuer(root_cert.get_subject())
cert.set_pubkey(req.get_pubkey())
if wildcard:
DNS = 'DNS:'
alt_hosts = [DNS + host,
DNS + '*.' + host]
alt_hosts = ', '.join(alt_hosts)
cert.add_extensions([
crypto.X509Extension('subjectAltName',
False,
alt_hosts)])
cert.sign(root_key, 'sha1')
# Write cert + key
CertificateAuthority.write_pem(host_filename, cert, key)
return cert, key
@staticmethod
def write_pem(filename, cert, key):
with open(filename, 'wb+') as f:
f.write(crypto.dump_privatekey(FILETYPE_PEM, key))
f.write(crypto.dump_certificate(FILETYPE_PEM, cert))
@staticmethod
def read_pem(filename):
with open(filename, 'r') as f:
cert = crypto.load_certificate(FILETYPE_PEM, f.read())
f.seek(0)
key = crypto.load_privatekey(FILETYPE_PEM, f.read())
return cert, key
#=================================================================
def main():
parser = ArgumentParser(description='Cert Auth Cert Maker')
parser.add_argument('output_pem_file', help='path to cert .pem file')
parser.add_argument('-r', '--use-root',
help=('use specified root cert (.pem file) ' +
'to create signed cert'))
parser.add_argument('-n', '--name', action='store', default=CERT_NAME,
help='name for root certificate')
parser.add_argument('-d', '--certs-dir', default=CERTS_DIR)
parser.add_argument('-f', '--force', action='store_true')
parser.add_argument('-w', '--wildcard_cert', action='store_true',
help='add wildcard SAN to host: *.<host>, <host>')
result = parser.parse_args()
overwrite = result.force
# Create a new signed certificate using specified root
if result.use_root:
certs_dir = result.certs_dir
wildcard = result.wildcard
ca = CertificateAuthority(ca_file=result.use_root,
certs_dir=result.certs_dir,
certname=result.name)
created, host_filename = ca.get_cert_for_host(result.output_pem_file,
overwrite, wildcard)
if created:
print ('Created new cert "' + host_filename +
'" signed by root cert ' +
result.use_root)
else:
print ('Cert "' + host_filename + '" already exists,' +
' use -f to overwrite')
# Create new root certificate
else:
created, c, k = (CertificateAuthority.
generate_ca_root(result.output_pem_file,
result.name,
overwrite))
if created:
print 'Created new root cert: "' + result.output_pem_file + '"'
else:
print ('Root cert "' + result.output_pem_file + '" already exists,' +
' use -f to overwrite')
if __name__ == "__main__":
main()

View File

@ -4,8 +4,17 @@ from archivalrouter import ArchivalRouter
import urlparse
import base64
import socket
import ssl
from pywb.rewrite.url_rewriter import HttpsUrlRewriter
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.wbexception import BadRequestException
from pywb.utils.bufferedreaders import BufferedReader
from certauth import CertificateAuthority
from proxy_resolvers import ProxyAuthResolver, CookieResolver
#=================================================================
@ -44,8 +53,17 @@ class ProxyRouter(object):
for more details.
"""
PAC_PATH = '/proxy.pac'
BLOCK_SIZE = 4096
DEF_MAGIC_NAME = 'pywb.proxy'
CERT_DL_PEM = '/pywb-ca.pem'
CERT_DL_P12 = '/pywb-ca.p12'
EXTRA_HEADERS = {'cache-control': 'no-cache',
'p3p': 'CP="NOI ADM DEV COM NAV OUR STP"'}
def __init__(self, routes, **kwargs):
self.routes = routes
self.hostpaths = kwargs.get('hostpaths')
self.error_view = kwargs.get('error_view')
@ -54,61 +72,124 @@ class ProxyRouter(object):
if proxy_options:
proxy_options = proxy_options.get('proxy_options', {})
self.auth_msg = proxy_options.get('auth_msg',
'Please enter name of a collection to use for proxy mode')
self.magic_name = proxy_options.get('magic_name')
if not self.magic_name:
self.magic_name = self.DEF_MAGIC_NAME
proxy_options['magic_name'] = self.magic_name
self.use_default_coll = proxy_options.get('use_default_coll', True)
self.extra_headers = proxy_options.get('extra_headers')
if not self.extra_headers:
self.extra_headers = self.EXTRA_HEADERS
proxy_options['extra_headers'] = self.extra_headers
if proxy_options.get('cookie_resolver'):
self.resolver = CookieResolver(routes, proxy_options)
else:
self.resolver = ProxyAuthResolver(routes, proxy_options)
self.unaltered = proxy_options.get('unaltered_replay', False)
self.proxy_pac_path = proxy_options.get('pac_path', self.PAC_PATH)
if not proxy_options.get('enable_https_proxy'):
self.ca = None
self.proxy_cert_dl_view = None
return
# HTTPS Only Options
ca_file = proxy_options.get('root_ca_file')
# attempt to create the root_ca_file if doesn't exist
# (generally recommended to create this seperately)
certname = proxy_options.get('root_ca_name')
CertificateAuthority.generate_ca_root(certname, ca_file)
certs_dir = proxy_options.get('certs_dir')
self.ca = CertificateAuthority(ca_file=ca_file,
certs_dir=certs_dir)
self.proxy_cert_dl_view = proxy_options.get('proxy_cert_download_view')
def __call__(self, env):
url = env['REL_REQUEST_URI']
is_https = (env['REQUEST_METHOD'] == 'CONNECT')
if url.endswith('/proxy.pac'):
return self.make_pac_response(env)
# for non-https requests, check pac path and non-proxy urls
if not is_https:
url = env['REL_REQUEST_URI']
if not url.startswith('http://'):
return None
if url == self.proxy_pac_path:
return self.make_pac_response(env)
proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION')
if not url.startswith(('http://', 'https://')):
return None
env['pywb.proxy_scheme'] = 'http'
route = None
coll = None
matcher = None
response = None
ts = None
if proxy_auth:
proxy_coll = self.read_basic_auth_coll(proxy_auth)
# check resolver, for pre connect resolve
if self.resolver.pre_connect:
route, coll, matcher, ts, response = self.resolver.resolve(env)
if response:
return response
if not proxy_coll:
return self.proxy_auth_coll_response()
# do connect, then get updated url
if is_https:
response = self.handle_connect(env)
if response:
return response
proxy_coll = '/' + proxy_coll + '/'
for r in self.routes:
matcher, c = r.is_handling(proxy_coll)
if matcher:
route = r
coll = c
break
if not route:
return self.proxy_auth_coll_response()
# if 'use_default_coll' or only one collection, use that
# for proxy mode
elif self.use_default_coll or len(self.routes) == 1:
route = self.routes[0]
coll = self.routes[0].regex.pattern
# otherwise, require proxy auth 407 to select collection
url = env['REL_REQUEST_URI']
else:
return self.proxy_auth_coll_response()
parts = urlparse.urlsplit(env['REL_REQUEST_URI'])
hostport = parts.netloc.split(':', 1)
env['pywb.proxy_host'] = hostport[0]
env['pywb.proxy_port'] = hostport[1] if len(hostport) == 2 else ''
env['pywb.proxy_req_uri'] = parts.path
if parts.query:
env['pywb.proxy_req_uri'] += '?' + parts.query
env['pywb_proxy_magic'] = self.magic_name
# route (static) and other resources to archival replay
if env['pywb.proxy_host'] == self.magic_name:
env['REL_REQUEST_URI'] = env['pywb.proxy_req_uri']
# special case for proxy install
response = self.handle_cert_install(env)
if response:
return response
return None
# check resolver, post connect
if not self.resolver.pre_connect:
route, coll, matcher, ts, response = self.resolver.resolve(env)
if response:
return response
host_prefix = env['pywb.proxy_scheme'] + '://' + self.magic_name
rel_prefix = ''
# special case for proxy calendar
if (env['pywb.proxy_host'] == 'query.' + self.magic_name):
url = env['pywb.proxy_req_uri'][1:]
rel_prefix = '/'
if ts is not None:
url = ts + '/' + url
wbrequest = route.request_class(env,
request_uri=url,
wb_url_str=url,
coll=coll,
host_prefix=self.hostpaths[0],
host_prefix=host_prefix,
rel_prefix=rel_prefix,
wburl_class=route.handler.get_wburl_type(),
urlrewriter_class=HttpsUrlRewriter,
use_abs_prefix=False,
@ -119,13 +200,170 @@ class ProxyRouter(object):
if self.unaltered:
wbrequest.wb_url.mod = 'id_'
elif is_https:
wbrequest.wb_url.mod = 'bn_'
return route.handler(wbrequest)
response = route.handler(wbrequest)
if wbrequest.wb_url and wbrequest.wb_url.is_replay():
response.status_headers.replace_headers(self.extra_headers)
return response
def get_request_socket(self, env):
if not self.ca:
return None
sock = None
if env.get('uwsgi.version'):
try:
import uwsgi
fd = uwsgi.connection_fd()
conn = socket.fromfd(fd, socket.AF_INET, socket.SOCK_STREAM)
sock = socket.socket(_sock=conn)
except Exception:
pass
elif env.get('gunicorn.socket'):
sock = env['gunicorn.socket']
if not sock:
# attempt to find socket from wsgi.input
input_ = env.get('wsgi.input')
if input_ and hasattr(input_, '_sock'):
sock = socket.socket(_sock=input_._sock)
return sock
def handle_connect(self, env):
sock = self.get_request_socket(env)
if not sock:
return WbResponse.text_response('HTTPS Proxy Not Supported',
'405 HTTPS Proxy Not Supported')
sock.send('HTTP/1.0 200 Connection Established\r\n')
sock.send('Server: pywb proxy\r\n')
sock.send('\r\n')
hostname, port = env['REL_REQUEST_URI'].split(':')
cert_host = hostname
host_parts = hostname.split('.', 1)
if len(host_parts) == 2 and '.' in host_parts[1]:
cert_host = host_parts[1]
created, certfile = self.ca.get_cert_for_host(cert_host,
wildcard=True)
try:
ssl_sock = ssl.wrap_socket(sock,
server_side=True,
certfile=certfile,
ciphers="ALL",
suppress_ragged_eofs=False,
#ssl_version=ssl.PROTOCOL_TLSv1)
ssl_version=ssl.PROTOCOL_SSLv23)
env['pywb.proxy_ssl_sock'] = ssl_sock
buffreader = BufferedReader(ssl_sock, block_size=self.BLOCK_SIZE)
statusline = buffreader.readline().rstrip()
except Exception as se:
raise BadRequestException(se.message)
statusparts = statusline.split(' ')
if len(statusparts) < 3:
raise BadRequestException('Invalid Proxy Request: ' + statusline)
env['REQUEST_METHOD'] = statusparts[0]
env['REL_REQUEST_URI'] = ('https://' +
env['REL_REQUEST_URI'].replace(':443', '') +
statusparts[1])
env['SERVER_PROTOCOL'] = statusparts[2].strip()
env['pywb.proxy_scheme'] = 'https'
env['pywb.proxy_host'] = hostname
env['pywb.proxy_port'] = port
env['pywb.proxy_req_uri'] = statusparts[1]
queryparts = env['REL_REQUEST_URI'].split('?', 1)
env['PATH_INFO'] = queryparts[0]
env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else ''
while True:
line = buffreader.readline()
if line:
line = line.rstrip()
if not line:
break
parts = line.split(':', 1)
if len(parts) < 2:
continue
name = parts[0].strip()
value = parts[1].strip()
name = name.replace('-', '_').upper()
if not name in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
name = 'HTTP_' + name
env[name] = value
remain = buffreader.rem_length()
if remain > 0:
remainder = buffreader.read(self.BLOCK_SIZE)
env['wsgi.input'] = BufferedReader(ssl_sock,
block_size=self.BLOCK_SIZE,
starting_data=remainder)
def handle_cert_install(self, env):
if env['pywb.proxy_req_uri'] in ('/', '/index.html', '/index.html'):
available = (self.ca is not None)
if self.proxy_cert_dl_view:
return (self.proxy_cert_dl_view.
render_response(available=available,
pem_path=self.CERT_DL_PEM,
p12_path=self.CERT_DL_P12))
else:
return None
elif env['pywb.proxy_req_uri'] == self.CERT_DL_PEM:
if not self.ca:
return None
buff = ''
with open(self.ca.ca_file) as fh:
buff = fh.read()
content_type = 'application/x-x509-ca-cert'
return WbResponse.text_response(buff,
content_type=content_type)
elif env['pywb.proxy_req_uri'] == self.CERT_DL_P12:
if not self.ca:
return None
buff = self.ca.get_root_PKCS12()
content_type = 'application/x-pkcs12'
return WbResponse.text_response(buff,
content_type=content_type)
else:
return None
# Proxy Auto-Config (PAC) script for the proxy
def make_pac_response(self, env):
import os
hostname = os.environ.get('PYWB_HOST_NAME')
hostname = env.get('HTTP_HOST')
if not hostname:
server_hostport = env['SERVER_NAME'] + ':' + env['SERVER_PORT']
hostonly = env['SERVER_NAME']
@ -143,33 +381,8 @@ class ProxyRouter(object):
buff += direct.format(hostonly)
#buff += '\n return "PROXY {0}";\n}}\n'.format(self.hostpaths[0])
buff += '\n return "PROXY {0}";\n}}\n'.format(server_hostport)
content_type = 'application/x-ns-proxy-autoconfig'
return WbResponse.text_response(buff, content_type=content_type)
def proxy_auth_coll_response(self):
proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg)
headers = [('Content-Type', 'text/plain'),
('Proxy-Authenticate', proxy_msg)]
status_headers = StatusAndHeaders('407 Proxy Authentication', headers)
value = self.auth_msg
return WbResponse(status_headers, value=[value])
@staticmethod
def read_basic_auth_coll(value):
parts = value.split(' ')
if parts[0].lower() != 'basic':
return ''
if len(parts) != 2:
return ''
user_pass = base64.b64decode(parts[1])
return user_pass.split(':')[0]

View File

@ -0,0 +1,340 @@
from wbrequestresponse import WbResponse, WbRequest
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.rewrite.wburl import WbUrl
import urlparse
import base64
import os
try:
import uwsgi
uwsgi_cache = True
except ImportError:
uwsgi_cache = False
#=================================================================
class UwsgiCache(object):
def __setitem__(self, item, value):
uwsgi.cache_update(item, value)
def __getitem__(self, item):
return uwsgi.cache_get(item)
def __contains__(self, item):
return uwsgi.cache_exists(item)
def __delitem__(self, item):
uwsgi.cache_del(item)
#=================================================================
class BaseCollResolver(object):
def __init__(self, routes, config):
self.routes = routes
self.pre_connect = config.get('pre_connect', False)
self.use_default_coll = config.get('use_default_coll', True)
def resolve(self, env):
route = None
coll = None
matcher = None
ts = None
proxy_coll, ts = self.get_proxy_coll_ts(env)
# invalid parsing
if proxy_coll == '':
return None, None, None, None, self.select_coll_response(env)
if proxy_coll is None and isinstance(self.use_default_coll, str):
proxy_coll = self.use_default_coll
if proxy_coll:
proxy_coll = '/' + proxy_coll + '/'
for r in self.routes:
matcher, c = r.is_handling(proxy_coll)
if matcher:
route = r
coll = c
break
# if no match, return coll selection response
if not route:
return None, None, None, None, self.select_coll_response(env)
# if 'use_default_coll'
elif self.use_default_coll == True or len(self.routes) == 1:
route = self.routes[0]
coll = self.routes[0].path
# otherwise, return the appropriate coll selection response
else:
return None, None, None, None, self.select_coll_response(env)
return route, coll, matcher, ts, None
#=================================================================
class ProxyAuthResolver(BaseCollResolver):
DEFAULT_MSG = 'Please enter name of a collection to use with proxy mode'
def __init__(self, routes, config):
config['pre_connect'] = True
super(ProxyAuthResolver, self).__init__(routes, config)
self.auth_msg = config.get('auth_msg', self.DEFAULT_MSG)
def get_proxy_coll_ts(self, env):
proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION')
if not proxy_auth:
return None, None
proxy_coll = self.read_basic_auth_coll(proxy_auth)
return proxy_coll, None
def select_coll_response(self, env):
proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg)
headers = [('Content-Type', 'text/plain'),
('Proxy-Authenticate', proxy_msg)]
status_headers = StatusAndHeaders('407 Proxy Authentication', headers)
value = self.auth_msg
return WbResponse(status_headers, value=[value])
@staticmethod
def read_basic_auth_coll(value):
parts = value.split(' ')
if parts[0].lower() != 'basic':
return ''
if len(parts) != 2:
return ''
user_pass = base64.b64decode(parts[1])
return user_pass.split(':')[0]
#=================================================================
# Experimental CookieResolver
class CookieResolver(BaseCollResolver): # pragma: no cover
def __init__(self, routes, config):
config['pre_connect'] = False
super(CookieResolver, self).__init__(routes, config)
self.magic_name = config['magic_name']
self.sethost_prefix = '-sethost.' + self.magic_name + '.'
self.set_prefix = '-set.' + self.magic_name
self.cookie_name = config.get('cookie_name', '__pywb_coll')
self.proxy_select_view = config.get('proxy_select_view')
self.extra_headers = config.get('extra_headers')
if uwsgi_cache:
self.cache = UwsgiCache()
else:
self.cache = {}
def get_proxy_coll_ts(self, env):
coll, ts, sesh_id = self.get_coll(env)
return coll, ts
def select_coll_response(self, env):
return self.make_magic_response('auto',
env['REL_REQUEST_URI'],
env)
def resolve(self, env):
server_name = env['pywb.proxy_host']
if ('.' + self.magic_name) in server_name:
response = self.handle_magic_page(env)
if response:
return None, None, None, None, response
return super(CookieResolver, self).resolve(env)
def handle_magic_page(self, env):
request_url = env['REL_REQUEST_URI']
parts = urlparse.urlsplit(request_url)
server_name = env['pywb.proxy_host']
path_url = parts.path[1:]
if parts.query:
path_url += '?' + parts.query
if server_name.startswith('auto'):
coll, ts, sesh_id = self.get_coll(env)
if coll:
return self.make_sethost_cookie_response(sesh_id, path_url, env)
else:
return self.make_magic_response('select', path_url, env)
elif server_name.startswith('query.'):
wb_url = WbUrl(path_url)
# only dealing with specific timestamp setting
if wb_url.is_query():
return None
coll, ts, sesh_id = self.get_coll(env)
if not coll:
return self.make_magic_response('select', path_url, env)
self.set_ts(sesh_id, wb_url.timestamp)
return self.make_redir_response(wb_url.url)
elif server_name.endswith(self.set_prefix):
old_sesh_id = self.extract_client_cookie(env, self.cookie_name)
sesh_id = self.create_renew_sesh_id(old_sesh_id)
if sesh_id != old_sesh_id:
headers = self.make_cookie_headers(sesh_id, self.magic_name)
else:
headers = None
coll = server_name[:-len(self.set_prefix)]
# set sesh value
self.set_coll(sesh_id, coll)
return self.make_sethost_cookie_response(sesh_id, path_url, env,
headers=headers)
elif self.sethost_prefix in server_name:
inx = server_name.find(self.sethost_prefix)
sesh_id = server_name[:inx]
domain = server_name[inx + len(self.sethost_prefix):]
headers = self.make_cookie_headers(sesh_id, domain)
full_url = env['pywb.proxy_scheme'] + '://' + domain
full_url += '/' + path_url
return self.make_redir_response(full_url, headers=headers)
elif 'select.' in server_name:
if not self.proxy_select_view:
return WbResponse.text_response('select text for ' + path_url)
coll, ts, sesh_id = self.get_coll(env)
#scheme = env['pywb.proxy_scheme'] + '://'
route_temp = '-set.' + self.magic_name + '/' + path_url
try:
return (self.proxy_select_view.
render_response(routes=self.routes,
route_temp=route_temp,
coll=coll,
url=path_url))
except Exception as exc:
raise
#else:
# msg = 'Invalid Magic Path: ' + url
# print msg
# return WbResponse.text_response(msg, status='404 Not Found')
def make_cookie_headers(self, sesh_id, domain):
cookie_val = '{0}={1}; Path=/; Domain=.{2}; HttpOnly'
cookie_val = cookie_val.format(self.cookie_name, sesh_id, domain)
headers = [('Set-Cookie', cookie_val)]
return headers
def make_sethost_cookie_response(self, sesh_id, path_url,
env, headers=None):
if '://' not in path_url:
path_url = 'http://' + path_url
path_parts = urlparse.urlsplit(path_url)
new_url = path_parts.path[1:]
if path_parts.query:
new_url += '?' + path_parts.query
return self.make_magic_response(sesh_id + '-sethost', new_url, env,
suffix=path_parts.netloc,
headers=headers)
def make_magic_response(self, prefix, url, env,
suffix=None, headers=None):
full_url = env['pywb.proxy_scheme'] + '://' + prefix + '.'
full_url += self.magic_name
if suffix:
full_url += '.' + suffix
full_url += '/' + url
return self.make_redir_response(full_url, headers=headers)
def set_coll(self, sesh_id, coll):
self.cache[sesh_id + ':c'] = coll
def set_ts(self, sesh_id, ts):
if ts:
self.cache[sesh_id + ':t'] = ts
# this ensures that omitting timestamp will reset to latest
# capture by deleting the cache entry
else:
del self.cache[sesh_id + ':t']
def get_coll(self, env):
sesh_id = self.extract_client_cookie(env, self.cookie_name)
coll = None
ts = None
if sesh_id:
coll = self.cache[sesh_id + ':c']
try:
ts = self.cache[sesh_id + ':t']
except KeyError:
pass
return coll, ts, sesh_id
def create_renew_sesh_id(self, sesh_id, force=False):
#if sesh_id in self.cache and not force:
if sesh_id and ((sesh_id + ':c') in self.cache) and not force:
return sesh_id
sesh_id = base64.b32encode(os.urandom(5)).lower()
return sesh_id
def make_redir_response(self, url, headers=None):
if not headers:
headers = []
if self.extra_headers:
for name, value in self.extra_headers.iteritems():
headers.append((name, value))
return WbResponse.redir_response(url, headers=headers)
@staticmethod
def extract_client_cookie(env, cookie_name):
cookie_header = env.get('HTTP_COOKIE')
if not cookie_header:
return None
# attempt to extract cookie_name only
inx = cookie_header.find(cookie_name)
if inx < 0:
return None
end_inx = cookie_header.find(';', inx)
if end_inx > 0:
value = cookie_header[inx:end_inx]
else:
value = cookie_header[inx:]
value = value.split('=')
if len(value) < 2:
return None
value = value[1].strip()
return value

View File

@ -50,6 +50,42 @@ class WSGIApp(object):
# Top-level wsgi application
def __call__(self, env, start_response):
if env['REQUEST_METHOD'] == 'CONNECT':
return self.handle_connect(env, start_response)
else:
return self.handle_methods(env, start_response)
def handle_connect(self, env, start_response):
def ssl_start_response(statusline, headers):
ssl_sock = env.get('pywb.proxy_ssl_sock')
if not ssl_sock:
start_response(statusline, headers)
return
env['pywb.proxy_statusline'] = statusline
ssl_sock.write('HTTP/1.1 ' + statusline + '\r\n')
for name, value in headers:
ssl_sock.write(name + ': ' + value + '\r\n')
resp_iter = self.handle_methods(env, ssl_start_response)
ssl_sock = env.get('pywb.proxy_ssl_sock')
if not ssl_sock:
return resp_iter
ssl_sock.write('\r\n')
for obj in resp_iter:
if obj:
ssl_sock.write(obj)
ssl_sock.close()
start_response(env['pywb.proxy_statusline'], [])
return []
def handle_methods(self, env, start_response):
if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
env['REL_REQUEST_URI'] = rel_request_uri(env)
else:
@ -89,22 +125,29 @@ class WSGIApp(object):
else:
err_url = None
try:
err_msg = exc.message.encode('utf-8')
except Exception:
err_msg = exc.message
err_url = ''
if print_trace:
import traceback
err_details = traceback.format_exc(exc)
print err_details
else:
logging.info(str(exc))
logging.info(err_msg)
err_details = None
if error_view:
return error_view.render_response(exc_type=type(exc).__name__,
err_msg=str(exc),
err_msg=err_msg,
err_details=err_details,
status=status,
env=env,
err_url=err_url)
else:
return WbResponse.text_response(status + ' Error: ' + str(exc),
return WbResponse.text_response(status + ' Error: ' + err_msg,
status=status)
#=================================================================
@ -145,6 +188,10 @@ def init_app(init_func, load_yaml=True, config_file=None, config={}):
def start_wsgi_server(the_app, name, default_port=None): # pragma: no cover
from wsgiref.simple_server import make_server
# disable is_hop_by_hop restrictions
import wsgiref.handlers
wsgiref.handlers.is_hop_by_hop = lambda x: False
port = the_app.port
if not port:

View File

@ -37,7 +37,8 @@ class HeaderRewriter:
ENCODING_HEADERS = ['content-encoding']
REMOVE_HEADERS = ['transfer-encoding']
REMOVE_HEADERS = ['transfer-encoding', 'content-security-policy',
'strict-transport-security']
PROXY_NO_REWRITE_HEADERS = ['content-length']
@ -90,7 +91,10 @@ class HeaderRewriter:
new_headers = []
removed_header_dict = {}
cookie_rewriter = urlrewriter.get_cookie_rewriter()
if urlrewriter:
cookie_rewriter = urlrewriter.get_cookie_rewriter()
else:
cookie_rewriter = None
for (name, value) in headers:
@ -99,7 +103,7 @@ class HeaderRewriter:
if lowername in self.PROXY_HEADERS:
new_headers.append((name, value))
elif lowername in self.URL_REWRITE_HEADERS:
elif urlrewriter and lowername in self.URL_REWRITE_HEADERS:
new_headers.append((name, urlrewriter.rewrite(value)))
elif lowername in self.ENCODING_HEADERS:
@ -109,7 +113,8 @@ class HeaderRewriter:
new_headers.append((name, value))
elif lowername in self.REMOVE_HEADERS:
removed_header_dict[lowername] = value
removed_header_dict[lowername] = value
new_headers.append((self.header_prefix + name, value))
elif (lowername in self.PROXY_NO_REWRITE_HEADERS and
not content_rewritten):
@ -120,7 +125,9 @@ class HeaderRewriter:
cookie_list = cookie_rewriter.rewrite(value)
new_headers.extend(cookie_list)
else:
elif urlrewriter:
new_headers.append((self.header_prefix + name, value))
else:
new_headers.append((name, value))
return (new_headers, removed_header_dict)

View File

@ -69,6 +69,10 @@ class RewriteContent:
status_headers, stream = self.sanitize_content(headers, stream)
return (status_headers, self.stream_to_gen(stream), False)
if wb_url.is_banner_only:
urlrewriter = None
(rewritten_headers, stream) = self.rewrite_headers(urlrewriter,
headers,
stream)

View File

@ -40,17 +40,19 @@ HTTP Headers Rewriting
'removed_header_dict': {'content-encoding': 'gzip',
'transfer-encoding': 'chunked'},
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('X-Archive-Orig-Content-Length', '199999'),
('Content-Type', 'text/javascript')]),
('Content-Type', 'text/javascript'),
('X-Archive-Orig-Transfer-Encoding', 'chunked')]),
'text_type': 'js'}
# Binary -- transfer-encoding removed
# Binary -- transfer-encoding rewritten
>>> _test_headers([('Content-Length', '200000'), ('Content-Type', 'image/png'), ('Set-Cookie', 'foo=bar; Path=/;'), ('Content-Encoding', 'gzip'), ('Transfer-Encoding', 'chunked')])
{'charset': None,
'removed_header_dict': {'transfer-encoding': 'chunked'},
'status_headers': StatusAndHeaders(protocol = '', statusline = '200 OK', headers = [ ('Content-Length', '200000'),
('Content-Type', 'image/png'),
('Set-Cookie', 'foo=bar; Path=/web/20131010/http://example.com/'),
('Content-Encoding', 'gzip')]),
('Content-Encoding', 'gzip'),
('X-Archive-Orig-Transfer-Encoding', 'chunked')]),
'text_type': None}
"""

View File

@ -142,7 +142,7 @@ class HttpsUrlRewriter(UrlRewriter):
else:
return url
def get_timestamp_url(self, timestamp, url):
def get_timestamp_url(self, timestamp, url=''):
return url
def get_abs_url(self, url=''):

View File

@ -73,6 +73,14 @@ function init_banner() {
text += "<b id='_wb_capture_info'>" + capture_str + "</b>";
if (wbinfo.proxy_magic && wbinfo.url) {
var select_url = wbinfo.proxy_magic + "/" + wbinfo.url;
var query_url = wbinfo.proxy_magic + "/*/" + wbinfo.url;
text += '&nbsp;<a href="//query.' + query_url + '">All Capture Times</a>';
text += '<br/>'
text += 'From collection <b>"' + wbinfo.coll + '"</b>&nbsp;<a href="//select.' + select_url + '">All Collections</a>';
}
banner.innerHTML = text;
document.body.insertBefore(banner, document.body.firstChild);

View File

@ -9,3 +9,10 @@
</pre>
</p>
{% endif %}
{% if env.pywb_proxy_magic and err_url and status == '404 Not Found' %}
<p>
<a href="//select.{{ env.pywb_proxy_magic }}/{{ err_url }}">Try Different Collection</a>
</p>
{% endif %}

View File

@ -2,7 +2,7 @@
{% if rule.js_rewrite_location and include_wombat %}
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wombat.js'> </script>
<script>
{% set urlsplit = cdx['original'] | urlsplit %}
{% set urlsplit = cdx.original | urlsplit %}
WB_wombat_init("{{ wbrequest.wb_prefix}}",
"{{ cdx['timestamp'] if include_ts else ''}}",
"{{ urlsplit.scheme }}",
@ -12,13 +12,15 @@
{% endif %}
<script>
wbinfo = {}
wbinfo.url = "{{ cdx.original }}";
wbinfo.timestamp = "{{ cdx.timestamp }}";
wbinfo.prefix = "{{ wbrequest.wb_prefix }}";
wbinfo.is_embed = {{"true" if wbrequest.wb_url.is_embed else "false"}};
wbinfo.is_frame_mp = {{"true" if wbrequest.wb_url.mod == 'mp_' else "false"}};
wbinfo.canon_url = "{{ canon_url }}";
wbinfo.is_live = {{ "true" if cdx.is_live else "false" }};
wbinfo.is_proxy_mode = {{ "true" if wbrequest.options.is_proxy else "false" }};
wbinfo.coll = "{{ wbrequest.coll }}";
wbinfo.proxy_magic = "{{ wbrequest.env.pywb_proxy_magic }}";
</script>
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.js'> </script>
<link rel='stylesheet' href='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.css'/>

View File

@ -0,0 +1,14 @@
<h2>HTTPS Certificate For PyWb Web Archive Replay</h2>
{% if not available %}
<p>Sorry, HTTPS support is not configured for this proxy. However, the proxy should work in HTTP mode.</p>
{% else %}
<p>Download for all platforms (except Windows):</p>
<p><b><a href="{{ pem_path }}">Download Certificate (All except Windows)</a></b></p>
<p>(If you see the <i>Already Installed</i> message, then no further action is necessary and you may start browsing!</p>
{% endif %}
<p>Download for Windows platforms:</p>
<p><b><a href="{{ p12_path }}">Download Certificate (Window Only)</a></b></p>

25
pywb/ui/proxy_select.html Normal file
View File

@ -0,0 +1,25 @@
<html>
<body>
<h2>Pywb Proxy Collection Selector</h1>
{% if coll %}
<p>
Current collection is: <b>{{ coll }}</b>
</p>
{% else %}
<p>You have attempted to load the url <b>{{ url }}</b>, but there are multiple collections available.</p>
{% endif %}
<p>Please select which collection you would like to use (You will be redirected back to <b>{{ url }}</b>):
</p>
<ul>
{% for route in routes %}
{% if route.path and route | is_wb_handler %}
<li><a href="//{{ route.path }}{{ route_temp }}">{{ route.path }}</a></li>
{% endif %}
{% endfor %}
</ul>
<p>(Once selected, you will not be prompted again, however you can return to this page to switch collections.)</p>
</body>
</html>

View File

@ -3,6 +3,7 @@ Representation and parsing of HTTP-style status + headers
"""
import pprint
from copy import copy
#=================================================================
@ -44,9 +45,26 @@ class StatusAndHeaders(object):
self.headers.append((name, value))
return None
def replace_headers(self, header_dict):
"""
replace all headers in header_dict that already exist
add any remaining headers
"""
header_dict = copy(header_dict)
for index in xrange(len(self.headers) - 1, -1, -1):
curr_name, curr_value = self.headers[index]
name_lower = curr_name.lower()
if name_lower in header_dict:
self.headers[index] = (curr_name, header_dict[name_lower])
del header_dict[name_lower]
for name, value in header_dict.iteritems():
self.headers.append((name, value))
def remove_header(self, name):
"""
remove header (case-insensitive)
Remove header (case-insensitive)
return True if header removed, False otherwise
"""
name_lower = name.lower()

View File

@ -34,6 +34,9 @@ DEFAULTS = {
'home_html': 'ui/index.html',
'error_html': 'ui/error.html',
'proxy_select_html': 'ui/proxy_select.html',
'proxy_cert_download_html': 'ui/proxy_cert_download.html',
'template_globals': {'static_path': 'static/default'},
'static_routes': {'static/default': 'pywb/static/'},
@ -80,7 +83,7 @@ def create_live_handler(config):
#=================================================================
def init_route_config(value, config):
if isinstance(value, str):
if isinstance(value, str) or isinstance(value, list):
value = dict(index_paths=value)
route_config = DictChain(value, config)
@ -226,10 +229,27 @@ def create_wb_router(passed_config={}):
if hasattr(route.handler, 'resolve_refs'):
route.handler.resolve_refs(handler_dict)
# Check for new proxy mode!
if config.get('enable_http_proxy', False):
router = ProxyArchivalRouter
view = J2TemplateView.create_template(
config.get('proxy_select_html'),
'Proxy Coll Selector')
if not 'proxy_options' in passed_config:
passed_config['proxy_options'] = {}
if view:
passed_config['proxy_options']['proxy_select_view'] = view
view = J2TemplateView.create_template(
config.get('proxy_cert_download_html'),
'Proxy Cert Download')
if view:
passed_config['proxy_options']['proxy_cert_download_view'] = view
else:
router = ArchivalRouter
@ -250,6 +270,5 @@ def create_wb_router(passed_config={}):
error_view=J2TemplateView.create_template(config.get('error_html'),
'Error Page'),
config=config
)

View File

@ -34,7 +34,7 @@ class PyTest(TestCommand):
setup(
name='pywb',
version='0.5.3',
version='0.6.0',
url='https://github.com/ikreymer/pywb',
author='Ilya Kreymer',
author_email='ikreymer@gmail.com',
@ -70,6 +70,7 @@ setup(
'jinja2',
'surt',
'pyyaml',
'pyopenssl',
],
tests_require=[
'pytest',
@ -86,6 +87,7 @@ setup(
cdx-server = pywb.apps.cdx_server:main
cdx-indexer = pywb.warc.cdxindexer:main
live-rewrite-server = pywb.apps.live_rewrite_server:main
proxy-cert-auth = pywb.framework.certauth:main
""",
zip_safe=False,
classifiers=[

View File

@ -389,7 +389,7 @@ class TestWb:
assert resp.status_int == 407
def test_proxy_pac(self):
resp = self.testapp.get('/proxy.pac', extra_environ = dict(SERVER_NAME='pywb-proxy', SERVER_PORT='8080'))
resp = self.testapp.get('/proxy.pac', headers = [('Host', 'pywb-proxy:8080')])
assert resp.content_type == 'application/x-ns-proxy-autoconfig'
assert '"PROXY pywb-proxy:8080"' in resp.body
assert '"localhost"' in resp.body