1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

proxy: timestamp selection support!

certauth: wildcard support, use *.host wildcard for proxy certs whenever possible
ui: add coll info/switch and calendar links to banner
This commit is contained in:
Ilya Kreymer 2014-07-31 11:12:50 -07:00
parent eff5a74ec7
commit 522ea87637
9 changed files with 160 additions and 65 deletions

View File

@ -45,13 +45,15 @@ class CertificateAuthority(object):
if not os.path.exists(certs_dir): if not os.path.exists(certs_dir):
os.mkdir(certs_dir) os.mkdir(certs_dir)
def get_cert_for_host(self, host, overwrite=False): def get_cert_for_host(self, host, overwrite=False, wildcard=False):
host_filename = os.path.sep.join([self.certs_dir, '%s.pem' % host]) host_filename = os.path.join(self.certs_dir, host) + '.pem'
if not overwrite and os.path.exists(host_filename): if not overwrite and os.path.exists(host_filename):
return False, host_filename return False, host_filename
self.generate_host_cert(host, self.cert, self.key, host_filename) self.generate_host_cert(host, self.cert, self.key, host_filename,
wildcard)
return True, host_filename return True, host_filename
@staticmethod @staticmethod
@ -107,7 +109,8 @@ class CertificateAuthority(object):
return True, cert, key return True, cert, key
@staticmethod @staticmethod
def generate_host_cert(host, root_cert, root_key, host_filename): def generate_host_cert(host, root_cert, root_key, host_filename,
wildcard=False):
# Generate key # Generate key
key = crypto.PKey() key = crypto.PKey()
key.generate_key(crypto.TYPE_RSA, 2048) key.generate_key(crypto.TYPE_RSA, 2048)
@ -123,6 +126,19 @@ class CertificateAuthority(object):
cert.set_issuer(root_cert.get_subject()) cert.set_issuer(root_cert.get_subject())
cert.set_pubkey(req.get_pubkey()) cert.set_pubkey(req.get_pubkey())
if wildcard:
DNS = 'DNS:'
alt_hosts = [DNS + host,
DNS + '*.' + host]
alt_hosts = ', '.join(alt_hosts)
cert.add_extensions([
crypto.X509Extension('subjectAltName',
False,
alt_hosts)])
cert.sign(root_key, 'sha1') cert.sign(root_key, 'sha1')
# Write cert + key # Write cert + key
@ -163,6 +179,9 @@ def main():
parser.add_argument('-f', '--force', action='store_true') parser.add_argument('-f', '--force', action='store_true')
parser.add_argument('-w', '--wildcard_cert', action='store_true',
help='add wildcard SAN to host: *.<host>, <host>')
result = parser.parse_args() result = parser.parse_args()
overwrite = result.force overwrite = result.force
@ -170,12 +189,13 @@ def main():
# Create a new signed certificate using specified root # Create a new signed certificate using specified root
if result.use_root: if result.use_root:
certs_dir = result.certs_dir certs_dir = result.certs_dir
wildcard = result.wildcard
ca = CertificateAuthority(ca_file=result.use_root, ca = CertificateAuthority(ca_file=result.use_root,
certs_dir=result.certs_dir, certs_dir=result.certs_dir,
certname=result.name) certname=result.name)
created, host_filename = ca.get_cert_for_host(result.output_pem_file, created, host_filename = ca.get_cert_for_host(result.output_pem_file,
overwrite) overwrite, wildcard)
if created: if created:
print ('Created new cert "' + host_filename + print ('Created new cert "' + host_filename +

View File

@ -76,7 +76,6 @@ class ProxyRouter(object):
else: else:
self.resolver = ProxyAuthResolver(routes, proxy_options) self.resolver = ProxyAuthResolver(routes, proxy_options)
self.insert_banner = proxy_options.get('banner_only_replay', False)
self.unaltered = proxy_options.get('unaltered_replay', False) self.unaltered = proxy_options.get('unaltered_replay', False)
self.proxy_pac_path = proxy_options.get('pac_path', self.PAC_PATH) self.proxy_pac_path = proxy_options.get('pac_path', self.PAC_PATH)
@ -115,10 +114,11 @@ class ProxyRouter(object):
coll = None coll = None
matcher = None matcher = None
response = None response = None
ts = None
# check resolver, for pre connect resolve # check resolver, for pre connect resolve
if self.resolver.pre_connect: if self.resolver.pre_connect:
route, coll, matcher, response = self.resolver.resolve(env) route, coll, matcher, response, ts = self.resolver.resolve(env)
if response: if response:
return response return response
@ -138,26 +138,36 @@ class ProxyRouter(object):
if parts.query: if parts.query:
env['pywb.proxy_req_uri'] += '?' + parts.query env['pywb.proxy_req_uri'] += '?' + parts.query
# select prefix env['pywb_proxy_magic'] = self.magic_name
env['pywb_proxy_select'] = 'select.' + self.magic_name
# route (static) and other resources to archival replay
if env['pywb.proxy_host'] == self.magic_name: if env['pywb.proxy_host'] == self.magic_name:
env['REL_REQUEST_URI'] = env['pywb.proxy_req_uri'] env['REL_REQUEST_URI'] = env['pywb.proxy_req_uri']
return None return None
# check resolver, post connect # check resolver, post connect
if not self.resolver.pre_connect: if not self.resolver.pre_connect:
route, coll, matcher, response = self.resolver.resolve(env) route, coll, matcher, ts, response = self.resolver.resolve(env)
if response: if response:
return response return response
host_prefix = env['pywb.proxy_scheme'] + '://' + self.magic_name host_prefix = env['pywb.proxy_scheme'] + '://' + self.magic_name
rel_prefix = ''
# special case for proxy calendar
if (env['pywb.proxy_host'] == 'query.' + self.magic_name):
url = env['pywb.proxy_req_uri'][1:]
rel_prefix = '/'
if ts is not None:
url = ts + '/' + url
wbrequest = route.request_class(env, wbrequest = route.request_class(env,
request_uri=url, request_uri=url,
wb_url_str=url, wb_url_str=url,
coll=coll, coll=coll,
host_prefix=host_prefix, host_prefix=host_prefix,
rel_prefix=rel_prefix,
wburl_class=route.handler.get_wburl_type(), wburl_class=route.handler.get_wburl_type(),
urlrewriter_class=HttpsUrlRewriter, urlrewriter_class=HttpsUrlRewriter,
use_abs_prefix=False, use_abs_prefix=False,
@ -166,10 +176,10 @@ class ProxyRouter(object):
if matcher: if matcher:
route.apply_filters(wbrequest, matcher) route.apply_filters(wbrequest, matcher)
if self.insert_banner: if self.unaltered:
wbrequest.wb_url.mod = 'bn_'
elif self.unaltered:
wbrequest.wb_url.mod = 'id_' wbrequest.wb_url.mod = 'id_'
elif is_https:
wbrequest.wb_url.mod = 'bn_'
return route.handler(wbrequest) return route.handler(wbrequest)
@ -209,13 +219,23 @@ class ProxyRouter(object):
sock.send('\r\n') sock.send('\r\n')
hostname, port = env['REL_REQUEST_URI'].split(':') hostname, port = env['REL_REQUEST_URI'].split(':')
created, certfile = self.ca.get_cert_for_host(hostname) cert_host = hostname
ssl_sock = ssl.wrap_socket(sock, host_parts = hostname.split('.', 1)
server_side=True, if len(host_parts) == 2 and '.' in host_parts[1]:
certfile=certfile, cert_host = host_parts[1]
ciphers="ALL",
ssl_version=ssl.PROTOCOL_SSLv23) created, certfile = self.ca.get_cert_for_host(cert_host,
wildcard=True)
try:
ssl_sock = ssl.wrap_socket(sock,
server_side=True,
certfile=certfile,
ciphers="ALL",
ssl_version=ssl.PROTOCOL_SSLv23)
except Exception as se:
raise BadRequestException(se.message)
env['pywb.proxy_ssl_sock'] = ssl_sock env['pywb.proxy_ssl_sock'] = ssl_sock
@ -244,7 +264,6 @@ class ProxyRouter(object):
env['PATH_INFO'] = queryparts[0] env['PATH_INFO'] = queryparts[0]
env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else '' env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else ''
while True: while True:
line = buffreader.readline() line = buffreader.readline()
if line: if line:
@ -270,8 +289,7 @@ class ProxyRouter(object):
remain = buffreader.rem_length() remain = buffreader.rem_length()
if remain > 0: if remain > 0:
remainder = buffreader.read(self.BLOCK_SIZE) remainder = buffreader.read(self.BLOCK_SIZE)
input_ = socket._fileobject(ssl_sock, mode='r') env['wsgi.input'] = BufferedReader(ssl_sock,
env['wsgi.input'] = BufferedReader(input_,
block_size=self.BLOCK_SIZE, block_size=self.BLOCK_SIZE,
starting_data=remainder) starting_data=remainder)

View File

@ -1,5 +1,7 @@
from wbrequestresponse import WbResponse, WbRequest from wbrequestresponse import WbResponse, WbRequest
from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.rewrite.wburl import WbUrl
import urlparse import urlparse
import base64 import base64
import os import os
@ -22,6 +24,9 @@ class UwsgiCache(object):
def __contains__(self, item): def __contains__(self, item):
return uwsgi.cache_exists(item) return uwsgi.cache_exists(item)
def __delitem__(self, item):
uwsgi.cache_del(item)
#================================================================= #=================================================================
class BaseCollResolver(object): class BaseCollResolver(object):
@ -34,12 +39,13 @@ class BaseCollResolver(object):
route = None route = None
coll = None coll = None
matcher = None matcher = None
ts = None
proxy_coll = self.get_proxy_coll(env) proxy_coll, ts = self.get_proxy_coll_ts(env)
# invalid parsing # invalid parsing
if proxy_coll == '': if proxy_coll == '':
return None, None, None, self.select_coll_response(env) return None, None, None, None, self.select_coll_response(env)
if proxy_coll is None and isinstance(self.use_default_coll, str): if proxy_coll is None and isinstance(self.use_default_coll, str):
proxy_coll = self.use_default_coll proxy_coll = self.use_default_coll
@ -56,7 +62,7 @@ class BaseCollResolver(object):
# if no match, return coll selection response # if no match, return coll selection response
if not route: if not route:
return None, None, None, self.select_coll_response(env) return None, None, None, None, self.select_coll_response(env)
# if 'use_default_coll' # if 'use_default_coll'
elif self.use_default_coll == True or len(self.routes) == 1: elif self.use_default_coll == True or len(self.routes) == 1:
@ -65,9 +71,9 @@ class BaseCollResolver(object):
# otherwise, return the appropriate coll selection response # otherwise, return the appropriate coll selection response
else: else:
return None, None, None, self.select_coll_response(env) return None, None, None, None, self.select_coll_response(env)
return route, coll, matcher, None return route, coll, matcher, ts, None
#================================================================= #=================================================================
@ -79,14 +85,14 @@ class ProxyAuthResolver(BaseCollResolver):
super(ProxyAuthResolver, self).__init__(routes, config) super(ProxyAuthResolver, self).__init__(routes, config)
self.auth_msg = config.get('auth_msg', self.DEFAULT_MSG) self.auth_msg = config.get('auth_msg', self.DEFAULT_MSG)
def get_proxy_coll(self, env): def get_proxy_coll_ts(self, env):
proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION') proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION')
if not proxy_auth: if not proxy_auth:
return None return None, None
proxy_coll = self.read_basic_auth_coll(proxy_auth) proxy_coll = self.read_basic_auth_coll(proxy_auth)
return proxy_coll return proxy_coll, None
def select_coll_response(self, env): def select_coll_response(self, env):
proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg) proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg)
@ -120,6 +126,9 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
config['pre_connect'] = False config['pre_connect'] = False
super(CookieResolver, self).__init__(routes, config) super(CookieResolver, self).__init__(routes, config)
self.magic_name = config['magic_name'] self.magic_name = config['magic_name']
self.sethost_prefix = '-sethost.' + self.magic_name + '.'
self.set_prefix = '-set.' + self.magic_name
self.cookie_name = config.get('cookie_name', '__pywb_coll') self.cookie_name = config.get('cookie_name', '__pywb_coll')
self.proxy_select_view = config.get('proxy_select_view') self.proxy_select_view = config.get('proxy_select_view')
@ -128,9 +137,9 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
else: else:
self.cache = {} self.cache = {}
def get_proxy_coll(self, env): def get_proxy_coll_ts(self, env):
coll, sesh_id = self.get_coll(env) coll, ts, sesh_id = self.get_coll(env)
return coll return coll, ts
def select_coll_response(self, env): def select_coll_response(self, env):
return self.make_magic_response('auto', return self.make_magic_response('auto',
@ -141,27 +150,44 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
server_name = env['pywb.proxy_host'] server_name = env['pywb.proxy_host']
if ('.' + self.magic_name) in server_name: if ('.' + self.magic_name) in server_name:
return None, None, None, self.handle_magic_page(env) response = self.handle_magic_page(env)
if response:
return None, None, None, None, response
return super(CookieResolver, self).resolve(env) return super(CookieResolver, self).resolve(env)
def handle_magic_page(self, env): def handle_magic_page(self, env):
url = env['REL_REQUEST_URI'] request_url = env['REL_REQUEST_URI']
parts = urlparse.urlsplit(url) parts = urlparse.urlsplit(request_url)
server_name = env['pywb.proxy_host']
path_url = parts.path[1:] path_url = parts.path[1:]
if parts.query: if parts.query:
path_url += '?' + parts.query path_url += '?' + parts.query
if parts.netloc.startswith('auto'): if server_name.startswith('auto'):
coll, sesh_id = self.get_coll(env) coll, ts, sesh_id = self.get_coll(env)
if coll: if coll:
return self.make_sethost_cookie_response(sesh_id, path_url, env) return self.make_sethost_cookie_response(sesh_id, path_url, env)
else: else:
return self.make_magic_response('select', path_url, env) return self.make_magic_response('select', path_url, env)
elif '.set.' in parts.netloc: elif server_name.startswith('query.'):
wb_url = WbUrl(path_url)
# only dealing with specific timestamp setting
if wb_url.is_query():
return None
coll, ts, sesh_id = self.get_coll(env)
if not coll:
return self.make_magic_response('select', path_url, env)
self.set_ts(sesh_id, wb_url.timestamp)
return self.make_redir_response(wb_url.url)
elif server_name.endswith(self.set_prefix):
old_sesh_id = self.extract_client_cookie(env, self.cookie_name) old_sesh_id = self.extract_client_cookie(env, self.cookie_name)
sesh_id = self.create_renew_sesh_id(old_sesh_id) sesh_id = self.create_renew_sesh_id(old_sesh_id)
@ -170,34 +196,33 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
else: else:
headers = None headers = None
value, name, _ = parts.netloc.split('.', 2) coll = server_name[:-len(self.set_prefix)]
# set sesh value # set sesh value
self.cache[sesh_id] = value self.set_coll(sesh_id, coll)
return self.make_sethost_cookie_response(sesh_id, path_url, env, return self.make_sethost_cookie_response(sesh_id, path_url, env,
headers=headers) headers=headers)
elif '.sethost.' in parts.netloc: elif self.sethost_prefix in server_name:
host_parts = parts.netloc.split('.', 1) inx = server_name.find(self.sethost_prefix)
sesh_id = host_parts[0] sesh_id = server_name[:inx]
inx = parts.netloc.find('.' + self.magic_name + '.') domain = server_name[inx + len(self.sethost_prefix):]
domain = parts.netloc[inx + len(self.magic_name) + 2:]
headers = self.make_cookie_headers(sesh_id, domain) headers = self.make_cookie_headers(sesh_id, domain)
full_url = env['pywb.proxy_scheme'] + '://' + domain full_url = env['pywb.proxy_scheme'] + '://' + domain
full_url += '/' + path_url full_url += '/' + path_url
return WbResponse.redir_response(full_url, headers=headers) return self.make_redir_response(full_url, headers=headers)
elif 'select.' in parts.netloc: elif 'select.' in server_name:
if not self.proxy_select_view: if not self.proxy_select_view:
return WbResponse.text_response('select text for ' + path_url) return WbResponse.text_response('select text for ' + path_url)
coll, sesh_id = self.get_coll(env) coll, ts, sesh_id = self.get_coll(env)
route_temp = env['pywb.proxy_scheme'] + '://%s.coll.set.' route_temp = env['pywb.proxy_scheme'] + '://%s-set.'
route_temp += self.magic_name + '/' + path_url route_temp += self.magic_name + '/' + path_url
return (self.proxy_select_view. return (self.proxy_select_view.
@ -217,14 +242,18 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
headers = [('Set-Cookie', cookie_val)] headers = [('Set-Cookie', cookie_val)]
return headers return headers
def make_sethost_cookie_response(self, sesh_id, path_url, env, headers=None): def make_sethost_cookie_response(self, sesh_id, path_url,
env, headers=None):
if '://' not in path_url:
path_url = 'http://' + path_url
path_parts = urlparse.urlsplit(path_url) path_parts = urlparse.urlsplit(path_url)
new_url = path_parts.path[1:] new_url = path_parts.path[1:]
if path_parts.query: if path_parts.query:
new_url += '?' + path_parts.query new_url += '?' + path_parts.query
return self.make_magic_response(sesh_id + '.sethost', new_url, env, return self.make_magic_response(sesh_id + '-sethost', new_url, env,
suffix=path_parts.netloc, suffix=path_parts.netloc,
headers=headers) headers=headers)
@ -236,25 +265,44 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
if suffix: if suffix:
full_url += '.' + suffix full_url += '.' + suffix
full_url += '/' + url full_url += '/' + url
return WbResponse.redir_response(full_url, headers=headers) return self.make_redir_response(full_url, headers=headers)
def set_coll(self, sesh_id, coll):
self.cache[sesh_id + ':c'] = coll
def set_ts(self, sesh_id, ts):
if ts:
self.cache[sesh_id + ':t'] = ts
# this ensures that omitting timestamp will reset to latest
# capture by deleting the cache entry
else:
del self.cache[sesh_id + ':t']
def get_coll(self, env): def get_coll(self, env):
sesh_id = self.extract_client_cookie(env, self.cookie_name) sesh_id = self.extract_client_cookie(env, self.cookie_name)
coll = None coll = None
ts = None
if sesh_id: if sesh_id:
coll = self.cache[sesh_id] coll = self.cache[sesh_id + ':c']
try:
ts = self.cache[sesh_id + ':t']
except KeyError:
pass
return coll, sesh_id return coll, ts, sesh_id
def create_renew_sesh_id(self, sesh_id, force=False): def create_renew_sesh_id(self, sesh_id, force=False):
#if sesh_id in self.cache and not force: #if sesh_id in self.cache and not force:
if sesh_id and (sesh_id in self.cache) and not force: if sesh_id and ((sesh_id + ':c') in self.cache) and not force:
return sesh_id return sesh_id
sesh_id = base64.b32encode(os.urandom(5)).lower() sesh_id = base64.b32encode(os.urandom(5)).lower()
return sesh_id return sesh_id
def make_redir_response(self, url, headers=None):
return WbResponse.redir_response(url, headers=headers)
@staticmethod @staticmethod
def extract_client_cookie(env, cookie_name): def extract_client_cookie(env, cookie_name):
cookie_header = env.get('HTTP_COOKIE') cookie_header = env.get('HTTP_COOKIE')

View File

@ -125,7 +125,11 @@ class WSGIApp(object):
else: else:
err_url = None err_url = None
err_msg = exc.message.encode('utf-8') try:
err_msg = exc.message.encode('utf-8')
except Exception:
err_msg = exc.message
err_url = ''
if print_trace: if print_trace:
import traceback import traceback

View File

@ -144,7 +144,7 @@ class HttpsUrlRewriter(object):
else: else:
return url return url
def get_timestamp_url(self, timestamp, url): def get_timestamp_url(self, timestamp, url=''):
return url return url
def get_abs_url(self, url=''): def get_abs_url(self, url=''):

View File

@ -70,9 +70,13 @@ function init_banner() {
text += "<b id='_wb_capture_info'>" + capture_str + "</b>"; text += "<b id='_wb_capture_info'>" + capture_str + "</b>";
if (wbinfo.proxy_select && wbinfo.url) { if (wbinfo.proxy_magic && wbinfo.url) {
full_url = wbinfo.proxy_select + "/" + wbinfo.url; var select_url = wbinfo.proxy_magic + "/" + wbinfo.url;
text += '<br/><a href="//' + full_url + '">Switch Collection</a>'; var query_url = wbinfo.proxy_magic + "/*/" + wbinfo.url;
text += '<br/>'
text += 'From <b>' + wbinfo.coll + '</b>&nbsp;<a href="//select.' + select_url + '">[Switch]</a>';
text += '&nbsp;&nbsp;';
text += '<a href="//query.' + query_url + '">View All Captures</a>';
} }
banner.innerHTML = text; banner.innerHTML = text;

View File

@ -10,9 +10,9 @@
</p> </p>
{% endif %} {% endif %}
{% if env.pywb_proxy_select and err_url and status == '404 Not Found' %} {% if env.pywb_proxy_magic and err_url and status == '404 Not Found' %}
<p> <p>
<a href="//{{ env.pywb_proxy_select }}/{{ err_url }}">Try Different Collections</a> <a href="//select.{{ env.pywb_proxy_magic }}/{{ err_url }}">Try Different Collection</a>
</p> </p>
{% endif %} {% endif %}

View File

@ -20,7 +20,8 @@
wbinfo.is_frame_mp = {{"true" if wbrequest.wb_url.mod == 'mp_' else "false"}}; wbinfo.is_frame_mp = {{"true" if wbrequest.wb_url.mod == 'mp_' else "false"}};
wbinfo.canon_url = "{{ canon_url }}"; wbinfo.canon_url = "{{ canon_url }}";
wbinfo.is_live = {{ "true" if cdx.is_live else "false" }}; wbinfo.is_live = {{ "true" if cdx.is_live else "false" }};
wbinfo.proxy_select = "{{ wbrequest.env.pywb_proxy_select }}"; wbinfo.coll = "{{ wbrequest.coll }}";
wbinfo.proxy_magic = "{{ wbrequest.env.pywb_proxy_magic }}";
</script> </script>
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.js'> </script> <script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.js'> </script>
<link rel='stylesheet' href='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.css'/> <link rel='stylesheet' href='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.css'/>

View File

@ -78,7 +78,7 @@ def create_live_handler(config):
#================================================================= #=================================================================
def init_route_config(value, config): def init_route_config(value, config):
if isinstance(value, str): if isinstance(value, str) or isinstance(value, list):
value = dict(index_paths=value) value = dict(index_paths=value)
route_config = DictChain(value, config) route_config = DictChain(value, config)