1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

proxy: timestamp selection support!

certauth: wildcard support, use *.host wildcard for proxy certs whenever possible
ui: add coll info/switch and calendar links to banner
This commit is contained in:
Ilya Kreymer 2014-07-31 11:12:50 -07:00
parent eff5a74ec7
commit 522ea87637
9 changed files with 160 additions and 65 deletions

View File

@ -45,13 +45,15 @@ class CertificateAuthority(object):
if not os.path.exists(certs_dir):
os.mkdir(certs_dir)
def get_cert_for_host(self, host, overwrite=False):
host_filename = os.path.sep.join([self.certs_dir, '%s.pem' % host])
def get_cert_for_host(self, host, overwrite=False, wildcard=False):
host_filename = os.path.join(self.certs_dir, host) + '.pem'
if not overwrite and os.path.exists(host_filename):
return False, host_filename
self.generate_host_cert(host, self.cert, self.key, host_filename)
self.generate_host_cert(host, self.cert, self.key, host_filename,
wildcard)
return True, host_filename
@staticmethod
@ -107,7 +109,8 @@ class CertificateAuthority(object):
return True, cert, key
@staticmethod
def generate_host_cert(host, root_cert, root_key, host_filename):
def generate_host_cert(host, root_cert, root_key, host_filename,
wildcard=False):
# Generate key
key = crypto.PKey()
key.generate_key(crypto.TYPE_RSA, 2048)
@ -123,6 +126,19 @@ class CertificateAuthority(object):
cert.set_issuer(root_cert.get_subject())
cert.set_pubkey(req.get_pubkey())
if wildcard:
DNS = 'DNS:'
alt_hosts = [DNS + host,
DNS + '*.' + host]
alt_hosts = ', '.join(alt_hosts)
cert.add_extensions([
crypto.X509Extension('subjectAltName',
False,
alt_hosts)])
cert.sign(root_key, 'sha1')
# Write cert + key
@ -163,6 +179,9 @@ def main():
parser.add_argument('-f', '--force', action='store_true')
parser.add_argument('-w', '--wildcard_cert', action='store_true',
help='add wildcard SAN to host: *.<host>, <host>')
result = parser.parse_args()
overwrite = result.force
@ -170,12 +189,13 @@ def main():
# Create a new signed certificate using specified root
if result.use_root:
certs_dir = result.certs_dir
wildcard = result.wildcard
ca = CertificateAuthority(ca_file=result.use_root,
certs_dir=result.certs_dir,
certname=result.name)
created, host_filename = ca.get_cert_for_host(result.output_pem_file,
overwrite)
overwrite, wildcard)
if created:
print ('Created new cert "' + host_filename +

View File

@ -76,7 +76,6 @@ class ProxyRouter(object):
else:
self.resolver = ProxyAuthResolver(routes, proxy_options)
self.insert_banner = proxy_options.get('banner_only_replay', False)
self.unaltered = proxy_options.get('unaltered_replay', False)
self.proxy_pac_path = proxy_options.get('pac_path', self.PAC_PATH)
@ -115,10 +114,11 @@ class ProxyRouter(object):
coll = None
matcher = None
response = None
ts = None
# check resolver, for pre connect resolve
if self.resolver.pre_connect:
route, coll, matcher, response = self.resolver.resolve(env)
route, coll, matcher, response, ts = self.resolver.resolve(env)
if response:
return response
@ -138,26 +138,36 @@ class ProxyRouter(object):
if parts.query:
env['pywb.proxy_req_uri'] += '?' + parts.query
# select prefix
env['pywb_proxy_select'] = 'select.' + self.magic_name
env['pywb_proxy_magic'] = self.magic_name
# route (static) and other resources to archival replay
if env['pywb.proxy_host'] == self.magic_name:
env['REL_REQUEST_URI'] = env['pywb.proxy_req_uri']
return None
# check resolver, post connect
if not self.resolver.pre_connect:
route, coll, matcher, response = self.resolver.resolve(env)
route, coll, matcher, ts, response = self.resolver.resolve(env)
if response:
return response
host_prefix = env['pywb.proxy_scheme'] + '://' + self.magic_name
rel_prefix = ''
# special case for proxy calendar
if (env['pywb.proxy_host'] == 'query.' + self.magic_name):
url = env['pywb.proxy_req_uri'][1:]
rel_prefix = '/'
if ts is not None:
url = ts + '/' + url
wbrequest = route.request_class(env,
request_uri=url,
wb_url_str=url,
coll=coll,
host_prefix=host_prefix,
rel_prefix=rel_prefix,
wburl_class=route.handler.get_wburl_type(),
urlrewriter_class=HttpsUrlRewriter,
use_abs_prefix=False,
@ -166,10 +176,10 @@ class ProxyRouter(object):
if matcher:
route.apply_filters(wbrequest, matcher)
if self.insert_banner:
wbrequest.wb_url.mod = 'bn_'
elif self.unaltered:
if self.unaltered:
wbrequest.wb_url.mod = 'id_'
elif is_https:
wbrequest.wb_url.mod = 'bn_'
return route.handler(wbrequest)
@ -209,13 +219,23 @@ class ProxyRouter(object):
sock.send('\r\n')
hostname, port = env['REL_REQUEST_URI'].split(':')
created, certfile = self.ca.get_cert_for_host(hostname)
cert_host = hostname
ssl_sock = ssl.wrap_socket(sock,
server_side=True,
certfile=certfile,
ciphers="ALL",
ssl_version=ssl.PROTOCOL_SSLv23)
host_parts = hostname.split('.', 1)
if len(host_parts) == 2 and '.' in host_parts[1]:
cert_host = host_parts[1]
created, certfile = self.ca.get_cert_for_host(cert_host,
wildcard=True)
try:
ssl_sock = ssl.wrap_socket(sock,
server_side=True,
certfile=certfile,
ciphers="ALL",
ssl_version=ssl.PROTOCOL_SSLv23)
except Exception as se:
raise BadRequestException(se.message)
env['pywb.proxy_ssl_sock'] = ssl_sock
@ -244,7 +264,6 @@ class ProxyRouter(object):
env['PATH_INFO'] = queryparts[0]
env['QUERY_STRING'] = queryparts[1] if len(queryparts) > 1 else ''
while True:
line = buffreader.readline()
if line:
@ -270,8 +289,7 @@ class ProxyRouter(object):
remain = buffreader.rem_length()
if remain > 0:
remainder = buffreader.read(self.BLOCK_SIZE)
input_ = socket._fileobject(ssl_sock, mode='r')
env['wsgi.input'] = BufferedReader(input_,
env['wsgi.input'] = BufferedReader(ssl_sock,
block_size=self.BLOCK_SIZE,
starting_data=remainder)

View File

@ -1,5 +1,7 @@
from wbrequestresponse import WbResponse, WbRequest
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.rewrite.wburl import WbUrl
import urlparse
import base64
import os
@ -22,6 +24,9 @@ class UwsgiCache(object):
def __contains__(self, item):
return uwsgi.cache_exists(item)
def __delitem__(self, item):
uwsgi.cache_del(item)
#=================================================================
class BaseCollResolver(object):
@ -34,12 +39,13 @@ class BaseCollResolver(object):
route = None
coll = None
matcher = None
ts = None
proxy_coll = self.get_proxy_coll(env)
proxy_coll, ts = self.get_proxy_coll_ts(env)
# invalid parsing
if proxy_coll == '':
return None, None, None, self.select_coll_response(env)
return None, None, None, None, self.select_coll_response(env)
if proxy_coll is None and isinstance(self.use_default_coll, str):
proxy_coll = self.use_default_coll
@ -56,7 +62,7 @@ class BaseCollResolver(object):
# if no match, return coll selection response
if not route:
return None, None, None, self.select_coll_response(env)
return None, None, None, None, self.select_coll_response(env)
# if 'use_default_coll'
elif self.use_default_coll == True or len(self.routes) == 1:
@ -65,9 +71,9 @@ class BaseCollResolver(object):
# otherwise, return the appropriate coll selection response
else:
return None, None, None, self.select_coll_response(env)
return None, None, None, None, self.select_coll_response(env)
return route, coll, matcher, None
return route, coll, matcher, ts, None
#=================================================================
@ -79,14 +85,14 @@ class ProxyAuthResolver(BaseCollResolver):
super(ProxyAuthResolver, self).__init__(routes, config)
self.auth_msg = config.get('auth_msg', self.DEFAULT_MSG)
def get_proxy_coll(self, env):
def get_proxy_coll_ts(self, env):
proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION')
if not proxy_auth:
return None
return None, None
proxy_coll = self.read_basic_auth_coll(proxy_auth)
return proxy_coll
return proxy_coll, None
def select_coll_response(self, env):
proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg)
@ -120,6 +126,9 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
config['pre_connect'] = False
super(CookieResolver, self).__init__(routes, config)
self.magic_name = config['magic_name']
self.sethost_prefix = '-sethost.' + self.magic_name + '.'
self.set_prefix = '-set.' + self.magic_name
self.cookie_name = config.get('cookie_name', '__pywb_coll')
self.proxy_select_view = config.get('proxy_select_view')
@ -128,9 +137,9 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
else:
self.cache = {}
def get_proxy_coll(self, env):
coll, sesh_id = self.get_coll(env)
return coll
def get_proxy_coll_ts(self, env):
coll, ts, sesh_id = self.get_coll(env)
return coll, ts
def select_coll_response(self, env):
return self.make_magic_response('auto',
@ -141,27 +150,44 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
server_name = env['pywb.proxy_host']
if ('.' + self.magic_name) in server_name:
return None, None, None, self.handle_magic_page(env)
response = self.handle_magic_page(env)
if response:
return None, None, None, None, response
return super(CookieResolver, self).resolve(env)
def handle_magic_page(self, env):
url = env['REL_REQUEST_URI']
parts = urlparse.urlsplit(url)
request_url = env['REL_REQUEST_URI']
parts = urlparse.urlsplit(request_url)
server_name = env['pywb.proxy_host']
path_url = parts.path[1:]
if parts.query:
path_url += '?' + parts.query
if parts.netloc.startswith('auto'):
coll, sesh_id = self.get_coll(env)
if server_name.startswith('auto'):
coll, ts, sesh_id = self.get_coll(env)
if coll:
return self.make_sethost_cookie_response(sesh_id, path_url, env)
else:
return self.make_magic_response('select', path_url, env)
elif '.set.' in parts.netloc:
elif server_name.startswith('query.'):
wb_url = WbUrl(path_url)
# only dealing with specific timestamp setting
if wb_url.is_query():
return None
coll, ts, sesh_id = self.get_coll(env)
if not coll:
return self.make_magic_response('select', path_url, env)
self.set_ts(sesh_id, wb_url.timestamp)
return self.make_redir_response(wb_url.url)
elif server_name.endswith(self.set_prefix):
old_sesh_id = self.extract_client_cookie(env, self.cookie_name)
sesh_id = self.create_renew_sesh_id(old_sesh_id)
@ -170,34 +196,33 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
else:
headers = None
value, name, _ = parts.netloc.split('.', 2)
coll = server_name[:-len(self.set_prefix)]
# set sesh value
self.cache[sesh_id] = value
self.set_coll(sesh_id, coll)
return self.make_sethost_cookie_response(sesh_id, path_url, env,
headers=headers)
elif '.sethost.' in parts.netloc:
host_parts = parts.netloc.split('.', 1)
sesh_id = host_parts[0]
elif self.sethost_prefix in server_name:
inx = server_name.find(self.sethost_prefix)
sesh_id = server_name[:inx]
inx = parts.netloc.find('.' + self.magic_name + '.')
domain = parts.netloc[inx + len(self.magic_name) + 2:]
domain = server_name[inx + len(self.sethost_prefix):]
headers = self.make_cookie_headers(sesh_id, domain)
full_url = env['pywb.proxy_scheme'] + '://' + domain
full_url += '/' + path_url
return WbResponse.redir_response(full_url, headers=headers)
return self.make_redir_response(full_url, headers=headers)
elif 'select.' in parts.netloc:
elif 'select.' in server_name:
if not self.proxy_select_view:
return WbResponse.text_response('select text for ' + path_url)
coll, sesh_id = self.get_coll(env)
coll, ts, sesh_id = self.get_coll(env)
route_temp = env['pywb.proxy_scheme'] + '://%s.coll.set.'
route_temp = env['pywb.proxy_scheme'] + '://%s-set.'
route_temp += self.magic_name + '/' + path_url
return (self.proxy_select_view.
@ -217,14 +242,18 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
headers = [('Set-Cookie', cookie_val)]
return headers
def make_sethost_cookie_response(self, sesh_id, path_url, env, headers=None):
def make_sethost_cookie_response(self, sesh_id, path_url,
env, headers=None):
if '://' not in path_url:
path_url = 'http://' + path_url
path_parts = urlparse.urlsplit(path_url)
new_url = path_parts.path[1:]
if path_parts.query:
new_url += '?' + path_parts.query
return self.make_magic_response(sesh_id + '.sethost', new_url, env,
return self.make_magic_response(sesh_id + '-sethost', new_url, env,
suffix=path_parts.netloc,
headers=headers)
@ -236,25 +265,44 @@ class CookieResolver(BaseCollResolver): # pragma: no cover
if suffix:
full_url += '.' + suffix
full_url += '/' + url
return WbResponse.redir_response(full_url, headers=headers)
return self.make_redir_response(full_url, headers=headers)
def set_coll(self, sesh_id, coll):
self.cache[sesh_id + ':c'] = coll
def set_ts(self, sesh_id, ts):
if ts:
self.cache[sesh_id + ':t'] = ts
# this ensures that omitting timestamp will reset to latest
# capture by deleting the cache entry
else:
del self.cache[sesh_id + ':t']
def get_coll(self, env):
sesh_id = self.extract_client_cookie(env, self.cookie_name)
coll = None
ts = None
if sesh_id:
coll = self.cache[sesh_id]
coll = self.cache[sesh_id + ':c']
try:
ts = self.cache[sesh_id + ':t']
except KeyError:
pass
return coll, sesh_id
return coll, ts, sesh_id
def create_renew_sesh_id(self, sesh_id, force=False):
#if sesh_id in self.cache and not force:
if sesh_id and (sesh_id in self.cache) and not force:
if sesh_id and ((sesh_id + ':c') in self.cache) and not force:
return sesh_id
sesh_id = base64.b32encode(os.urandom(5)).lower()
return sesh_id
def make_redir_response(self, url, headers=None):
return WbResponse.redir_response(url, headers=headers)
@staticmethod
def extract_client_cookie(env, cookie_name):
cookie_header = env.get('HTTP_COOKIE')

View File

@ -125,7 +125,11 @@ class WSGIApp(object):
else:
err_url = None
err_msg = exc.message.encode('utf-8')
try:
err_msg = exc.message.encode('utf-8')
except Exception:
err_msg = exc.message
err_url = ''
if print_trace:
import traceback

View File

@ -144,7 +144,7 @@ class HttpsUrlRewriter(object):
else:
return url
def get_timestamp_url(self, timestamp, url):
def get_timestamp_url(self, timestamp, url=''):
return url
def get_abs_url(self, url=''):

View File

@ -70,9 +70,13 @@ function init_banner() {
text += "<b id='_wb_capture_info'>" + capture_str + "</b>";
if (wbinfo.proxy_select && wbinfo.url) {
full_url = wbinfo.proxy_select + "/" + wbinfo.url;
text += '<br/><a href="//' + full_url + '">Switch Collection</a>';
if (wbinfo.proxy_magic && wbinfo.url) {
var select_url = wbinfo.proxy_magic + "/" + wbinfo.url;
var query_url = wbinfo.proxy_magic + "/*/" + wbinfo.url;
text += '<br/>'
text += 'From <b>' + wbinfo.coll + '</b>&nbsp;<a href="//select.' + select_url + '">[Switch]</a>';
text += '&nbsp;&nbsp;';
text += '<a href="//query.' + query_url + '">View All Captures</a>';
}
banner.innerHTML = text;

View File

@ -10,9 +10,9 @@
</p>
{% endif %}
{% if env.pywb_proxy_select and err_url and status == '404 Not Found' %}
{% if env.pywb_proxy_magic and err_url and status == '404 Not Found' %}
<p>
<a href="//{{ env.pywb_proxy_select }}/{{ err_url }}">Try Different Collections</a>
<a href="//select.{{ env.pywb_proxy_magic }}/{{ err_url }}">Try Different Collection</a>
</p>
{% endif %}

View File

@ -20,7 +20,8 @@
wbinfo.is_frame_mp = {{"true" if wbrequest.wb_url.mod == 'mp_' else "false"}};
wbinfo.canon_url = "{{ canon_url }}";
wbinfo.is_live = {{ "true" if cdx.is_live else "false" }};
wbinfo.proxy_select = "{{ wbrequest.env.pywb_proxy_select }}";
wbinfo.coll = "{{ wbrequest.coll }}";
wbinfo.proxy_magic = "{{ wbrequest.env.pywb_proxy_magic }}";
</script>
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.js'> </script>
<link rel='stylesheet' href='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.css'/>

View File

@ -78,7 +78,7 @@ def create_live_handler(config):
#=================================================================
def init_route_config(value, config):
if isinstance(value, str):
if isinstance(value, str) or isinstance(value, list):
value = dict(index_paths=value)
route_config = DictChain(value, config)