1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-20 02:39:13 +01:00
pywb/pywb/framework/proxy_resolvers.py
Ilya Kreymer 1aac5a9f15 cache: move cache wrappers to seperate cache.py in framework from
proxy_resolvers
range cache: and buffering cache for serving range requests, intended
for videos but not only. full response cached in temp file and range
requests served from cache, still experimental
need to add deletion.
youtube_dl: wrap youtube-dl import due to youtube-dl HTMLParser regex
bug
tests: add test for vi_ handler
2014-11-01 15:41:01 -07:00

294 lines
9.6 KiB
Python

from wbrequestresponse import WbResponse
from pywb.utils.loaders import extract_client_cookie
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.rewrite.wburl import WbUrl
from cache import create_cache
import urlparse
import base64
import os
#=================================================================
class BaseCollResolver(object):
def __init__(self, routes, config):
self.routes = routes
self.pre_connect = config.get('pre_connect', False)
self.use_default_coll = config.get('use_default_coll', True)
def resolve(self, env):
route = None
coll = None
matcher = None
ts = None
proxy_coll, ts = self.get_proxy_coll_ts(env)
# invalid parsing
if proxy_coll == '':
return None, None, None, None, self.select_coll_response(env)
if proxy_coll is None and isinstance(self.use_default_coll, str):
proxy_coll = self.use_default_coll
if proxy_coll:
proxy_coll = '/' + proxy_coll + '/'
for r in self.routes:
matcher, c = r.is_handling(proxy_coll)
if matcher:
route = r
coll = c
break
# if no match, return coll selection response
if not route:
return None, None, None, None, self.select_coll_response(env)
# if 'use_default_coll'
elif self.use_default_coll or len(self.routes) == 1:
route = self.routes[0]
coll = self.routes[0].path
# otherwise, return the appropriate coll selection response
else:
return None, None, None, None, self.select_coll_response(env)
return route, coll, matcher, ts, None
#=================================================================
class ProxyAuthResolver(BaseCollResolver):
DEFAULT_MSG = 'Please enter name of a collection to use with proxy mode'
def __init__(self, routes, config):
config['pre_connect'] = True
super(ProxyAuthResolver, self).__init__(routes, config)
self.auth_msg = config.get('auth_msg', self.DEFAULT_MSG)
def get_proxy_coll_ts(self, env):
proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION')
if not proxy_auth:
return None, None
proxy_coll = self.read_basic_auth_coll(proxy_auth)
return proxy_coll, None
def select_coll_response(self, env):
proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg)
headers = [('Content-Type', 'text/plain'),
('Proxy-Authenticate', proxy_msg)]
status_headers = StatusAndHeaders('407 Proxy Authentication', headers)
value = self.auth_msg
return WbResponse(status_headers, value=[value])
@staticmethod
def read_basic_auth_coll(value):
parts = value.split(' ')
if parts[0].lower() != 'basic':
return ''
if len(parts) != 2:
return ''
user_pass = base64.b64decode(parts[1])
return user_pass.split(':')[0]
#=================================================================
class CookieResolver(BaseCollResolver):
SESH_COOKIE_NAME = '__pywb_proxy_sesh'
def __init__(self, routes, config):
config['pre_connect'] = False
super(CookieResolver, self).__init__(routes, config)
self.magic_name = config['magic_name']
self.sethost_prefix = '-sethost.' + self.magic_name + '.'
self.set_prefix = '-set.' + self.magic_name
self.cookie_name = config.get('cookie_name', self.SESH_COOKIE_NAME)
self.proxy_select_view = config.get('proxy_select_view')
self.extra_headers = config.get('extra_headers')
self.cache = create_cache()
def get_proxy_coll_ts(self, env):
coll, ts, sesh_id = self.get_coll(env)
return coll, ts
def select_coll_response(self, env):
return self.make_magic_response('auto',
env['REL_REQUEST_URI'],
env)
def resolve(self, env):
server_name = env['pywb.proxy_host']
if ('.' + self.magic_name) in server_name:
response = self.handle_magic_page(env)
if response:
return None, None, None, None, response
return super(CookieResolver, self).resolve(env)
def handle_magic_page(self, env):
request_url = env['REL_REQUEST_URI']
parts = urlparse.urlsplit(request_url)
server_name = env['pywb.proxy_host']
path_url = parts.path[1:]
if parts.query:
path_url += '?' + parts.query
if server_name.startswith('auto'):
coll, ts, sesh_id = self.get_coll(env)
if coll:
return self.make_sethost_cookie_response(sesh_id,
path_url,
env)
else:
return self.make_magic_response('select', path_url, env)
elif server_name.startswith('query.'):
wb_url = WbUrl(path_url)
# only dealing with specific timestamp setting
if wb_url.is_query():
return None
coll, ts, sesh_id = self.get_coll(env)
if not coll:
return self.make_magic_response('select', path_url, env)
self.set_ts(sesh_id, wb_url.timestamp)
return self.make_redir_response(wb_url.url)
elif server_name.endswith(self.set_prefix):
old_sesh_id = extract_client_cookie(env, self.cookie_name)
sesh_id = self.create_renew_sesh_id(old_sesh_id)
if sesh_id != old_sesh_id:
headers = self.make_cookie_headers(sesh_id, self.magic_name)
else:
headers = None
coll = server_name[:-len(self.set_prefix)]
# set sesh value
self.set_coll(sesh_id, coll)
return self.make_sethost_cookie_response(sesh_id, path_url, env,
headers=headers)
elif self.sethost_prefix in server_name:
inx = server_name.find(self.sethost_prefix)
sesh_id = server_name[:inx]
domain = server_name[inx + len(self.sethost_prefix):]
headers = self.make_cookie_headers(sesh_id, domain)
full_url = env['pywb.proxy_scheme'] + '://' + domain
full_url += '/' + path_url
return self.make_redir_response(full_url, headers=headers)
elif 'select.' in server_name:
coll, ts, sesh_id = self.get_coll(env)
route_temp = '-set.' + self.magic_name + '/' + path_url
try:
return (self.proxy_select_view.
render_response(routes=self.routes,
route_temp=route_temp,
coll=coll,
url=path_url))
except Exception as exc:
raise
#else:
# msg = 'Invalid Magic Path: ' + url
# print msg
# return WbResponse.text_response(msg, status='404 Not Found')
def make_cookie_headers(self, sesh_id, domain):
cookie_val = '{0}={1}; Path=/; Domain=.{2}; HttpOnly'
cookie_val = cookie_val.format(self.cookie_name, sesh_id, domain)
headers = [('Set-Cookie', cookie_val)]
return headers
def make_sethost_cookie_response(self, sesh_id, path_url,
env, headers=None):
if '://' not in path_url:
path_url = 'http://' + path_url
path_parts = urlparse.urlsplit(path_url)
new_url = path_parts.path[1:]
if path_parts.query:
new_url += '?' + path_parts.query
return self.make_magic_response(sesh_id + '-sethost', new_url, env,
suffix=path_parts.netloc,
headers=headers)
def make_magic_response(self, prefix, url, env,
suffix=None, headers=None):
full_url = env['pywb.proxy_scheme'] + '://' + prefix + '.'
full_url += self.magic_name
if suffix:
full_url += '.' + suffix
full_url += '/' + url
return self.make_redir_response(full_url, headers=headers)
def set_coll(self, sesh_id, coll):
self.cache[sesh_id + ':c'] = coll
def set_ts(self, sesh_id, ts):
if ts:
self.cache[sesh_id + ':t'] = ts
# this ensures that omitting timestamp will reset to latest
# capture by deleting the cache entry
else:
del self.cache[sesh_id + ':t']
def get_coll(self, env):
sesh_id = extract_client_cookie(env, self.cookie_name)
coll = None
ts = None
if sesh_id:
coll = self.cache[sesh_id + ':c']
try:
ts = self.cache[sesh_id + ':t']
except KeyError:
pass
return coll, ts, sesh_id
def create_renew_sesh_id(self, sesh_id, force=False):
#if sesh_id in self.cache and not force:
if sesh_id and ((sesh_id + ':c') in self.cache) and not force:
return sesh_id
sesh_id = base64.b32encode(os.urandom(5)).lower()
return sesh_id
def make_redir_response(self, url, headers=None):
if not headers:
headers = []
if self.extra_headers:
for name, value in self.extra_headers.iteritems():
headers.append((name, value))
return WbResponse.redir_response(url, headers=headers)