mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-20 02:39:13 +01:00
proxy_resolvers range cache: and buffering cache for serving range requests, intended for videos but not only. full response cached in temp file and range requests served from cache, still experimental need to add deletion. youtube_dl: wrap youtube-dl import due to youtube-dl HTMLParser regex bug tests: add test for vi_ handler
294 lines
9.6 KiB
Python
294 lines
9.6 KiB
Python
from wbrequestresponse import WbResponse
|
|
from pywb.utils.loaders import extract_client_cookie
|
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
|
from pywb.rewrite.wburl import WbUrl
|
|
|
|
from cache import create_cache
|
|
|
|
import urlparse
|
|
import base64
|
|
import os
|
|
|
|
|
|
#=================================================================
|
|
class BaseCollResolver(object):
|
|
def __init__(self, routes, config):
|
|
self.routes = routes
|
|
self.pre_connect = config.get('pre_connect', False)
|
|
self.use_default_coll = config.get('use_default_coll', True)
|
|
|
|
def resolve(self, env):
|
|
route = None
|
|
coll = None
|
|
matcher = None
|
|
ts = None
|
|
|
|
proxy_coll, ts = self.get_proxy_coll_ts(env)
|
|
|
|
# invalid parsing
|
|
if proxy_coll == '':
|
|
return None, None, None, None, self.select_coll_response(env)
|
|
|
|
if proxy_coll is None and isinstance(self.use_default_coll, str):
|
|
proxy_coll = self.use_default_coll
|
|
|
|
if proxy_coll:
|
|
proxy_coll = '/' + proxy_coll + '/'
|
|
|
|
for r in self.routes:
|
|
matcher, c = r.is_handling(proxy_coll)
|
|
if matcher:
|
|
route = r
|
|
coll = c
|
|
break
|
|
|
|
# if no match, return coll selection response
|
|
if not route:
|
|
return None, None, None, None, self.select_coll_response(env)
|
|
|
|
# if 'use_default_coll'
|
|
elif self.use_default_coll or len(self.routes) == 1:
|
|
route = self.routes[0]
|
|
coll = self.routes[0].path
|
|
|
|
# otherwise, return the appropriate coll selection response
|
|
else:
|
|
return None, None, None, None, self.select_coll_response(env)
|
|
|
|
return route, coll, matcher, ts, None
|
|
|
|
|
|
#=================================================================
|
|
class ProxyAuthResolver(BaseCollResolver):
|
|
DEFAULT_MSG = 'Please enter name of a collection to use with proxy mode'
|
|
|
|
def __init__(self, routes, config):
|
|
config['pre_connect'] = True
|
|
super(ProxyAuthResolver, self).__init__(routes, config)
|
|
self.auth_msg = config.get('auth_msg', self.DEFAULT_MSG)
|
|
|
|
def get_proxy_coll_ts(self, env):
|
|
proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION')
|
|
|
|
if not proxy_auth:
|
|
return None, None
|
|
|
|
proxy_coll = self.read_basic_auth_coll(proxy_auth)
|
|
return proxy_coll, None
|
|
|
|
def select_coll_response(self, env):
|
|
proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg)
|
|
|
|
headers = [('Content-Type', 'text/plain'),
|
|
('Proxy-Authenticate', proxy_msg)]
|
|
|
|
status_headers = StatusAndHeaders('407 Proxy Authentication', headers)
|
|
|
|
value = self.auth_msg
|
|
|
|
return WbResponse(status_headers, value=[value])
|
|
|
|
@staticmethod
|
|
def read_basic_auth_coll(value):
|
|
parts = value.split(' ')
|
|
if parts[0].lower() != 'basic':
|
|
return ''
|
|
|
|
if len(parts) != 2:
|
|
return ''
|
|
|
|
user_pass = base64.b64decode(parts[1])
|
|
return user_pass.split(':')[0]
|
|
|
|
|
|
#=================================================================
|
|
class CookieResolver(BaseCollResolver):
|
|
SESH_COOKIE_NAME = '__pywb_proxy_sesh'
|
|
|
|
def __init__(self, routes, config):
|
|
config['pre_connect'] = False
|
|
super(CookieResolver, self).__init__(routes, config)
|
|
self.magic_name = config['magic_name']
|
|
self.sethost_prefix = '-sethost.' + self.magic_name + '.'
|
|
self.set_prefix = '-set.' + self.magic_name
|
|
|
|
self.cookie_name = config.get('cookie_name', self.SESH_COOKIE_NAME)
|
|
self.proxy_select_view = config.get('proxy_select_view')
|
|
|
|
self.extra_headers = config.get('extra_headers')
|
|
|
|
self.cache = create_cache()
|
|
|
|
def get_proxy_coll_ts(self, env):
|
|
coll, ts, sesh_id = self.get_coll(env)
|
|
return coll, ts
|
|
|
|
def select_coll_response(self, env):
|
|
return self.make_magic_response('auto',
|
|
env['REL_REQUEST_URI'],
|
|
env)
|
|
|
|
def resolve(self, env):
|
|
server_name = env['pywb.proxy_host']
|
|
|
|
if ('.' + self.magic_name) in server_name:
|
|
response = self.handle_magic_page(env)
|
|
if response:
|
|
return None, None, None, None, response
|
|
|
|
return super(CookieResolver, self).resolve(env)
|
|
|
|
def handle_magic_page(self, env):
|
|
request_url = env['REL_REQUEST_URI']
|
|
parts = urlparse.urlsplit(request_url)
|
|
server_name = env['pywb.proxy_host']
|
|
|
|
path_url = parts.path[1:]
|
|
if parts.query:
|
|
path_url += '?' + parts.query
|
|
|
|
if server_name.startswith('auto'):
|
|
coll, ts, sesh_id = self.get_coll(env)
|
|
|
|
if coll:
|
|
return self.make_sethost_cookie_response(sesh_id,
|
|
path_url,
|
|
env)
|
|
else:
|
|
return self.make_magic_response('select', path_url, env)
|
|
|
|
elif server_name.startswith('query.'):
|
|
wb_url = WbUrl(path_url)
|
|
|
|
# only dealing with specific timestamp setting
|
|
if wb_url.is_query():
|
|
return None
|
|
|
|
coll, ts, sesh_id = self.get_coll(env)
|
|
if not coll:
|
|
return self.make_magic_response('select', path_url, env)
|
|
|
|
self.set_ts(sesh_id, wb_url.timestamp)
|
|
return self.make_redir_response(wb_url.url)
|
|
|
|
elif server_name.endswith(self.set_prefix):
|
|
old_sesh_id = extract_client_cookie(env, self.cookie_name)
|
|
sesh_id = self.create_renew_sesh_id(old_sesh_id)
|
|
|
|
if sesh_id != old_sesh_id:
|
|
headers = self.make_cookie_headers(sesh_id, self.magic_name)
|
|
else:
|
|
headers = None
|
|
|
|
coll = server_name[:-len(self.set_prefix)]
|
|
|
|
# set sesh value
|
|
self.set_coll(sesh_id, coll)
|
|
|
|
return self.make_sethost_cookie_response(sesh_id, path_url, env,
|
|
headers=headers)
|
|
|
|
elif self.sethost_prefix in server_name:
|
|
inx = server_name.find(self.sethost_prefix)
|
|
sesh_id = server_name[:inx]
|
|
|
|
domain = server_name[inx + len(self.sethost_prefix):]
|
|
|
|
headers = self.make_cookie_headers(sesh_id, domain)
|
|
|
|
full_url = env['pywb.proxy_scheme'] + '://' + domain
|
|
full_url += '/' + path_url
|
|
return self.make_redir_response(full_url, headers=headers)
|
|
|
|
elif 'select.' in server_name:
|
|
coll, ts, sesh_id = self.get_coll(env)
|
|
|
|
route_temp = '-set.' + self.magic_name + '/' + path_url
|
|
|
|
try:
|
|
return (self.proxy_select_view.
|
|
render_response(routes=self.routes,
|
|
route_temp=route_temp,
|
|
coll=coll,
|
|
url=path_url))
|
|
except Exception as exc:
|
|
raise
|
|
|
|
#else:
|
|
# msg = 'Invalid Magic Path: ' + url
|
|
# print msg
|
|
# return WbResponse.text_response(msg, status='404 Not Found')
|
|
|
|
def make_cookie_headers(self, sesh_id, domain):
|
|
cookie_val = '{0}={1}; Path=/; Domain=.{2}; HttpOnly'
|
|
cookie_val = cookie_val.format(self.cookie_name, sesh_id, domain)
|
|
headers = [('Set-Cookie', cookie_val)]
|
|
return headers
|
|
|
|
def make_sethost_cookie_response(self, sesh_id, path_url,
|
|
env, headers=None):
|
|
if '://' not in path_url:
|
|
path_url = 'http://' + path_url
|
|
|
|
path_parts = urlparse.urlsplit(path_url)
|
|
|
|
new_url = path_parts.path[1:]
|
|
if path_parts.query:
|
|
new_url += '?' + path_parts.query
|
|
|
|
return self.make_magic_response(sesh_id + '-sethost', new_url, env,
|
|
suffix=path_parts.netloc,
|
|
headers=headers)
|
|
|
|
def make_magic_response(self, prefix, url, env,
|
|
suffix=None, headers=None):
|
|
full_url = env['pywb.proxy_scheme'] + '://' + prefix + '.'
|
|
full_url += self.magic_name
|
|
if suffix:
|
|
full_url += '.' + suffix
|
|
full_url += '/' + url
|
|
return self.make_redir_response(full_url, headers=headers)
|
|
|
|
def set_coll(self, sesh_id, coll):
|
|
self.cache[sesh_id + ':c'] = coll
|
|
|
|
def set_ts(self, sesh_id, ts):
|
|
if ts:
|
|
self.cache[sesh_id + ':t'] = ts
|
|
# this ensures that omitting timestamp will reset to latest
|
|
# capture by deleting the cache entry
|
|
else:
|
|
del self.cache[sesh_id + ':t']
|
|
|
|
def get_coll(self, env):
|
|
sesh_id = extract_client_cookie(env, self.cookie_name)
|
|
|
|
coll = None
|
|
ts = None
|
|
if sesh_id:
|
|
coll = self.cache[sesh_id + ':c']
|
|
try:
|
|
ts = self.cache[sesh_id + ':t']
|
|
except KeyError:
|
|
pass
|
|
|
|
return coll, ts, sesh_id
|
|
|
|
def create_renew_sesh_id(self, sesh_id, force=False):
|
|
#if sesh_id in self.cache and not force:
|
|
if sesh_id and ((sesh_id + ':c') in self.cache) and not force:
|
|
return sesh_id
|
|
|
|
sesh_id = base64.b32encode(os.urandom(5)).lower()
|
|
return sesh_id
|
|
|
|
def make_redir_response(self, url, headers=None):
|
|
if not headers:
|
|
headers = []
|
|
|
|
if self.extra_headers:
|
|
for name, value in self.extra_headers.iteritems():
|
|
headers.append((name, value))
|
|
|
|
return WbResponse.redir_response(url, headers=headers)
|