1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-25 23:47:47 +01:00
pywb/pywb/webapp/live_rewrite_handler.py
Ilya Kreymer f0359877f0 youtube-dl: remove from dependency, installation is optional. Return 404 if attempting live
proxy of videos and youtube-dl is not available (the only use case).
HTTPParser wrapping logic no longer needed in latest versions
Modify tests to only run if youtube-dl is installed in cases where it is not available #118
2015-06-27 16:11:59 -07:00

278 lines
8.7 KiB
Python

from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.cache import create_cache
from pywb.rewrite.rewrite_live import LiveRewriter
from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import HttpsUrlRewriter
from handlers import StaticHandler, SearchPageWbUrlHandler
from views import HeadInsertView
from pywb.utils.wbexception import WbException
import json
import requests
import hashlib
#=================================================================
class LiveResourceException(WbException):
def status(self):
return '400 Bad Live Resource'
#=================================================================
class RewriteHandler(SearchPageWbUrlHandler):
LIVE_COOKIE = 'pywb.timestamp={0}; max-age=60'
YT_DL_TYPE = 'application/vnd.youtube-dl_formats+json'
youtubedl = None
def __init__(self, config):
super(RewriteHandler, self).__init__(config)
proxyhostport = config.get('proxyhostport')
self.rewriter = LiveRewriter(is_framed_replay=self.is_frame_mode,
proxies=proxyhostport)
self.proxies = self.rewriter.proxies
self.head_insert_view = HeadInsertView.init_from_config(config)
self.live_cookie = config.get('live-cookie', self.LIVE_COOKIE)
self.verify = config.get('verify_ssl', True)
self.ydl = None
self._cache = None
def handle_request(self, wbrequest):
try:
return self.render_content(wbrequest)
except Exception as exc:
import traceback
err_details = traceback.format_exc(exc)
print err_details
url = wbrequest.wb_url.url
msg = 'Could not load the url from the live web: ' + url
raise LiveResourceException(msg=msg, url=url)
def _live_request_headers(self, wbrequest):
return {}
def _ignore_proxies(self, wbrequest):
return False
def render_content(self, wbrequest):
if wbrequest.wb_url.mod == 'vi_':
return self._get_video_info(wbrequest)
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
req_headers = self._live_request_headers(wbrequest)
ref_wburl_str = wbrequest.extract_referrer_wburl_str()
if ref_wburl_str:
wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url
ignore_proxies = self._ignore_proxies(wbrequest)
use_206 = False
url = None
rangeres = None
readd_range = False
cache_key = None
if self.proxies and not ignore_proxies:
rangeres = wbrequest.extract_range()
if rangeres:
url, start, end, use_206 = rangeres
# if bytes=0- Range request,
# simply remove the range and still proxy
if start == 0 and not end and use_206:
wbrequest.wb_url.url = url
del wbrequest.env['HTTP_RANGE']
readd_range = True
else:
# disables proxy
ignore_proxies = True
# sets cache_key only if not already cached
cache_key = self._get_cache_key('r:', url)
result = self.rewriter.fetch_request(wbrequest.wb_url.url,
wbrequest.urlrewriter,
head_insert_func=head_insert_func,
req_headers=req_headers,
env=wbrequest.env,
ignore_proxies=ignore_proxies,
verify=self.verify)
wbresponse = self._make_response(wbrequest, *result)
if readd_range:
content_length = (wbresponse.status_headers.
get_header('Content-Length'))
try:
content_length = int(content_length)
wbresponse.status_headers.add_range(0, content_length,
content_length)
except (ValueError, TypeError):
pass
if cache_key:
self._add_proxy_ping(cache_key, url, wbrequest, wbresponse)
if rangeres:
referrer = wbrequest.env.get('REL_REFERER')
# also ping video info
if referrer:
try:
resp = self._get_video_info(wbrequest,
info_url=referrer,
video_url=url)
except:
print('Error getting video info')
return wbresponse
def _make_response(self, wbrequest, status_headers, gen, is_rewritten):
# if cookie set, pass recorded timestamp info via cookie
# so that client side may be able to access it
# used by framed mode to update frame banner
if self.live_cookie:
cdx = wbrequest.env.get('pywb.cdx')
if cdx:
value = self.live_cookie.format(cdx['timestamp'])
status_headers.headers.append(('Set-Cookie', value))
return WbResponse(status_headers, gen)
def _get_cache_key(self, prefix, url):
if not self._cache:
self._cache = create_cache()
key = self.create_cache_key(prefix, url)
if key in self._cache:
return None
return key
@staticmethod
def create_cache_key(prefix, url):
hash_ = hashlib.md5()
hash_.update(url)
key = hash_.hexdigest()
key = prefix + key
return key
def _add_proxy_ping(self, key, url, wbrequest, wbresponse):
def do_ping():
headers = self._live_request_headers(wbrequest)
headers['Connection'] = 'close'
try:
# mark as pinged
self._cache[key] = '1'
resp = requests.get(url=url,
headers=headers,
proxies=self.proxies,
verify=False,
stream=True)
# don't actually read whole response,
# proxy response for writing it
resp.close()
except:
del self._cache[key]
raise
def wrap_buff_gen(gen):
for x in gen:
yield x
try:
do_ping()
except:
pass
#do_ping()
wbresponse.body = wrap_buff_gen(wbresponse.body)
return wbresponse
def _get_video_info(self, wbrequest, info_url=None, video_url=None):
if not self.youtubedl:
self.youtubedl = YoutubeDLWrapper()
if not video_url:
video_url = wbrequest.wb_url.url
if not info_url:
info_url = wbrequest.wb_url.url
cache_key = None
if self.proxies:
cache_key = self._get_cache_key('v:', video_url)
info = self.youtubedl.extract_info(video_url)
if info is None:
msg = ('youtube-dl is not installed, pip install youtube-dl to ' +
'enable improved video proxy')
return WbResponse.text_response(msg=msg, status='404 Not Found')
#if info and info.formats and len(info.formats) == 1:
content_type = self.YT_DL_TYPE
metadata = json.dumps(info)
if (self.proxies and cache_key):
headers = self._live_request_headers(wbrequest)
headers['Content-Type'] = content_type
info_url = HttpsUrlRewriter.remove_https(info_url)
response = requests.request(method='PUTMETA',
url=info_url,
data=metadata,
headers=headers,
proxies=self.proxies,
verify=False)
self._cache[cache_key] = '1'
return WbResponse.text_response(metadata, content_type=content_type)
#=================================================================
class YoutubeDLWrapper(object):
""" YoutubeDL wrapper, inits youtubee-dil if it is available
"""
def __init__(self):
try:
from youtube_dl import YoutubeDL as YoutubeDL
except ImportError:
self.ydl = None
pass
self.ydl = YoutubeDL(dict(simulate=True,
youtube_include_dash_manifest=False))
self.ydl.add_default_info_extractors()
def extract_info(self, url):
if not self.ydl:
return None
info = self.ydl.extract_info(url)
return info