mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-25 23:47:47 +01:00
proxy of videos and youtube-dl is not available (the only use case). HTTPParser wrapping logic no longer needed in latest versions Modify tests to only run if youtube-dl is installed in cases where it is not available #118
278 lines
8.7 KiB
Python
278 lines
8.7 KiB
Python
from pywb.framework.wbrequestresponse import WbResponse
|
|
from pywb.framework.cache import create_cache
|
|
|
|
from pywb.rewrite.rewrite_live import LiveRewriter
|
|
from pywb.rewrite.wburl import WbUrl
|
|
from pywb.rewrite.url_rewriter import HttpsUrlRewriter
|
|
|
|
from handlers import StaticHandler, SearchPageWbUrlHandler
|
|
from views import HeadInsertView
|
|
|
|
from pywb.utils.wbexception import WbException
|
|
|
|
import json
|
|
import requests
|
|
import hashlib
|
|
|
|
|
|
#=================================================================
|
|
class LiveResourceException(WbException):
|
|
def status(self):
|
|
return '400 Bad Live Resource'
|
|
|
|
|
|
#=================================================================
|
|
class RewriteHandler(SearchPageWbUrlHandler):
|
|
|
|
LIVE_COOKIE = 'pywb.timestamp={0}; max-age=60'
|
|
|
|
YT_DL_TYPE = 'application/vnd.youtube-dl_formats+json'
|
|
|
|
youtubedl = None
|
|
|
|
def __init__(self, config):
|
|
super(RewriteHandler, self).__init__(config)
|
|
|
|
proxyhostport = config.get('proxyhostport')
|
|
self.rewriter = LiveRewriter(is_framed_replay=self.is_frame_mode,
|
|
proxies=proxyhostport)
|
|
|
|
self.proxies = self.rewriter.proxies
|
|
|
|
self.head_insert_view = HeadInsertView.init_from_config(config)
|
|
|
|
self.live_cookie = config.get('live-cookie', self.LIVE_COOKIE)
|
|
|
|
self.verify = config.get('verify_ssl', True)
|
|
|
|
self.ydl = None
|
|
|
|
self._cache = None
|
|
|
|
def handle_request(self, wbrequest):
|
|
try:
|
|
return self.render_content(wbrequest)
|
|
|
|
except Exception as exc:
|
|
import traceback
|
|
err_details = traceback.format_exc(exc)
|
|
print err_details
|
|
|
|
url = wbrequest.wb_url.url
|
|
msg = 'Could not load the url from the live web: ' + url
|
|
raise LiveResourceException(msg=msg, url=url)
|
|
|
|
def _live_request_headers(self, wbrequest):
|
|
return {}
|
|
|
|
def _ignore_proxies(self, wbrequest):
|
|
return False
|
|
|
|
def render_content(self, wbrequest):
|
|
if wbrequest.wb_url.mod == 'vi_':
|
|
return self._get_video_info(wbrequest)
|
|
|
|
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
|
|
req_headers = self._live_request_headers(wbrequest)
|
|
|
|
ref_wburl_str = wbrequest.extract_referrer_wburl_str()
|
|
if ref_wburl_str:
|
|
wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url
|
|
|
|
ignore_proxies = self._ignore_proxies(wbrequest)
|
|
|
|
use_206 = False
|
|
url = None
|
|
rangeres = None
|
|
|
|
readd_range = False
|
|
cache_key = None
|
|
|
|
if self.proxies and not ignore_proxies:
|
|
rangeres = wbrequest.extract_range()
|
|
|
|
if rangeres:
|
|
url, start, end, use_206 = rangeres
|
|
|
|
# if bytes=0- Range request,
|
|
# simply remove the range and still proxy
|
|
if start == 0 and not end and use_206:
|
|
wbrequest.wb_url.url = url
|
|
del wbrequest.env['HTTP_RANGE']
|
|
readd_range = True
|
|
else:
|
|
# disables proxy
|
|
ignore_proxies = True
|
|
|
|
# sets cache_key only if not already cached
|
|
cache_key = self._get_cache_key('r:', url)
|
|
|
|
result = self.rewriter.fetch_request(wbrequest.wb_url.url,
|
|
wbrequest.urlrewriter,
|
|
head_insert_func=head_insert_func,
|
|
req_headers=req_headers,
|
|
env=wbrequest.env,
|
|
ignore_proxies=ignore_proxies,
|
|
verify=self.verify)
|
|
|
|
wbresponse = self._make_response(wbrequest, *result)
|
|
|
|
if readd_range:
|
|
content_length = (wbresponse.status_headers.
|
|
get_header('Content-Length'))
|
|
try:
|
|
content_length = int(content_length)
|
|
wbresponse.status_headers.add_range(0, content_length,
|
|
content_length)
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
if cache_key:
|
|
self._add_proxy_ping(cache_key, url, wbrequest, wbresponse)
|
|
|
|
if rangeres:
|
|
referrer = wbrequest.env.get('REL_REFERER')
|
|
|
|
# also ping video info
|
|
if referrer:
|
|
try:
|
|
resp = self._get_video_info(wbrequest,
|
|
info_url=referrer,
|
|
video_url=url)
|
|
except:
|
|
print('Error getting video info')
|
|
|
|
return wbresponse
|
|
|
|
def _make_response(self, wbrequest, status_headers, gen, is_rewritten):
|
|
# if cookie set, pass recorded timestamp info via cookie
|
|
# so that client side may be able to access it
|
|
# used by framed mode to update frame banner
|
|
if self.live_cookie:
|
|
cdx = wbrequest.env.get('pywb.cdx')
|
|
if cdx:
|
|
value = self.live_cookie.format(cdx['timestamp'])
|
|
status_headers.headers.append(('Set-Cookie', value))
|
|
|
|
return WbResponse(status_headers, gen)
|
|
|
|
def _get_cache_key(self, prefix, url):
|
|
if not self._cache:
|
|
self._cache = create_cache()
|
|
|
|
key = self.create_cache_key(prefix, url)
|
|
|
|
if key in self._cache:
|
|
return None
|
|
|
|
return key
|
|
|
|
@staticmethod
|
|
def create_cache_key(prefix, url):
|
|
hash_ = hashlib.md5()
|
|
hash_.update(url)
|
|
key = hash_.hexdigest()
|
|
key = prefix + key
|
|
return key
|
|
|
|
def _add_proxy_ping(self, key, url, wbrequest, wbresponse):
|
|
def do_ping():
|
|
headers = self._live_request_headers(wbrequest)
|
|
headers['Connection'] = 'close'
|
|
|
|
try:
|
|
# mark as pinged
|
|
self._cache[key] = '1'
|
|
|
|
resp = requests.get(url=url,
|
|
headers=headers,
|
|
proxies=self.proxies,
|
|
verify=False,
|
|
stream=True)
|
|
|
|
# don't actually read whole response,
|
|
# proxy response for writing it
|
|
resp.close()
|
|
except:
|
|
del self._cache[key]
|
|
raise
|
|
|
|
def wrap_buff_gen(gen):
|
|
for x in gen:
|
|
yield x
|
|
|
|
try:
|
|
do_ping()
|
|
except:
|
|
pass
|
|
|
|
#do_ping()
|
|
wbresponse.body = wrap_buff_gen(wbresponse.body)
|
|
return wbresponse
|
|
|
|
def _get_video_info(self, wbrequest, info_url=None, video_url=None):
|
|
if not self.youtubedl:
|
|
self.youtubedl = YoutubeDLWrapper()
|
|
|
|
if not video_url:
|
|
video_url = wbrequest.wb_url.url
|
|
|
|
if not info_url:
|
|
info_url = wbrequest.wb_url.url
|
|
|
|
cache_key = None
|
|
if self.proxies:
|
|
cache_key = self._get_cache_key('v:', video_url)
|
|
|
|
info = self.youtubedl.extract_info(video_url)
|
|
if info is None:
|
|
msg = ('youtube-dl is not installed, pip install youtube-dl to ' +
|
|
'enable improved video proxy')
|
|
|
|
return WbResponse.text_response(msg=msg, status='404 Not Found')
|
|
|
|
#if info and info.formats and len(info.formats) == 1:
|
|
|
|
content_type = self.YT_DL_TYPE
|
|
metadata = json.dumps(info)
|
|
|
|
if (self.proxies and cache_key):
|
|
headers = self._live_request_headers(wbrequest)
|
|
headers['Content-Type'] = content_type
|
|
|
|
info_url = HttpsUrlRewriter.remove_https(info_url)
|
|
|
|
response = requests.request(method='PUTMETA',
|
|
url=info_url,
|
|
data=metadata,
|
|
headers=headers,
|
|
proxies=self.proxies,
|
|
verify=False)
|
|
|
|
self._cache[cache_key] = '1'
|
|
|
|
return WbResponse.text_response(metadata, content_type=content_type)
|
|
|
|
|
|
#=================================================================
|
|
class YoutubeDLWrapper(object):
|
|
""" YoutubeDL wrapper, inits youtubee-dil if it is available
|
|
"""
|
|
def __init__(self):
|
|
try:
|
|
from youtube_dl import YoutubeDL as YoutubeDL
|
|
except ImportError:
|
|
self.ydl = None
|
|
pass
|
|
|
|
self.ydl = YoutubeDL(dict(simulate=True,
|
|
youtube_include_dash_manifest=False))
|
|
self.ydl.add_default_info_extractors()
|
|
|
|
def extract_info(self, url):
|
|
if not self.ydl:
|
|
return None
|
|
|
|
info = self.ydl.extract_info(url)
|
|
return info
|