mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
live rewite proxy: decouple having http/https proxy from recording,
move youtubedl wrapper calls, metadata add calls to live rewrite proxy class for easier extension closes #141 also improves #136
This commit is contained in:
parent
c7224ecceb
commit
39e824cb3a
@ -38,6 +38,9 @@ class LiveRewriter(object):
|
|||||||
else:
|
else:
|
||||||
logging.debug('Live Rewrite Direct (no proxy)')
|
logging.debug('Live Rewrite Direct (no proxy)')
|
||||||
|
|
||||||
|
def is_recording(self):
|
||||||
|
return self.proxies is not None
|
||||||
|
|
||||||
def fetch_local_file(self, uri):
|
def fetch_local_file(self, uri):
|
||||||
#fh = open(uri)
|
#fh = open(uri)
|
||||||
fh = LocalFileLoader().load(uri)
|
fh = LocalFileLoader().load(uri)
|
||||||
@ -123,14 +126,14 @@ class LiveRewriter(object):
|
|||||||
env=None,
|
env=None,
|
||||||
req_headers=None,
|
req_headers=None,
|
||||||
follow_redirects=False,
|
follow_redirects=False,
|
||||||
ignore_proxies=False,
|
skip_recording=False,
|
||||||
verify=True):
|
verify=True):
|
||||||
|
|
||||||
method = 'GET'
|
method = 'GET'
|
||||||
data = None
|
data = None
|
||||||
|
|
||||||
proxies = None
|
proxies = None
|
||||||
if not ignore_proxies:
|
if not skip_recording:
|
||||||
proxies = self.proxies
|
proxies = self.proxies
|
||||||
|
|
||||||
if not req_headers:
|
if not req_headers:
|
||||||
@ -174,7 +177,7 @@ class LiveRewriter(object):
|
|||||||
req_headers={},
|
req_headers={},
|
||||||
timestamp=None,
|
timestamp=None,
|
||||||
follow_redirects=False,
|
follow_redirects=False,
|
||||||
ignore_proxies=False,
|
skip_recording=False,
|
||||||
verify=True,
|
verify=True,
|
||||||
remote_only=True):
|
remote_only=True):
|
||||||
|
|
||||||
@ -203,7 +206,7 @@ class LiveRewriter(object):
|
|||||||
(status_headers, stream) = self.fetch_http(url, urlkey, env,
|
(status_headers, stream) = self.fetch_http(url, urlkey, env,
|
||||||
req_headers,
|
req_headers,
|
||||||
follow_redirects,
|
follow_redirects,
|
||||||
ignore_proxies,
|
skip_recording,
|
||||||
verify)
|
verify)
|
||||||
else:
|
else:
|
||||||
(status_headers, stream) = self.fetch_local_file(url)
|
(status_headers, stream) = self.fetch_local_file(url)
|
||||||
@ -232,6 +235,26 @@ class LiveRewriter(object):
|
|||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def fetch_async(self, url, headers):
|
||||||
|
resp = self.live_request(method='GET',
|
||||||
|
url=url,
|
||||||
|
headers=headers,
|
||||||
|
proxies=self.proxies,
|
||||||
|
verify=False,
|
||||||
|
stream=True)
|
||||||
|
|
||||||
|
# don't actually read whole response,
|
||||||
|
# proxy response for writing it
|
||||||
|
resp.close()
|
||||||
|
|
||||||
|
def add_metadata(self, url, headers, data):
|
||||||
|
return self.live_request(method='PUTMETA',
|
||||||
|
url=url,
|
||||||
|
data=data,
|
||||||
|
headers=headers,
|
||||||
|
proxies=self.proxies,
|
||||||
|
verify=False)
|
||||||
|
|
||||||
def get_rewritten(self, *args, **kwargs):
|
def get_rewritten(self, *args, **kwargs):
|
||||||
result = self.fetch_request(*args, **kwargs)
|
result = self.fetch_request(*args, **kwargs)
|
||||||
|
|
||||||
@ -240,3 +263,35 @@ class LiveRewriter(object):
|
|||||||
buff = ''.join(gen)
|
buff = ''.join(gen)
|
||||||
|
|
||||||
return (status_headers, buff)
|
return (status_headers, buff)
|
||||||
|
|
||||||
|
def get_video_info(self, url):
|
||||||
|
return youtubedl.extract_info(url)
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class YoutubeDLWrapper(object): #pragma: no cover
|
||||||
|
""" YoutubeDL wrapper, inits youtubee-dl if it is available
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
try:
|
||||||
|
from youtube_dl import YoutubeDL as YoutubeDL
|
||||||
|
except ImportError:
|
||||||
|
self.ydl = None
|
||||||
|
return
|
||||||
|
|
||||||
|
self.ydl = YoutubeDL(dict(simulate=True,
|
||||||
|
youtube_include_dash_manifest=False))
|
||||||
|
self.ydl.add_default_info_extractors()
|
||||||
|
|
||||||
|
def extract_info(self, url):
|
||||||
|
print('YDL', self.ydl)
|
||||||
|
if not self.ydl:
|
||||||
|
return None
|
||||||
|
|
||||||
|
info = self.ydl.extract_info(url)
|
||||||
|
return info
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
youtubedl = YoutubeDLWrapper()
|
||||||
|
|
||||||
|
@ -11,7 +11,6 @@ from views import HeadInsertView
|
|||||||
from pywb.utils.wbexception import WbException
|
from pywb.utils.wbexception import WbException
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import requests
|
|
||||||
import hashlib
|
import hashlib
|
||||||
|
|
||||||
|
|
||||||
@ -28,8 +27,6 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
|||||||
|
|
||||||
YT_DL_TYPE = 'application/vnd.youtube-dl_formats+json'
|
YT_DL_TYPE = 'application/vnd.youtube-dl_formats+json'
|
||||||
|
|
||||||
youtubedl = None
|
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(RewriteHandler, self).__init__(config)
|
super(RewriteHandler, self).__init__(config)
|
||||||
|
|
||||||
@ -37,10 +34,10 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
|||||||
|
|
||||||
live_rewriter_cls = config.get('live_rewriter_cls', LiveRewriter)
|
live_rewriter_cls = config.get('live_rewriter_cls', LiveRewriter)
|
||||||
|
|
||||||
self.rewriter = live_rewriter_cls(is_framed_replay=self.is_frame_mode,
|
self.live_fetcher = live_rewriter_cls(is_framed_replay=self.is_frame_mode,
|
||||||
proxies=proxyhostport)
|
proxies=proxyhostport)
|
||||||
|
|
||||||
self.proxies = self.rewriter.proxies
|
self.recording = self.live_fetcher.is_recording()
|
||||||
|
|
||||||
self.head_insert_view = HeadInsertView.init_from_config(config)
|
self.head_insert_view = HeadInsertView.init_from_config(config)
|
||||||
|
|
||||||
@ -73,7 +70,7 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
|||||||
def _live_request_headers(self, wbrequest):
|
def _live_request_headers(self, wbrequest):
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
def _ignore_proxies(self, wbrequest):
|
def _skip_recording(self, wbrequest):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def render_content(self, wbrequest):
|
def render_content(self, wbrequest):
|
||||||
@ -87,7 +84,7 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
|||||||
if ref_wburl_str:
|
if ref_wburl_str:
|
||||||
wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url
|
wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url
|
||||||
|
|
||||||
ignore_proxies = self._ignore_proxies(wbrequest)
|
skip_recording = self._skip_recording(wbrequest)
|
||||||
|
|
||||||
use_206 = False
|
use_206 = False
|
||||||
url = None
|
url = None
|
||||||
@ -96,7 +93,7 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
|||||||
readd_range = False
|
readd_range = False
|
||||||
cache_key = None
|
cache_key = None
|
||||||
|
|
||||||
if self.proxies and not ignore_proxies:
|
if self.recording and not skip_recording:
|
||||||
rangeres = wbrequest.extract_range()
|
rangeres = wbrequest.extract_range()
|
||||||
|
|
||||||
if rangeres:
|
if rangeres:
|
||||||
@ -110,17 +107,17 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
|||||||
readd_range = True
|
readd_range = True
|
||||||
else:
|
else:
|
||||||
# disables proxy
|
# disables proxy
|
||||||
ignore_proxies = True
|
skip_recording = True
|
||||||
|
|
||||||
# sets cache_key only if not already cached
|
# sets cache_key only if not already cached
|
||||||
cache_key = self._get_cache_key('r:', url)
|
cache_key = self._get_cache_key('r:', url)
|
||||||
|
|
||||||
result = self.rewriter.fetch_request(wbrequest.wb_url.url,
|
result = self.live_fetcher.fetch_request(wbrequest.wb_url.url,
|
||||||
wbrequest.urlrewriter,
|
wbrequest.urlrewriter,
|
||||||
head_insert_func=head_insert_func,
|
head_insert_func=head_insert_func,
|
||||||
req_headers=req_headers,
|
req_headers=req_headers,
|
||||||
env=wbrequest.env,
|
env=wbrequest.env,
|
||||||
ignore_proxies=ignore_proxies,
|
skip_recording=skip_recording,
|
||||||
verify=self.verify)
|
verify=self.verify)
|
||||||
|
|
||||||
wbresponse = self._make_response(wbrequest, *result)
|
wbresponse = self._make_response(wbrequest, *result)
|
||||||
@ -135,8 +132,8 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
|||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if cache_key:
|
if self.recording and cache_key:
|
||||||
self._add_proxy_ping(cache_key, url, wbrequest, wbresponse)
|
self._add_rec_ping(cache_key, url, wbrequest, wbresponse)
|
||||||
|
|
||||||
if rangeres:
|
if rangeres:
|
||||||
referrer = wbrequest.env.get('REL_REFERER')
|
referrer = wbrequest.env.get('REL_REFERER')
|
||||||
@ -183,7 +180,7 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
|||||||
key = prefix + key
|
key = prefix + key
|
||||||
return key
|
return key
|
||||||
|
|
||||||
def _add_proxy_ping(self, key, url, wbrequest, wbresponse):
|
def _add_rec_ping(self, key, url, wbrequest, wbresponse):
|
||||||
def do_ping():
|
def do_ping():
|
||||||
headers = self._live_request_headers(wbrequest)
|
headers = self._live_request_headers(wbrequest)
|
||||||
headers['Connection'] = 'close'
|
headers['Connection'] = 'close'
|
||||||
@ -192,15 +189,8 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
|||||||
# mark as pinged
|
# mark as pinged
|
||||||
self._cache[key] = '1'
|
self._cache[key] = '1'
|
||||||
|
|
||||||
resp = requests.get(url=url,
|
self.live_fetcher.fetch_async(url, headers)
|
||||||
headers=headers,
|
|
||||||
proxies=self.proxies,
|
|
||||||
verify=False,
|
|
||||||
stream=True)
|
|
||||||
|
|
||||||
# don't actually read whole response,
|
|
||||||
# proxy response for writing it
|
|
||||||
resp.close()
|
|
||||||
except:
|
except:
|
||||||
del self._cache[key]
|
del self._cache[key]
|
||||||
raise
|
raise
|
||||||
@ -219,9 +209,6 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
|||||||
return wbresponse
|
return wbresponse
|
||||||
|
|
||||||
def _get_video_info(self, wbrequest, info_url=None, video_url=None):
|
def _get_video_info(self, wbrequest, info_url=None, video_url=None):
|
||||||
if not self.youtubedl:
|
|
||||||
self.youtubedl = YoutubeDLWrapper()
|
|
||||||
|
|
||||||
if not video_url:
|
if not video_url:
|
||||||
video_url = wbrequest.wb_url.url
|
video_url = wbrequest.wb_url.url
|
||||||
|
|
||||||
@ -229,10 +216,10 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
|||||||
info_url = wbrequest.wb_url.url
|
info_url = wbrequest.wb_url.url
|
||||||
|
|
||||||
cache_key = None
|
cache_key = None
|
||||||
if self.proxies:
|
if self.recording:
|
||||||
cache_key = self._get_cache_key('v:', video_url)
|
cache_key = self._get_cache_key('v:', video_url)
|
||||||
|
|
||||||
info = self.youtubedl.extract_info(video_url)
|
info = self.live_fetcher.get_video_info(video_url)
|
||||||
if info is None: #pragma: no cover
|
if info is None: #pragma: no cover
|
||||||
msg = ('youtube-dl is not installed, pip install youtube-dl to ' +
|
msg = ('youtube-dl is not installed, pip install youtube-dl to ' +
|
||||||
'enable improved video proxy')
|
'enable improved video proxy')
|
||||||
@ -244,42 +231,14 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
|||||||
content_type = self.YT_DL_TYPE
|
content_type = self.YT_DL_TYPE
|
||||||
metadata = json.dumps(info)
|
metadata = json.dumps(info)
|
||||||
|
|
||||||
if (self.proxies and cache_key):
|
if (self.recording and cache_key):
|
||||||
headers = self._live_request_headers(wbrequest)
|
headers = self._live_request_headers(wbrequest)
|
||||||
headers['Content-Type'] = content_type
|
headers['Content-Type'] = content_type
|
||||||
|
|
||||||
info_url = HttpsUrlRewriter.remove_https(info_url)
|
info_url = HttpsUrlRewriter.remove_https(info_url)
|
||||||
|
|
||||||
response = requests.request(method='PUTMETA',
|
response = self.live_fetcher.add_metadata(info_url, headers, metadata)
|
||||||
url=info_url,
|
|
||||||
data=metadata,
|
|
||||||
headers=headers,
|
|
||||||
proxies=self.proxies,
|
|
||||||
verify=False)
|
|
||||||
|
|
||||||
self._cache[cache_key] = '1'
|
self._cache[cache_key] = '1'
|
||||||
|
|
||||||
return WbResponse.text_response(metadata, content_type=content_type)
|
return WbResponse.text_response(metadata, content_type=content_type)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class YoutubeDLWrapper(object): #pragma: no cover
|
|
||||||
""" YoutubeDL wrapper, inits youtubee-dl if it is available
|
|
||||||
"""
|
|
||||||
def __init__(self):
|
|
||||||
try:
|
|
||||||
from youtube_dl import YoutubeDL as YoutubeDL
|
|
||||||
except ImportError:
|
|
||||||
self.ydl = None
|
|
||||||
return
|
|
||||||
|
|
||||||
self.ydl = YoutubeDL(dict(simulate=True,
|
|
||||||
youtube_include_dash_manifest=False))
|
|
||||||
self.ydl.add_default_info_extractors()
|
|
||||||
|
|
||||||
def extract_info(self, url):
|
|
||||||
if not self.ydl:
|
|
||||||
return None
|
|
||||||
|
|
||||||
info = self.ydl.extract_info(url)
|
|
||||||
return info
|
|
||||||
|
@ -73,7 +73,7 @@ def setup_module():
|
|||||||
|
|
||||||
config = dict(collections=dict(rewrite='$liveweb'),
|
config = dict(collections=dict(rewrite='$liveweb'),
|
||||||
framed_replay=True,
|
framed_replay=True,
|
||||||
proxyhostport=server.proxy_dict)
|
proxyhostport=server.proxy_str)
|
||||||
|
|
||||||
global cache
|
global cache
|
||||||
cache = {}
|
cache = {}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user