1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

live rewite proxy: decouple having http/https proxy from recording,

move youtubedl wrapper calls, metadata add calls to live rewrite proxy class for easier extension
closes #141 also improves #136
This commit is contained in:
Ilya Kreymer 2015-10-23 11:54:50 -07:00
parent c7224ecceb
commit 39e824cb3a
3 changed files with 77 additions and 63 deletions

View File

@ -38,6 +38,9 @@ class LiveRewriter(object):
else:
logging.debug('Live Rewrite Direct (no proxy)')
def is_recording(self):
return self.proxies is not None
def fetch_local_file(self, uri):
#fh = open(uri)
fh = LocalFileLoader().load(uri)
@ -123,14 +126,14 @@ class LiveRewriter(object):
env=None,
req_headers=None,
follow_redirects=False,
ignore_proxies=False,
skip_recording=False,
verify=True):
method = 'GET'
data = None
proxies = None
if not ignore_proxies:
if not skip_recording:
proxies = self.proxies
if not req_headers:
@ -174,7 +177,7 @@ class LiveRewriter(object):
req_headers={},
timestamp=None,
follow_redirects=False,
ignore_proxies=False,
skip_recording=False,
verify=True,
remote_only=True):
@ -203,7 +206,7 @@ class LiveRewriter(object):
(status_headers, stream) = self.fetch_http(url, urlkey, env,
req_headers,
follow_redirects,
ignore_proxies,
skip_recording,
verify)
else:
(status_headers, stream) = self.fetch_local_file(url)
@ -232,6 +235,26 @@ class LiveRewriter(object):
return result
def fetch_async(self, url, headers):
resp = self.live_request(method='GET',
url=url,
headers=headers,
proxies=self.proxies,
verify=False,
stream=True)
# don't actually read whole response,
# proxy response for writing it
resp.close()
def add_metadata(self, url, headers, data):
return self.live_request(method='PUTMETA',
url=url,
data=data,
headers=headers,
proxies=self.proxies,
verify=False)
def get_rewritten(self, *args, **kwargs):
result = self.fetch_request(*args, **kwargs)
@ -240,3 +263,35 @@ class LiveRewriter(object):
buff = ''.join(gen)
return (status_headers, buff)
def get_video_info(self, url):
return youtubedl.extract_info(url)
#=================================================================
class YoutubeDLWrapper(object): #pragma: no cover
""" YoutubeDL wrapper, inits youtubee-dl if it is available
"""
def __init__(self):
try:
from youtube_dl import YoutubeDL as YoutubeDL
except ImportError:
self.ydl = None
return
self.ydl = YoutubeDL(dict(simulate=True,
youtube_include_dash_manifest=False))
self.ydl.add_default_info_extractors()
def extract_info(self, url):
print('YDL', self.ydl)
if not self.ydl:
return None
info = self.ydl.extract_info(url)
return info
#=================================================================
youtubedl = YoutubeDLWrapper()

View File

@ -11,7 +11,6 @@ from views import HeadInsertView
from pywb.utils.wbexception import WbException
import json
import requests
import hashlib
@ -28,8 +27,6 @@ class RewriteHandler(SearchPageWbUrlHandler):
YT_DL_TYPE = 'application/vnd.youtube-dl_formats+json'
youtubedl = None
def __init__(self, config):
super(RewriteHandler, self).__init__(config)
@ -37,10 +34,10 @@ class RewriteHandler(SearchPageWbUrlHandler):
live_rewriter_cls = config.get('live_rewriter_cls', LiveRewriter)
self.rewriter = live_rewriter_cls(is_framed_replay=self.is_frame_mode,
proxies=proxyhostport)
self.live_fetcher = live_rewriter_cls(is_framed_replay=self.is_frame_mode,
proxies=proxyhostport)
self.proxies = self.rewriter.proxies
self.recording = self.live_fetcher.is_recording()
self.head_insert_view = HeadInsertView.init_from_config(config)
@ -73,7 +70,7 @@ class RewriteHandler(SearchPageWbUrlHandler):
def _live_request_headers(self, wbrequest):
return {}
def _ignore_proxies(self, wbrequest):
def _skip_recording(self, wbrequest):
return False
def render_content(self, wbrequest):
@ -87,7 +84,7 @@ class RewriteHandler(SearchPageWbUrlHandler):
if ref_wburl_str:
wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url
ignore_proxies = self._ignore_proxies(wbrequest)
skip_recording = self._skip_recording(wbrequest)
use_206 = False
url = None
@ -96,7 +93,7 @@ class RewriteHandler(SearchPageWbUrlHandler):
readd_range = False
cache_key = None
if self.proxies and not ignore_proxies:
if self.recording and not skip_recording:
rangeres = wbrequest.extract_range()
if rangeres:
@ -110,17 +107,17 @@ class RewriteHandler(SearchPageWbUrlHandler):
readd_range = True
else:
# disables proxy
ignore_proxies = True
skip_recording = True
# sets cache_key only if not already cached
cache_key = self._get_cache_key('r:', url)
result = self.rewriter.fetch_request(wbrequest.wb_url.url,
result = self.live_fetcher.fetch_request(wbrequest.wb_url.url,
wbrequest.urlrewriter,
head_insert_func=head_insert_func,
req_headers=req_headers,
env=wbrequest.env,
ignore_proxies=ignore_proxies,
skip_recording=skip_recording,
verify=self.verify)
wbresponse = self._make_response(wbrequest, *result)
@ -135,8 +132,8 @@ class RewriteHandler(SearchPageWbUrlHandler):
except (ValueError, TypeError):
pass
if cache_key:
self._add_proxy_ping(cache_key, url, wbrequest, wbresponse)
if self.recording and cache_key:
self._add_rec_ping(cache_key, url, wbrequest, wbresponse)
if rangeres:
referrer = wbrequest.env.get('REL_REFERER')
@ -183,7 +180,7 @@ class RewriteHandler(SearchPageWbUrlHandler):
key = prefix + key
return key
def _add_proxy_ping(self, key, url, wbrequest, wbresponse):
def _add_rec_ping(self, key, url, wbrequest, wbresponse):
def do_ping():
headers = self._live_request_headers(wbrequest)
headers['Connection'] = 'close'
@ -192,15 +189,8 @@ class RewriteHandler(SearchPageWbUrlHandler):
# mark as pinged
self._cache[key] = '1'
resp = requests.get(url=url,
headers=headers,
proxies=self.proxies,
verify=False,
stream=True)
self.live_fetcher.fetch_async(url, headers)
# don't actually read whole response,
# proxy response for writing it
resp.close()
except:
del self._cache[key]
raise
@ -219,9 +209,6 @@ class RewriteHandler(SearchPageWbUrlHandler):
return wbresponse
def _get_video_info(self, wbrequest, info_url=None, video_url=None):
if not self.youtubedl:
self.youtubedl = YoutubeDLWrapper()
if not video_url:
video_url = wbrequest.wb_url.url
@ -229,10 +216,10 @@ class RewriteHandler(SearchPageWbUrlHandler):
info_url = wbrequest.wb_url.url
cache_key = None
if self.proxies:
if self.recording:
cache_key = self._get_cache_key('v:', video_url)
info = self.youtubedl.extract_info(video_url)
info = self.live_fetcher.get_video_info(video_url)
if info is None: #pragma: no cover
msg = ('youtube-dl is not installed, pip install youtube-dl to ' +
'enable improved video proxy')
@ -244,42 +231,14 @@ class RewriteHandler(SearchPageWbUrlHandler):
content_type = self.YT_DL_TYPE
metadata = json.dumps(info)
if (self.proxies and cache_key):
if (self.recording and cache_key):
headers = self._live_request_headers(wbrequest)
headers['Content-Type'] = content_type
info_url = HttpsUrlRewriter.remove_https(info_url)
response = requests.request(method='PUTMETA',
url=info_url,
data=metadata,
headers=headers,
proxies=self.proxies,
verify=False)
response = self.live_fetcher.add_metadata(info_url, headers, metadata)
self._cache[cache_key] = '1'
return WbResponse.text_response(metadata, content_type=content_type)
#=================================================================
class YoutubeDLWrapper(object): #pragma: no cover
""" YoutubeDL wrapper, inits youtubee-dl if it is available
"""
def __init__(self):
try:
from youtube_dl import YoutubeDL as YoutubeDL
except ImportError:
self.ydl = None
return
self.ydl = YoutubeDL(dict(simulate=True,
youtube_include_dash_manifest=False))
self.ydl.add_default_info_extractors()
def extract_info(self, url):
if not self.ydl:
return None
info = self.ydl.extract_info(url)
return info

View File

@ -73,7 +73,7 @@ def setup_module():
config = dict(collections=dict(rewrite='$liveweb'),
framed_replay=True,
proxyhostport=server.proxy_dict)
proxyhostport=server.proxy_str)
global cache
cache = {}