diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py index b9cb11d9..60ab848b 100644 --- a/pywb/rewrite/rewrite_live.py +++ b/pywb/rewrite/rewrite_live.py @@ -38,6 +38,9 @@ class LiveRewriter(object): else: logging.debug('Live Rewrite Direct (no proxy)') + def is_recording(self): + return self.proxies is not None + def fetch_local_file(self, uri): #fh = open(uri) fh = LocalFileLoader().load(uri) @@ -123,14 +126,14 @@ class LiveRewriter(object): env=None, req_headers=None, follow_redirects=False, - ignore_proxies=False, + skip_recording=False, verify=True): method = 'GET' data = None proxies = None - if not ignore_proxies: + if not skip_recording: proxies = self.proxies if not req_headers: @@ -174,7 +177,7 @@ class LiveRewriter(object): req_headers={}, timestamp=None, follow_redirects=False, - ignore_proxies=False, + skip_recording=False, verify=True, remote_only=True): @@ -203,7 +206,7 @@ class LiveRewriter(object): (status_headers, stream) = self.fetch_http(url, urlkey, env, req_headers, follow_redirects, - ignore_proxies, + skip_recording, verify) else: (status_headers, stream) = self.fetch_local_file(url) @@ -232,6 +235,26 @@ class LiveRewriter(object): return result + def fetch_async(self, url, headers): + resp = self.live_request(method='GET', + url=url, + headers=headers, + proxies=self.proxies, + verify=False, + stream=True) + + # don't actually read whole response, + # proxy response for writing it + resp.close() + + def add_metadata(self, url, headers, data): + return self.live_request(method='PUTMETA', + url=url, + data=data, + headers=headers, + proxies=self.proxies, + verify=False) + def get_rewritten(self, *args, **kwargs): result = self.fetch_request(*args, **kwargs) @@ -240,3 +263,35 @@ class LiveRewriter(object): buff = ''.join(gen) return (status_headers, buff) + + def get_video_info(self, url): + return youtubedl.extract_info(url) + + +#================================================================= +class YoutubeDLWrapper(object): #pragma: no cover + """ YoutubeDL wrapper, inits youtubee-dl if it is available + """ + def __init__(self): + try: + from youtube_dl import YoutubeDL as YoutubeDL + except ImportError: + self.ydl = None + return + + self.ydl = YoutubeDL(dict(simulate=True, + youtube_include_dash_manifest=False)) + self.ydl.add_default_info_extractors() + + def extract_info(self, url): + print('YDL', self.ydl) + if not self.ydl: + return None + + info = self.ydl.extract_info(url) + return info + + +#================================================================= +youtubedl = YoutubeDLWrapper() + diff --git a/pywb/webapp/live_rewrite_handler.py b/pywb/webapp/live_rewrite_handler.py index a8121ad0..eda51dd8 100644 --- a/pywb/webapp/live_rewrite_handler.py +++ b/pywb/webapp/live_rewrite_handler.py @@ -11,7 +11,6 @@ from views import HeadInsertView from pywb.utils.wbexception import WbException import json -import requests import hashlib @@ -28,8 +27,6 @@ class RewriteHandler(SearchPageWbUrlHandler): YT_DL_TYPE = 'application/vnd.youtube-dl_formats+json' - youtubedl = None - def __init__(self, config): super(RewriteHandler, self).__init__(config) @@ -37,10 +34,10 @@ class RewriteHandler(SearchPageWbUrlHandler): live_rewriter_cls = config.get('live_rewriter_cls', LiveRewriter) - self.rewriter = live_rewriter_cls(is_framed_replay=self.is_frame_mode, - proxies=proxyhostport) + self.live_fetcher = live_rewriter_cls(is_framed_replay=self.is_frame_mode, + proxies=proxyhostport) - self.proxies = self.rewriter.proxies + self.recording = self.live_fetcher.is_recording() self.head_insert_view = HeadInsertView.init_from_config(config) @@ -73,7 +70,7 @@ class RewriteHandler(SearchPageWbUrlHandler): def _live_request_headers(self, wbrequest): return {} - def _ignore_proxies(self, wbrequest): + def _skip_recording(self, wbrequest): return False def render_content(self, wbrequest): @@ -87,7 +84,7 @@ class RewriteHandler(SearchPageWbUrlHandler): if ref_wburl_str: wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url - ignore_proxies = self._ignore_proxies(wbrequest) + skip_recording = self._skip_recording(wbrequest) use_206 = False url = None @@ -96,7 +93,7 @@ class RewriteHandler(SearchPageWbUrlHandler): readd_range = False cache_key = None - if self.proxies and not ignore_proxies: + if self.recording and not skip_recording: rangeres = wbrequest.extract_range() if rangeres: @@ -110,17 +107,17 @@ class RewriteHandler(SearchPageWbUrlHandler): readd_range = True else: # disables proxy - ignore_proxies = True + skip_recording = True # sets cache_key only if not already cached cache_key = self._get_cache_key('r:', url) - result = self.rewriter.fetch_request(wbrequest.wb_url.url, + result = self.live_fetcher.fetch_request(wbrequest.wb_url.url, wbrequest.urlrewriter, head_insert_func=head_insert_func, req_headers=req_headers, env=wbrequest.env, - ignore_proxies=ignore_proxies, + skip_recording=skip_recording, verify=self.verify) wbresponse = self._make_response(wbrequest, *result) @@ -135,8 +132,8 @@ class RewriteHandler(SearchPageWbUrlHandler): except (ValueError, TypeError): pass - if cache_key: - self._add_proxy_ping(cache_key, url, wbrequest, wbresponse) + if self.recording and cache_key: + self._add_rec_ping(cache_key, url, wbrequest, wbresponse) if rangeres: referrer = wbrequest.env.get('REL_REFERER') @@ -183,7 +180,7 @@ class RewriteHandler(SearchPageWbUrlHandler): key = prefix + key return key - def _add_proxy_ping(self, key, url, wbrequest, wbresponse): + def _add_rec_ping(self, key, url, wbrequest, wbresponse): def do_ping(): headers = self._live_request_headers(wbrequest) headers['Connection'] = 'close' @@ -192,15 +189,8 @@ class RewriteHandler(SearchPageWbUrlHandler): # mark as pinged self._cache[key] = '1' - resp = requests.get(url=url, - headers=headers, - proxies=self.proxies, - verify=False, - stream=True) + self.live_fetcher.fetch_async(url, headers) - # don't actually read whole response, - # proxy response for writing it - resp.close() except: del self._cache[key] raise @@ -219,9 +209,6 @@ class RewriteHandler(SearchPageWbUrlHandler): return wbresponse def _get_video_info(self, wbrequest, info_url=None, video_url=None): - if not self.youtubedl: - self.youtubedl = YoutubeDLWrapper() - if not video_url: video_url = wbrequest.wb_url.url @@ -229,10 +216,10 @@ class RewriteHandler(SearchPageWbUrlHandler): info_url = wbrequest.wb_url.url cache_key = None - if self.proxies: + if self.recording: cache_key = self._get_cache_key('v:', video_url) - info = self.youtubedl.extract_info(video_url) + info = self.live_fetcher.get_video_info(video_url) if info is None: #pragma: no cover msg = ('youtube-dl is not installed, pip install youtube-dl to ' + 'enable improved video proxy') @@ -244,42 +231,14 @@ class RewriteHandler(SearchPageWbUrlHandler): content_type = self.YT_DL_TYPE metadata = json.dumps(info) - if (self.proxies and cache_key): + if (self.recording and cache_key): headers = self._live_request_headers(wbrequest) headers['Content-Type'] = content_type info_url = HttpsUrlRewriter.remove_https(info_url) - response = requests.request(method='PUTMETA', - url=info_url, - data=metadata, - headers=headers, - proxies=self.proxies, - verify=False) + response = self.live_fetcher.add_metadata(info_url, headers, metadata) self._cache[cache_key] = '1' return WbResponse.text_response(metadata, content_type=content_type) - - -#================================================================= -class YoutubeDLWrapper(object): #pragma: no cover - """ YoutubeDL wrapper, inits youtubee-dl if it is available - """ - def __init__(self): - try: - from youtube_dl import YoutubeDL as YoutubeDL - except ImportError: - self.ydl = None - return - - self.ydl = YoutubeDL(dict(simulate=True, - youtube_include_dash_manifest=False)) - self.ydl.add_default_info_extractors() - - def extract_info(self, url): - if not self.ydl: - return None - - info = self.ydl.extract_info(url) - return info diff --git a/tests/test_live_proxy.py b/tests/test_live_proxy.py index b64f74a5..7896eac5 100644 --- a/tests/test_live_proxy.py +++ b/tests/test_live_proxy.py @@ -73,7 +73,7 @@ def setup_module(): config = dict(collections=dict(rewrite='$liveweb'), framed_replay=True, - proxyhostport=server.proxy_dict) + proxyhostport=server.proxy_str) global cache cache = {}