diff --git a/pywb/rules.yaml b/pywb/rules.yaml index eed510db..69e8f6b2 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -129,7 +129,7 @@ rules: - url_prefix: 'com,googlevideo,' - fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]+).*(range=[^&]+)' + fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]).*(mime=[^&]).*(signature=[^&])' # testing rules -- not for valid domain diff --git a/pywb/webapp/live_rewrite_handler.py b/pywb/webapp/live_rewrite_handler.py index 2f7cd62e..4578edf9 100644 --- a/pywb/webapp/live_rewrite_handler.py +++ b/pywb/webapp/live_rewrite_handler.py @@ -70,32 +70,37 @@ class RewriteHandler(SearchPageWbUrlHandler): if ref_wburl_str: wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url - result = self.rewriter.fetch_request(wbrequest.wb_url.url, - wbrequest.urlrewriter, - head_insert_func=head_insert_func, - req_headers=req_headers, - env=wbrequest.env) + def do_req(): + result = self.rewriter.fetch_request(wbrequest.wb_url.url, + wbrequest.urlrewriter, + head_insert_func=head_insert_func, + req_headers=req_headers, + env=wbrequest.env) + + return self._make_response(wbrequest, *result) + + cdx = dict(url=wbrequest.wb_url.url) + + range_status, range_iter = range_cache(wbrequest, cdx, do_req) + + if not range_status or not range_iter: + return do_req() + else: + result = range_status, range_iter, False + return self._make_response(wbrequest, *result) - return self._make_response(wbrequest, *result) def _make_response(self, wbrequest, status_headers, gen, is_rewritten): # if cookie set, pass recorded timestamp info via cookie # so that client side may be able to access it # used by framed mode to update frame banner if self.live_cookie: - cdx = wbrequest.env['pywb.cdx'] - value = self.live_cookie.format(cdx['timestamp']) - status_headers.headers.append(('Set-Cookie', value)) - - def resp_func(): - return WbResponse(status_headers, gen) - - #range_status, range_iter = range_cache(wbrequest, cdx, resp_func) - #if range_status and range_iter: - # return WbResponse(range_status, range_iter) - #else: - return resp_func() + cdx = wbrequest.env.get('pywb.cdx') + if cdx: + value = self.live_cookie.format(cdx['timestamp']) + status_headers.headers.append(('Set-Cookie', value)) + return WbResponse(status_headers, gen) def get_video_info(self, wbrequest): if not self.youtubedl: diff --git a/pywb/webapp/rangecache.py b/pywb/webapp/rangecache.py index b6a0a48d..dddb06e5 100644 --- a/pywb/webapp/rangecache.py +++ b/pywb/webapp/rangecache.py @@ -7,28 +7,71 @@ from tempfile import NamedTemporaryFile import hashlib import yaml import os +import re #================================================================= class RangeCache(object): + YOUTUBE_RX = re.compile('.*.googlevideo.com/videoplayback') + YT_EXTRACT_RX = re.compile('&range=([^&]+)') + + @staticmethod + def match_yt(url): + if not RangeCache.YOUTUBE_RX.match(url): + return None + + range_h_res = [] + + def repl_range(matcher): + range_h_res.append(matcher.group(1)) + return '' + + new_url = RangeCache.YT_EXTRACT_RX.sub(repl_range, url) + if range_h_res: + print('MATCHED') + return range_h_res[0], new_url + else: + return None, url + def __init__(self): self.cache = create_cache() print(type(self.cache)) def __call__(self, wbrequest, cdx, wbresponse_func): - range_h = wbrequest.env.get('HTTP_RANGE') - if not range_h: - return None, None + url = wbrequest.wb_url.url + range_h = None + use_206 = False + result = self.match_yt(url) + if result: + range_h, url = result + wbrequest.wb_url.url = url + print(range_h) + + # check for standard range header + if not range_h: + range_h = wbrequest.env.get('HTTP_RANGE') + if not range_h: + return None, None + range_h = True + + return self.handle_range(wbrequest, cdx, url, + wbresponse_func, + range_h, use_206) + + def handle_range(self, wbrequest, cdx, url, wbresponse_func, + range_h, use_206): + + range_h = range_h.split('=')[-1] key = cdx.get('digest') if not key: hash_ = hashlib.md5() - hash_.update(cdx['urlkey']) - hash_.update(cdx['timestamp']) + hash_.update(url) + #hash_.update(cdx['timestamp']) key = hash_.hexdigest() print('KEY: ', key) - print('CACHE: ', str(self.cache)) + print('RANGE: ', range_h) if not key in self.cache: print('MISS') @@ -56,13 +99,13 @@ class RangeCache(object): range_h = range_h.rstrip() - if range_h == 'bytes=0-': + if range_h == '0-': print('FIX RANGE') - range_h = 'bytes=0-120000' + range_h = '0-120000' parts = range_h.rstrip().split('-') start = parts[0] - start = start.split('=')[1] + #start = start.split('=')[1] start = int(start) maxlen = filelen - start @@ -82,14 +125,22 @@ class RangeCache(object): yield buf + if use_206: + content_range = 'bytes {0}-{1}/{2}'.format(start, + start + maxlen - 1, + filelen) + print('CONTENT_RANGE: ', content_range) - content_range = 'bytes {0}-{1}/{2}'.format(start, - start + maxlen - 1, - filelen) + status_headers = StatusAndHeaders('206 Partial Content', spec['headers']) + status_headers.replace_header('Content-Range', content_range) + else: + status_headers = StatusAndHeaders('200 OK', spec['headers']) + + status_headers.headers.append(('Accept-Ranges', 'bytes')) + status_headers.headers.append(('Access-Control-Allow-Credentials', 'true')) + status_headers.headers.append(('Access-Control-Allow-Origin', 'http://localhost:8080')) + status_headers.headers.append(('Timing-Allow-Origin', 'http://localhost:8080')) - print('CONTENT_RANGE: ', content_range) - status_headers = StatusAndHeaders('206 Partial Content', spec['headers']) - status_headers.replace_header('Content-Range', content_range) status_headers.replace_header('Content-Length', str(maxlen)) return status_headers, read_range() diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index b8db1c4e..b5002b0a 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -201,6 +201,9 @@ class ReplayView(object): if wbrequest.options['is_proxy']: return None + if range_cache and range_cache.match_yt(wbrequest.wb_url.url): + return None + redir_needed = (wbrequest.options.get('is_timegate', False)) if not redir_needed and self.redir_to_exact: