1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

video: work on domain-specific range cache rewrites

This commit is contained in:
Ilya Kreymer 2014-11-04 08:44:45 -08:00
parent 703ec0eb5e
commit 72aa921ce5
4 changed files with 93 additions and 34 deletions

View File

@ -129,7 +129,7 @@ rules:
- url_prefix: 'com,googlevideo,' - url_prefix: 'com,googlevideo,'
fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]+).*(range=[^&]+)' fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]).*(mime=[^&]).*(signature=[^&])'
# testing rules -- not for valid domain # testing rules -- not for valid domain

View File

@ -70,32 +70,37 @@ class RewriteHandler(SearchPageWbUrlHandler):
if ref_wburl_str: if ref_wburl_str:
wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url
result = self.rewriter.fetch_request(wbrequest.wb_url.url, def do_req():
wbrequest.urlrewriter, result = self.rewriter.fetch_request(wbrequest.wb_url.url,
head_insert_func=head_insert_func, wbrequest.urlrewriter,
req_headers=req_headers, head_insert_func=head_insert_func,
env=wbrequest.env) req_headers=req_headers,
env=wbrequest.env)
return self._make_response(wbrequest, *result)
cdx = dict(url=wbrequest.wb_url.url)
range_status, range_iter = range_cache(wbrequest, cdx, do_req)
if not range_status or not range_iter:
return do_req()
else:
result = range_status, range_iter, False
return self._make_response(wbrequest, *result)
return self._make_response(wbrequest, *result)
def _make_response(self, wbrequest, status_headers, gen, is_rewritten): def _make_response(self, wbrequest, status_headers, gen, is_rewritten):
# if cookie set, pass recorded timestamp info via cookie # if cookie set, pass recorded timestamp info via cookie
# so that client side may be able to access it # so that client side may be able to access it
# used by framed mode to update frame banner # used by framed mode to update frame banner
if self.live_cookie: if self.live_cookie:
cdx = wbrequest.env['pywb.cdx'] cdx = wbrequest.env.get('pywb.cdx')
value = self.live_cookie.format(cdx['timestamp']) if cdx:
status_headers.headers.append(('Set-Cookie', value)) value = self.live_cookie.format(cdx['timestamp'])
status_headers.headers.append(('Set-Cookie', value))
def resp_func():
return WbResponse(status_headers, gen)
#range_status, range_iter = range_cache(wbrequest, cdx, resp_func)
#if range_status and range_iter:
# return WbResponse(range_status, range_iter)
#else:
return resp_func()
return WbResponse(status_headers, gen)
def get_video_info(self, wbrequest): def get_video_info(self, wbrequest):
if not self.youtubedl: if not self.youtubedl:

View File

@ -7,28 +7,71 @@ from tempfile import NamedTemporaryFile
import hashlib import hashlib
import yaml import yaml
import os import os
import re
#================================================================= #=================================================================
class RangeCache(object): class RangeCache(object):
YOUTUBE_RX = re.compile('.*.googlevideo.com/videoplayback')
YT_EXTRACT_RX = re.compile('&range=([^&]+)')
@staticmethod
def match_yt(url):
if not RangeCache.YOUTUBE_RX.match(url):
return None
range_h_res = []
def repl_range(matcher):
range_h_res.append(matcher.group(1))
return ''
new_url = RangeCache.YT_EXTRACT_RX.sub(repl_range, url)
if range_h_res:
print('MATCHED')
return range_h_res[0], new_url
else:
return None, url
def __init__(self): def __init__(self):
self.cache = create_cache() self.cache = create_cache()
print(type(self.cache)) print(type(self.cache))
def __call__(self, wbrequest, cdx, wbresponse_func): def __call__(self, wbrequest, cdx, wbresponse_func):
range_h = wbrequest.env.get('HTTP_RANGE') url = wbrequest.wb_url.url
if not range_h: range_h = None
return None, None use_206 = False
result = self.match_yt(url)
if result:
range_h, url = result
wbrequest.wb_url.url = url
print(range_h)
# check for standard range header
if not range_h:
range_h = wbrequest.env.get('HTTP_RANGE')
if not range_h:
return None, None
range_h = True
return self.handle_range(wbrequest, cdx, url,
wbresponse_func,
range_h, use_206)
def handle_range(self, wbrequest, cdx, url, wbresponse_func,
range_h, use_206):
range_h = range_h.split('=')[-1]
key = cdx.get('digest') key = cdx.get('digest')
if not key: if not key:
hash_ = hashlib.md5() hash_ = hashlib.md5()
hash_.update(cdx['urlkey']) hash_.update(url)
hash_.update(cdx['timestamp']) #hash_.update(cdx['timestamp'])
key = hash_.hexdigest() key = hash_.hexdigest()
print('KEY: ', key) print('KEY: ', key)
print('CACHE: ', str(self.cache)) print('RANGE: ', range_h)
if not key in self.cache: if not key in self.cache:
print('MISS') print('MISS')
@ -56,13 +99,13 @@ class RangeCache(object):
range_h = range_h.rstrip() range_h = range_h.rstrip()
if range_h == 'bytes=0-': if range_h == '0-':
print('FIX RANGE') print('FIX RANGE')
range_h = 'bytes=0-120000' range_h = '0-120000'
parts = range_h.rstrip().split('-') parts = range_h.rstrip().split('-')
start = parts[0] start = parts[0]
start = start.split('=')[1] #start = start.split('=')[1]
start = int(start) start = int(start)
maxlen = filelen - start maxlen = filelen - start
@ -82,14 +125,22 @@ class RangeCache(object):
yield buf yield buf
if use_206:
content_range = 'bytes {0}-{1}/{2}'.format(start,
start + maxlen - 1,
filelen)
print('CONTENT_RANGE: ', content_range)
content_range = 'bytes {0}-{1}/{2}'.format(start, status_headers = StatusAndHeaders('206 Partial Content', spec['headers'])
start + maxlen - 1, status_headers.replace_header('Content-Range', content_range)
filelen) else:
status_headers = StatusAndHeaders('200 OK', spec['headers'])
status_headers.headers.append(('Accept-Ranges', 'bytes'))
status_headers.headers.append(('Access-Control-Allow-Credentials', 'true'))
status_headers.headers.append(('Access-Control-Allow-Origin', 'http://localhost:8080'))
status_headers.headers.append(('Timing-Allow-Origin', 'http://localhost:8080'))
print('CONTENT_RANGE: ', content_range)
status_headers = StatusAndHeaders('206 Partial Content', spec['headers'])
status_headers.replace_header('Content-Range', content_range)
status_headers.replace_header('Content-Length', str(maxlen)) status_headers.replace_header('Content-Length', str(maxlen))
return status_headers, read_range() return status_headers, read_range()

View File

@ -201,6 +201,9 @@ class ReplayView(object):
if wbrequest.options['is_proxy']: if wbrequest.options['is_proxy']:
return None return None
if range_cache and range_cache.match_yt(wbrequest.wb_url.url):
return None
redir_needed = (wbrequest.options.get('is_timegate', False)) redir_needed = (wbrequest.options.get('is_timegate', False))
if not redir_needed and self.redir_to_exact: if not redir_needed and self.redir_to_exact: