1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-25 23:47:47 +01:00
pywb/pywb/webapp/rangecache.py
Ilya Kreymer dffa0bcbad video work: vidrw fixes to support object, fallback to flash
rangecache: create dir on first use only
2014-12-01 09:09:23 -08:00

154 lines
4.4 KiB
Python

from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.loaders import LimitReader
from pywb.framework.cache import create_cache
from tempfile import NamedTemporaryFile, mkdtemp
import yaml
import os
import re
import atexit
#=================================================================
class RangeCache(object):
YOUTUBE_RX = re.compile('.*.googlevideo.com/videoplayback')
YT_EXTRACT_RX = re.compile('&range=([^&]+)')
DEFAULT_BUFF = 16384*4
@staticmethod
def match_yt(url):
if not RangeCache.YOUTUBE_RX.match(url):
return None
range_h_res = []
def repl_range(matcher):
range_h_res.append(matcher.group(1))
return ''
new_url = RangeCache.YT_EXTRACT_RX.sub(repl_range, url)
if range_h_res:
return range_h_res[0], new_url
else:
return None, url
def __init__(self):
self.cache = create_cache()
self.temp_dir = None
atexit.register(self.cleanup)
def cleanup(self):
if self.temp_dir:
import shutil
print('Removing: ' + self.temp_dir)
shutil.rmtree(self.temp_dir, True)
def is_ranged(self, wbrequest):
url = wbrequest.wb_url.url
range_h = None
use_206 = False
result = self.match_yt(url)
if result:
range_h, url = result
# check for standard range header
if not range_h:
range_h = wbrequest.env.get('HTTP_RANGE')
if not range_h:
return None
use_206 = True
# force bounded range
range_h = range_h.split('=')[-1]
range_h = range_h.rstrip()
parts = range_h.split('-', 1)
start = int(parts[0])
if len(parts) == 2 and parts[1]:
end = int(parts[1])
else:
end = start + self.DEFAULT_BUFF - 1
return url, start, end, use_206
def __call__(self, wbrequest, digest, wbresponse_func):
result = self.is_ranged(wbrequest)
if not result:
return None, None
return self.handle_range(wbrequest, digest, wbresponse_func,
*result)
def handle_range(self, wbrequest, digest, wbresponse_func,
url, start, end, use_206):
key = digest
if not key in self.cache:
wbrequest.custom_params['noredir'] = True
response = wbresponse_func()
# only cache 200 responses
if not response.status_headers.get_statuscode().startswith('200'):
print('NON 200 RESP')
return response.status_headers, response.body
if not self.temp_dir:
self.temp_dir = mkdtemp(prefix='_pywbcache')
else:
pass
#self._check_dir_size(self.temp_dir)
with NamedTemporaryFile(delete=False, dir=self.temp_dir) as fh:
for obj in response.body:
fh.write(obj)
name = fh.name
spec = dict(name=fh.name,
headers=response.status_headers.headers)
self.cache[key] = yaml.dump(spec)
else:
spec = yaml.load(self.cache[key])
spec['headers'] = [tuple(x) for x in spec['headers']]
filelen = os.path.getsize(spec['name'])
maxlen = filelen - start
if end:
maxlen = min(maxlen, end - start + 1)
def read_range():
with open(spec['name']) as fh:
fh.seek(start)
fh = LimitReader.wrap_stream(fh, maxlen)
while True:
buf = fh.read()
if not buf:
break
yield buf
if use_206:
content_range = 'bytes {0}-{1}/{2}'.format(start,
start + maxlen - 1,
filelen)
status_headers = StatusAndHeaders('206 Partial Content', spec['headers'])
status_headers.replace_header('Content-Range', content_range)
else:
status_headers = StatusAndHeaders('200 OK', spec['headers'])
status_headers.replace_header('Content-Length', str(maxlen))
return status_headers, read_range()
#=================================================================
range_cache = RangeCache()