mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
cache: move cache wrappers to seperate cache.py in framework from
proxy_resolvers range cache: and buffering cache for serving range requests, intended for videos but not only. full response cached in temp file and range requests served from cache, still experimental need to add deletion. youtube_dl: wrap youtube-dl import due to youtube-dl HTMLParser regex bug tests: add test for vi_ handler
This commit is contained in:
parent
07bcf9fbfe
commit
1aac5a9f15
28
pywb/framework/cache.py
Normal file
28
pywb/framework/cache.py
Normal file
@ -0,0 +1,28 @@
|
||||
try: # pragma: no cover
|
||||
import uwsgi
|
||||
uwsgi_cache = True
|
||||
except ImportError:
|
||||
uwsgi_cache = False
|
||||
|
||||
|
||||
#=================================================================
|
||||
class UwsgiCache(object): # pragma: no cover
|
||||
def __setitem__(self, item, value):
|
||||
uwsgi.cache_update(item, value)
|
||||
|
||||
def __getitem__(self, item):
|
||||
return uwsgi.cache_get(item)
|
||||
|
||||
def __contains__(self, item):
|
||||
return uwsgi.cache_exists(item)
|
||||
|
||||
def __delitem__(self, item):
|
||||
uwsgi.cache_del(item)
|
||||
|
||||
|
||||
#=================================================================
|
||||
def create_cache():
|
||||
if uwsgi_cache: # pragma: no cover
|
||||
return UwsgiCache()
|
||||
else:
|
||||
return {}
|
@ -3,31 +3,12 @@ from pywb.utils.loaders import extract_client_cookie
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
|
||||
from cache import create_cache
|
||||
|
||||
import urlparse
|
||||
import base64
|
||||
import os
|
||||
|
||||
try: # pragma: no cover
|
||||
import uwsgi
|
||||
uwsgi_cache = True
|
||||
except ImportError:
|
||||
uwsgi_cache = False
|
||||
|
||||
|
||||
#=================================================================
|
||||
class UwsgiCache(object): # pragma: no cover
|
||||
def __setitem__(self, item, value):
|
||||
uwsgi.cache_update(item, value)
|
||||
|
||||
def __getitem__(self, item):
|
||||
return uwsgi.cache_get(item)
|
||||
|
||||
def __contains__(self, item):
|
||||
return uwsgi.cache_exists(item)
|
||||
|
||||
def __delitem__(self, item):
|
||||
uwsgi.cache_del(item)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class BaseCollResolver(object):
|
||||
@ -136,10 +117,7 @@ class CookieResolver(BaseCollResolver):
|
||||
|
||||
self.extra_headers = config.get('extra_headers')
|
||||
|
||||
if uwsgi_cache: # pragma: no cover
|
||||
self.cache = UwsgiCache()
|
||||
else:
|
||||
self.cache = {}
|
||||
self.cache = create_cache()
|
||||
|
||||
def get_proxy_coll_ts(self, env):
|
||||
coll, ts, sesh_id = self.get_coll(env)
|
||||
|
@ -40,7 +40,7 @@ class SearchPageWbUrlHandler(WbUrlHandler):
|
||||
create_template(html, 'Frame Insert'))
|
||||
|
||||
self.banner_html = config.get('banner_html', 'banner.html')
|
||||
|
||||
|
||||
if config.get('enable_memento', False):
|
||||
self.response_class = MementoResponse
|
||||
|
||||
@ -195,7 +195,7 @@ class StaticHandler(BaseHandler):
|
||||
|
||||
content_type, _ = mimetypes.guess_type(full_path)
|
||||
|
||||
return WbResponse.text_stream(data,
|
||||
return WbResponse.text_stream(reader,
|
||||
content_type=content_type,
|
||||
headers=headers)
|
||||
|
||||
|
@ -13,7 +13,8 @@ from pywb.utils.wbexception import WbException
|
||||
|
||||
import json
|
||||
import requests
|
||||
from youtube_dl import YoutubeDL
|
||||
|
||||
from rangecache import range_cache
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -27,6 +28,8 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
||||
|
||||
LIVE_COOKIE = 'pywb.timestamp={0}; max-age=60'
|
||||
|
||||
youtubedl = None
|
||||
|
||||
def __init__(self, config):
|
||||
super(RewriteHandler, self).__init__(config)
|
||||
|
||||
@ -84,17 +87,22 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
||||
value = self.live_cookie.format(cdx['timestamp'])
|
||||
status_headers.headers.append(('Set-Cookie', value))
|
||||
|
||||
return WbResponse(status_headers, gen)
|
||||
def resp_func():
|
||||
return WbResponse(status_headers, gen)
|
||||
|
||||
#range_status, range_iter = range_cache(wbrequest, cdx, resp_func)
|
||||
#if range_status and range_iter:
|
||||
# return WbResponse(range_status, range_iter)
|
||||
#else:
|
||||
return resp_func()
|
||||
|
||||
|
||||
def get_video_info(self, wbrequest):
|
||||
if not self.ydl:
|
||||
self.ydl = YoutubeDL(dict(simulate=True,
|
||||
youtube_include_dash_manifest=False))
|
||||
if not self.youtubedl:
|
||||
self.youtubedl = YoutubeDLWrapper()
|
||||
|
||||
self.ydl.add_default_info_extractors()
|
||||
info = self.youtubedl.extract_info(wbrequest.wb_url.url)
|
||||
|
||||
info = self.ydl.extract_info(wbrequest.wb_url.url)
|
||||
content_type = 'application/vnd.youtube-dl_formats+json'
|
||||
metadata = json.dumps(info)
|
||||
|
||||
@ -119,6 +127,42 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
||||
return 'Live Web Rewrite Handler'
|
||||
|
||||
|
||||
#=================================================================
|
||||
class YoutubeDLWrapper(object):
|
||||
""" Used to wrap youtubedl import, since youtubedl currently overrides
|
||||
global HTMLParser.locatestarttagend regex with a different regex
|
||||
that doesn't quite work.
|
||||
|
||||
This wrapper ensures that this regex is only set for YoutubeDL and unset
|
||||
otherwise
|
||||
"""
|
||||
def __init__(self):
|
||||
import HTMLParser as htmlparser
|
||||
self.htmlparser = htmlparser
|
||||
|
||||
self.orig_tagregex = htmlparser.locatestarttagend
|
||||
|
||||
from youtube_dl import YoutubeDL as YoutubeDL
|
||||
|
||||
self.ydl_tagregex = htmlparser.locatestarttagend
|
||||
|
||||
htmlparser.locatestarttagend = self.orig_tagregex
|
||||
|
||||
self.ydl = YoutubeDL(dict(simulate=True,
|
||||
youtube_include_dash_manifest=False))
|
||||
self.ydl.add_default_info_extractors()
|
||||
|
||||
def extract_info(self, url):
|
||||
info = None
|
||||
try:
|
||||
self.htmlparser.locatestarttagend = self.ydl_tagregex
|
||||
info = self.ydl.extract_info(url)
|
||||
finally:
|
||||
self.htmlparser.locatestarttagend = self.orig_tagregex
|
||||
|
||||
return info
|
||||
|
||||
|
||||
#=================================================================
|
||||
def create_live_rewriter_app(config={}):
|
||||
routes = [Route('rewrite', RewriteHandler(config)),
|
||||
|
98
pywb/webapp/rangecache.py
Normal file
98
pywb/webapp/rangecache.py
Normal file
@ -0,0 +1,98 @@
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from pywb.utils.loaders import LimitReader
|
||||
from pywb.framework.cache import create_cache
|
||||
|
||||
from tempfile import NamedTemporaryFile
|
||||
|
||||
import hashlib
|
||||
import yaml
|
||||
import os
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RangeCache(object):
|
||||
def __init__(self):
|
||||
self.cache = create_cache()
|
||||
print(type(self.cache))
|
||||
|
||||
def __call__(self, wbrequest, cdx, wbresponse_func):
|
||||
range_h = wbrequest.env.get('HTTP_RANGE')
|
||||
if not range_h:
|
||||
return None, None
|
||||
|
||||
key = cdx.get('digest')
|
||||
if not key:
|
||||
hash_ = hashlib.md5()
|
||||
hash_.update(cdx['urlkey'])
|
||||
hash_.update(cdx['timestamp'])
|
||||
key = hash_.hexdigest()
|
||||
|
||||
print('KEY: ', key)
|
||||
print('CACHE: ', str(self.cache))
|
||||
|
||||
if not key in self.cache:
|
||||
print('MISS')
|
||||
response = wbresponse_func()
|
||||
|
||||
with NamedTemporaryFile(delete=False) as fh:
|
||||
for obj in response.body:
|
||||
fh.write(obj)
|
||||
|
||||
name = fh.name
|
||||
|
||||
spec = dict(name=fh.name,
|
||||
headers=response.status_headers.headers)
|
||||
|
||||
print('SET CACHE: ' + key)
|
||||
self.cache[key] = yaml.dump(spec)
|
||||
else:
|
||||
print('HIT')
|
||||
spec = yaml.load(self.cache[key])
|
||||
spec['headers'] = [tuple(x) for x in spec['headers']]
|
||||
|
||||
print(spec['headers'])
|
||||
print('TEMP FILE: ' + spec['name'])
|
||||
filelen = os.path.getsize(spec['name'])
|
||||
|
||||
range_h = range_h.rstrip()
|
||||
|
||||
if range_h == 'bytes=0-':
|
||||
print('FIX RANGE')
|
||||
range_h = 'bytes=0-120000'
|
||||
|
||||
parts = range_h.rstrip().split('-')
|
||||
start = parts[0]
|
||||
start = start.split('=')[1]
|
||||
start = int(start)
|
||||
|
||||
maxlen = filelen - start
|
||||
|
||||
if len(parts) == 2 and parts[1]:
|
||||
maxlen = min(maxlen, int(parts[1]) - start + 1)
|
||||
|
||||
def read_range():
|
||||
with open(spec['name']) as fh:
|
||||
fh.seek(start)
|
||||
fh = LimitReader.wrap_stream(fh, maxlen)
|
||||
while True:
|
||||
buf = fh.read()
|
||||
print('READ: ', len(buf))
|
||||
if not buf:
|
||||
break
|
||||
|
||||
yield buf
|
||||
|
||||
|
||||
content_range = 'bytes {0}-{1}/{2}'.format(start,
|
||||
start + maxlen - 1,
|
||||
filelen)
|
||||
|
||||
print('CONTENT_RANGE: ', content_range)
|
||||
status_headers = StatusAndHeaders('206 Partial Content', spec['headers'])
|
||||
status_headers.replace_header('Content-Range', content_range)
|
||||
status_headers.replace_header('Content-Length', str(maxlen))
|
||||
return status_headers, read_range()
|
||||
|
||||
|
||||
#=================================================================
|
||||
range_cache = RangeCache()
|
@ -15,6 +15,8 @@ from pywb.warc.recordloader import ArchiveLoadFailed
|
||||
from views import J2TemplateView, add_env_globals
|
||||
from views import J2HtmlCapturesView, HeadInsertView
|
||||
|
||||
from rangecache import range_cache
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CaptureException(WbException):
|
||||
@ -77,7 +79,7 @@ class ReplayView(object):
|
||||
|
||||
first = False
|
||||
|
||||
response = self.replay_capture(wbrequest,
|
||||
response = self.cached_replay_capture(wbrequest,
|
||||
cdx,
|
||||
cdx_loader,
|
||||
failed_files)
|
||||
@ -99,6 +101,23 @@ class ReplayView(object):
|
||||
|
||||
raise last_e
|
||||
|
||||
|
||||
def cached_replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
|
||||
def get_capture():
|
||||
return self.replay_capture(wbrequest, cdx, cdx_loader, failed_files)
|
||||
|
||||
range_status, range_iter = range_cache(wbrequest,
|
||||
cdx,
|
||||
get_capture)
|
||||
if range_status and range_iter:
|
||||
response = self.response_class(range_status,
|
||||
range_iter,
|
||||
wbrequest=wbrequest,
|
||||
cdx=cdx)
|
||||
return response
|
||||
|
||||
return get_capture()
|
||||
|
||||
def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
|
||||
(status_headers, stream) = (self.content_loader.
|
||||
resolve_headers_and_payload(cdx,
|
||||
|
@ -38,4 +38,9 @@ class TestLiveRewriter:
|
||||
resp = self.testapp.get('/rewrite/@#$@#$', status=400)
|
||||
assert resp.status_int == 400
|
||||
|
||||
def test_live_video_info(self):
|
||||
resp = self.testapp.get('/rewrite/vi_/https://www.youtube.com/watch?v=DjFZyFWSt1M')
|
||||
assert resp.status_int == 200
|
||||
assert resp.content_type == 'application/vnd.youtube-dl_formats+json', resp.content_type
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user