1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

cache: move cache wrappers to seperate cache.py in framework from

proxy_resolvers
range cache: and buffering cache for serving range requests, intended
for videos but not only. full response cached in temp file and range
requests served from cache, still experimental
need to add deletion.
youtube_dl: wrap youtube-dl import due to youtube-dl HTMLParser regex
bug
tests: add test for vi_ handler
This commit is contained in:
Ilya Kreymer 2014-11-01 13:22:54 -07:00
parent 07bcf9fbfe
commit 1aac5a9f15
7 changed files with 207 additions and 35 deletions

28
pywb/framework/cache.py Normal file
View File

@ -0,0 +1,28 @@
try: # pragma: no cover
import uwsgi
uwsgi_cache = True
except ImportError:
uwsgi_cache = False
#=================================================================
class UwsgiCache(object): # pragma: no cover
def __setitem__(self, item, value):
uwsgi.cache_update(item, value)
def __getitem__(self, item):
return uwsgi.cache_get(item)
def __contains__(self, item):
return uwsgi.cache_exists(item)
def __delitem__(self, item):
uwsgi.cache_del(item)
#=================================================================
def create_cache():
if uwsgi_cache: # pragma: no cover
return UwsgiCache()
else:
return {}

View File

@ -3,31 +3,12 @@ from pywb.utils.loaders import extract_client_cookie
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.rewrite.wburl import WbUrl
from cache import create_cache
import urlparse
import base64
import os
try: # pragma: no cover
import uwsgi
uwsgi_cache = True
except ImportError:
uwsgi_cache = False
#=================================================================
class UwsgiCache(object): # pragma: no cover
def __setitem__(self, item, value):
uwsgi.cache_update(item, value)
def __getitem__(self, item):
return uwsgi.cache_get(item)
def __contains__(self, item):
return uwsgi.cache_exists(item)
def __delitem__(self, item):
uwsgi.cache_del(item)
#=================================================================
class BaseCollResolver(object):
@ -136,10 +117,7 @@ class CookieResolver(BaseCollResolver):
self.extra_headers = config.get('extra_headers')
if uwsgi_cache: # pragma: no cover
self.cache = UwsgiCache()
else:
self.cache = {}
self.cache = create_cache()
def get_proxy_coll_ts(self, env):
coll, ts, sesh_id = self.get_coll(env)

View File

@ -40,7 +40,7 @@ class SearchPageWbUrlHandler(WbUrlHandler):
create_template(html, 'Frame Insert'))
self.banner_html = config.get('banner_html', 'banner.html')
if config.get('enable_memento', False):
self.response_class = MementoResponse
@ -195,7 +195,7 @@ class StaticHandler(BaseHandler):
content_type, _ = mimetypes.guess_type(full_path)
return WbResponse.text_stream(data,
return WbResponse.text_stream(reader,
content_type=content_type,
headers=headers)

View File

@ -13,7 +13,8 @@ from pywb.utils.wbexception import WbException
import json
import requests
from youtube_dl import YoutubeDL
from rangecache import range_cache
#=================================================================
@ -27,6 +28,8 @@ class RewriteHandler(SearchPageWbUrlHandler):
LIVE_COOKIE = 'pywb.timestamp={0}; max-age=60'
youtubedl = None
def __init__(self, config):
super(RewriteHandler, self).__init__(config)
@ -84,17 +87,22 @@ class RewriteHandler(SearchPageWbUrlHandler):
value = self.live_cookie.format(cdx['timestamp'])
status_headers.headers.append(('Set-Cookie', value))
return WbResponse(status_headers, gen)
def resp_func():
return WbResponse(status_headers, gen)
#range_status, range_iter = range_cache(wbrequest, cdx, resp_func)
#if range_status and range_iter:
# return WbResponse(range_status, range_iter)
#else:
return resp_func()
def get_video_info(self, wbrequest):
if not self.ydl:
self.ydl = YoutubeDL(dict(simulate=True,
youtube_include_dash_manifest=False))
if not self.youtubedl:
self.youtubedl = YoutubeDLWrapper()
self.ydl.add_default_info_extractors()
info = self.youtubedl.extract_info(wbrequest.wb_url.url)
info = self.ydl.extract_info(wbrequest.wb_url.url)
content_type = 'application/vnd.youtube-dl_formats+json'
metadata = json.dumps(info)
@ -119,6 +127,42 @@ class RewriteHandler(SearchPageWbUrlHandler):
return 'Live Web Rewrite Handler'
#=================================================================
class YoutubeDLWrapper(object):
""" Used to wrap youtubedl import, since youtubedl currently overrides
global HTMLParser.locatestarttagend regex with a different regex
that doesn't quite work.
This wrapper ensures that this regex is only set for YoutubeDL and unset
otherwise
"""
def __init__(self):
import HTMLParser as htmlparser
self.htmlparser = htmlparser
self.orig_tagregex = htmlparser.locatestarttagend
from youtube_dl import YoutubeDL as YoutubeDL
self.ydl_tagregex = htmlparser.locatestarttagend
htmlparser.locatestarttagend = self.orig_tagregex
self.ydl = YoutubeDL(dict(simulate=True,
youtube_include_dash_manifest=False))
self.ydl.add_default_info_extractors()
def extract_info(self, url):
info = None
try:
self.htmlparser.locatestarttagend = self.ydl_tagregex
info = self.ydl.extract_info(url)
finally:
self.htmlparser.locatestarttagend = self.orig_tagregex
return info
#=================================================================
def create_live_rewriter_app(config={}):
routes = [Route('rewrite', RewriteHandler(config)),

98
pywb/webapp/rangecache.py Normal file
View File

@ -0,0 +1,98 @@
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.loaders import LimitReader
from pywb.framework.cache import create_cache
from tempfile import NamedTemporaryFile
import hashlib
import yaml
import os
#=================================================================
class RangeCache(object):
def __init__(self):
self.cache = create_cache()
print(type(self.cache))
def __call__(self, wbrequest, cdx, wbresponse_func):
range_h = wbrequest.env.get('HTTP_RANGE')
if not range_h:
return None, None
key = cdx.get('digest')
if not key:
hash_ = hashlib.md5()
hash_.update(cdx['urlkey'])
hash_.update(cdx['timestamp'])
key = hash_.hexdigest()
print('KEY: ', key)
print('CACHE: ', str(self.cache))
if not key in self.cache:
print('MISS')
response = wbresponse_func()
with NamedTemporaryFile(delete=False) as fh:
for obj in response.body:
fh.write(obj)
name = fh.name
spec = dict(name=fh.name,
headers=response.status_headers.headers)
print('SET CACHE: ' + key)
self.cache[key] = yaml.dump(spec)
else:
print('HIT')
spec = yaml.load(self.cache[key])
spec['headers'] = [tuple(x) for x in spec['headers']]
print(spec['headers'])
print('TEMP FILE: ' + spec['name'])
filelen = os.path.getsize(spec['name'])
range_h = range_h.rstrip()
if range_h == 'bytes=0-':
print('FIX RANGE')
range_h = 'bytes=0-120000'
parts = range_h.rstrip().split('-')
start = parts[0]
start = start.split('=')[1]
start = int(start)
maxlen = filelen - start
if len(parts) == 2 and parts[1]:
maxlen = min(maxlen, int(parts[1]) - start + 1)
def read_range():
with open(spec['name']) as fh:
fh.seek(start)
fh = LimitReader.wrap_stream(fh, maxlen)
while True:
buf = fh.read()
print('READ: ', len(buf))
if not buf:
break
yield buf
content_range = 'bytes {0}-{1}/{2}'.format(start,
start + maxlen - 1,
filelen)
print('CONTENT_RANGE: ', content_range)
status_headers = StatusAndHeaders('206 Partial Content', spec['headers'])
status_headers.replace_header('Content-Range', content_range)
status_headers.replace_header('Content-Length', str(maxlen))
return status_headers, read_range()
#=================================================================
range_cache = RangeCache()

View File

@ -15,6 +15,8 @@ from pywb.warc.recordloader import ArchiveLoadFailed
from views import J2TemplateView, add_env_globals
from views import J2HtmlCapturesView, HeadInsertView
from rangecache import range_cache
#=================================================================
class CaptureException(WbException):
@ -77,7 +79,7 @@ class ReplayView(object):
first = False
response = self.replay_capture(wbrequest,
response = self.cached_replay_capture(wbrequest,
cdx,
cdx_loader,
failed_files)
@ -99,6 +101,23 @@ class ReplayView(object):
raise last_e
def cached_replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
def get_capture():
return self.replay_capture(wbrequest, cdx, cdx_loader, failed_files)
range_status, range_iter = range_cache(wbrequest,
cdx,
get_capture)
if range_status and range_iter:
response = self.response_class(range_status,
range_iter,
wbrequest=wbrequest,
cdx=cdx)
return response
return get_capture()
def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
(status_headers, stream) = (self.content_loader.
resolve_headers_and_payload(cdx,

View File

@ -38,4 +38,9 @@ class TestLiveRewriter:
resp = self.testapp.get('/rewrite/@#$@#$', status=400)
assert resp.status_int == 400
def test_live_video_info(self):
resp = self.testapp.get('/rewrite/vi_/https://www.youtube.com/watch?v=DjFZyFWSt1M')
assert resp.status_int == 200
assert resp.content_type == 'application/vnd.youtube-dl_formats+json', resp.content_type