diff --git a/pywb/framework/cache.py b/pywb/framework/cache.py new file mode 100644 index 00000000..9028828b --- /dev/null +++ b/pywb/framework/cache.py @@ -0,0 +1,28 @@ +try: # pragma: no cover + import uwsgi + uwsgi_cache = True +except ImportError: + uwsgi_cache = False + + +#================================================================= +class UwsgiCache(object): # pragma: no cover + def __setitem__(self, item, value): + uwsgi.cache_update(item, value) + + def __getitem__(self, item): + return uwsgi.cache_get(item) + + def __contains__(self, item): + return uwsgi.cache_exists(item) + + def __delitem__(self, item): + uwsgi.cache_del(item) + + +#================================================================= +def create_cache(): + if uwsgi_cache: # pragma: no cover + return UwsgiCache() + else: + return {} diff --git a/pywb/framework/proxy_resolvers.py b/pywb/framework/proxy_resolvers.py index cbe636ca..f9d7cb88 100644 --- a/pywb/framework/proxy_resolvers.py +++ b/pywb/framework/proxy_resolvers.py @@ -3,31 +3,12 @@ from pywb.utils.loaders import extract_client_cookie from pywb.utils.statusandheaders import StatusAndHeaders from pywb.rewrite.wburl import WbUrl +from cache import create_cache + import urlparse import base64 import os -try: # pragma: no cover - import uwsgi - uwsgi_cache = True -except ImportError: - uwsgi_cache = False - - -#================================================================= -class UwsgiCache(object): # pragma: no cover - def __setitem__(self, item, value): - uwsgi.cache_update(item, value) - - def __getitem__(self, item): - return uwsgi.cache_get(item) - - def __contains__(self, item): - return uwsgi.cache_exists(item) - - def __delitem__(self, item): - uwsgi.cache_del(item) - #================================================================= class BaseCollResolver(object): @@ -136,10 +117,7 @@ class CookieResolver(BaseCollResolver): self.extra_headers = config.get('extra_headers') - if uwsgi_cache: # pragma: no cover - self.cache = UwsgiCache() - else: - self.cache = {} + self.cache = create_cache() def get_proxy_coll_ts(self, env): coll, ts, sesh_id = self.get_coll(env) diff --git a/pywb/webapp/handlers.py b/pywb/webapp/handlers.py index 2c7962cc..2e296e2d 100644 --- a/pywb/webapp/handlers.py +++ b/pywb/webapp/handlers.py @@ -40,7 +40,7 @@ class SearchPageWbUrlHandler(WbUrlHandler): create_template(html, 'Frame Insert')) self.banner_html = config.get('banner_html', 'banner.html') - + if config.get('enable_memento', False): self.response_class = MementoResponse @@ -195,7 +195,7 @@ class StaticHandler(BaseHandler): content_type, _ = mimetypes.guess_type(full_path) - return WbResponse.text_stream(data, + return WbResponse.text_stream(reader, content_type=content_type, headers=headers) diff --git a/pywb/webapp/live_rewrite_handler.py b/pywb/webapp/live_rewrite_handler.py index 078d544b..2f7cd62e 100644 --- a/pywb/webapp/live_rewrite_handler.py +++ b/pywb/webapp/live_rewrite_handler.py @@ -13,7 +13,8 @@ from pywb.utils.wbexception import WbException import json import requests -from youtube_dl import YoutubeDL + +from rangecache import range_cache #================================================================= @@ -27,6 +28,8 @@ class RewriteHandler(SearchPageWbUrlHandler): LIVE_COOKIE = 'pywb.timestamp={0}; max-age=60' + youtubedl = None + def __init__(self, config): super(RewriteHandler, self).__init__(config) @@ -84,17 +87,22 @@ class RewriteHandler(SearchPageWbUrlHandler): value = self.live_cookie.format(cdx['timestamp']) status_headers.headers.append(('Set-Cookie', value)) - return WbResponse(status_headers, gen) + def resp_func(): + return WbResponse(status_headers, gen) + + #range_status, range_iter = range_cache(wbrequest, cdx, resp_func) + #if range_status and range_iter: + # return WbResponse(range_status, range_iter) + #else: + return resp_func() def get_video_info(self, wbrequest): - if not self.ydl: - self.ydl = YoutubeDL(dict(simulate=True, - youtube_include_dash_manifest=False)) + if not self.youtubedl: + self.youtubedl = YoutubeDLWrapper() - self.ydl.add_default_info_extractors() + info = self.youtubedl.extract_info(wbrequest.wb_url.url) - info = self.ydl.extract_info(wbrequest.wb_url.url) content_type = 'application/vnd.youtube-dl_formats+json' metadata = json.dumps(info) @@ -119,6 +127,42 @@ class RewriteHandler(SearchPageWbUrlHandler): return 'Live Web Rewrite Handler' +#================================================================= +class YoutubeDLWrapper(object): + """ Used to wrap youtubedl import, since youtubedl currently overrides + global HTMLParser.locatestarttagend regex with a different regex + that doesn't quite work. + + This wrapper ensures that this regex is only set for YoutubeDL and unset + otherwise + """ + def __init__(self): + import HTMLParser as htmlparser + self.htmlparser = htmlparser + + self.orig_tagregex = htmlparser.locatestarttagend + + from youtube_dl import YoutubeDL as YoutubeDL + + self.ydl_tagregex = htmlparser.locatestarttagend + + htmlparser.locatestarttagend = self.orig_tagregex + + self.ydl = YoutubeDL(dict(simulate=True, + youtube_include_dash_manifest=False)) + self.ydl.add_default_info_extractors() + + def extract_info(self, url): + info = None + try: + self.htmlparser.locatestarttagend = self.ydl_tagregex + info = self.ydl.extract_info(url) + finally: + self.htmlparser.locatestarttagend = self.orig_tagregex + + return info + + #================================================================= def create_live_rewriter_app(config={}): routes = [Route('rewrite', RewriteHandler(config)), diff --git a/pywb/webapp/rangecache.py b/pywb/webapp/rangecache.py new file mode 100644 index 00000000..b6a0a48d --- /dev/null +++ b/pywb/webapp/rangecache.py @@ -0,0 +1,98 @@ +from pywb.utils.statusandheaders import StatusAndHeaders +from pywb.utils.loaders import LimitReader +from pywb.framework.cache import create_cache + +from tempfile import NamedTemporaryFile + +import hashlib +import yaml +import os + + +#================================================================= +class RangeCache(object): + def __init__(self): + self.cache = create_cache() + print(type(self.cache)) + + def __call__(self, wbrequest, cdx, wbresponse_func): + range_h = wbrequest.env.get('HTTP_RANGE') + if not range_h: + return None, None + + key = cdx.get('digest') + if not key: + hash_ = hashlib.md5() + hash_.update(cdx['urlkey']) + hash_.update(cdx['timestamp']) + key = hash_.hexdigest() + + print('KEY: ', key) + print('CACHE: ', str(self.cache)) + + if not key in self.cache: + print('MISS') + response = wbresponse_func() + + with NamedTemporaryFile(delete=False) as fh: + for obj in response.body: + fh.write(obj) + + name = fh.name + + spec = dict(name=fh.name, + headers=response.status_headers.headers) + + print('SET CACHE: ' + key) + self.cache[key] = yaml.dump(spec) + else: + print('HIT') + spec = yaml.load(self.cache[key]) + spec['headers'] = [tuple(x) for x in spec['headers']] + + print(spec['headers']) + print('TEMP FILE: ' + spec['name']) + filelen = os.path.getsize(spec['name']) + + range_h = range_h.rstrip() + + if range_h == 'bytes=0-': + print('FIX RANGE') + range_h = 'bytes=0-120000' + + parts = range_h.rstrip().split('-') + start = parts[0] + start = start.split('=')[1] + start = int(start) + + maxlen = filelen - start + + if len(parts) == 2 and parts[1]: + maxlen = min(maxlen, int(parts[1]) - start + 1) + + def read_range(): + with open(spec['name']) as fh: + fh.seek(start) + fh = LimitReader.wrap_stream(fh, maxlen) + while True: + buf = fh.read() + print('READ: ', len(buf)) + if not buf: + break + + yield buf + + + content_range = 'bytes {0}-{1}/{2}'.format(start, + start + maxlen - 1, + filelen) + + print('CONTENT_RANGE: ', content_range) + status_headers = StatusAndHeaders('206 Partial Content', spec['headers']) + status_headers.replace_header('Content-Range', content_range) + status_headers.replace_header('Content-Length', str(maxlen)) + return status_headers, read_range() + + +#================================================================= +range_cache = RangeCache() diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index 3e77d288..b8db1c4e 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -15,6 +15,8 @@ from pywb.warc.recordloader import ArchiveLoadFailed from views import J2TemplateView, add_env_globals from views import J2HtmlCapturesView, HeadInsertView +from rangecache import range_cache + #================================================================= class CaptureException(WbException): @@ -77,7 +79,7 @@ class ReplayView(object): first = False - response = self.replay_capture(wbrequest, + response = self.cached_replay_capture(wbrequest, cdx, cdx_loader, failed_files) @@ -99,6 +101,23 @@ class ReplayView(object): raise last_e + + def cached_replay_capture(self, wbrequest, cdx, cdx_loader, failed_files): + def get_capture(): + return self.replay_capture(wbrequest, cdx, cdx_loader, failed_files) + + range_status, range_iter = range_cache(wbrequest, + cdx, + get_capture) + if range_status and range_iter: + response = self.response_class(range_status, + range_iter, + wbrequest=wbrequest, + cdx=cdx) + return response + + return get_capture() + def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files): (status_headers, stream) = (self.content_loader. resolve_headers_and_payload(cdx, diff --git a/tests/test_live_rewriter.py b/tests/test_live_rewriter.py index 77933463..104c55c7 100644 --- a/tests/test_live_rewriter.py +++ b/tests/test_live_rewriter.py @@ -38,4 +38,9 @@ class TestLiveRewriter: resp = self.testapp.get('/rewrite/@#$@#$', status=400) assert resp.status_int == 400 + def test_live_video_info(self): + resp = self.testapp.get('/rewrite/vi_/https://www.youtube.com/watch?v=DjFZyFWSt1M') + assert resp.status_int == 200 + assert resp.content_type == 'application/vnd.youtube-dl_formats+json', resp.content_type +