mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
replay: make range cache available by default in replay_views since its
inited on first use. remove separate subclass. 'enable_ranges' can be set to false to disable range cache altogether improve tests
This commit is contained in:
parent
3819e935fb
commit
51919ed1e7
@ -53,6 +53,8 @@ True
|
|||||||
|
|
||||||
>>> extract_client_cookie(dict(HTTP_COOKIE='a=b; c=d'), 'x')
|
>>> extract_client_cookie(dict(HTTP_COOKIE='a=b; c=d'), 'x')
|
||||||
|
|
||||||
|
>>> extract_client_cookie(dict(HTTP_COOKIE='x'), 'x')
|
||||||
|
|
||||||
>>> extract_client_cookie({}, 'y')
|
>>> extract_client_cookie({}, 'y')
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -1,34 +0,0 @@
|
|||||||
from rangecache import range_cache
|
|
||||||
from replay_views import ReplayView
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class CachedReplayView(ReplayView):
|
|
||||||
"""
|
|
||||||
Extension for ReplayView supporting loading via the rangecache
|
|
||||||
"""
|
|
||||||
def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
|
|
||||||
def get_capture():
|
|
||||||
return super(CachedReplayView, self).replay_capture(
|
|
||||||
wbrequest,
|
|
||||||
cdx,
|
|
||||||
cdx_loader,
|
|
||||||
failed_files)
|
|
||||||
|
|
||||||
range_status, range_iter = range_cache(wbrequest,
|
|
||||||
cdx.get('digest'),
|
|
||||||
get_capture)
|
|
||||||
if range_status and range_iter:
|
|
||||||
response = self.response_class(range_status,
|
|
||||||
range_iter,
|
|
||||||
wbrequest=wbrequest,
|
|
||||||
cdx=cdx)
|
|
||||||
return response
|
|
||||||
|
|
||||||
return get_capture()
|
|
||||||
|
|
||||||
def _redirect_if_needed(self, wbrequest, cdx):
|
|
||||||
if wbrequest.extract_range():
|
|
||||||
return None
|
|
||||||
|
|
||||||
return super(CachedReplayView, self)._redirect_if_needed(wbrequest, cdx)
|
|
@ -120,11 +120,7 @@ class WBHandler(SearchPageWbUrlHandler):
|
|||||||
resolving_loader = ResolvingLoader(paths=paths,
|
resolving_loader = ResolvingLoader(paths=paths,
|
||||||
record_loader=record_loader)
|
record_loader=record_loader)
|
||||||
|
|
||||||
enable_cache = config.get('enable_cache')
|
self.replay = ReplayView(resolving_loader, config)
|
||||||
if enable_cache:
|
|
||||||
self.replay = CachedReplayView(resolving_loader, config)
|
|
||||||
else:
|
|
||||||
self.replay = ReplayView(resolving_loader, config)
|
|
||||||
|
|
||||||
self.fallback_handler = None
|
self.fallback_handler = None
|
||||||
self.fallback_name = config.get('fallback')
|
self.fallback_name = config.get('fallback')
|
||||||
|
@ -24,21 +24,6 @@ class RangeCache(object):
|
|||||||
shutil.rmtree(self.temp_dir, True)
|
shutil.rmtree(self.temp_dir, True)
|
||||||
self.temp_dir = None
|
self.temp_dir = None
|
||||||
|
|
||||||
def __call__(self, wbrequest, digest, wbresponse_func):
|
|
||||||
result = wbrequest.extract_range()
|
|
||||||
if not result:
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
# no longer needed -- handled at frontend rewrite
|
|
||||||
#if wbrequest.env.get('HTTP_X_IGNORE_RANGE_ARG'):
|
|
||||||
# wbrequest.wb_url.url = result[0]
|
|
||||||
# return None, None
|
|
||||||
|
|
||||||
return self.handle_range(wbrequest,
|
|
||||||
digest,
|
|
||||||
wbresponse_func,
|
|
||||||
*result)
|
|
||||||
|
|
||||||
def handle_range(self, wbrequest, digest, wbresponse_func,
|
def handle_range(self, wbrequest, digest, wbresponse_func,
|
||||||
url, start, end, use_206):
|
url, start, end, use_206):
|
||||||
|
|
||||||
|
@ -15,6 +15,8 @@ from pywb.warc.recordloader import ArchiveLoadFailed
|
|||||||
from views import J2TemplateView, add_env_globals
|
from views import J2TemplateView, add_env_globals
|
||||||
from views import J2HtmlCapturesView, HeadInsertView
|
from views import J2HtmlCapturesView, HeadInsertView
|
||||||
|
|
||||||
|
from rangecache import range_cache
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class CaptureException(WbException):
|
class CaptureException(WbException):
|
||||||
@ -49,6 +51,8 @@ class ReplayView(object):
|
|||||||
else:
|
else:
|
||||||
self.response_class = WbResponse
|
self.response_class = WbResponse
|
||||||
|
|
||||||
|
self.enable_range_cache = config.get('enable_ranges', True)
|
||||||
|
|
||||||
self._reporter = config.get('reporter')
|
self._reporter = config.get('reporter')
|
||||||
|
|
||||||
def render_content(self, wbrequest, cdx_lines, cdx_loader):
|
def render_content(self, wbrequest, cdx_lines, cdx_loader):
|
||||||
@ -77,10 +81,10 @@ class ReplayView(object):
|
|||||||
|
|
||||||
first = False
|
first = False
|
||||||
|
|
||||||
response = self.replay_capture(wbrequest,
|
response = self.cached_replay_capture(wbrequest,
|
||||||
cdx,
|
cdx,
|
||||||
cdx_loader,
|
cdx_loader,
|
||||||
failed_files)
|
failed_files)
|
||||||
|
|
||||||
except (CaptureException, ArchiveLoadFailed) as ce:
|
except (CaptureException, ArchiveLoadFailed) as ce:
|
||||||
import traceback
|
import traceback
|
||||||
@ -99,6 +103,33 @@ class ReplayView(object):
|
|||||||
|
|
||||||
raise last_e
|
raise last_e
|
||||||
|
|
||||||
|
def cached_replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
|
||||||
|
def get_capture():
|
||||||
|
return self.replay_capture(wbrequest,
|
||||||
|
cdx,
|
||||||
|
cdx_loader,
|
||||||
|
failed_files)
|
||||||
|
|
||||||
|
if not self.enable_range_cache:
|
||||||
|
return get_capture()
|
||||||
|
|
||||||
|
range_info = wbrequest.extract_range()
|
||||||
|
|
||||||
|
if not range_info:
|
||||||
|
return get_capture()
|
||||||
|
|
||||||
|
range_status, range_iter = (range_cache.
|
||||||
|
handle_range(wbrequest,
|
||||||
|
cdx.get('digest'),
|
||||||
|
get_capture,
|
||||||
|
*range_info))
|
||||||
|
|
||||||
|
response = self.response_class(range_status,
|
||||||
|
range_iter,
|
||||||
|
wbrequest=wbrequest,
|
||||||
|
cdx=cdx)
|
||||||
|
return response
|
||||||
|
|
||||||
def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
|
def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
|
||||||
(status_headers, stream) = (self.content_loader.
|
(status_headers, stream) = (self.content_loader.
|
||||||
resolve_headers_and_payload(cdx,
|
resolve_headers_and_payload(cdx,
|
||||||
|
@ -38,9 +38,9 @@ collections:
|
|||||||
index_paths: ./sample_archive/cdx/
|
index_paths: ./sample_archive/cdx/
|
||||||
fallback: live
|
fallback: live
|
||||||
|
|
||||||
pywb-rangecache:
|
pywb-norange:
|
||||||
index_paths: ./sample_archive/cdx/
|
index_paths: ./sample_archive/cdx/
|
||||||
enable_cache: true
|
enable_ranges: False
|
||||||
|
|
||||||
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
|
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
|
||||||
# SURT keys are recommended for future indices, but non-SURT cdxs
|
# SURT keys are recommended for future indices, but non-SURT cdxs
|
||||||
|
@ -101,15 +101,6 @@ class TestWb:
|
|||||||
assert 'new _WBWombat' in resp.body, resp.body
|
assert 'new _WBWombat' in resp.body, resp.body
|
||||||
assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
|
assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
|
||||||
|
|
||||||
def test_replay_content_with_rangecache(self):
|
|
||||||
resp = self.testapp.get('/pywb-rangecache/20140127171238/http://www.iana.org/')
|
|
||||||
self._assert_basic_html(resp)
|
|
||||||
|
|
||||||
assert '"20140127171238"' in resp.body
|
|
||||||
assert 'wb.js' in resp.body
|
|
||||||
assert 'new _WBWombat' in resp.body, resp.body
|
|
||||||
assert '/pywb-rangecache/20140127171238/http://www.iana.org/time-zones"' in resp.body
|
|
||||||
|
|
||||||
def test_replay_non_frame_content(self):
|
def test_replay_non_frame_content(self):
|
||||||
resp = self.testapp.get('/pywb-nonframe/20140127171238/http://www.iana.org/')
|
resp = self.testapp.get('/pywb-nonframe/20140127171238/http://www.iana.org/')
|
||||||
self._assert_basic_html(resp)
|
self._assert_basic_html(resp)
|
||||||
@ -142,6 +133,11 @@ class TestWb:
|
|||||||
assert 'wb.js' in resp.body
|
assert 'wb.js' in resp.body
|
||||||
assert '/pywb/20130729195151/http://www.iana.org/domains/example"' in resp.body
|
assert '/pywb/20130729195151/http://www.iana.org/domains/example"' in resp.body
|
||||||
|
|
||||||
|
def test_video_info_not_found(self):
|
||||||
|
# not actually archived, but ensure video info path is tested
|
||||||
|
resp = self.testapp.get('/pywb/vi_/https://www.youtube.com/watch?v=DjFZyFWSt1M', status=404)
|
||||||
|
assert resp.status_int == 404
|
||||||
|
|
||||||
def test_replay_cdx_mod(self):
|
def test_replay_cdx_mod(self):
|
||||||
resp = self.testapp.get('/pywb/20140127171239cdx_/http://www.iana.org/_css/2013.1/print.css')
|
resp = self.testapp.get('/pywb/20140127171239cdx_/http://www.iana.org/_css/2013.1/print.css')
|
||||||
self._assert_basic_text(resp)
|
self._assert_basic_text(resp)
|
||||||
@ -177,7 +173,7 @@ class TestWb:
|
|||||||
|
|
||||||
def test_replay_range_cache_content(self):
|
def test_replay_range_cache_content(self):
|
||||||
headers = [('Range', 'bytes=0-200')]
|
headers = [('Range', 'bytes=0-200')]
|
||||||
resp = self.testapp.get('/pywb-rangecache/20140127171251id_/http://example.com', headers=headers)
|
resp = self.testapp.get('/pywb/20140127171251id_/http://example.com', headers=headers)
|
||||||
|
|
||||||
assert resp.status_int == 206
|
assert resp.status_int == 206
|
||||||
assert resp.headers['Accept-Ranges'] == 'bytes'
|
assert resp.headers['Accept-Ranges'] == 'bytes'
|
||||||
@ -186,9 +182,22 @@ class TestWb:
|
|||||||
|
|
||||||
assert 'wb.js' not in resp.body
|
assert 'wb.js' not in resp.body
|
||||||
|
|
||||||
|
def test_replay_content_ignore_range(self):
|
||||||
|
headers = [('Range', 'bytes=0-200')]
|
||||||
|
resp = self.testapp.get('/pywb-norange/20140127171251id_/http://example.com', headers=headers)
|
||||||
|
|
||||||
|
# range request ignored
|
||||||
|
assert resp.status_int == 200
|
||||||
|
|
||||||
|
# full response
|
||||||
|
assert resp.content_length == 1270, resp.content_length
|
||||||
|
|
||||||
|
# identity, no header insertion
|
||||||
|
assert 'wb.js' not in resp.body
|
||||||
|
|
||||||
def test_replay_range_cache_content_bound_end(self):
|
def test_replay_range_cache_content_bound_end(self):
|
||||||
headers = [('Range', 'bytes=10-10000')]
|
headers = [('Range', 'bytes=10-10000')]
|
||||||
resp = self.testapp.get('/pywb-rangecache/20140127171251id_/http://example.com', headers=headers)
|
resp = self.testapp.get('/pywb/20140127171251id_/http://example.com', headers=headers)
|
||||||
|
|
||||||
assert resp.status_int == 206
|
assert resp.status_int == 206
|
||||||
assert resp.headers['Accept-Ranges'] == 'bytes'
|
assert resp.headers['Accept-Ranges'] == 'bytes'
|
||||||
@ -201,7 +210,7 @@ class TestWb:
|
|||||||
def test_replay_redir_no_cache(self):
|
def test_replay_redir_no_cache(self):
|
||||||
headers = [('Range', 'bytes=10-10000')]
|
headers = [('Range', 'bytes=10-10000')]
|
||||||
# Range ignored
|
# Range ignored
|
||||||
resp = self.testapp.get('/pywb-rangecache/20140126200927/http://www.iana.org/domains/root/db/', headers=headers)
|
resp = self.testapp.get('/pywb/20140126200927/http://www.iana.org/domains/root/db/', headers=headers)
|
||||||
assert resp.content_length == 0
|
assert resp.content_length == 0
|
||||||
assert resp.status_int == 302
|
assert resp.status_int == 302
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user