From 51919ed1e73c4665f8508353c9ddb5ba82bba034 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 23 Dec 2014 14:34:59 -0800 Subject: [PATCH] replay: make range cache available by default in replay_views since its inited on first use. remove separate subclass. 'enable_ranges' can be set to false to disable range cache altogether improve tests --- pywb/utils/test/test_loaders.py | 2 ++ pywb/webapp/cached_replay.py | 34 ---------------------------- pywb/webapp/handlers.py | 6 +---- pywb/webapp/rangecache.py | 15 ------------- pywb/webapp/replay_views.py | 39 +++++++++++++++++++++++++++++---- tests/test_config.yaml | 4 ++-- tests/test_integration.py | 33 ++++++++++++++++++---------- 7 files changed, 61 insertions(+), 72 deletions(-) delete mode 100644 pywb/webapp/cached_replay.py diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py index 12ad7c44..1da5d71e 100644 --- a/pywb/utils/test/test_loaders.py +++ b/pywb/utils/test/test_loaders.py @@ -53,6 +53,8 @@ True >>> extract_client_cookie(dict(HTTP_COOKIE='a=b; c=d'), 'x') +>>> extract_client_cookie(dict(HTTP_COOKIE='x'), 'x') + >>> extract_client_cookie({}, 'y') """ diff --git a/pywb/webapp/cached_replay.py b/pywb/webapp/cached_replay.py deleted file mode 100644 index ea2b293d..00000000 --- a/pywb/webapp/cached_replay.py +++ /dev/null @@ -1,34 +0,0 @@ -from rangecache import range_cache -from replay_views import ReplayView - - -#================================================================= -class CachedReplayView(ReplayView): - """ - Extension for ReplayView supporting loading via the rangecache - """ - def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files): - def get_capture(): - return super(CachedReplayView, self).replay_capture( - wbrequest, - cdx, - cdx_loader, - failed_files) - - range_status, range_iter = range_cache(wbrequest, - cdx.get('digest'), - get_capture) - if range_status and range_iter: - response = self.response_class(range_status, - range_iter, - wbrequest=wbrequest, - cdx=cdx) - return response - - return get_capture() - - def _redirect_if_needed(self, wbrequest, cdx): - if wbrequest.extract_range(): - return None - - return super(CachedReplayView, self)._redirect_if_needed(wbrequest, cdx) diff --git a/pywb/webapp/handlers.py b/pywb/webapp/handlers.py index fe885fa6..e6ce156a 100644 --- a/pywb/webapp/handlers.py +++ b/pywb/webapp/handlers.py @@ -120,11 +120,7 @@ class WBHandler(SearchPageWbUrlHandler): resolving_loader = ResolvingLoader(paths=paths, record_loader=record_loader) - enable_cache = config.get('enable_cache') - if enable_cache: - self.replay = CachedReplayView(resolving_loader, config) - else: - self.replay = ReplayView(resolving_loader, config) + self.replay = ReplayView(resolving_loader, config) self.fallback_handler = None self.fallback_name = config.get('fallback') diff --git a/pywb/webapp/rangecache.py b/pywb/webapp/rangecache.py index 09da656b..9a0516be 100644 --- a/pywb/webapp/rangecache.py +++ b/pywb/webapp/rangecache.py @@ -24,21 +24,6 @@ class RangeCache(object): shutil.rmtree(self.temp_dir, True) self.temp_dir = None - def __call__(self, wbrequest, digest, wbresponse_func): - result = wbrequest.extract_range() - if not result: - return None, None - - # no longer needed -- handled at frontend rewrite - #if wbrequest.env.get('HTTP_X_IGNORE_RANGE_ARG'): - # wbrequest.wb_url.url = result[0] - # return None, None - - return self.handle_range(wbrequest, - digest, - wbresponse_func, - *result) - def handle_range(self, wbrequest, digest, wbresponse_func, url, start, end, use_206): diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index b0b8bffc..33b9908b 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -15,6 +15,8 @@ from pywb.warc.recordloader import ArchiveLoadFailed from views import J2TemplateView, add_env_globals from views import J2HtmlCapturesView, HeadInsertView +from rangecache import range_cache + #================================================================= class CaptureException(WbException): @@ -49,6 +51,8 @@ class ReplayView(object): else: self.response_class = WbResponse + self.enable_range_cache = config.get('enable_ranges', True) + self._reporter = config.get('reporter') def render_content(self, wbrequest, cdx_lines, cdx_loader): @@ -77,10 +81,10 @@ class ReplayView(object): first = False - response = self.replay_capture(wbrequest, - cdx, - cdx_loader, - failed_files) + response = self.cached_replay_capture(wbrequest, + cdx, + cdx_loader, + failed_files) except (CaptureException, ArchiveLoadFailed) as ce: import traceback @@ -99,6 +103,33 @@ class ReplayView(object): raise last_e + def cached_replay_capture(self, wbrequest, cdx, cdx_loader, failed_files): + def get_capture(): + return self.replay_capture(wbrequest, + cdx, + cdx_loader, + failed_files) + + if not self.enable_range_cache: + return get_capture() + + range_info = wbrequest.extract_range() + + if not range_info: + return get_capture() + + range_status, range_iter = (range_cache. + handle_range(wbrequest, + cdx.get('digest'), + get_capture, + *range_info)) + + response = self.response_class(range_status, + range_iter, + wbrequest=wbrequest, + cdx=cdx) + return response + def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files): (status_headers, stream) = (self.content_loader. resolve_headers_and_payload(cdx, diff --git a/tests/test_config.yaml b/tests/test_config.yaml index 79a62cfe..ad010789 100644 --- a/tests/test_config.yaml +++ b/tests/test_config.yaml @@ -38,9 +38,9 @@ collections: index_paths: ./sample_archive/cdx/ fallback: live - pywb-rangecache: + pywb-norange: index_paths: ./sample_archive/cdx/ - enable_cache: true + enable_ranges: False # indicate if cdx files are sorted by SURT keys -- eg: com,example)/ # SURT keys are recommended for future indices, but non-SURT cdxs diff --git a/tests/test_integration.py b/tests/test_integration.py index b49248b8..4dd7ba35 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -101,15 +101,6 @@ class TestWb: assert 'new _WBWombat' in resp.body, resp.body assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body - def test_replay_content_with_rangecache(self): - resp = self.testapp.get('/pywb-rangecache/20140127171238/http://www.iana.org/') - self._assert_basic_html(resp) - - assert '"20140127171238"' in resp.body - assert 'wb.js' in resp.body - assert 'new _WBWombat' in resp.body, resp.body - assert '/pywb-rangecache/20140127171238/http://www.iana.org/time-zones"' in resp.body - def test_replay_non_frame_content(self): resp = self.testapp.get('/pywb-nonframe/20140127171238/http://www.iana.org/') self._assert_basic_html(resp) @@ -142,6 +133,11 @@ class TestWb: assert 'wb.js' in resp.body assert '/pywb/20130729195151/http://www.iana.org/domains/example"' in resp.body + def test_video_info_not_found(self): + # not actually archived, but ensure video info path is tested + resp = self.testapp.get('/pywb/vi_/https://www.youtube.com/watch?v=DjFZyFWSt1M', status=404) + assert resp.status_int == 404 + def test_replay_cdx_mod(self): resp = self.testapp.get('/pywb/20140127171239cdx_/http://www.iana.org/_css/2013.1/print.css') self._assert_basic_text(resp) @@ -177,7 +173,7 @@ class TestWb: def test_replay_range_cache_content(self): headers = [('Range', 'bytes=0-200')] - resp = self.testapp.get('/pywb-rangecache/20140127171251id_/http://example.com', headers=headers) + resp = self.testapp.get('/pywb/20140127171251id_/http://example.com', headers=headers) assert resp.status_int == 206 assert resp.headers['Accept-Ranges'] == 'bytes' @@ -186,9 +182,22 @@ class TestWb: assert 'wb.js' not in resp.body + def test_replay_content_ignore_range(self): + headers = [('Range', 'bytes=0-200')] + resp = self.testapp.get('/pywb-norange/20140127171251id_/http://example.com', headers=headers) + + # range request ignored + assert resp.status_int == 200 + + # full response + assert resp.content_length == 1270, resp.content_length + + # identity, no header insertion + assert 'wb.js' not in resp.body + def test_replay_range_cache_content_bound_end(self): headers = [('Range', 'bytes=10-10000')] - resp = self.testapp.get('/pywb-rangecache/20140127171251id_/http://example.com', headers=headers) + resp = self.testapp.get('/pywb/20140127171251id_/http://example.com', headers=headers) assert resp.status_int == 206 assert resp.headers['Accept-Ranges'] == 'bytes' @@ -201,7 +210,7 @@ class TestWb: def test_replay_redir_no_cache(self): headers = [('Range', 'bytes=10-10000')] # Range ignored - resp = self.testapp.get('/pywb-rangecache/20140126200927/http://www.iana.org/domains/root/db/', headers=headers) + resp = self.testapp.get('/pywb/20140126200927/http://www.iana.org/domains/root/db/', headers=headers) assert resp.content_length == 0 assert resp.status_int == 302