From 66f5ad62b33efff5598402bdc509ed2b7b9ba83f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 18 Jul 2015 23:28:14 -0700 Subject: [PATCH] memento: when `redir_to_exact` is false, don't redirect latest replay/timegate to current timestamp, but return directly latest capture. when memento enabled, the timegate now follows memento pattern 2.2 (http://tools.ietf.org/html/rfc7089#section-4.2.2) also return content-location instead of location, update memento no-redirect tests to match new behavior. closes #122 --- pywb/framework/memento.py | 26 +++++++++++++++------ pywb/webapp/replay_views.py | 19 ++++++++-------- tests/test_config_memento.yaml | 2 +- tests/test_memento.py | 41 +++++++++++++++++++++++++--------- 4 files changed, 61 insertions(+), 27 deletions(-) diff --git a/pywb/framework/memento.py b/pywb/framework/memento.py index 3e923ed5..eff48152 100644 --- a/pywb/framework/memento.py +++ b/pywb/framework/memento.py @@ -42,6 +42,7 @@ class MementoRequest(MementoReqMixin, WbRequest): class MementoRespMixin(object): def _init_derived(self, params): wbrequest = params.get('wbrequest') + is_redirect = params.get('memento_is_redir', False) cdx = params.get('cdx') if not wbrequest or not wbrequest.wb_url: @@ -50,7 +51,7 @@ class MementoRespMixin(object): mod = wbrequest.options.get('replay_mod', '') #is_top_frame = wbrequest.wb_url.is_top_frame - is_top_frame = wbrequest.options.get('is_top_frame') + is_top_frame = wbrequest.options.get('is_top_frame', False) is_timegate = (wbrequest.options.get('is_timegate', False) and not is_top_frame) @@ -60,6 +61,7 @@ class MementoRespMixin(object): # Determine if memento: is_memento = False + is_original = False # if no cdx included, not a memento, unless top-frame special if not cdx: @@ -71,10 +73,13 @@ class MementoRespMixin(object): # otherwise, if in proxy mode, then always a memento elif wbrequest.options['is_proxy']: is_memento = True + is_original = True # otherwise only if timestamp replay (and not a timegate) - elif not is_timegate: - is_memento = (wbrequest.wb_url.type == wbrequest.wb_url.REPLAY) + #elif not is_timegate: + # is_memento = (wbrequest.wb_url.type == wbrequest.wb_url.REPLAY) + elif not is_redirect: + is_memento = (wbrequest.wb_url.is_replay()) link = [] req_url = wbrequest.wb_url.url @@ -101,11 +106,18 @@ class MementoRespMixin(object): timestamp=ts, url=url) - link.append(self.make_memento_link(canon_link, - 'memento', - http_date)) + # Must set content location + if is_memento and is_timegate: + self.status_headers.headers.append(('Content-Location', + canon_link)) - if is_memento and is_timegate: + # don't set memento link for very long urls... + if len(canon_link) < 512: + link.append(self.make_memento_link(canon_link, + 'memento', + http_date)) + + if is_original and is_timegate: link.append(self.make_link(req_url, 'original timegate')) else: link.append(self.make_link(req_url, 'original')) diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index a0ce23d0..832cf66c 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -233,6 +233,9 @@ class ReplayView(object): return chain(iter([content]), iterator) def _redirect_if_needed(self, wbrequest, cdx): + if not self.redir_to_exact: + return None + if wbrequest.options['is_proxy']: return None @@ -243,10 +246,7 @@ class ReplayView(object): if not is_timegate: is_timegate = wbrequest.wb_url.is_latest_replay() - redir_needed = is_timegate - - if not redir_needed and self.redir_to_exact: - redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp) + redir_needed = is_timegate or (cdx['timestamp'] != wbrequest.wb_url.timestamp) if not redir_needed: return None @@ -254,10 +254,10 @@ class ReplayView(object): if self.enable_range_cache and wbrequest.extract_range(): return None - if is_timegate and not self.redir_to_exact: - timestamp = timestamp_now() - else: - timestamp = cdx['timestamp'] + #if is_timegate: + # timestamp = timestamp_now() + #else: + timestamp = cdx['timestamp'] new_url = (wbrequest.urlrewriter. get_new_url(timestamp=timestamp, @@ -279,7 +279,8 @@ class ReplayView(object): return self.response_class(status_headers, wbrequest=wbrequest, - cdx=cdx) + cdx=cdx, + memento_is_redir=True) def _reject_self_redirect(self, wbrequest, cdx, status_headers): """ diff --git a/tests/test_config_memento.yaml b/tests/test_config_memento.yaml index 91d4c624..cd6c515b 100644 --- a/tests/test_config_memento.yaml +++ b/tests/test_config_memento.yaml @@ -6,7 +6,7 @@ collections: pywb: ./sample_archive/cdx/ - pywb-non-exact: + pywb-no-redir: index_paths: ./sample_archive/cdx/ redir_to_exact: false diff --git a/tests/test_memento.py b/tests/test_memento.py index 5197bda4..dd182275 100644 --- a/tests/test_memento.py +++ b/tests/test_memento.py @@ -43,30 +43,29 @@ class TestMemento(MementoMixin): assert '/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location'] - # timegate with latest memento, but redirect to current timestamp url instead of - # memento timestamp - def test_timegate_latest_request_timestamp(self): + # timegate with latest memento, but no redirect + def test_timegate_memento_no_redir_latest(self): """ TimeGate with no Accept-Datetime header """ dt = 'Mon, 27 Jan 2014 17:12:39 GMT' - resp = self.testapp.get('/pywb-non-exact/http://www.iana.org/_css/2013.1/screen.css') + resp = self.testapp.get('/pywb-no-redir/http://www.iana.org/_css/2013.1/screen.css') - assert resp.status_int == 302 + assert resp.status_int == 200 assert resp.headers[VARY] == 'accept-datetime' links = self.get_links(resp) assert '; rel="original"' in links - assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css', coll='pywb-non-exact') in links - assert self.make_memento_link('http://www.iana.org/_css/2013.1/screen.css', '20140127171239', dt, coll='pywb-non-exact') in links + assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css', coll='pywb-no-redir') in links + assert self.make_memento_link('http://www.iana.org/_css/2013.1/screen.css', '20140127171239', dt, coll='pywb-no-redir') in links - assert MEMENTO_DATETIME not in resp.headers + assert MEMENTO_DATETIME in resp.headers - assert '/pywb-non-exact/' in resp.headers['Location'] + assert '/pywb-no-redir/' in resp.headers['Content-Location'] - wburl = resp.headers['Location'].split('/pywb-non-exact/')[-1] + wburl = resp.headers['Content-Location'].split('/pywb-no-redir/')[-1] ts = wburl.split('/')[0] assert len(ts) == 14 assert timestamp_now() >= ts @@ -115,6 +114,28 @@ class TestMemento(MementoMixin): assert '/pywb/20140126200804/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location'] + def test_timegate_memento_no_redir_accept_datetime_inexact(self): + """ + TimeGate with Accept-Datetime header, not matching a memento exactly, no redirect + """ + dt = 'Sun, 26 Jan 2014 20:08:04 GMT' + request_dt = 'Sun, 26 Jan 2014 20:08:00 GMT' + headers = {ACCEPT_DATETIME: request_dt} + resp = self.testapp.get('/pywb-no-redir/http://www.iana.org/_css/2013.1/screen.css', headers=headers) + + assert resp.status_int == 200 + + assert resp.headers[VARY] == 'accept-datetime' + + links = self.get_links(resp) + assert '; rel="original"' in links + assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css', coll='pywb-no-redir') in links + assert self.make_memento_link('http://www.iana.org/_css/2013.1/screen.css', '20140126200804', dt, coll='pywb-no-redir') == links[0], links[0] + + assert MEMENTO_DATETIME in resp.headers + + assert '/pywb-no-redir/20140126200804/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Content-Location'] + def test_non_timegate_intermediate_redir(self): """ Not a timegate, but an 'intermediate resource', redirect to closest timestamp