From 55426e76196898d2b94659b5a03c9539e074fe94 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 29 Jan 2015 22:27:15 -0800 Subject: [PATCH] memento: fix headers to be more consistent for framed replay. when using frames, outer frames 'mirrors' mementos of the inner frame to be discoverable by client side memento tools, tracked via #70 --- pywb/framework/memento.py | 34 +++++++++++++------ pywb/rewrite/url_rewriter.py | 2 +- tests/test_memento.py | 65 +++++++++++++++++++++++++++--------- 3 files changed, 73 insertions(+), 28 deletions(-) diff --git a/pywb/framework/memento.py b/pywb/framework/memento.py index 1fd2069c..d1677be7 100644 --- a/pywb/framework/memento.py +++ b/pywb/framework/memento.py @@ -74,14 +74,23 @@ class MementoRespMixin(object): link = [] - if is_memento and cdx: - http_date = timestamp_to_http_date(cdx['timestamp']) - self.status_headers.headers.append(('Memento-Datetime', http_date)) + if is_memento: + if cdx: + http_date = timestamp_to_http_date(cdx['timestamp']) + # for top frame + elif wbrequest.wb_url.timestamp: + http_date = timestamp_to_http_date(wbrequest.wb_url.timestamp) + else: + http_date = None - elif is_memento and is_top_frame and wbrequest.wb_url.timestamp: - # top frame special case - canon_link = wbrequest.urlrewriter.get_new_url(mod='') - link.append(self.make_link(canon_link, 'memento')) + if http_date: + self.status_headers.headers.append(('Memento-Datetime', + http_date)) + + canon_link = wbrequest.urlrewriter.get_new_url(mod='') + link.append(self.make_memento_link(canon_link, + 'memento', + http_date)) req_url = wbrequest.wb_url.url @@ -105,6 +114,9 @@ class MementoRespMixin(object): def make_link(self, url, type): return '<{0}>; rel="{1}"'.format(url, type) + def make_memento_link(self, url, type_, dt): + return '<{0}>; rel="{1}"; datetime="{2}"'.format(url, type_, dt) + def make_timemap_link(self, wbrequest): format_ = '<{0}>; rel="timemap"; type="{1}"' @@ -121,7 +133,7 @@ class MementoResponse(MementoRespMixin, WbResponse): #================================================================= -def make_memento_link(cdx, prefix, datetime=None, rel='memento', end=',\n'): +def make_timemap_memento_link(cdx, prefix, datetime=None, rel='memento', end=',\n'): memento = '<{0}>; rel="{1}"; datetime="{2}"' + end string = WbUrl.to_wburl_str(url=cdx['original'], @@ -161,17 +173,17 @@ def make_timemap(wbrequest, cdx_lines): yield timegate.format(prefix + url) # first memento link - yield make_memento_link(first_cdx, prefix, + yield make_timemap_memento_link(first_cdx, prefix, datetime=from_date) prev_cdx = None for cdx in cdx_lines: if prev_cdx: - yield make_memento_link(prev_cdx, prefix) + yield make_timemap_memento_link(prev_cdx, prefix) prev_cdx = cdx # last memento link, if any if prev_cdx: - yield make_memento_link(prev_cdx, prefix, end='') + yield make_timemap_memento_link(prev_cdx, prefix, end='') diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index 201b6016..77f0acc4 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -151,7 +151,7 @@ class HttpsUrlRewriter(UrlRewriter): return self.remove_https(url) def get_new_url(self, **kwargs): - return kwargs.get('url') + return kwargs.get('url', self.wburl.url) def rebase_rewriter(self, new_url): return self diff --git a/tests/test_memento.py b/tests/test_memento.py index 32b73e40..6c79915c 100644 --- a/tests/test_memento.py +++ b/tests/test_memento.py @@ -1,4 +1,5 @@ import webtest +import re from pywb.webapp.pywb_init import create_wb_router from pywb.framework.wsgi_wrappers import init_app from pywb.cdx.cdxobject import CDXObject @@ -20,12 +21,16 @@ class TestWb: self.testapp = webtest.TestApp(self.app) def get_links(self, resp): - return map(lambda x: x.strip(), resp.headers[LINK].split(',')) + return map(lambda x: x.strip(), re.split(', (?![0-9])', resp.headers[LINK])) def make_timemap_link(self, url): format_ = '; rel="timemap"; type="{1}"' return format_.format(url, LINK_FORMAT) + def make_memento_link(self, url, ts, dt): + format_ = '; rel="memento"; datetime="{2}"' + return format_.format(url, ts, dt) + # Below functionality is for archival (non-proxy) mode # It is designed to conform to Memento protocol Pattern 2.1 # http://www.mementoweb.org/guide/rfc/#Pattern2.1 @@ -93,15 +98,37 @@ class TestWb: assert '/pywb/20140127171239/' in resp.headers['Location'] - def test_top_frame_no_date(self): + def test_top_frame(self): """ - A top-frame request with no date, must treat as intermediate + A top-frame request with no date, not returning memento-datetime Include timemap, timegate, original headers """ - headers = {ACCEPT_DATETIME: 'Sun, 26 Jan 2014 20:08:04'} + resp = self.testapp.get('/pywb/tf_/http://www.iana.org/_css/2013.1/screen.css') - # not a timegate, ignore ACCEPT_DATETIME + assert resp.status_int == 200 + + # no vary header + assert VARY not in resp.headers + + # not memento-datetime + assert MEMENTO_DATETIME not in resp.headers + + links = self.get_links(resp) + assert '; rel="original"' in links + assert '; rel="timegate"' in links + assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links + + def test_top_frame_no_date_accept_datetime(self): + """ + A top-frame request with no date, reflects back accept-datetime date + Include timemap, timegate, original headers, and memento-datetime + """ + + dt = 'Sun, 26 Jan 2014 20:08:04 GMT' + headers = {ACCEPT_DATETIME: dt} + + # not a timegate, but use ACCEPT_DATETIME to infer memento for top frame resp = self.testapp.get('/pywb/tf_/http://www.iana.org/_css/2013.1/screen.css', headers=headers) assert resp.status_int == 200 @@ -109,40 +136,42 @@ class TestWb: # no vary header assert VARY not in resp.headers - # no memento-datetime - assert MEMENTO_DATETIME not in resp.headers + # memento-datetime matches + assert resp.headers[MEMENTO_DATETIME] == dt links = self.get_links(resp) assert '; rel="original"' in links assert '; rel="timegate"' in links + assert self.make_memento_link('http://www.iana.org/_css/2013.1/screen.css', '20140126200804', dt) in links assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links def test_top_frame_with_date(self): """ - A top-frame request with date, treat as intermediate - Include timemap, timegate, original headers and a link to the possible memento + A top-frame request with date, treat as memento + Include timemap, timegate, original headers, memento and memento-datetime """ - headers = {ACCEPT_DATETIME: 'Sun, 26 Jan 2014 20:08:04'} + dt = 'Sun, 26 Jan 2014 20:08:04 GMT' + headers = {ACCEPT_DATETIME: dt} - # not a timegate, ignore ACCEPT_DATETIME - resp = self.testapp.get('/pywb/20141012tf_/http://www.iana.org/_css/2013.1/screen.css', headers=headers) + # not a timegate, ignore ACCEPT_DATETIME, but use provided timestamp as memento-datetime + resp = self.testapp.get('/pywb/20141012000000tf_/http://www.iana.org/_css/2013.1/screen.css', headers=headers) assert resp.status_int == 200 # no vary header assert VARY not in resp.headers - # no memento-datetime - assert MEMENTO_DATETIME not in resp.headers + dt = 'Sun, 12 Oct 2014 00:00:00 GMT' + # memento-datetime matches + assert resp.headers[MEMENTO_DATETIME] == dt links = self.get_links(resp) assert '; rel="original"' in links assert '; rel="timegate"' in links + assert self.make_memento_link('http://www.iana.org/_css/2013.1/screen.css', '20141012000000', dt) in links, links assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links - assert '; rel="memento"' in links - def test_memento_url(self): """ Memento response, 200 capture @@ -156,6 +185,7 @@ class TestWb: links = self.get_links(resp) assert '; rel="original"' in links assert '; rel="timegate"' in links + assert self.make_memento_link('http://www.iana.org/_css/2013.1/screen.css', '20140126200804', 'Sun, 26 Jan 2014 20:08:04 GMT') in links assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links assert resp.headers[MEMENTO_DATETIME] == 'Sun, 26 Jan 2014 20:08:04 GMT' @@ -174,6 +204,7 @@ class TestWb: links = self.get_links(resp) assert '; rel="original"' in links assert '; rel="timegate"' in links + assert self.make_memento_link('http://www.iana.org/domains/example', '20140128051539', 'Tue, 28 Jan 2014 05:15:39 GMT') in links assert self.make_timemap_link('http://www.iana.org/domains/example') in links assert resp.headers[MEMENTO_DATETIME] == 'Tue, 28 Jan 2014 05:15:39 GMT' @@ -241,6 +272,7 @@ rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"' # for memento links = self.get_links(resp) assert '; rel="original timegate"' in links + assert '; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:39 GMT"' in links #assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links assert resp.headers[MEMENTO_DATETIME] == 'Mon, 27 Jan 2014 17:12:39 GMT' @@ -266,6 +298,7 @@ rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"' # for memento links = self.get_links(resp) assert '; rel="original timegate"' in links + assert '; rel="memento"; datetime="Sun, 26 Jan 2014 20:08:04 GMT"' in links #assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links assert resp.headers[MEMENTO_DATETIME] == 'Sun, 26 Jan 2014 20:08:04 GMT'