From 2a605652c61afd31e773ef8ead776a97c6fe1275 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 24 Mar 2014 14:00:06 -0700 Subject: [PATCH] add memento timemap support (for archival mode only) add timemap Link headers to timegate and memento responses timemap accessible via /timemap/*/ path --- pywb/core/query_handler.py | 6 ++- pywb/core/views.py | 11 ++++- pywb/framework/memento.py | 89 ++++++++++++++++++++++++++++++++++---- pywb/rewrite/wburl.py | 36 +++++++++++---- pywb/ui/query.html | 2 +- tests/test_memento.py | 72 ++++++++++++++++++++++++++---- 6 files changed, 185 insertions(+), 31 deletions(-) diff --git a/pywb/core/query_handler.py b/pywb/core/query_handler.py index dbf404a0..f2685077 100644 --- a/pywb/core/query_handler.py +++ b/pywb/core/query_handler.py @@ -4,7 +4,7 @@ import urllib2 from pywb.perms.perms_filter import make_perms_cdx_filter from pywb.framework.wbrequestresponse import WbResponse from pywb.cdx.cdxserver import create_cdx_server - +from views import MementoTimemapView #================================================================= DEFAULT_RULES = 'pywb/rules.yaml' @@ -29,6 +29,8 @@ class QueryHandler(object): if html_query_view: self.views['html'] = html_query_view + self.views['timemap'] = MementoTimemapView() + @staticmethod def init_from_config(config, ds_rules_file=DEFAULT_RULES, @@ -46,7 +48,7 @@ class QueryHandler(object): # cdx server only supports text and cdxobject for now if wb_url.mod == 'cdx_': output = 'text' - elif wb_url.mod == 'timemap_': + elif wb_url.mod == 'timemap': output = 'timemap' elif wb_url.is_query(): output = 'html' diff --git a/pywb/core/views.py b/pywb/core/views.py index 244746f8..37c1c1ab 100644 --- a/pywb/core/views.py +++ b/pywb/core/views.py @@ -1,5 +1,6 @@ from pywb.utils.timeutils import timestamp_to_datetime from pywb.framework.wbrequestresponse import WbResponse +from pywb.framework.memento import make_timemap, LINK_FORMAT import urlparse import logging @@ -75,7 +76,7 @@ def load_query_template(file, desc=None): #================================================================= -# html captures 'calendar' view +# query views #================================================================= class J2HtmlCapturesView(J2TemplateView): def render_response(self, wbrequest, cdx_lines): @@ -83,3 +84,11 @@ class J2HtmlCapturesView(J2TemplateView): cdx_lines=list(cdx_lines), url=wbrequest.wb_url.url, prefix=wbrequest.wb_prefix) + + +#================================================================= +class MementoTimemapView(object): + def render_response(self, wbrequest, cdx_lines): + memento_lines = make_timemap(wbrequest, cdx_lines) + return WbResponse.text_stream(memento_lines, + content_type=LINK_FORMAT) diff --git a/pywb/framework/memento.py b/pywb/framework/memento.py index 8f380121..e6962980 100644 --- a/pywb/framework/memento.py +++ b/pywb/framework/memento.py @@ -3,6 +3,9 @@ from pywb.utils.timeutils import http_date_to_timestamp from pywb.utils.timeutils import timestamp_to_http_date from wbrequestresponse import WbRequest, WbResponse +from pywb.rewrite.wburl import WbUrl + +LINK_FORMAT = 'application/link-format' #================================================================= @@ -69,24 +72,92 @@ class MementoRespMixin(object): req_url = wbrequest.wb_url.url - if is_memento and is_timegate: - link = self.make_link(req_url, 'original timegate') - elif is_memento: - timegate = wbrequest.urlrewriter.get_timestamp_url('') + link = [] - link = [] - link.append(self.make_link(req_url, 'original')) - link.append(self.make_link(timegate, 'timegate')) - link = ', '.join(link) + if is_memento and is_timegate: + link.append(self.make_link(req_url, 'original timegate')) else: - link = self.make_link(req_url, 'original') + link.append(self.make_link(req_url, 'original')) + + # for now, include timemap only in non-proxy mode + if not wbrequest.is_proxy and (is_memento or is_timegate): + link.append(self.make_timemap_link(wbrequest)) + + if is_memento and not is_timegate: + timegate = wbrequest.urlrewriter.get_timestamp_url('') + link.append(self.make_link(timegate, 'timegate')) + + link = ', '.join(link) self.status_headers.headers.append(('Link', link)) def make_link(self, url, type): return '<{0}>; rel="{1}"'.format(url, type) + def make_timemap_link(self, wbrequest): + format_ = '<{0}>; rel="timemap"; type="{1}"' + + prefix = wbrequest.wb_prefix + + url = prefix + (wbrequest.wb_url. + to_str(mod='timemap', + timestamp='', + type=wbrequest.wb_url.QUERY)) + + return format_.format(url, LINK_FORMAT) + #================================================================= class MementoResponse(MementoRespMixin, WbResponse): pass + + +#================================================================= +def make_memento_link(cdx, prefix, datetime=None, rel='memento', end=',\n'): + memento = '<{0}>; rel="{1}"; datetime="{2}"' + end + + string = WbUrl.to_wburl_str(url=cdx['original'], + timestamp=cdx['timestamp'], + type=WbUrl.REPLAY) + + url = prefix + string + + if not datetime: + datetime = timestamp_to_http_date(cdx['timestamp']) + + return memento.format(url, rel, datetime) + + +#================================================================= +def make_timemap(wbrequest, cdx_lines): + prefix = wbrequest.wb_prefix + url = wbrequest.wb_url.url + + # get first memento as it'll be used for 'from' field + first_cdx = cdx_lines.next() + from_date = timestamp_to_http_date(first_cdx['timestamp']) + + # timemap link + timemap = ('<{0}>; rel="self"; ' + + 'type="application/link-format"; from="{1}",\n') + yield timemap.format(prefix + wbrequest.wb_url.to_str(), from_date) + + # timegate link + timegate = '<{0}>; rel="timegate",\n' + yield timegate.format(prefix + url) + + # first memento link + yield make_memento_link(first_cdx, prefix, + datetime=from_date) + + prev_cdx = None + + for cdx in cdx_lines: + if prev_cdx: + yield make_memento_link(prev_cdx, prefix) + + prev_cdx = cdx + + # last memento link + if prev_cdx: + yield make_memento_link(prev_cdx, prefix, end='') diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py index 0cad8ed2..1a91393a 100644 --- a/pywb/rewrite/wburl.py +++ b/pywb/rewrite/wburl.py @@ -59,12 +59,20 @@ class BaseWbUrl(object): self.type = type def is_replay(self): - return (self.type == self.REPLAY or - self.type == self.LATEST_REPLAY) + return self.is_replay_type(self.type) def is_query(self): - return (self.type == self.QUERY or - self.type == self.URL_QUERY) + return self.is_query_type(self.type) + + @staticmethod + def is_replay_type(type_): + return (type_ == BaseWbUrl.REPLAY or + type_ == BaseWbUrl.LATEST_REPLAY) + + @staticmethod + def is_query_type(type_): + return (type_ == BaseWbUrl.QUERY or + type_ == BaseWbUrl.URL_QUERY) #================================================================= @@ -152,23 +160,33 @@ class WbUrl(BaseWbUrl): # Str Representation # ==================== def to_str(self, **overrides): - atype = overrides.get('type', self.type) + type_ = overrides.get('type', self.type) mod = overrides.get('mod', self.mod) timestamp = overrides.get('timestamp', self.timestamp) end_timestamp = overrides.get('end_timestamp', self.end_timestamp) url = overrides.get('url', self.url) - if atype == self.QUERY or atype == self.URL_QUERY: + return self.to_wburl_str(url=url, + type=type_, + mod=mod, + timestamp=timestamp, + end_timestamp=end_timestamp) + + @staticmethod + def to_wburl_str(url, type=BaseWbUrl.LATEST_REPLAY, + mod='', timestamp='', end_timestamp=''): + + if WbUrl.is_query_type(type): tsmod = '' if mod: tsmod += mod + "/" if timestamp: tsmod += timestamp - if end_timestamp: - tsmod += '-' + end_timestamp + if end_timestamp: + tsmod += '-' + end_timestamp tsmod += "*/" + url - if atype == self.URL_QUERY: + if type == BaseWbUrl.URL_QUERY: tsmod += "*" return tsmod else: diff --git a/pywb/ui/query.html b/pywb/ui/query.html index f4c69e26..c78e1b49 100644 --- a/pywb/ui/query.html +++ b/pywb/ui/query.html @@ -10,7 +10,7 @@ {% for cdx in cdx_lines %} - {{ cdx['timestamp'] | format_ts}} + {{ cdx['timestamp'] | format_ts}} {{ cdx['statuscode'] }} {{ cdx['original'] }} {{ cdx['filename'] }} diff --git a/tests/test_memento.py b/tests/test_memento.py index 838c6463..f24aac8c 100644 --- a/tests/test_memento.py +++ b/tests/test_memento.py @@ -7,6 +7,7 @@ MEMENTO_DATETIME = 'Memento-Datetime' ACCEPT_DATETIME = 'Accept-Datetime' LINK = 'Link' VARY = 'Vary' +LINK_FORMAT = 'application/link-format' class TestWb: TEST_CONFIG = 'tests/test_config_memento.yaml' @@ -18,6 +19,13 @@ class TestWb: self.testapp = webtest.TestApp(self.app) + def get_links(self, resp): + return map(lambda x: x.strip(), resp.headers[LINK].split(',')) + + def make_timemap_link(self, url): + format_ = '; rel="timemap"; type="{1}"' + return format_.format(url, LINK_FORMAT) + # Below functionality is for archival (non-proxy) mode # It is designed to conform to Memento protocol Pattern 2.1 # http://www.mementoweb.org/guide/rfc/#Pattern2.1 @@ -31,7 +39,11 @@ class TestWb: assert resp.status_int == 302 assert resp.headers[VARY] == 'accept-datetime' - assert resp.headers[LINK] == '; rel="original"' + + links = self.get_links(resp) + assert '; rel="original"' in links + assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links + assert MEMENTO_DATETIME not in resp.headers assert '/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location'] @@ -47,7 +59,12 @@ class TestWb: assert resp.status_int == 302 assert resp.headers[VARY] == 'accept-datetime' - assert resp.headers[LINK] == '; rel="original"' + + links = self.get_links(resp) + assert '; rel="original"' in links + assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links + + assert MEMENTO_DATETIME not in resp.headers assert '/pywb/20140126200804/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location'] @@ -65,7 +82,10 @@ class TestWb: # no vary header assert VARY not in resp.headers - assert resp.headers[LINK] == '; rel="original"' + + links = self.get_links(resp) + assert '; rel="original"' in links + assert MEMENTO_DATETIME not in resp.headers @@ -83,8 +103,10 @@ class TestWb: assert VARY not in resp.headers - assert resp.headers[LINK] == '; rel="original", \ -; rel="timegate"' + links = self.get_links(resp) + assert '; rel="original"' in links + assert '; rel="timegate"' in links + assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links assert resp.headers[MEMENTO_DATETIME] == 'Sun, 26 Jan 2014 20:08:04 GMT' @@ -99,12 +121,38 @@ class TestWb: assert VARY not in resp.headers - assert resp.headers[LINK] == '; rel="original", \ -; rel="timegate"' + links = self.get_links(resp) + assert '; rel="original"' in links + assert '; rel="timegate"' in links + assert self.make_timemap_link('http://www.iana.org/domains/example') in links assert resp.headers[MEMENTO_DATETIME] == 'Tue, 28 Jan 2014 05:15:39 GMT' + def test_timemap(self): + """ + Test application/link-format timemap + """ + + resp = self.testapp.get('/pywb/timemap/*/http://example.com?example=1') + assert resp.status_int == 200 + assert resp.content_type == LINK_FORMAT + + lines = resp.body.split('\n') + + assert len(lines) == 4 + + assert lines[0] == '; \ +rel="self"; type="application/link-format"; from="Fri, 03 Jan 2014 03:03:21 GMT",' + + assert lines[1] == '; rel="timegate",' + + assert lines[2] == '; \ +rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT",' + + assert lines[3] == '; \ +rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"' + # Below functions test pywb proxy mode behavior # They are designed to roughly conform to Memento protocol Pattern 1.3 # with the exception that the original resource is not available @@ -126,7 +174,10 @@ class TestWb: assert resp.headers[VARY] == 'accept-datetime' # for memento - assert resp.headers[LINK] == '; rel="original timegate"' + links = self.get_links(resp) + assert '; rel="original timegate"' in links + #assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links + assert resp.headers[MEMENTO_DATETIME] == 'Mon, 27 Jan 2014 17:12:39 GMT' @@ -148,7 +199,10 @@ class TestWb: assert resp.headers[VARY] == 'accept-datetime' # for memento - assert resp.headers[LINK] == '; rel="original timegate"' + links = self.get_links(resp) + assert '; rel="original timegate"' in links + #assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links + assert resp.headers[MEMENTO_DATETIME] == 'Sun, 26 Jan 2014 20:08:04 GMT'