1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

add memento timemap support (for archival mode only)

add timemap Link headers to timegate and memento responses
timemap accessible via /timemap/*/ path
This commit is contained in:
Ilya Kreymer 2014-03-24 14:00:06 -07:00
parent 9654c22bed
commit 2a605652c6
6 changed files with 185 additions and 31 deletions

View File

@ -4,7 +4,7 @@ import urllib2
from pywb.perms.perms_filter import make_perms_cdx_filter
from pywb.framework.wbrequestresponse import WbResponse
from pywb.cdx.cdxserver import create_cdx_server
from views import MementoTimemapView
#=================================================================
DEFAULT_RULES = 'pywb/rules.yaml'
@ -29,6 +29,8 @@ class QueryHandler(object):
if html_query_view:
self.views['html'] = html_query_view
self.views['timemap'] = MementoTimemapView()
@staticmethod
def init_from_config(config,
ds_rules_file=DEFAULT_RULES,
@ -46,7 +48,7 @@ class QueryHandler(object):
# cdx server only supports text and cdxobject for now
if wb_url.mod == 'cdx_':
output = 'text'
elif wb_url.mod == 'timemap_':
elif wb_url.mod == 'timemap':
output = 'timemap'
elif wb_url.is_query():
output = 'html'

View File

@ -1,5 +1,6 @@
from pywb.utils.timeutils import timestamp_to_datetime
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.memento import make_timemap, LINK_FORMAT
import urlparse
import logging
@ -75,7 +76,7 @@ def load_query_template(file, desc=None):
#=================================================================
# html captures 'calendar' view
# query views
#=================================================================
class J2HtmlCapturesView(J2TemplateView):
def render_response(self, wbrequest, cdx_lines):
@ -83,3 +84,11 @@ class J2HtmlCapturesView(J2TemplateView):
cdx_lines=list(cdx_lines),
url=wbrequest.wb_url.url,
prefix=wbrequest.wb_prefix)
#=================================================================
class MementoTimemapView(object):
def render_response(self, wbrequest, cdx_lines):
memento_lines = make_timemap(wbrequest, cdx_lines)
return WbResponse.text_stream(memento_lines,
content_type=LINK_FORMAT)

View File

@ -3,6 +3,9 @@ from pywb.utils.timeutils import http_date_to_timestamp
from pywb.utils.timeutils import timestamp_to_http_date
from wbrequestresponse import WbRequest, WbResponse
from pywb.rewrite.wburl import WbUrl
LINK_FORMAT = 'application/link-format'
#=================================================================
@ -69,24 +72,92 @@ class MementoRespMixin(object):
req_url = wbrequest.wb_url.url
if is_memento and is_timegate:
link = self.make_link(req_url, 'original timegate')
elif is_memento:
timegate = wbrequest.urlrewriter.get_timestamp_url('')
link = []
link = []
link.append(self.make_link(req_url, 'original'))
link.append(self.make_link(timegate, 'timegate'))
link = ', '.join(link)
if is_memento and is_timegate:
link.append(self.make_link(req_url, 'original timegate'))
else:
link = self.make_link(req_url, 'original')
link.append(self.make_link(req_url, 'original'))
# for now, include timemap only in non-proxy mode
if not wbrequest.is_proxy and (is_memento or is_timegate):
link.append(self.make_timemap_link(wbrequest))
if is_memento and not is_timegate:
timegate = wbrequest.urlrewriter.get_timestamp_url('')
link.append(self.make_link(timegate, 'timegate'))
link = ', '.join(link)
self.status_headers.headers.append(('Link', link))
def make_link(self, url, type):
return '<{0}>; rel="{1}"'.format(url, type)
def make_timemap_link(self, wbrequest):
format_ = '<{0}>; rel="timemap"; type="{1}"'
prefix = wbrequest.wb_prefix
url = prefix + (wbrequest.wb_url.
to_str(mod='timemap',
timestamp='',
type=wbrequest.wb_url.QUERY))
return format_.format(url, LINK_FORMAT)
#=================================================================
class MementoResponse(MementoRespMixin, WbResponse):
pass
#=================================================================
def make_memento_link(cdx, prefix, datetime=None, rel='memento', end=',\n'):
memento = '<{0}>; rel="{1}"; datetime="{2}"' + end
string = WbUrl.to_wburl_str(url=cdx['original'],
timestamp=cdx['timestamp'],
type=WbUrl.REPLAY)
url = prefix + string
if not datetime:
datetime = timestamp_to_http_date(cdx['timestamp'])
return memento.format(url, rel, datetime)
#=================================================================
def make_timemap(wbrequest, cdx_lines):
prefix = wbrequest.wb_prefix
url = wbrequest.wb_url.url
# get first memento as it'll be used for 'from' field
first_cdx = cdx_lines.next()
from_date = timestamp_to_http_date(first_cdx['timestamp'])
# timemap link
timemap = ('<{0}>; rel="self"; ' +
'type="application/link-format"; from="{1}",\n')
yield timemap.format(prefix + wbrequest.wb_url.to_str(), from_date)
# timegate link
timegate = '<{0}>; rel="timegate",\n'
yield timegate.format(prefix + url)
# first memento link
yield make_memento_link(first_cdx, prefix,
datetime=from_date)
prev_cdx = None
for cdx in cdx_lines:
if prev_cdx:
yield make_memento_link(prev_cdx, prefix)
prev_cdx = cdx
# last memento link
if prev_cdx:
yield make_memento_link(prev_cdx, prefix, end='')

View File

@ -59,12 +59,20 @@ class BaseWbUrl(object):
self.type = type
def is_replay(self):
return (self.type == self.REPLAY or
self.type == self.LATEST_REPLAY)
return self.is_replay_type(self.type)
def is_query(self):
return (self.type == self.QUERY or
self.type == self.URL_QUERY)
return self.is_query_type(self.type)
@staticmethod
def is_replay_type(type_):
return (type_ == BaseWbUrl.REPLAY or
type_ == BaseWbUrl.LATEST_REPLAY)
@staticmethod
def is_query_type(type_):
return (type_ == BaseWbUrl.QUERY or
type_ == BaseWbUrl.URL_QUERY)
#=================================================================
@ -152,23 +160,33 @@ class WbUrl(BaseWbUrl):
# Str Representation
# ====================
def to_str(self, **overrides):
atype = overrides.get('type', self.type)
type_ = overrides.get('type', self.type)
mod = overrides.get('mod', self.mod)
timestamp = overrides.get('timestamp', self.timestamp)
end_timestamp = overrides.get('end_timestamp', self.end_timestamp)
url = overrides.get('url', self.url)
if atype == self.QUERY or atype == self.URL_QUERY:
return self.to_wburl_str(url=url,
type=type_,
mod=mod,
timestamp=timestamp,
end_timestamp=end_timestamp)
@staticmethod
def to_wburl_str(url, type=BaseWbUrl.LATEST_REPLAY,
mod='', timestamp='', end_timestamp=''):
if WbUrl.is_query_type(type):
tsmod = ''
if mod:
tsmod += mod + "/"
if timestamp:
tsmod += timestamp
if end_timestamp:
tsmod += '-' + end_timestamp
if end_timestamp:
tsmod += '-' + end_timestamp
tsmod += "*/" + url
if atype == self.URL_QUERY:
if type == BaseWbUrl.URL_QUERY:
tsmod += "*"
return tsmod
else:

View File

@ -10,7 +10,7 @@
</tr>
{% for cdx in cdx_lines %}
<tr style="{{ 'font-weight: bold' if cdx['mimetype'] != 'warc/revisit' else '' }}">
<td><a href="{{ prefix }}{{ cdx.timestamp }}/{{ url }}">{{ cdx['timestamp'] | format_ts}}</a></td>
<td><a href="{{ prefix }}{{ cdx.timestamp }}/{{ cdx.original }}">{{ cdx['timestamp'] | format_ts}}</a></td>
<td>{{ cdx['statuscode'] }}</td>
<td>{{ cdx['original'] }}</td>
<td>{{ cdx['filename'] }}</td>

View File

@ -7,6 +7,7 @@ MEMENTO_DATETIME = 'Memento-Datetime'
ACCEPT_DATETIME = 'Accept-Datetime'
LINK = 'Link'
VARY = 'Vary'
LINK_FORMAT = 'application/link-format'
class TestWb:
TEST_CONFIG = 'tests/test_config_memento.yaml'
@ -18,6 +19,13 @@ class TestWb:
self.testapp = webtest.TestApp(self.app)
def get_links(self, resp):
return map(lambda x: x.strip(), resp.headers[LINK].split(','))
def make_timemap_link(self, url):
format_ = '<http://localhost:80/pywb/timemap/*/{0}>; rel="timemap"; type="{1}"'
return format_.format(url, LINK_FORMAT)
# Below functionality is for archival (non-proxy) mode
# It is designed to conform to Memento protocol Pattern 2.1
# http://www.mementoweb.org/guide/rfc/#Pattern2.1
@ -31,7 +39,11 @@ class TestWb:
assert resp.status_int == 302
assert resp.headers[VARY] == 'accept-datetime'
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"'
links = self.get_links(resp)
assert '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"' in links
assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links
assert MEMENTO_DATETIME not in resp.headers
assert '/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location']
@ -47,7 +59,12 @@ class TestWb:
assert resp.status_int == 302
assert resp.headers[VARY] == 'accept-datetime'
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"'
links = self.get_links(resp)
assert '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"' in links
assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links
assert MEMENTO_DATETIME not in resp.headers
assert '/pywb/20140126200804/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location']
@ -65,7 +82,10 @@ class TestWb:
# no vary header
assert VARY not in resp.headers
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"'
links = self.get_links(resp)
assert '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"' in links
assert MEMENTO_DATETIME not in resp.headers
@ -83,8 +103,10 @@ class TestWb:
assert VARY not in resp.headers
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original", \
<http://localhost:80/pywb/http://www.iana.org/_css/2013.1/screen.css>; rel="timegate"'
links = self.get_links(resp)
assert '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"' in links
assert '<http://localhost:80/pywb/http://www.iana.org/_css/2013.1/screen.css>; rel="timegate"' in links
assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links
assert resp.headers[MEMENTO_DATETIME] == 'Sun, 26 Jan 2014 20:08:04 GMT'
@ -99,12 +121,38 @@ class TestWb:
assert VARY not in resp.headers
assert resp.headers[LINK] == '<http://www.iana.org/domains/example>; rel="original", \
<http://localhost:80/pywb/http://www.iana.org/domains/example>; rel="timegate"'
links = self.get_links(resp)
assert '<http://www.iana.org/domains/example>; rel="original"' in links
assert '<http://localhost:80/pywb/http://www.iana.org/domains/example>; rel="timegate"' in links
assert self.make_timemap_link('http://www.iana.org/domains/example') in links
assert resp.headers[MEMENTO_DATETIME] == 'Tue, 28 Jan 2014 05:15:39 GMT'
def test_timemap(self):
"""
Test application/link-format timemap
"""
resp = self.testapp.get('/pywb/timemap/*/http://example.com?example=1')
assert resp.status_int == 200
assert resp.content_type == LINK_FORMAT
lines = resp.body.split('\n')
assert len(lines) == 4
assert lines[0] == '<http://localhost:80/pywb/timemap/*/http://example.com?example=1>; \
rel="self"; type="application/link-format"; from="Fri, 03 Jan 2014 03:03:21 GMT",'
assert lines[1] == '<http://localhost:80/pywb/http://example.com?example=1>; rel="timegate",'
assert lines[2] == '<http://localhost:80/pywb/20140103030321/http://example.com?example=1>; \
rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT",'
assert lines[3] == '<http://localhost:80/pywb/20140103030341/http://example.com?example=1>; \
rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"'
# Below functions test pywb proxy mode behavior
# They are designed to roughly conform to Memento protocol Pattern 1.3
# with the exception that the original resource is not available
@ -126,7 +174,10 @@ class TestWb:
assert resp.headers[VARY] == 'accept-datetime'
# for memento
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original timegate"'
links = self.get_links(resp)
assert '<http://www.iana.org/_css/2013.1/screen.css>; rel="original timegate"' in links
#assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links
assert resp.headers[MEMENTO_DATETIME] == 'Mon, 27 Jan 2014 17:12:39 GMT'
@ -148,7 +199,10 @@ class TestWb:
assert resp.headers[VARY] == 'accept-datetime'
# for memento
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original timegate"'
links = self.get_links(resp)
assert '<http://www.iana.org/_css/2013.1/screen.css>; rel="original timegate"' in links
#assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links
assert resp.headers[MEMENTO_DATETIME] == 'Sun, 26 Jan 2014 20:08:04 GMT'