mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
add memento timemap support (for archival mode only)
add timemap Link headers to timegate and memento responses timemap accessible via /timemap/*/ path
This commit is contained in:
parent
9654c22bed
commit
2a605652c6
@ -4,7 +4,7 @@ import urllib2
|
||||
from pywb.perms.perms_filter import make_perms_cdx_filter
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.cdx.cdxserver import create_cdx_server
|
||||
|
||||
from views import MementoTimemapView
|
||||
|
||||
#=================================================================
|
||||
DEFAULT_RULES = 'pywb/rules.yaml'
|
||||
@ -29,6 +29,8 @@ class QueryHandler(object):
|
||||
if html_query_view:
|
||||
self.views['html'] = html_query_view
|
||||
|
||||
self.views['timemap'] = MementoTimemapView()
|
||||
|
||||
@staticmethod
|
||||
def init_from_config(config,
|
||||
ds_rules_file=DEFAULT_RULES,
|
||||
@ -46,7 +48,7 @@ class QueryHandler(object):
|
||||
# cdx server only supports text and cdxobject for now
|
||||
if wb_url.mod == 'cdx_':
|
||||
output = 'text'
|
||||
elif wb_url.mod == 'timemap_':
|
||||
elif wb_url.mod == 'timemap':
|
||||
output = 'timemap'
|
||||
elif wb_url.is_query():
|
||||
output = 'html'
|
||||
|
@ -1,5 +1,6 @@
|
||||
from pywb.utils.timeutils import timestamp_to_datetime
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.framework.memento import make_timemap, LINK_FORMAT
|
||||
|
||||
import urlparse
|
||||
import logging
|
||||
@ -75,7 +76,7 @@ def load_query_template(file, desc=None):
|
||||
|
||||
|
||||
#=================================================================
|
||||
# html captures 'calendar' view
|
||||
# query views
|
||||
#=================================================================
|
||||
class J2HtmlCapturesView(J2TemplateView):
|
||||
def render_response(self, wbrequest, cdx_lines):
|
||||
@ -83,3 +84,11 @@ class J2HtmlCapturesView(J2TemplateView):
|
||||
cdx_lines=list(cdx_lines),
|
||||
url=wbrequest.wb_url.url,
|
||||
prefix=wbrequest.wb_prefix)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class MementoTimemapView(object):
|
||||
def render_response(self, wbrequest, cdx_lines):
|
||||
memento_lines = make_timemap(wbrequest, cdx_lines)
|
||||
return WbResponse.text_stream(memento_lines,
|
||||
content_type=LINK_FORMAT)
|
||||
|
@ -3,6 +3,9 @@ from pywb.utils.timeutils import http_date_to_timestamp
|
||||
from pywb.utils.timeutils import timestamp_to_http_date
|
||||
|
||||
from wbrequestresponse import WbRequest, WbResponse
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
|
||||
LINK_FORMAT = 'application/link-format'
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -69,24 +72,92 @@ class MementoRespMixin(object):
|
||||
|
||||
req_url = wbrequest.wb_url.url
|
||||
|
||||
if is_memento and is_timegate:
|
||||
link = self.make_link(req_url, 'original timegate')
|
||||
elif is_memento:
|
||||
timegate = wbrequest.urlrewriter.get_timestamp_url('')
|
||||
link = []
|
||||
|
||||
link = []
|
||||
link.append(self.make_link(req_url, 'original'))
|
||||
link.append(self.make_link(timegate, 'timegate'))
|
||||
link = ', '.join(link)
|
||||
if is_memento and is_timegate:
|
||||
link.append(self.make_link(req_url, 'original timegate'))
|
||||
else:
|
||||
link = self.make_link(req_url, 'original')
|
||||
link.append(self.make_link(req_url, 'original'))
|
||||
|
||||
# for now, include timemap only in non-proxy mode
|
||||
if not wbrequest.is_proxy and (is_memento or is_timegate):
|
||||
link.append(self.make_timemap_link(wbrequest))
|
||||
|
||||
if is_memento and not is_timegate:
|
||||
timegate = wbrequest.urlrewriter.get_timestamp_url('')
|
||||
link.append(self.make_link(timegate, 'timegate'))
|
||||
|
||||
link = ', '.join(link)
|
||||
|
||||
self.status_headers.headers.append(('Link', link))
|
||||
|
||||
def make_link(self, url, type):
|
||||
return '<{0}>; rel="{1}"'.format(url, type)
|
||||
|
||||
def make_timemap_link(self, wbrequest):
|
||||
format_ = '<{0}>; rel="timemap"; type="{1}"'
|
||||
|
||||
prefix = wbrequest.wb_prefix
|
||||
|
||||
url = prefix + (wbrequest.wb_url.
|
||||
to_str(mod='timemap',
|
||||
timestamp='',
|
||||
type=wbrequest.wb_url.QUERY))
|
||||
|
||||
return format_.format(url, LINK_FORMAT)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class MementoResponse(MementoRespMixin, WbResponse):
|
||||
pass
|
||||
|
||||
|
||||
#=================================================================
|
||||
def make_memento_link(cdx, prefix, datetime=None, rel='memento', end=',\n'):
|
||||
memento = '<{0}>; rel="{1}"; datetime="{2}"' + end
|
||||
|
||||
string = WbUrl.to_wburl_str(url=cdx['original'],
|
||||
timestamp=cdx['timestamp'],
|
||||
type=WbUrl.REPLAY)
|
||||
|
||||
url = prefix + string
|
||||
|
||||
if not datetime:
|
||||
datetime = timestamp_to_http_date(cdx['timestamp'])
|
||||
|
||||
return memento.format(url, rel, datetime)
|
||||
|
||||
|
||||
#=================================================================
|
||||
def make_timemap(wbrequest, cdx_lines):
|
||||
prefix = wbrequest.wb_prefix
|
||||
url = wbrequest.wb_url.url
|
||||
|
||||
# get first memento as it'll be used for 'from' field
|
||||
first_cdx = cdx_lines.next()
|
||||
from_date = timestamp_to_http_date(first_cdx['timestamp'])
|
||||
|
||||
# timemap link
|
||||
timemap = ('<{0}>; rel="self"; ' +
|
||||
'type="application/link-format"; from="{1}",\n')
|
||||
yield timemap.format(prefix + wbrequest.wb_url.to_str(), from_date)
|
||||
|
||||
# timegate link
|
||||
timegate = '<{0}>; rel="timegate",\n'
|
||||
yield timegate.format(prefix + url)
|
||||
|
||||
# first memento link
|
||||
yield make_memento_link(first_cdx, prefix,
|
||||
datetime=from_date)
|
||||
|
||||
prev_cdx = None
|
||||
|
||||
for cdx in cdx_lines:
|
||||
if prev_cdx:
|
||||
yield make_memento_link(prev_cdx, prefix)
|
||||
|
||||
prev_cdx = cdx
|
||||
|
||||
# last memento link
|
||||
if prev_cdx:
|
||||
yield make_memento_link(prev_cdx, prefix, end='')
|
||||
|
@ -59,12 +59,20 @@ class BaseWbUrl(object):
|
||||
self.type = type
|
||||
|
||||
def is_replay(self):
|
||||
return (self.type == self.REPLAY or
|
||||
self.type == self.LATEST_REPLAY)
|
||||
return self.is_replay_type(self.type)
|
||||
|
||||
def is_query(self):
|
||||
return (self.type == self.QUERY or
|
||||
self.type == self.URL_QUERY)
|
||||
return self.is_query_type(self.type)
|
||||
|
||||
@staticmethod
|
||||
def is_replay_type(type_):
|
||||
return (type_ == BaseWbUrl.REPLAY or
|
||||
type_ == BaseWbUrl.LATEST_REPLAY)
|
||||
|
||||
@staticmethod
|
||||
def is_query_type(type_):
|
||||
return (type_ == BaseWbUrl.QUERY or
|
||||
type_ == BaseWbUrl.URL_QUERY)
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -152,23 +160,33 @@ class WbUrl(BaseWbUrl):
|
||||
# Str Representation
|
||||
# ====================
|
||||
def to_str(self, **overrides):
|
||||
atype = overrides.get('type', self.type)
|
||||
type_ = overrides.get('type', self.type)
|
||||
mod = overrides.get('mod', self.mod)
|
||||
timestamp = overrides.get('timestamp', self.timestamp)
|
||||
end_timestamp = overrides.get('end_timestamp', self.end_timestamp)
|
||||
url = overrides.get('url', self.url)
|
||||
|
||||
if atype == self.QUERY or atype == self.URL_QUERY:
|
||||
return self.to_wburl_str(url=url,
|
||||
type=type_,
|
||||
mod=mod,
|
||||
timestamp=timestamp,
|
||||
end_timestamp=end_timestamp)
|
||||
|
||||
@staticmethod
|
||||
def to_wburl_str(url, type=BaseWbUrl.LATEST_REPLAY,
|
||||
mod='', timestamp='', end_timestamp=''):
|
||||
|
||||
if WbUrl.is_query_type(type):
|
||||
tsmod = ''
|
||||
if mod:
|
||||
tsmod += mod + "/"
|
||||
if timestamp:
|
||||
tsmod += timestamp
|
||||
if end_timestamp:
|
||||
tsmod += '-' + end_timestamp
|
||||
if end_timestamp:
|
||||
tsmod += '-' + end_timestamp
|
||||
|
||||
tsmod += "*/" + url
|
||||
if atype == self.URL_QUERY:
|
||||
if type == BaseWbUrl.URL_QUERY:
|
||||
tsmod += "*"
|
||||
return tsmod
|
||||
else:
|
||||
|
@ -10,7 +10,7 @@
|
||||
</tr>
|
||||
{% for cdx in cdx_lines %}
|
||||
<tr style="{{ 'font-weight: bold' if cdx['mimetype'] != 'warc/revisit' else '' }}">
|
||||
<td><a href="{{ prefix }}{{ cdx.timestamp }}/{{ url }}">{{ cdx['timestamp'] | format_ts}}</a></td>
|
||||
<td><a href="{{ prefix }}{{ cdx.timestamp }}/{{ cdx.original }}">{{ cdx['timestamp'] | format_ts}}</a></td>
|
||||
<td>{{ cdx['statuscode'] }}</td>
|
||||
<td>{{ cdx['original'] }}</td>
|
||||
<td>{{ cdx['filename'] }}</td>
|
||||
|
@ -7,6 +7,7 @@ MEMENTO_DATETIME = 'Memento-Datetime'
|
||||
ACCEPT_DATETIME = 'Accept-Datetime'
|
||||
LINK = 'Link'
|
||||
VARY = 'Vary'
|
||||
LINK_FORMAT = 'application/link-format'
|
||||
|
||||
class TestWb:
|
||||
TEST_CONFIG = 'tests/test_config_memento.yaml'
|
||||
@ -18,6 +19,13 @@ class TestWb:
|
||||
|
||||
self.testapp = webtest.TestApp(self.app)
|
||||
|
||||
def get_links(self, resp):
|
||||
return map(lambda x: x.strip(), resp.headers[LINK].split(','))
|
||||
|
||||
def make_timemap_link(self, url):
|
||||
format_ = '<http://localhost:80/pywb/timemap/*/{0}>; rel="timemap"; type="{1}"'
|
||||
return format_.format(url, LINK_FORMAT)
|
||||
|
||||
# Below functionality is for archival (non-proxy) mode
|
||||
# It is designed to conform to Memento protocol Pattern 2.1
|
||||
# http://www.mementoweb.org/guide/rfc/#Pattern2.1
|
||||
@ -31,7 +39,11 @@ class TestWb:
|
||||
assert resp.status_int == 302
|
||||
|
||||
assert resp.headers[VARY] == 'accept-datetime'
|
||||
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"'
|
||||
|
||||
links = self.get_links(resp)
|
||||
assert '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"' in links
|
||||
assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links
|
||||
|
||||
assert MEMENTO_DATETIME not in resp.headers
|
||||
|
||||
assert '/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location']
|
||||
@ -47,7 +59,12 @@ class TestWb:
|
||||
assert resp.status_int == 302
|
||||
|
||||
assert resp.headers[VARY] == 'accept-datetime'
|
||||
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"'
|
||||
|
||||
links = self.get_links(resp)
|
||||
assert '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"' in links
|
||||
assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links
|
||||
|
||||
|
||||
assert MEMENTO_DATETIME not in resp.headers
|
||||
|
||||
assert '/pywb/20140126200804/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location']
|
||||
@ -65,7 +82,10 @@ class TestWb:
|
||||
|
||||
# no vary header
|
||||
assert VARY not in resp.headers
|
||||
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"'
|
||||
|
||||
links = self.get_links(resp)
|
||||
assert '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"' in links
|
||||
|
||||
assert MEMENTO_DATETIME not in resp.headers
|
||||
|
||||
|
||||
@ -83,8 +103,10 @@ class TestWb:
|
||||
|
||||
assert VARY not in resp.headers
|
||||
|
||||
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original", \
|
||||
<http://localhost:80/pywb/http://www.iana.org/_css/2013.1/screen.css>; rel="timegate"'
|
||||
links = self.get_links(resp)
|
||||
assert '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"' in links
|
||||
assert '<http://localhost:80/pywb/http://www.iana.org/_css/2013.1/screen.css>; rel="timegate"' in links
|
||||
assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links
|
||||
|
||||
assert resp.headers[MEMENTO_DATETIME] == 'Sun, 26 Jan 2014 20:08:04 GMT'
|
||||
|
||||
@ -99,12 +121,38 @@ class TestWb:
|
||||
|
||||
assert VARY not in resp.headers
|
||||
|
||||
assert resp.headers[LINK] == '<http://www.iana.org/domains/example>; rel="original", \
|
||||
<http://localhost:80/pywb/http://www.iana.org/domains/example>; rel="timegate"'
|
||||
links = self.get_links(resp)
|
||||
assert '<http://www.iana.org/domains/example>; rel="original"' in links
|
||||
assert '<http://localhost:80/pywb/http://www.iana.org/domains/example>; rel="timegate"' in links
|
||||
assert self.make_timemap_link('http://www.iana.org/domains/example') in links
|
||||
|
||||
assert resp.headers[MEMENTO_DATETIME] == 'Tue, 28 Jan 2014 05:15:39 GMT'
|
||||
|
||||
|
||||
def test_timemap(self):
|
||||
"""
|
||||
Test application/link-format timemap
|
||||
"""
|
||||
|
||||
resp = self.testapp.get('/pywb/timemap/*/http://example.com?example=1')
|
||||
assert resp.status_int == 200
|
||||
assert resp.content_type == LINK_FORMAT
|
||||
|
||||
lines = resp.body.split('\n')
|
||||
|
||||
assert len(lines) == 4
|
||||
|
||||
assert lines[0] == '<http://localhost:80/pywb/timemap/*/http://example.com?example=1>; \
|
||||
rel="self"; type="application/link-format"; from="Fri, 03 Jan 2014 03:03:21 GMT",'
|
||||
|
||||
assert lines[1] == '<http://localhost:80/pywb/http://example.com?example=1>; rel="timegate",'
|
||||
|
||||
assert lines[2] == '<http://localhost:80/pywb/20140103030321/http://example.com?example=1>; \
|
||||
rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT",'
|
||||
|
||||
assert lines[3] == '<http://localhost:80/pywb/20140103030341/http://example.com?example=1>; \
|
||||
rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"'
|
||||
|
||||
# Below functions test pywb proxy mode behavior
|
||||
# They are designed to roughly conform to Memento protocol Pattern 1.3
|
||||
# with the exception that the original resource is not available
|
||||
@ -126,7 +174,10 @@ class TestWb:
|
||||
assert resp.headers[VARY] == 'accept-datetime'
|
||||
|
||||
# for memento
|
||||
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original timegate"'
|
||||
links = self.get_links(resp)
|
||||
assert '<http://www.iana.org/_css/2013.1/screen.css>; rel="original timegate"' in links
|
||||
#assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links
|
||||
|
||||
assert resp.headers[MEMENTO_DATETIME] == 'Mon, 27 Jan 2014 17:12:39 GMT'
|
||||
|
||||
|
||||
@ -148,7 +199,10 @@ class TestWb:
|
||||
assert resp.headers[VARY] == 'accept-datetime'
|
||||
|
||||
# for memento
|
||||
assert resp.headers[LINK] == '<http://www.iana.org/_css/2013.1/screen.css>; rel="original timegate"'
|
||||
links = self.get_links(resp)
|
||||
assert '<http://www.iana.org/_css/2013.1/screen.css>; rel="original timegate"' in links
|
||||
#assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links
|
||||
|
||||
assert resp.headers[MEMENTO_DATETIME] == 'Sun, 26 Jan 2014 20:08:04 GMT'
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user