1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

timemap format fix: fixes ukwa-pywb/pywb#37

- ensure timemap returns full url-m warcserver supports 'memento_format' param which, if present, specifies
full format to use for memento links in timemap
- memento tests: timemap tests include full url-m, test both framed and frameless timemap responses
This commit is contained in:
Ilya Kreymer 2019-02-15 16:25:40 -08:00 committed by John Berlin
parent 3868f5b915
commit 0a9ad5c8dc
No known key found for this signature in database
GPG Key ID: 6EF5E4B442011B02
5 changed files with 44 additions and 34 deletions

View File

@ -635,6 +635,8 @@ class RewriterApp(object):
params['output'] = kwargs.get('output', 'json') params['output'] = kwargs.get('output', 'json')
params['from'] = wb_url.timestamp params['from'] = wb_url.timestamp
params['to'] = wb_url.end_timestamp params['to'] = wb_url.end_timestamp
if 'memento_format' in kwargs:
params['memento_format'] = kwargs['memento_format']
upstream_url = self.get_upstream_url(wb_url, kwargs, params) upstream_url = self.get_upstream_url(wb_url, kwargs, params)
upstream_url = upstream_url.replace('/resource/postreq', '/index') upstream_url = upstream_url.replace('/resource/postreq', '/index')
@ -668,6 +670,7 @@ class RewriterApp(object):
def handle_timemap(self, wb_url, kwargs, full_prefix): def handle_timemap(self, wb_url, kwargs, full_prefix):
output = kwargs.get('output') output = kwargs.get('output')
kwargs['memento_format'] = full_prefix + '{timestamp}' + self.replay_mod + '/{url}'
res = self.do_query(wb_url, kwargs) res = self.do_query(wb_url, kwargs)
return self.make_timemap(wb_url, res, full_prefix, output) return self.make_timemap(wb_url, res, full_prefix, output)

View File

@ -1,7 +1,7 @@
import re import re
import six import six
from warcio.timeutils import timestamp_to_http_date from warcio.timeutils import timestamp_to_http_date, http_date_to_timestamp
from pywb.utils.wbexception import BadRequestException from pywb.utils.wbexception import BadRequestException
@ -65,7 +65,7 @@ class MementoUtils(object):
return results return results
@classmethod @classmethod
def make_timemap_memento_link(cls, cdx, datetime=None, rel='memento', end=',\n'): def make_timemap_memento_link(cls, cdx, datetime=None, rel='memento', end=',\n', memento_format=None):
url = cdx.get('url') url = cdx.get('url')
if not url: if not url:
url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length')) url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length'))
@ -73,21 +73,22 @@ class MementoUtils(object):
if not datetime: if not datetime:
datetime = timestamp_to_http_date(cdx['timestamp']) datetime = timestamp_to_http_date(cdx['timestamp'])
return cls.make_memento_link(url, rel, datetime, cdx.get('source-coll')) + end return cls.make_memento_link(url, rel, datetime, cdx.get('source-coll'), memento_format) + end
@classmethod @classmethod
def make_timemap(cls, cdx_iter): def make_timemap(cls, cdx_iter, params):
prev_cdx = None prev_cdx = None
memento_format = params.get('memento_format')
for cdx in cdx_iter: for cdx in cdx_iter:
if prev_cdx: if prev_cdx:
yield cls.make_timemap_memento_link(prev_cdx) yield cls.make_timemap_memento_link(prev_cdx, memento_format=memento_format)
prev_cdx = cdx prev_cdx = cdx
# last memento link, if any # last memento link, if any
if prev_cdx: if prev_cdx:
yield cls.make_timemap_memento_link(prev_cdx, end='\n') yield cls.make_timemap_memento_link(prev_cdx, end='\n', memento_format=memento_format)
@classmethod @classmethod
def wrap_timemap_header(cls, url, timegate_url, timemap_url, timemap): def wrap_timemap_header(cls, url, timegate_url, timemap_url, timemap):
@ -111,8 +112,14 @@ class MementoUtils(object):
return '<{0}>; rel="{1}"'.format(url, type) return '<{0}>; rel="{1}"'.format(url, type)
@classmethod @classmethod
def make_memento_link(cls, url, type, dt, coll=None): def make_memento_link(cls, url, type, dt, coll=None, memento_format=None):
res = '<{0}>; rel="{1}"; datetime="{2}"'.format(url, type, dt) if memento_format:
memento_format = memento_format.format(url=url,
timestamp=http_date_to_timestamp(dt))
else:
memento_format = url
res = '<{0}>; rel="{1}"; datetime="{2}"'.format(memento_format, type, dt)
if coll: if coll:
res += '; collection="{0}"'.format(coll) res += '; collection="{0}"'.format(coll)

View File

@ -16,21 +16,21 @@ logger = logging.getLogger('warcserver')
#============================================================================= #=============================================================================
def to_cdxj(cdx_iter, fields): def to_cdxj(cdx_iter, fields, params):
content_type = 'text/x-cdxj' content_type = 'text/x-cdxj'
return content_type, (cdx.to_cdxj(fields) for cdx in cdx_iter) return content_type, (cdx.to_cdxj(fields) for cdx in cdx_iter)
def to_json(cdx_iter, fields): def to_json(cdx_iter, fields, params):
content_type = 'text/x-ndjson' content_type = 'text/x-ndjson'
return content_type, (cdx.to_json(fields) for cdx in cdx_iter) return content_type, (cdx.to_json(fields) for cdx in cdx_iter)
def to_text(cdx_iter, fields): def to_text(cdx_iter, fields, params):
content_type = 'text/plain' content_type = 'text/plain'
return content_type, (cdx.to_text(fields) for cdx in cdx_iter) return content_type, (cdx.to_text(fields) for cdx in cdx_iter)
def to_link(cdx_iter, fields): def to_link(cdx_iter, fields, params):
content_type = 'application/link-format' content_type = 'application/link-format'
return content_type, MementoUtils.make_timemap(cdx_iter) return content_type, MementoUtils.make_timemap(cdx_iter, params)
#============================================================================= #=============================================================================
@ -93,7 +93,7 @@ class IndexHandler(object):
if not cdx_iter: if not cdx_iter:
return None, None, errs return None, None, errs
content_type, res = handler(cdx_iter, fields) content_type, res = handler(cdx_iter, fields, params)
out_headers = {'Content-Type': content_type} out_headers = {'Content-Type': content_type}
def check_str(lines): def check_str(lines):

View File

@ -7,6 +7,10 @@ VARY = 'Vary'
LINK_FORMAT = 'application/link-format' LINK_FORMAT = 'application/link-format'
class MementoMixin(object): class MementoMixin(object):
def _timemap_get(self, url, fmod=True, **kwargs):
app = self.testapp if fmod else self.testapp_non_frame
return app.get(url, extra_environ={'REQUEST_URI': url}, **kwargs)
def get_links(self, resp): def get_links(self, resp):
return list(map(lambda x: x.strip(), re.split(', (?![0-9])', resp.headers[LINK]))) return list(map(lambda x: x.strip(), re.split(', (?![0-9])', resp.headers[LINK])))

View File

@ -11,9 +11,6 @@ class TestMemento(MementoMixin, BaseConfigTest):
def setup_class(cls): def setup_class(cls):
super(TestMemento, cls).setup_class('config_test.yaml') super(TestMemento, cls).setup_class('config_test.yaml')
def _timemap_get(self, url, **kwargs):
return self.testapp.get(url, extra_environ={'REQUEST_URI': url}, **kwargs)
def _assert_memento(self, resp, url, ts, fmod, dt=''): def _assert_memento(self, resp, url, ts, fmod, dt=''):
dt = dt or timestamp_to_http_date(ts) dt = dt or timestamp_to_http_date(ts)
@ -119,12 +116,12 @@ class TestMemento(MementoMixin, BaseConfigTest):
self._assert_memento(resp, 'http://www.iana.org/domains/example', '20140128051539', fmod) self._assert_memento(resp, 'http://www.iana.org/domains/example', '20140128051539', fmod)
def test_timemap(self): def test_timemap(self, fmod):
""" """
Test application/link-format timemap Test application/link-format timemap
""" """
resp = self._timemap_get('/pywb/timemap/link/http://example.com?example=1') resp = self._timemap_get('/pywb/timemap/link/http://example.com?example=1', fmod)
assert resp.status_int == 200 assert resp.status_int == 200
assert resp.content_type == LINK_FORMAT assert resp.content_type == LINK_FORMAT
@ -134,17 +131,18 @@ class TestMemento(MementoMixin, BaseConfigTest):
<http://localhost:80/pywb/timemap/link/http://example.com?example=1>; rel="self"; type="application/link-format"; from="Fri, 03 Jan 2014 03:03:21 GMT", <http://localhost:80/pywb/timemap/link/http://example.com?example=1>; rel="self"; type="application/link-format"; from="Fri, 03 Jan 2014 03:03:21 GMT",
<http://localhost:80/pywb/http://example.com?example=1>; rel="timegate", <http://localhost:80/pywb/http://example.com?example=1>; rel="timegate",
<http://example.com?example=1>; rel="original", <http://example.com?example=1>; rel="original",
<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT"; collection="pywb", <http://localhost:80/pywb/20140103030321{0}/http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT"; collection="pywb",
<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"; collection="pywb" <http://localhost:80/pywb/20140103030341{0}/http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"; collection="pywb"
""" """.format(fmod)
assert exp == resp.text assert exp == resp.text
def test_timemap_cdxj(self): def test_timemap_cdxj(self, fmod):
""" """
Test test/x-cdxj timemap Test test/x-cdxj timemap
""" """
resp = self._timemap_get('/pywb/timemap/cdxj/http://example.com?example=1') resp = self._timemap_get('/pywb/timemap/cdxj/http://example.com?example=1', fmod)
assert resp.status_int == 200 assert resp.status_int == 200
assert resp.content_type == 'text/x-cdxj' assert resp.content_type == 'text/x-cdxj'
@ -156,12 +154,12 @@ com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "
""" """
assert exp == resp.text assert exp == resp.text
def test_timemap_2(self): def test_timemap_2(self, fmod):
""" """
Test application/link-format timemap total count Test application/link-format timemap total count
""" """
resp = self._timemap_get('/pywb/timemap/link/http://example.com') resp = self._timemap_get('/pywb/timemap/link/http://example.com', fmod)
assert resp.status_int == 200 assert resp.status_int == 200
assert resp.content_type == LINK_FORMAT assert resp.content_type == LINK_FORMAT
@ -192,9 +190,6 @@ class TestMementoRedirectClassic(MementoMixin, BaseConfigTest):
def setup_class(cls): def setup_class(cls):
super(TestMementoRedirectClassic, cls).setup_class('config_test_redirect_classic.yaml') super(TestMementoRedirectClassic, cls).setup_class('config_test_redirect_classic.yaml')
def _timemap_get(self, url, **kwargs):
return self.testapp.get(url, extra_environ={'REQUEST_URI': url}, **kwargs)
def test_memento_top_frame_timegate(self, fmod): def test_memento_top_frame_timegate(self, fmod):
resp = self.testapp.get('/pywb/http://www.iana.org/') resp = self.testapp.get('/pywb/http://www.iana.org/')
assert resp.status_code == 307 assert resp.status_code == 307
@ -252,12 +247,12 @@ class TestMementoRedirectClassic(MementoMixin, BaseConfigTest):
assert '"20140126200624"' in resp.text assert '"20140126200624"' in resp.text
assert '"http://www.iana.org/"' in resp.text, resp.text assert '"http://www.iana.org/"' in resp.text, resp.text
def test_timemap(self): def test_timemap(self, fmod):
""" """
Test application/link-format timemap Test application/link-format timemap
""" """
resp = self._timemap_get('/pywb/timemap/link/http://example.com?example=1') resp = self._timemap_get('/pywb/timemap/link/http://example.com?example=1', fmod)
assert resp.status_int == 200 assert resp.status_int == 200
assert resp.content_type == LINK_FORMAT assert resp.content_type == LINK_FORMAT
@ -267,9 +262,10 @@ class TestMementoRedirectClassic(MementoMixin, BaseConfigTest):
<http://localhost:80/pywb/timemap/link/http://example.com?example=1>; rel="self"; type="application/link-format"; from="Fri, 03 Jan 2014 03:03:21 GMT", <http://localhost:80/pywb/timemap/link/http://example.com?example=1>; rel="self"; type="application/link-format"; from="Fri, 03 Jan 2014 03:03:21 GMT",
<http://localhost:80/pywb/http://example.com?example=1>; rel="timegate", <http://localhost:80/pywb/http://example.com?example=1>; rel="timegate",
<http://example.com?example=1>; rel="original", <http://example.com?example=1>; rel="original",
<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT"; collection="pywb", <http://localhost:80/pywb/20140103030321{0}/http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT"; collection="pywb",
<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"; collection="pywb" <http://localhost:80/pywb/20140103030341{0}/http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"; collection="pywb"
""" """.format(fmod)
assert exp == resp.text assert exp == resp.text
def test_memento_not_time_gate(self, fmod): def test_memento_not_time_gate(self, fmod):