From 0a9ad5c8dc914a1964e15de1896c1835107eb72f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 15 Feb 2019 16:25:40 -0800 Subject: [PATCH] timemap format fix: fixes ukwa-pywb/pywb#37 - ensure timemap returns full url-m warcserver supports 'memento_format' param which, if present, specifies full format to use for memento links in timemap - memento tests: timemap tests include full url-m, test both framed and frameless timemap responses --- pywb/apps/rewriterapp.py | 3 +++ pywb/utils/memento.py | 23 +++++++++++++++-------- pywb/warcserver/handlers.py | 12 ++++++------ tests/memento_fixture.py | 4 ++++ tests/test_memento.py | 36 ++++++++++++++++-------------------- 5 files changed, 44 insertions(+), 34 deletions(-) diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index aab05e65..4118999e 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -635,6 +635,8 @@ class RewriterApp(object): params['output'] = kwargs.get('output', 'json') params['from'] = wb_url.timestamp params['to'] = wb_url.end_timestamp + if 'memento_format' in kwargs: + params['memento_format'] = kwargs['memento_format'] upstream_url = self.get_upstream_url(wb_url, kwargs, params) upstream_url = upstream_url.replace('/resource/postreq', '/index') @@ -668,6 +670,7 @@ class RewriterApp(object): def handle_timemap(self, wb_url, kwargs, full_prefix): output = kwargs.get('output') + kwargs['memento_format'] = full_prefix + '{timestamp}' + self.replay_mod + '/{url}' res = self.do_query(wb_url, kwargs) return self.make_timemap(wb_url, res, full_prefix, output) diff --git a/pywb/utils/memento.py b/pywb/utils/memento.py index b5e9745e..f55c2dc6 100644 --- a/pywb/utils/memento.py +++ b/pywb/utils/memento.py @@ -1,7 +1,7 @@ import re import six -from warcio.timeutils import timestamp_to_http_date +from warcio.timeutils import timestamp_to_http_date, http_date_to_timestamp from pywb.utils.wbexception import BadRequestException @@ -65,7 +65,7 @@ class MementoUtils(object): return results @classmethod - def make_timemap_memento_link(cls, cdx, datetime=None, rel='memento', end=',\n'): + def make_timemap_memento_link(cls, cdx, datetime=None, rel='memento', end=',\n', memento_format=None): url = cdx.get('url') if not url: url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length')) @@ -73,21 +73,22 @@ class MementoUtils(object): if not datetime: datetime = timestamp_to_http_date(cdx['timestamp']) - return cls.make_memento_link(url, rel, datetime, cdx.get('source-coll')) + end + return cls.make_memento_link(url, rel, datetime, cdx.get('source-coll'), memento_format) + end @classmethod - def make_timemap(cls, cdx_iter): + def make_timemap(cls, cdx_iter, params): prev_cdx = None + memento_format = params.get('memento_format') for cdx in cdx_iter: if prev_cdx: - yield cls.make_timemap_memento_link(prev_cdx) + yield cls.make_timemap_memento_link(prev_cdx, memento_format=memento_format) prev_cdx = cdx # last memento link, if any if prev_cdx: - yield cls.make_timemap_memento_link(prev_cdx, end='\n') + yield cls.make_timemap_memento_link(prev_cdx, end='\n', memento_format=memento_format) @classmethod def wrap_timemap_header(cls, url, timegate_url, timemap_url, timemap): @@ -111,8 +112,14 @@ class MementoUtils(object): return '<{0}>; rel="{1}"'.format(url, type) @classmethod - def make_memento_link(cls, url, type, dt, coll=None): - res = '<{0}>; rel="{1}"; datetime="{2}"'.format(url, type, dt) + def make_memento_link(cls, url, type, dt, coll=None, memento_format=None): + if memento_format: + memento_format = memento_format.format(url=url, + timestamp=http_date_to_timestamp(dt)) + else: + memento_format = url + + res = '<{0}>; rel="{1}"; datetime="{2}"'.format(memento_format, type, dt) if coll: res += '; collection="{0}"'.format(coll) diff --git a/pywb/warcserver/handlers.py b/pywb/warcserver/handlers.py index 6cd2f4b2..58d272d5 100644 --- a/pywb/warcserver/handlers.py +++ b/pywb/warcserver/handlers.py @@ -16,21 +16,21 @@ logger = logging.getLogger('warcserver') #============================================================================= -def to_cdxj(cdx_iter, fields): +def to_cdxj(cdx_iter, fields, params): content_type = 'text/x-cdxj' return content_type, (cdx.to_cdxj(fields) for cdx in cdx_iter) -def to_json(cdx_iter, fields): +def to_json(cdx_iter, fields, params): content_type = 'text/x-ndjson' return content_type, (cdx.to_json(fields) for cdx in cdx_iter) -def to_text(cdx_iter, fields): +def to_text(cdx_iter, fields, params): content_type = 'text/plain' return content_type, (cdx.to_text(fields) for cdx in cdx_iter) -def to_link(cdx_iter, fields): +def to_link(cdx_iter, fields, params): content_type = 'application/link-format' - return content_type, MementoUtils.make_timemap(cdx_iter) + return content_type, MementoUtils.make_timemap(cdx_iter, params) #============================================================================= @@ -93,7 +93,7 @@ class IndexHandler(object): if not cdx_iter: return None, None, errs - content_type, res = handler(cdx_iter, fields) + content_type, res = handler(cdx_iter, fields, params) out_headers = {'Content-Type': content_type} def check_str(lines): diff --git a/tests/memento_fixture.py b/tests/memento_fixture.py index 17d9fdcb..25d026a4 100644 --- a/tests/memento_fixture.py +++ b/tests/memento_fixture.py @@ -7,6 +7,10 @@ VARY = 'Vary' LINK_FORMAT = 'application/link-format' class MementoMixin(object): + def _timemap_get(self, url, fmod=True, **kwargs): + app = self.testapp if fmod else self.testapp_non_frame + return app.get(url, extra_environ={'REQUEST_URI': url}, **kwargs) + def get_links(self, resp): return list(map(lambda x: x.strip(), re.split(', (?![0-9])', resp.headers[LINK]))) diff --git a/tests/test_memento.py b/tests/test_memento.py index c2b481c0..f13a6a63 100644 --- a/tests/test_memento.py +++ b/tests/test_memento.py @@ -11,9 +11,6 @@ class TestMemento(MementoMixin, BaseConfigTest): def setup_class(cls): super(TestMemento, cls).setup_class('config_test.yaml') - def _timemap_get(self, url, **kwargs): - return self.testapp.get(url, extra_environ={'REQUEST_URI': url}, **kwargs) - def _assert_memento(self, resp, url, ts, fmod, dt=''): dt = dt or timestamp_to_http_date(ts) @@ -119,12 +116,12 @@ class TestMemento(MementoMixin, BaseConfigTest): self._assert_memento(resp, 'http://www.iana.org/domains/example', '20140128051539', fmod) - def test_timemap(self): + def test_timemap(self, fmod): """ Test application/link-format timemap """ - resp = self._timemap_get('/pywb/timemap/link/http://example.com?example=1') + resp = self._timemap_get('/pywb/timemap/link/http://example.com?example=1', fmod) assert resp.status_int == 200 assert resp.content_type == LINK_FORMAT @@ -134,17 +131,18 @@ class TestMemento(MementoMixin, BaseConfigTest): ; rel="self"; type="application/link-format"; from="Fri, 03 Jan 2014 03:03:21 GMT", ; rel="timegate", ; rel="original", -; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT"; collection="pywb", -; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"; collection="pywb" -""" +; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT"; collection="pywb", +; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"; collection="pywb" +""".format(fmod) + assert exp == resp.text - def test_timemap_cdxj(self): + def test_timemap_cdxj(self, fmod): """ Test test/x-cdxj timemap """ - resp = self._timemap_get('/pywb/timemap/cdxj/http://example.com?example=1') + resp = self._timemap_get('/pywb/timemap/cdxj/http://example.com?example=1', fmod) assert resp.status_int == 200 assert resp.content_type == 'text/x-cdxj' @@ -156,12 +154,12 @@ com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", " """ assert exp == resp.text - def test_timemap_2(self): + def test_timemap_2(self, fmod): """ Test application/link-format timemap total count """ - resp = self._timemap_get('/pywb/timemap/link/http://example.com') + resp = self._timemap_get('/pywb/timemap/link/http://example.com', fmod) assert resp.status_int == 200 assert resp.content_type == LINK_FORMAT @@ -192,9 +190,6 @@ class TestMementoRedirectClassic(MementoMixin, BaseConfigTest): def setup_class(cls): super(TestMementoRedirectClassic, cls).setup_class('config_test_redirect_classic.yaml') - def _timemap_get(self, url, **kwargs): - return self.testapp.get(url, extra_environ={'REQUEST_URI': url}, **kwargs) - def test_memento_top_frame_timegate(self, fmod): resp = self.testapp.get('/pywb/http://www.iana.org/') assert resp.status_code == 307 @@ -252,12 +247,12 @@ class TestMementoRedirectClassic(MementoMixin, BaseConfigTest): assert '"20140126200624"' in resp.text assert '"http://www.iana.org/"' in resp.text, resp.text - def test_timemap(self): + def test_timemap(self, fmod): """ Test application/link-format timemap """ - resp = self._timemap_get('/pywb/timemap/link/http://example.com?example=1') + resp = self._timemap_get('/pywb/timemap/link/http://example.com?example=1', fmod) assert resp.status_int == 200 assert resp.content_type == LINK_FORMAT @@ -267,9 +262,10 @@ class TestMementoRedirectClassic(MementoMixin, BaseConfigTest): ; rel="self"; type="application/link-format"; from="Fri, 03 Jan 2014 03:03:21 GMT", ; rel="timegate", ; rel="original", -; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT"; collection="pywb", -; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"; collection="pywb" -""" +; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT"; collection="pywb", +; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"; collection="pywb" +""".format(fmod) + assert exp == resp.text def test_memento_not_time_gate(self, fmod):