From 98830147b559d904028b49967e556ec81bca5f44 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 3 Mar 2016 11:04:28 -0800 Subject: [PATCH] add memento headers to all response loaders, use BaseLoader base class, update tests for memento headers --- rezag/responseloader.py | 43 +++++++++++++++++++++++++----------- rezag/utils.py | 4 ++++ test/test_handlers.py | 48 ++++++++++++++++++++++++++++++++++++++--- 3 files changed, 80 insertions(+), 15 deletions(-) diff --git a/rezag/responseloader.py b/rezag/responseloader.py index c3c6dd5c..774d7c34 100644 --- a/rezag/responseloader.py +++ b/rezag/responseloader.py @@ -1,7 +1,10 @@ from rezag.liverec import BaseRecorder from rezag.liverec import request as remote_request +from rezag.utils import MementoUtils + from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_http_date +from pywb.utils.timeutils import iso_date_to_datetime from pywb.utils.wbexception import LiveResourceException from pywb.warc.resolvingloader import ResolvingLoader @@ -46,7 +49,28 @@ class StreamIter(object): #============================================================================= -class WARCPathLoader(object): +class BaseLoader(object): + def __call__(self, cdx, params): + res = self._load_resource(cdx, params) + if not res: + return res + + response.headers['WARC-Coll'] = cdx.get('source', '') + + response.headers['Link'] = MementoUtils.make_link( + response.headers['WARC-Target-URI'], + 'original') + + memento_dt = iso_date_to_datetime(response.headers['WARC-Date']) + response.headers['Memento-Datetime'] = datetime_to_http_date(memento_dt) + return res + + def _load_resource(self, cdx, params): #pragma: no cover + raise NotImplemented() + + +#============================================================================= +class WARCPathLoader(BaseLoader): def __init__(self, paths, cdx_source): self.paths = paths if isinstance(paths, str): @@ -77,8 +101,7 @@ class WARCPathLoader(object): yield check - - def __call__(self, cdx, params): + def _load_resource(self, cdx, params): if not cdx.get('filename') or cdx.get('offset') is None: return None @@ -94,8 +117,6 @@ class WARCPathLoader(object): for n, v in record.rec_headers.headers: response.headers[n] = v - response.headers['WARC-Coll'] = cdx.get('source') - if headers != payload: response.headers['WARC-Target-URI'] = headers.rec_headers.get_header('WARC-Target-URI') response.headers['WARC-Date'] = headers.rec_headers.get_header('WARC-Date') @@ -103,8 +124,7 @@ class WARCPathLoader(object): response.headers['WARC-Refers-To-Date'] = payload.rec_headers.get_header('WARC-Date') headers.stream.close() - res = StreamIter(record.stream) - return res + return StreamIter(record.stream) def __str__(self): return 'WARCPathLoader' @@ -133,13 +153,13 @@ class HeaderRecorder(BaseRecorder): #============================================================================= -class LiveWebLoader(object): +class LiveWebLoader(BaseLoader): SKIP_HEADERS = (b'link', b'memento-datetime', b'content-location', b'x-archive') - def __call__(self, cdx, params): + def _load_resource(self, cdx, params): load_url = cdx.get('load_url') if not load_url: return None @@ -185,7 +205,6 @@ class LiveWebLoader(object): #response.headers['WARC-Record-ID'] = self._make_warc_id() response.headers['WARC-Target-URI'] = cdx['url'] response.headers['WARC-Date'] = self._make_date(dt) - response.headers['WARC-Coll'] = cdx.get('source', '') # Try to set content-length, if it is available and valid try: @@ -193,8 +212,8 @@ class LiveWebLoader(object): if content_len > 0: content_len += len(resp_headers) response.headers['Content-Length'] = content_len - except: - raise + except (KeyError, TypeError): + pass return StreamIter(upstream_res.raw, header=resp_headers) diff --git a/rezag/utils.py b/rezag/utils.py index 94c67975..126c0f40 100644 --- a/rezag/utils.py +++ b/rezag/utils.py @@ -98,3 +98,7 @@ class MementoUtils(object): # last memento link, if any if prev_cdx: yield MementoUtils.make_timemap_memento_link(prev_cdx, end='\n') + + @staticmethod + def make_link(url, type): + return '<{0}>; rel="{1}"'.format(url, type) diff --git a/test/test_handlers.py b/test/test_handlers.py index 3e911224..1c8ec45e 100644 --- a/test/test_handlers.py +++ b/test/test_handlers.py @@ -9,6 +9,7 @@ from rezag.aggindexsource import GeventTimeoutAggregator, SimpleAggregator from rezag.aggindexsource import DirectoryIndexSource from rezag.app import add_route, application +from rezag.utils import MementoUtils import webtest import bottle @@ -121,7 +122,10 @@ class TestResAgg(object): assert resp.headers['WARC-Coll'] == 'live' assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar' - assert 'WARC-Date' in resp.headers + assert resp.headers['WARC-Date'] != '' + + assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/get?foo=bar', 'original') + assert resp.headers['Memento-Datetime'] != '' assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body @@ -134,7 +138,10 @@ class TestResAgg(object): assert resp.headers['WARC-Coll'] == 'live' assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/post' - assert 'WARC-Date' in resp.headers + assert resp.headers['WARC-Date'] != '' + + assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original') + assert resp.headers['Memento-Datetime'] != '' assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body @@ -149,6 +156,9 @@ class TestResAgg(object): assert resp.headers['WARC-Date'] == '2014-10-06T18:43:57Z' assert b'HTTP/1.1 200 OK' in resp.body + assert resp.headers['Link'] == MementoUtils.make_link('http://www.vvork.com/', 'original') + assert resp.headers['Memento-Datetime'] == 'Mon, 06 Oct 2014 18:43:57 GMT' + assert 'ResErrors' not in resp.headers def test_agg_select_mem_2(self): @@ -159,6 +169,9 @@ class TestResAgg(object): assert resp.headers['WARC-Date'] == '2016-01-10T13:48:55Z' assert b'HTTP/1.1 200 OK' in resp.body + assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original') + assert resp.headers['Memento-Datetime'] == 'Sun, 10 Jan 2016 13:48:55 GMT' + assert 'ResErrors' not in resp.headers def test_agg_select_live(self): @@ -168,6 +181,9 @@ class TestResAgg(object): assert resp.headers['WARC-Target-URI'] == 'http://vvork.com/' assert resp.headers['WARC-Date'] != '' + assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original') + assert resp.headers['Memento-Datetime'] != '' + assert 'ResErrors' not in resp.headers def test_agg_select_local(self): @@ -177,6 +193,9 @@ class TestResAgg(object): assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/' assert resp.headers['WARC-Date'] == '2014-01-26T20:06:24Z' + assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original') + assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT' + assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"} def test_agg_select_local_postreq(self): @@ -193,6 +212,9 @@ Host: iana.org assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/' assert resp.headers['WARC-Date'] == '2014-01-26T20:06:24Z' + assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original') + assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT' + assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"} def test_agg_live_postreq(self): @@ -207,7 +229,10 @@ Host: httpbin.org assert resp.headers['WARC-Coll'] == 'live' assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar' - assert 'WARC-Date' in resp.headers + assert resp.headers['WARC-Date'] != '' + + assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/get?foo=bar', 'original') + assert resp.headers['Memento-Datetime'] != '' assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body @@ -229,6 +254,11 @@ foo=bar&test=abc""" assert resp.headers['WARC-Coll'] == 'post' assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/post' + assert resp.headers['WARC-Date'] != '' + + assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original') + assert resp.headers['Memento-Datetime'] != '' + assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body assert b'"test": "abc"' in resp.body @@ -243,6 +273,8 @@ foo=bar&test=abc""" assert resp.headers['WARC-Coll'] == 'post' assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/post' + assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original') + assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body assert b'"test": "abc"' in resp.body @@ -255,6 +287,8 @@ foo=bar&test=abc""" assert resp.headers['WARC-Coll'] == 'live' assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/' + assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original') + assert b'HTTP/1.1 200 OK' in resp.body assert 'ResErrors' not in resp.headers @@ -265,6 +299,10 @@ foo=bar&test=abc""" assert resp.headers['WARC-Coll'] == 'example' assert resp.headers['WARC-Date'] == '2016-02-25T04:23:29Z' assert resp.headers['WARC-Target-URI'] == 'http://example.com/' + + assert resp.headers['Link'] == MementoUtils.make_link('http://example.com/', 'original') + assert resp.headers['Memento-Datetime'] == 'Thu, 25 Feb 2016 04:23:29 GMT' + assert b'HTTP/1.1 200 OK' in resp.body assert 'ResErrors' not in resp.headers @@ -285,6 +323,10 @@ foo=bar&test=abc""" assert resp.headers['WARC-Date'] == '2014-01-27T17:12:51Z' assert resp.headers['WARC-Refers-To-Target-URI'] == 'http://example.com' assert resp.headers['WARC-Refers-To-Date'] == '2014-01-27T17:12:00Z' + + assert resp.headers['Link'] == MementoUtils.make_link('http://example.com', 'original') + assert resp.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT' + assert b'HTTP/1.1 200 OK' in resp.body assert b'' in resp.body