1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

add memento headers to all response loaders, use BaseLoader base class, update tests

for memento headers
This commit is contained in:
Ilya Kreymer 2016-03-03 11:04:28 -08:00
parent 65e969a492
commit 98830147b5
3 changed files with 80 additions and 15 deletions

View File

@ -1,7 +1,10 @@
from rezag.liverec import BaseRecorder
from rezag.liverec import request as remote_request
from rezag.utils import MementoUtils
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_http_date
from pywb.utils.timeutils import iso_date_to_datetime
from pywb.utils.wbexception import LiveResourceException
from pywb.warc.resolvingloader import ResolvingLoader
@ -46,7 +49,28 @@ class StreamIter(object):
#=============================================================================
class WARCPathLoader(object):
class BaseLoader(object):
def __call__(self, cdx, params):
res = self._load_resource(cdx, params)
if not res:
return res
response.headers['WARC-Coll'] = cdx.get('source', '')
response.headers['Link'] = MementoUtils.make_link(
response.headers['WARC-Target-URI'],
'original')
memento_dt = iso_date_to_datetime(response.headers['WARC-Date'])
response.headers['Memento-Datetime'] = datetime_to_http_date(memento_dt)
return res
def _load_resource(self, cdx, params): #pragma: no cover
raise NotImplemented()
#=============================================================================
class WARCPathLoader(BaseLoader):
def __init__(self, paths, cdx_source):
self.paths = paths
if isinstance(paths, str):
@ -77,8 +101,7 @@ class WARCPathLoader(object):
yield check
def __call__(self, cdx, params):
def _load_resource(self, cdx, params):
if not cdx.get('filename') or cdx.get('offset') is None:
return None
@ -94,8 +117,6 @@ class WARCPathLoader(object):
for n, v in record.rec_headers.headers:
response.headers[n] = v
response.headers['WARC-Coll'] = cdx.get('source')
if headers != payload:
response.headers['WARC-Target-URI'] = headers.rec_headers.get_header('WARC-Target-URI')
response.headers['WARC-Date'] = headers.rec_headers.get_header('WARC-Date')
@ -103,8 +124,7 @@ class WARCPathLoader(object):
response.headers['WARC-Refers-To-Date'] = payload.rec_headers.get_header('WARC-Date')
headers.stream.close()
res = StreamIter(record.stream)
return res
return StreamIter(record.stream)
def __str__(self):
return 'WARCPathLoader'
@ -133,13 +153,13 @@ class HeaderRecorder(BaseRecorder):
#=============================================================================
class LiveWebLoader(object):
class LiveWebLoader(BaseLoader):
SKIP_HEADERS = (b'link',
b'memento-datetime',
b'content-location',
b'x-archive')
def __call__(self, cdx, params):
def _load_resource(self, cdx, params):
load_url = cdx.get('load_url')
if not load_url:
return None
@ -185,7 +205,6 @@ class LiveWebLoader(object):
#response.headers['WARC-Record-ID'] = self._make_warc_id()
response.headers['WARC-Target-URI'] = cdx['url']
response.headers['WARC-Date'] = self._make_date(dt)
response.headers['WARC-Coll'] = cdx.get('source', '')
# Try to set content-length, if it is available and valid
try:
@ -193,8 +212,8 @@ class LiveWebLoader(object):
if content_len > 0:
content_len += len(resp_headers)
response.headers['Content-Length'] = content_len
except:
raise
except (KeyError, TypeError):
pass
return StreamIter(upstream_res.raw, header=resp_headers)

View File

@ -98,3 +98,7 @@ class MementoUtils(object):
# last memento link, if any
if prev_cdx:
yield MementoUtils.make_timemap_memento_link(prev_cdx, end='\n')
@staticmethod
def make_link(url, type):
return '<{0}>; rel="{1}"'.format(url, type)

View File

@ -9,6 +9,7 @@ from rezag.aggindexsource import GeventTimeoutAggregator, SimpleAggregator
from rezag.aggindexsource import DirectoryIndexSource
from rezag.app import add_route, application
from rezag.utils import MementoUtils
import webtest
import bottle
@ -121,7 +122,10 @@ class TestResAgg(object):
assert resp.headers['WARC-Coll'] == 'live'
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar'
assert 'WARC-Date' in resp.headers
assert resp.headers['WARC-Date'] != ''
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/get?foo=bar', 'original')
assert resp.headers['Memento-Datetime'] != ''
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
@ -134,7 +138,10 @@ class TestResAgg(object):
assert resp.headers['WARC-Coll'] == 'live'
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/post'
assert 'WARC-Date' in resp.headers
assert resp.headers['WARC-Date'] != ''
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original')
assert resp.headers['Memento-Datetime'] != ''
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
@ -149,6 +156,9 @@ class TestResAgg(object):
assert resp.headers['WARC-Date'] == '2014-10-06T18:43:57Z'
assert b'HTTP/1.1 200 OK' in resp.body
assert resp.headers['Link'] == MementoUtils.make_link('http://www.vvork.com/', 'original')
assert resp.headers['Memento-Datetime'] == 'Mon, 06 Oct 2014 18:43:57 GMT'
assert 'ResErrors' not in resp.headers
def test_agg_select_mem_2(self):
@ -159,6 +169,9 @@ class TestResAgg(object):
assert resp.headers['WARC-Date'] == '2016-01-10T13:48:55Z'
assert b'HTTP/1.1 200 OK' in resp.body
assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original')
assert resp.headers['Memento-Datetime'] == 'Sun, 10 Jan 2016 13:48:55 GMT'
assert 'ResErrors' not in resp.headers
def test_agg_select_live(self):
@ -168,6 +181,9 @@ class TestResAgg(object):
assert resp.headers['WARC-Target-URI'] == 'http://vvork.com/'
assert resp.headers['WARC-Date'] != ''
assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original')
assert resp.headers['Memento-Datetime'] != ''
assert 'ResErrors' not in resp.headers
def test_agg_select_local(self):
@ -177,6 +193,9 @@ class TestResAgg(object):
assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/'
assert resp.headers['WARC-Date'] == '2014-01-26T20:06:24Z'
assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original')
assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT'
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}
def test_agg_select_local_postreq(self):
@ -193,6 +212,9 @@ Host: iana.org
assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/'
assert resp.headers['WARC-Date'] == '2014-01-26T20:06:24Z'
assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original')
assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT'
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}
def test_agg_live_postreq(self):
@ -207,7 +229,10 @@ Host: httpbin.org
assert resp.headers['WARC-Coll'] == 'live'
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar'
assert 'WARC-Date' in resp.headers
assert resp.headers['WARC-Date'] != ''
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/get?foo=bar', 'original')
assert resp.headers['Memento-Datetime'] != ''
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
@ -229,6 +254,11 @@ foo=bar&test=abc"""
assert resp.headers['WARC-Coll'] == 'post'
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/post'
assert resp.headers['WARC-Date'] != ''
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original')
assert resp.headers['Memento-Datetime'] != ''
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
assert b'"test": "abc"' in resp.body
@ -243,6 +273,8 @@ foo=bar&test=abc"""
assert resp.headers['WARC-Coll'] == 'post'
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/post'
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
assert b'"test": "abc"' in resp.body
@ -255,6 +287,8 @@ foo=bar&test=abc"""
assert resp.headers['WARC-Coll'] == 'live'
assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/'
assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original')
assert b'HTTP/1.1 200 OK' in resp.body
assert 'ResErrors' not in resp.headers
@ -265,6 +299,10 @@ foo=bar&test=abc"""
assert resp.headers['WARC-Coll'] == 'example'
assert resp.headers['WARC-Date'] == '2016-02-25T04:23:29Z'
assert resp.headers['WARC-Target-URI'] == 'http://example.com/'
assert resp.headers['Link'] == MementoUtils.make_link('http://example.com/', 'original')
assert resp.headers['Memento-Datetime'] == 'Thu, 25 Feb 2016 04:23:29 GMT'
assert b'HTTP/1.1 200 OK' in resp.body
assert 'ResErrors' not in resp.headers
@ -285,6 +323,10 @@ foo=bar&test=abc"""
assert resp.headers['WARC-Date'] == '2014-01-27T17:12:51Z'
assert resp.headers['WARC-Refers-To-Target-URI'] == 'http://example.com'
assert resp.headers['WARC-Refers-To-Date'] == '2014-01-27T17:12:00Z'
assert resp.headers['Link'] == MementoUtils.make_link('http://example.com', 'original')
assert resp.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'
assert b'HTTP/1.1 200 OK' in resp.body
assert b'<!doctype html>' in resp.body