mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 08:04:49 +01:00
add memento headers to all response loaders, use BaseLoader base class, update tests
for memento headers
This commit is contained in:
parent
65e969a492
commit
98830147b5
@ -1,7 +1,10 @@
|
||||
from rezag.liverec import BaseRecorder
|
||||
from rezag.liverec import request as remote_request
|
||||
|
||||
from rezag.utils import MementoUtils
|
||||
|
||||
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_http_date
|
||||
from pywb.utils.timeutils import iso_date_to_datetime
|
||||
from pywb.utils.wbexception import LiveResourceException
|
||||
from pywb.warc.resolvingloader import ResolvingLoader
|
||||
|
||||
@ -46,7 +49,28 @@ class StreamIter(object):
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class WARCPathLoader(object):
|
||||
class BaseLoader(object):
|
||||
def __call__(self, cdx, params):
|
||||
res = self._load_resource(cdx, params)
|
||||
if not res:
|
||||
return res
|
||||
|
||||
response.headers['WARC-Coll'] = cdx.get('source', '')
|
||||
|
||||
response.headers['Link'] = MementoUtils.make_link(
|
||||
response.headers['WARC-Target-URI'],
|
||||
'original')
|
||||
|
||||
memento_dt = iso_date_to_datetime(response.headers['WARC-Date'])
|
||||
response.headers['Memento-Datetime'] = datetime_to_http_date(memento_dt)
|
||||
return res
|
||||
|
||||
def _load_resource(self, cdx, params): #pragma: no cover
|
||||
raise NotImplemented()
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class WARCPathLoader(BaseLoader):
|
||||
def __init__(self, paths, cdx_source):
|
||||
self.paths = paths
|
||||
if isinstance(paths, str):
|
||||
@ -77,8 +101,7 @@ class WARCPathLoader(object):
|
||||
|
||||
yield check
|
||||
|
||||
|
||||
def __call__(self, cdx, params):
|
||||
def _load_resource(self, cdx, params):
|
||||
if not cdx.get('filename') or cdx.get('offset') is None:
|
||||
return None
|
||||
|
||||
@ -94,8 +117,6 @@ class WARCPathLoader(object):
|
||||
for n, v in record.rec_headers.headers:
|
||||
response.headers[n] = v
|
||||
|
||||
response.headers['WARC-Coll'] = cdx.get('source')
|
||||
|
||||
if headers != payload:
|
||||
response.headers['WARC-Target-URI'] = headers.rec_headers.get_header('WARC-Target-URI')
|
||||
response.headers['WARC-Date'] = headers.rec_headers.get_header('WARC-Date')
|
||||
@ -103,8 +124,7 @@ class WARCPathLoader(object):
|
||||
response.headers['WARC-Refers-To-Date'] = payload.rec_headers.get_header('WARC-Date')
|
||||
headers.stream.close()
|
||||
|
||||
res = StreamIter(record.stream)
|
||||
return res
|
||||
return StreamIter(record.stream)
|
||||
|
||||
def __str__(self):
|
||||
return 'WARCPathLoader'
|
||||
@ -133,13 +153,13 @@ class HeaderRecorder(BaseRecorder):
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class LiveWebLoader(object):
|
||||
class LiveWebLoader(BaseLoader):
|
||||
SKIP_HEADERS = (b'link',
|
||||
b'memento-datetime',
|
||||
b'content-location',
|
||||
b'x-archive')
|
||||
|
||||
def __call__(self, cdx, params):
|
||||
def _load_resource(self, cdx, params):
|
||||
load_url = cdx.get('load_url')
|
||||
if not load_url:
|
||||
return None
|
||||
@ -185,7 +205,6 @@ class LiveWebLoader(object):
|
||||
#response.headers['WARC-Record-ID'] = self._make_warc_id()
|
||||
response.headers['WARC-Target-URI'] = cdx['url']
|
||||
response.headers['WARC-Date'] = self._make_date(dt)
|
||||
response.headers['WARC-Coll'] = cdx.get('source', '')
|
||||
|
||||
# Try to set content-length, if it is available and valid
|
||||
try:
|
||||
@ -193,8 +212,8 @@ class LiveWebLoader(object):
|
||||
if content_len > 0:
|
||||
content_len += len(resp_headers)
|
||||
response.headers['Content-Length'] = content_len
|
||||
except:
|
||||
raise
|
||||
except (KeyError, TypeError):
|
||||
pass
|
||||
|
||||
return StreamIter(upstream_res.raw, header=resp_headers)
|
||||
|
||||
|
@ -98,3 +98,7 @@ class MementoUtils(object):
|
||||
# last memento link, if any
|
||||
if prev_cdx:
|
||||
yield MementoUtils.make_timemap_memento_link(prev_cdx, end='\n')
|
||||
|
||||
@staticmethod
|
||||
def make_link(url, type):
|
||||
return '<{0}>; rel="{1}"'.format(url, type)
|
||||
|
@ -9,6 +9,7 @@ from rezag.aggindexsource import GeventTimeoutAggregator, SimpleAggregator
|
||||
from rezag.aggindexsource import DirectoryIndexSource
|
||||
|
||||
from rezag.app import add_route, application
|
||||
from rezag.utils import MementoUtils
|
||||
|
||||
import webtest
|
||||
import bottle
|
||||
@ -121,7 +122,10 @@ class TestResAgg(object):
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'live'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar'
|
||||
assert 'WARC-Date' in resp.headers
|
||||
assert resp.headers['WARC-Date'] != ''
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/get?foo=bar', 'original')
|
||||
assert resp.headers['Memento-Datetime'] != ''
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"foo": "bar"' in resp.body
|
||||
@ -134,7 +138,10 @@ class TestResAgg(object):
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'live'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/post'
|
||||
assert 'WARC-Date' in resp.headers
|
||||
assert resp.headers['WARC-Date'] != ''
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original')
|
||||
assert resp.headers['Memento-Datetime'] != ''
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"foo": "bar"' in resp.body
|
||||
@ -149,6 +156,9 @@ class TestResAgg(object):
|
||||
assert resp.headers['WARC-Date'] == '2014-10-06T18:43:57Z'
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://www.vvork.com/', 'original')
|
||||
assert resp.headers['Memento-Datetime'] == 'Mon, 06 Oct 2014 18:43:57 GMT'
|
||||
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_agg_select_mem_2(self):
|
||||
@ -159,6 +169,9 @@ class TestResAgg(object):
|
||||
assert resp.headers['WARC-Date'] == '2016-01-10T13:48:55Z'
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original')
|
||||
assert resp.headers['Memento-Datetime'] == 'Sun, 10 Jan 2016 13:48:55 GMT'
|
||||
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_agg_select_live(self):
|
||||
@ -168,6 +181,9 @@ class TestResAgg(object):
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://vvork.com/'
|
||||
assert resp.headers['WARC-Date'] != ''
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original')
|
||||
assert resp.headers['Memento-Datetime'] != ''
|
||||
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_agg_select_local(self):
|
||||
@ -177,6 +193,9 @@ class TestResAgg(object):
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/'
|
||||
assert resp.headers['WARC-Date'] == '2014-01-26T20:06:24Z'
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original')
|
||||
assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT'
|
||||
|
||||
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}
|
||||
|
||||
def test_agg_select_local_postreq(self):
|
||||
@ -193,6 +212,9 @@ Host: iana.org
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/'
|
||||
assert resp.headers['WARC-Date'] == '2014-01-26T20:06:24Z'
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original')
|
||||
assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT'
|
||||
|
||||
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}
|
||||
|
||||
def test_agg_live_postreq(self):
|
||||
@ -207,7 +229,10 @@ Host: httpbin.org
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'live'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/get?foo=bar'
|
||||
assert 'WARC-Date' in resp.headers
|
||||
assert resp.headers['WARC-Date'] != ''
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/get?foo=bar', 'original')
|
||||
assert resp.headers['Memento-Datetime'] != ''
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"foo": "bar"' in resp.body
|
||||
@ -229,6 +254,11 @@ foo=bar&test=abc"""
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'post'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/post'
|
||||
assert resp.headers['WARC-Date'] != ''
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original')
|
||||
assert resp.headers['Memento-Datetime'] != ''
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"foo": "bar"' in resp.body
|
||||
assert b'"test": "abc"' in resp.body
|
||||
@ -243,6 +273,8 @@ foo=bar&test=abc"""
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'post'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://httpbin.org/post'
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original')
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'"foo": "bar"' in resp.body
|
||||
assert b'"test": "abc"' in resp.body
|
||||
@ -255,6 +287,8 @@ foo=bar&test=abc"""
|
||||
|
||||
assert resp.headers['WARC-Coll'] == 'live'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://www.iana.org/'
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original')
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
|
||||
assert 'ResErrors' not in resp.headers
|
||||
@ -265,6 +299,10 @@ foo=bar&test=abc"""
|
||||
assert resp.headers['WARC-Coll'] == 'example'
|
||||
assert resp.headers['WARC-Date'] == '2016-02-25T04:23:29Z'
|
||||
assert resp.headers['WARC-Target-URI'] == 'http://example.com/'
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://example.com/', 'original')
|
||||
assert resp.headers['Memento-Datetime'] == 'Thu, 25 Feb 2016 04:23:29 GMT'
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
|
||||
assert 'ResErrors' not in resp.headers
|
||||
@ -285,6 +323,10 @@ foo=bar&test=abc"""
|
||||
assert resp.headers['WARC-Date'] == '2014-01-27T17:12:51Z'
|
||||
assert resp.headers['WARC-Refers-To-Target-URI'] == 'http://example.com'
|
||||
assert resp.headers['WARC-Refers-To-Date'] == '2014-01-27T17:12:00Z'
|
||||
|
||||
assert resp.headers['Link'] == MementoUtils.make_link('http://example.com', 'original')
|
||||
assert resp.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'
|
||||
|
||||
assert b'HTTP/1.1 200 OK' in resp.body
|
||||
assert b'<!doctype html>' in resp.body
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user