1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-20 10:49:11 +01:00
pywb/webagg/test/test_handlers.py
2016-06-07 12:54:28 -04:00

448 lines
18 KiB
Python

#from gevent import monkey; monkey.patch_all(thread=False)
from collections import OrderedDict
from webagg.handlers import DefaultResourceHandler, HandlerSeq
from webagg.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource
from webagg.aggregator import GeventTimeoutAggregator, SimpleAggregator
from webagg.aggregator import DirectoryIndexSource
from webagg.app import ResAggApp
from webagg.utils import MementoUtils
from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.bufferedreaders import ChunkedDataReader
from io import BytesIO
from six.moves.urllib.parse import urlencode
import webtest
from fakeredis import FakeStrictRedis
from .testutils import to_path, FakeRedisTests, BaseTestClass
import json
sources = {
'local': DirectoryIndexSource(to_path('testdata/'), ''),
'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*'),
'live': LiveIndexSource(),
}
class TestResAgg(FakeRedisTests, BaseTestClass):
def setup_class(cls):
super(TestResAgg, cls).setup_class()
live_source = SimpleAggregator({'live': LiveIndexSource()})
live_handler = DefaultResourceHandler(live_source)
app = ResAggApp()
app.add_route('/live', live_handler)
source1 = GeventTimeoutAggregator(sources)
handler1 = DefaultResourceHandler(source1, to_path('testdata/'))
app.add_route('/many', handler1)
source2 = SimpleAggregator({'post': FileIndexSource(to_path('testdata/post-test.cdxj'))})
handler2 = DefaultResourceHandler(source2, to_path('testdata/'))
app.add_route('/posttest', handler2)
source3 = SimpleAggregator({'example': FileIndexSource(to_path('testdata/example.cdxj'))})
handler3 = DefaultResourceHandler(source3, to_path('testdata/'))
app.add_route('/fallback', HandlerSeq([handler3,
handler2,
live_handler]))
app.add_route('/seq', HandlerSeq([handler3,
handler2]))
app.add_route('/allredis', DefaultResourceHandler(source3, 'redis://localhost/2/test:warc'))
app.add_route('/empty', HandlerSeq([]))
app.add_route('/invalid', DefaultResourceHandler([SimpleAggregator({'invalid': 'should not be a callable'})]))
cls.testapp = webtest.TestApp(app)
def _check_uri_date(self, resp, uri, dt):
buff = BytesIO(resp.body)
buff = ChunkedDataReader(buff)
status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff)
assert status_headers.get_header('WARC-Target-URI') == uri
if dt == True:
assert status_headers.get_header('WARC-Date') != ''
else:
assert status_headers.get_header('WARC-Date') == dt
def test_list_routes(self):
resp = self.testapp.get('/')
res = resp.json
assert set(res.keys()) == set(['/empty', '/empty/postreq',
'/fallback', '/fallback/postreq',
'/live', '/live/postreq',
'/many', '/many/postreq',
'/posttest', '/posttest/postreq',
'/seq', '/seq/postreq',
'/allredis', '/allredis/postreq',
'/invalid', '/invalid/postreq'])
assert res['/fallback'] == {'modes': ['list_sources', 'index', 'resource']}
def test_list_handlers(self):
resp = self.testapp.get('/many')
assert resp.json == {'modes': ['list_sources', 'index', 'resource']}
assert 'ResErrors' not in resp.headers
resp = self.testapp.get('/many/other')
assert resp.json == {'modes': ['list_sources', 'index', 'resource']}
assert 'ResErrors' not in resp.headers
def test_list_errors(self):
# must specify url for index or resource
resp = self.testapp.get('/many/index', status=400)
assert resp.json == {'message': 'The "url" param is required'}
assert resp.text == resp.headers['ResErrors']
resp = self.testapp.get('/many/index', status=400)
assert resp.json == {'message': 'The "url" param is required'}
assert resp.text == resp.headers['ResErrors']
resp = self.testapp.get('/many/resource', status=400)
assert resp.json == {'message': 'The "url" param is required'}
assert resp.text == resp.headers['ResErrors']
def test_list_sources(self):
resp = self.testapp.get('/many/list_sources')
assert resp.json == {'sources': {'local': 'file_dir', 'ia': 'memento', 'rhiz': 'memento', 'live': 'live'}}
assert 'ResErrors' not in resp.headers
def test_live_index(self):
resp = self.testapp.get('/live/index?url=http://httpbin.org/get&output=json')
resp.charset = 'utf-8'
cdxlist = list([json.loads(cdx) for cdx in resp.text.rstrip().split('\n')])
cdxlist[0]['timestamp'] = '2016'
assert(cdxlist == [{'url': 'http://httpbin.org/get', 'urlkey': 'org,httpbin)/get', 'is_live': 'true',
'mime': '', 'load_url': 'http://httpbin.org/get', 'source': 'live', 'timestamp': '2016'}])
def test_live_resource(self):
headers = {'foo': 'bar'}
resp = self.testapp.get('/live/resource?url=http://httpbin.org/get?foo=bar', headers=headers)
assert resp.headers['WebAgg-Source-Coll'] == 'live'
self._check_uri_date(resp, 'http://httpbin.org/get?foo=bar', True)
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/get?foo=bar', 'original')
assert resp.headers['Memento-Datetime'] != ''
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
assert 'ResErrors' not in resp.headers
def test_live_post_resource(self):
resp = self.testapp.post('/live/resource?url=http://httpbin.org/post',
OrderedDict([('foo', 'bar')]))
assert resp.headers['WebAgg-Source-Coll'] == 'live'
self._check_uri_date(resp, 'http://httpbin.org/post', True)
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original')
assert resp.headers['Memento-Datetime'] != ''
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
assert 'ResErrors' not in resp.headers
def test_agg_select_mem_1(self):
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20141001')
assert resp.headers['WebAgg-Source-Coll'] == 'rhiz'
self._check_uri_date(resp, 'http://www.vvork.com/', '2014-10-06T18:43:57Z')
assert b'HTTP/1.1 200 OK' in resp.body
assert resp.headers['Link'] == MementoUtils.make_link('http://www.vvork.com/', 'original')
assert resp.headers['Memento-Datetime'] == 'Mon, 06 Oct 2014 18:43:57 GMT'
assert 'ResErrors' not in resp.headers
def test_agg_select_mem_2(self):
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20151231')
assert resp.headers['WebAgg-Source-Coll'] == 'ia'
self._check_uri_date(resp, 'http://vvork.com/', '2016-01-10T13:48:55Z')
assert b'HTTP/1.1 200 OK' in resp.body
assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original')
assert resp.headers['Memento-Datetime'] == 'Sun, 10 Jan 2016 13:48:55 GMT'
assert 'ResErrors' not in resp.headers
def test_agg_select_live(self):
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=2016')
assert resp.headers['WebAgg-Source-Coll'] == 'live'
self._check_uri_date(resp, 'http://vvork.com/', True)
assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original')
assert resp.headers['Memento-Datetime'] != ''
assert 'ResErrors' not in resp.headers
def test_agg_select_local(self):
resp = self.testapp.get('/many/resource?url=http://iana.org/&closest=20140126200624')
assert resp.headers['WebAgg-Source-Coll'] == 'local:iana.cdxj'
self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z')
assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original')
assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT'
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}
def test_agg_select_local_postreq(self):
req_data = """\
GET / HTTP/1.1
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36
Host: iana.org
"""
resp = self.testapp.post('/many/resource/postreq?url=http://iana.org/&closest=20140126200624', req_data)
assert resp.headers['WebAgg-Source-Coll'] == 'local:iana.cdxj'
self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z')
assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original')
assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT'
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}
def test_agg_live_postreq(self):
req_data = """\
GET /get?foo=bar HTTP/1.1
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36
Host: httpbin.org
"""
resp = self.testapp.post('/many/resource/postreq?url=http://httpbin.org/get?foo=bar&closest=now', req_data)
assert resp.headers['WebAgg-Source-Coll'] == 'live'
self._check_uri_date(resp, 'http://httpbin.org/get?foo=bar', True)
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/get?foo=bar', 'original')
assert resp.headers['Memento-Datetime'] != ''
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://httpbin.org/get?foo=bar',)"}
def test_agg_post_resolve_postreq(self):
req_data = """\
POST /post HTTP/1.1
content-length: 16
accept-encoding: gzip, deflate
accept: */*
host: httpbin.org
content-type: application/x-www-form-urlencoded
foo=bar&test=abc"""
resp = self.testapp.post('/posttest/resource/postreq?url=http://httpbin.org/post', req_data)
assert resp.headers['WebAgg-Source-Coll'] == 'post'
self._check_uri_date(resp, 'http://httpbin.org/post', True)
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original')
assert resp.headers['Memento-Datetime'] != ''
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
assert b'"test": "abc"' in resp.body
assert b'"url": "http://httpbin.org/post"' in resp.body
assert 'ResErrors' not in resp.headers
def test_agg_post_resolve_fallback(self):
req_data = OrderedDict([('foo', 'bar'), ('test', 'abc')])
resp = self.testapp.post('/fallback/resource?url=http://httpbin.org/post', req_data)
assert resp.headers['WebAgg-Source-Coll'] == 'post'
self._check_uri_date(resp, 'http://httpbin.org/post', True)
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
assert b'"test": "abc"' in resp.body
assert b'"url": "http://httpbin.org/post"' in resp.body
assert 'ResErrors' not in resp.headers
def test_agg_seq_fallback_1(self):
resp = self.testapp.get('/fallback/resource?url=http://www.iana.org/')
assert resp.headers['WebAgg-Source-Coll'] == 'live'
self._check_uri_date(resp, 'http://www.iana.org/', True)
assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original')
assert b'HTTP/1.1 200 OK' in resp.body
assert 'ResErrors' not in resp.headers
def test_agg_seq_fallback_2(self):
resp = self.testapp.get('/fallback/resource?url=http://www.example.com/')
assert resp.headers['WebAgg-Source-Coll'] == 'example'
self._check_uri_date(resp, 'http://example.com/', '2016-02-25T04:23:29Z')
assert resp.headers['Link'] == MementoUtils.make_link('http://example.com/', 'original')
assert resp.headers['Memento-Datetime'] == 'Thu, 25 Feb 2016 04:23:29 GMT'
assert b'HTTP/1.1 200 OK' in resp.body
assert 'ResErrors' not in resp.headers
def test_redis_warc_1(self):
f = FakeStrictRedis.from_url('redis://localhost/2')
f.hset('test:warc', 'example.warc.gz', './testdata/example.warc.gz')
resp = self.testapp.get('/allredis/resource?url=http://www.example.com/')
assert resp.headers['WebAgg-Source-Coll'] == 'example'
def test_live_video_loader(self):
params = {'url': 'http://www.youtube.com/v/BfBgWtAIbRc',
'content_type': 'application/vnd.youtube-dl_formats+json'
}
resp = self.testapp.get('/live/resource', params=params)
assert resp.headers['WebAgg-Source-Coll'] == 'live'
self._check_uri_date(resp, 'metadata://www.youtube.com/v/BfBgWtAIbRc', True)
assert resp.headers['Link'] == MementoUtils.make_link('metadata://www.youtube.com/v/BfBgWtAIbRc', 'original')
assert resp.headers['Memento-Datetime'] != ''
assert b'WARC-Type: metadata' in resp.body
assert b'Content-Type: application/vnd.youtube-dl_formats+json' in resp.body
def test_live_video_loader_post(self):
req_data = """\
GET /v/BfBgWtAIbRc HTTP/1.1
accept-encoding: gzip, deflate
accept: */*
host: www.youtube.com\
"""
params = {'url': 'http://www.youtube.com/v/BfBgWtAIbRc',
'content_type': 'application/vnd.youtube-dl_formats+json'
}
resp = self.testapp.post('/live/resource/postreq?&' + urlencode(params), req_data)
assert resp.headers['WebAgg-Source-Coll'] == 'live'
self._check_uri_date(resp, 'metadata://www.youtube.com/v/BfBgWtAIbRc', True)
assert resp.headers['Link'] == MementoUtils.make_link('metadata://www.youtube.com/v/BfBgWtAIbRc', 'original')
assert resp.headers['Memento-Datetime'] != ''
assert b'WARC-Type: metadata' in resp.body
assert b'Content-Type: application/vnd.youtube-dl_formats+json' in resp.body
def test_error_redis_file_not_found(self):
f = FakeStrictRedis.from_url('redis://localhost/2')
f.hset('test:warc', 'example.warc.gz', './testdata/example2.warc.gz')
resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503)
assert resp.json['message'] == "example.warc.gz: [Errno 2] No such file or directory: './testdata/example2.warc.gz'"
f.hdel('test:warc', 'example.warc.gz')
resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503)
assert resp.json == {'message': 'example.warc.gz: Archive File Not Found',
'errors': {'WARCPathLoader': 'example.warc.gz: Archive File Not Found'}}
f.delete('test:warc')
resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503)
assert resp.json == {'message': 'example.warc.gz: Archive File Not Found',
'errors': {'WARCPathLoader': 'example.warc.gz: Archive File Not Found'}}
def test_error_fallback_live_not_found(self):
resp = self.testapp.get('/fallback/resource?url=http://invalid.url-not-found', status=400)
assert resp.json == {'message': 'http://invalid.url-not-found/',
'errors': {'LiveWebLoader': 'http://invalid.url-not-found/'}}
assert resp.text == resp.headers['ResErrors']
def test_agg_local_revisit(self):
resp = self.testapp.get('/many/resource?url=http://www.example.com/&closest=20140127171251&sources=local')
assert resp.headers['WebAgg-Source-Coll'] == 'local:dupes.cdxj'
buff = BytesIO(resp.body)
status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff)
assert status_headers.get_header('WARC-Target-URI') == 'http://example.com'
assert status_headers.get_header('WARC-Date') == '2014-01-27T17:12:51Z'
assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://example.com'
assert status_headers.get_header('WARC-Refers-To-Date') == '2014-01-27T17:12:00Z'
assert resp.headers['Link'] == MementoUtils.make_link('http://example.com', 'original')
assert resp.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'
assert b'HTTP/1.1 200 OK' in resp.body
assert b'<!doctype html>' in resp.body
assert 'ResErrors' not in resp.headers
def test_error_invalid_index_output(self):
resp = self.testapp.get('/live/index?url=http://httpbin.org/get&output=foobar', status=400)
assert resp.json == {'message': 'output=foobar not supported'}
assert resp.text == resp.headers['ResErrors']
def test_error_local_not_found(self):
resp = self.testapp.get('/many/resource?url=http://not-found.error/&sources=local', status=404)
assert resp.json == {'message': 'No Resource Found'}
assert resp.text == resp.headers['ResErrors']
def test_error_empty(self):
resp = self.testapp.get('/empty/resource?url=http://example.com/', status=404)
assert resp.json == {'message': 'No Resource Found'}
assert resp.text == resp.headers['ResErrors']
def test_error_invalid(self):
resp = self.testapp.get('/invalid/resource?url=http://example.com/', status=500)
assert resp.json == {'message': "Internal Error: 'list' object is not callable"}
assert resp.text == resp.headers['ResErrors']