#from gevent import monkey; monkey.patch_all(thread=False) from collections import OrderedDict from webagg.handlers import DefaultResourceHandler, HandlerSeq from webagg.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource from webagg.aggregator import GeventTimeoutAggregator, SimpleAggregator from webagg.aggregator import DirectoryIndexSource from webagg.app import ResAggApp from webagg.utils import MementoUtils from pywb.utils.statusandheaders import StatusAndHeadersParser from pywb.utils.bufferedreaders import ChunkedDataReader from io import BytesIO from six.moves.urllib.parse import urlencode import webtest from fakeredis import FakeStrictRedis from .testutils import to_path, FakeRedisTests, BaseTestClass import json sources = { 'local': DirectoryIndexSource(to_path('testdata/'), ''), 'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'), 'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*'), 'live': LiveIndexSource(), } class TestResAgg(FakeRedisTests, BaseTestClass): def setup_class(cls): super(TestResAgg, cls).setup_class() live_source = SimpleAggregator({'live': LiveIndexSource()}) live_handler = DefaultResourceHandler(live_source) app = ResAggApp() app.add_route('/live', live_handler) source1 = GeventTimeoutAggregator(sources) handler1 = DefaultResourceHandler(source1, to_path('testdata/')) app.add_route('/many', handler1) source2 = SimpleAggregator({'post': FileIndexSource(to_path('testdata/post-test.cdxj'))}) handler2 = DefaultResourceHandler(source2, to_path('testdata/')) app.add_route('/posttest', handler2) source3 = SimpleAggregator({'example': FileIndexSource(to_path('testdata/example.cdxj'))}) handler3 = DefaultResourceHandler(source3, to_path('testdata/')) app.add_route('/fallback', HandlerSeq([handler3, handler2, live_handler])) app.add_route('/seq', HandlerSeq([handler3, handler2])) app.add_route('/allredis', DefaultResourceHandler(source3, 'redis://localhost/2/test:warc')) app.add_route('/empty', HandlerSeq([])) app.add_route('/invalid', DefaultResourceHandler([SimpleAggregator({'invalid': 'should not be a callable'})])) cls.testapp = webtest.TestApp(app) def _check_uri_date(self, resp, uri, dt): buff = BytesIO(resp.body) buff = ChunkedDataReader(buff) status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff) assert status_headers.get_header('WARC-Target-URI') == uri if dt == True: assert status_headers.get_header('WARC-Date') != '' else: assert status_headers.get_header('WARC-Date') == dt def test_list_routes(self): resp = self.testapp.get('/') res = resp.json assert set(res.keys()) == set(['/empty', '/empty/postreq', '/fallback', '/fallback/postreq', '/live', '/live/postreq', '/many', '/many/postreq', '/posttest', '/posttest/postreq', '/seq', '/seq/postreq', '/allredis', '/allredis/postreq', '/invalid', '/invalid/postreq']) assert res['/fallback'] == {'modes': ['list_sources', 'index', 'resource']} def test_list_handlers(self): resp = self.testapp.get('/many') assert resp.json == {'modes': ['list_sources', 'index', 'resource']} assert 'ResErrors' not in resp.headers resp = self.testapp.get('/many/other') assert resp.json == {'modes': ['list_sources', 'index', 'resource']} assert 'ResErrors' not in resp.headers def test_list_errors(self): # must specify url for index or resource resp = self.testapp.get('/many/index', status=400) assert resp.json == {'message': 'The "url" param is required'} assert resp.text == resp.headers['ResErrors'] resp = self.testapp.get('/many/index', status=400) assert resp.json == {'message': 'The "url" param is required'} assert resp.text == resp.headers['ResErrors'] resp = self.testapp.get('/many/resource', status=400) assert resp.json == {'message': 'The "url" param is required'} assert resp.text == resp.headers['ResErrors'] def test_list_sources(self): resp = self.testapp.get('/many/list_sources') assert resp.json == {'sources': {'local': 'file_dir', 'ia': 'memento', 'rhiz': 'memento', 'live': 'live'}} assert 'ResErrors' not in resp.headers def test_live_index(self): resp = self.testapp.get('/live/index?url=http://httpbin.org/get&output=json') resp.charset = 'utf-8' cdxlist = list([json.loads(cdx) for cdx in resp.text.rstrip().split('\n')]) cdxlist[0]['timestamp'] = '2016' assert(cdxlist == [{'url': 'http://httpbin.org/get', 'urlkey': 'org,httpbin)/get', 'is_live': 'true', 'mime': '', 'load_url': 'http://httpbin.org/get', 'source': 'live', 'timestamp': '2016'}]) def test_live_resource(self): headers = {'foo': 'bar'} resp = self.testapp.get('/live/resource?url=http://httpbin.org/get?foo=bar', headers=headers) assert resp.headers['WebAgg-Source-Coll'] == 'live' self._check_uri_date(resp, 'http://httpbin.org/get?foo=bar', True) assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/get?foo=bar', 'original') assert resp.headers['Memento-Datetime'] != '' assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body assert 'ResErrors' not in resp.headers def test_live_post_resource(self): resp = self.testapp.post('/live/resource?url=http://httpbin.org/post', OrderedDict([('foo', 'bar')])) assert resp.headers['WebAgg-Source-Coll'] == 'live' self._check_uri_date(resp, 'http://httpbin.org/post', True) assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original') assert resp.headers['Memento-Datetime'] != '' assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body assert 'ResErrors' not in resp.headers def test_agg_select_mem_1(self): resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20141001') assert resp.headers['WebAgg-Source-Coll'] == 'rhiz' self._check_uri_date(resp, 'http://www.vvork.com/', '2014-10-06T18:43:57Z') assert b'HTTP/1.1 200 OK' in resp.body assert resp.headers['Link'] == MementoUtils.make_link('http://www.vvork.com/', 'original') assert resp.headers['Memento-Datetime'] == 'Mon, 06 Oct 2014 18:43:57 GMT' assert 'ResErrors' not in resp.headers def test_agg_select_mem_2(self): resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20151231') assert resp.headers['WebAgg-Source-Coll'] == 'ia' self._check_uri_date(resp, 'http://vvork.com/', '2016-01-10T13:48:55Z') assert b'HTTP/1.1 200 OK' in resp.body assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original') assert resp.headers['Memento-Datetime'] == 'Sun, 10 Jan 2016 13:48:55 GMT' assert 'ResErrors' not in resp.headers def test_agg_select_live(self): resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=2016') assert resp.headers['WebAgg-Source-Coll'] == 'live' self._check_uri_date(resp, 'http://vvork.com/', True) assert resp.headers['Link'] == MementoUtils.make_link('http://vvork.com/', 'original') assert resp.headers['Memento-Datetime'] != '' assert 'ResErrors' not in resp.headers def test_agg_select_local(self): resp = self.testapp.get('/many/resource?url=http://iana.org/&closest=20140126200624') assert resp.headers['WebAgg-Source-Coll'] == 'local:iana.cdxj' self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z') assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original') assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT' assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"} def test_agg_select_local_postreq(self): req_data = """\ GET / HTTP/1.1 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 Host: iana.org """ resp = self.testapp.post('/many/resource/postreq?url=http://iana.org/&closest=20140126200624', req_data) assert resp.headers['WebAgg-Source-Coll'] == 'local:iana.cdxj' self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z') assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original') assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT' assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"} def test_agg_live_postreq(self): req_data = """\ GET /get?foo=bar HTTP/1.1 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 User-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 Host: httpbin.org """ resp = self.testapp.post('/many/resource/postreq?url=http://httpbin.org/get?foo=bar&closest=now', req_data) assert resp.headers['WebAgg-Source-Coll'] == 'live' self._check_uri_date(resp, 'http://httpbin.org/get?foo=bar', True) assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/get?foo=bar', 'original') assert resp.headers['Memento-Datetime'] != '' assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('http://webenact.rhizome.org/vvork/http://httpbin.org/get?foo=bar',)"} def test_agg_post_resolve_postreq(self): req_data = """\ POST /post HTTP/1.1 content-length: 16 accept-encoding: gzip, deflate accept: */* host: httpbin.org content-type: application/x-www-form-urlencoded foo=bar&test=abc""" resp = self.testapp.post('/posttest/resource/postreq?url=http://httpbin.org/post', req_data) assert resp.headers['WebAgg-Source-Coll'] == 'post' self._check_uri_date(resp, 'http://httpbin.org/post', True) assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original') assert resp.headers['Memento-Datetime'] != '' assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body assert b'"test": "abc"' in resp.body assert b'"url": "http://httpbin.org/post"' in resp.body assert 'ResErrors' not in resp.headers def test_agg_post_resolve_fallback(self): req_data = OrderedDict([('foo', 'bar'), ('test', 'abc')]) resp = self.testapp.post('/fallback/resource?url=http://httpbin.org/post', req_data) assert resp.headers['WebAgg-Source-Coll'] == 'post' self._check_uri_date(resp, 'http://httpbin.org/post', True) assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/post', 'original') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body assert b'"test": "abc"' in resp.body assert b'"url": "http://httpbin.org/post"' in resp.body assert 'ResErrors' not in resp.headers def test_agg_seq_fallback_1(self): resp = self.testapp.get('/fallback/resource?url=http://www.iana.org/') assert resp.headers['WebAgg-Source-Coll'] == 'live' self._check_uri_date(resp, 'http://www.iana.org/', True) assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original') assert b'HTTP/1.1 200 OK' in resp.body assert 'ResErrors' not in resp.headers def test_agg_seq_fallback_2(self): resp = self.testapp.get('/fallback/resource?url=http://www.example.com/') assert resp.headers['WebAgg-Source-Coll'] == 'example' self._check_uri_date(resp, 'http://example.com/', '2016-02-25T04:23:29Z') assert resp.headers['Link'] == MementoUtils.make_link('http://example.com/', 'original') assert resp.headers['Memento-Datetime'] == 'Thu, 25 Feb 2016 04:23:29 GMT' assert b'HTTP/1.1 200 OK' in resp.body assert 'ResErrors' not in resp.headers def test_redis_warc_1(self): f = FakeStrictRedis.from_url('redis://localhost/2') f.hset('test:warc', 'example.warc.gz', './testdata/example.warc.gz') resp = self.testapp.get('/allredis/resource?url=http://www.example.com/') assert resp.headers['WebAgg-Source-Coll'] == 'example' def test_live_video_loader(self): params = {'url': 'http://www.youtube.com/v/BfBgWtAIbRc', 'content_type': 'application/vnd.youtube-dl_formats+json' } resp = self.testapp.get('/live/resource', params=params) assert resp.headers['WebAgg-Source-Coll'] == 'live' self._check_uri_date(resp, 'metadata://www.youtube.com/v/BfBgWtAIbRc', True) assert resp.headers['Link'] == MementoUtils.make_link('metadata://www.youtube.com/v/BfBgWtAIbRc', 'original') assert resp.headers['Memento-Datetime'] != '' assert b'WARC-Type: metadata' in resp.body assert b'Content-Type: application/vnd.youtube-dl_formats+json' in resp.body def test_live_video_loader_post(self): req_data = """\ GET /v/BfBgWtAIbRc HTTP/1.1 accept-encoding: gzip, deflate accept: */* host: www.youtube.com\ """ params = {'url': 'http://www.youtube.com/v/BfBgWtAIbRc', 'content_type': 'application/vnd.youtube-dl_formats+json' } resp = self.testapp.post('/live/resource/postreq?&' + urlencode(params), req_data) assert resp.headers['WebAgg-Source-Coll'] == 'live' self._check_uri_date(resp, 'metadata://www.youtube.com/v/BfBgWtAIbRc', True) assert resp.headers['Link'] == MementoUtils.make_link('metadata://www.youtube.com/v/BfBgWtAIbRc', 'original') assert resp.headers['Memento-Datetime'] != '' assert b'WARC-Type: metadata' in resp.body assert b'Content-Type: application/vnd.youtube-dl_formats+json' in resp.body def test_error_redis_file_not_found(self): f = FakeStrictRedis.from_url('redis://localhost/2') f.hset('test:warc', 'example.warc.gz', './testdata/example2.warc.gz') resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503) assert resp.json['message'] == "example.warc.gz: [Errno 2] No such file or directory: './testdata/example2.warc.gz'" f.hdel('test:warc', 'example.warc.gz') resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503) assert resp.json == {'message': 'example.warc.gz: Archive File Not Found', 'errors': {'WARCPathLoader': 'example.warc.gz: Archive File Not Found'}} f.delete('test:warc') resp = self.testapp.get('/allredis/resource?url=http://www.example.com/', status=503) assert resp.json == {'message': 'example.warc.gz: Archive File Not Found', 'errors': {'WARCPathLoader': 'example.warc.gz: Archive File Not Found'}} def test_error_fallback_live_not_found(self): resp = self.testapp.get('/fallback/resource?url=http://invalid.url-not-found', status=400) assert resp.json == {'message': 'http://invalid.url-not-found/', 'errors': {'LiveWebLoader': 'http://invalid.url-not-found/'}} assert resp.text == resp.headers['ResErrors'] def test_agg_local_revisit(self): resp = self.testapp.get('/many/resource?url=http://www.example.com/&closest=20140127171251&sources=local') assert resp.headers['WebAgg-Source-Coll'] == 'local:dupes.cdxj' buff = BytesIO(resp.body) status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff) assert status_headers.get_header('WARC-Target-URI') == 'http://example.com' assert status_headers.get_header('WARC-Date') == '2014-01-27T17:12:51Z' assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://example.com' assert status_headers.get_header('WARC-Refers-To-Date') == '2014-01-27T17:12:00Z' assert resp.headers['Link'] == MementoUtils.make_link('http://example.com', 'original') assert resp.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT' assert b'HTTP/1.1 200 OK' in resp.body assert b'' in resp.body assert 'ResErrors' not in resp.headers def test_error_invalid_index_output(self): resp = self.testapp.get('/live/index?url=http://httpbin.org/get&output=foobar', status=400) assert resp.json == {'message': 'output=foobar not supported'} assert resp.text == resp.headers['ResErrors'] def test_error_local_not_found(self): resp = self.testapp.get('/many/resource?url=http://not-found.error/&sources=local', status=404) assert resp.json == {'message': 'No Resource Found'} assert resp.text == resp.headers['ResErrors'] def test_error_empty(self): resp = self.testapp.get('/empty/resource?url=http://example.com/', status=404) assert resp.json == {'message': 'No Resource Found'} assert resp.text == resp.headers['ResErrors'] def test_error_invalid(self): resp = self.testapp.get('/invalid/resource?url=http://example.com/', status=500) assert resp.json == {'message': "Internal Error: 'list' object is not callable"} assert resp.text == resp.headers['ResErrors']