1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00
pywb/tests/test_integration.py
Ilya Kreymer d98c1f6cf7 memento/api: add a new /collinfo.json end-point, enabled with 'enable_coll_info' config setting, which returns
the value fo collinfo.json template. Default template returns an entry for each handler route,
including the route path (id), title (name) and memento timegate and timemap paths, to be used with
an aggregator. Using a custom 'info_json' template can specify a different collinfo template, alternative to #69 (local aggregation)
Closes #146
2015-11-04 15:36:44 -08:00

493 lines
20 KiB
Python

from pytest import raises
from pywb.cdx.cdxobject import CDXObject
from pywb.utils.timeutils import timestamp_now
from server_mock import make_setup_module, BaseIntegration
setup_module = make_setup_module('tests/test_config.yaml')
class TestWbIntegration(BaseIntegration):
#def setup(self):
# self.app = app
# self.testapp = testapp
def _assert_basic_html(self, resp):
assert resp.status_int == 200
assert resp.content_type == 'text/html'
assert resp.content_length > 0
def _assert_basic_text(self, resp):
assert resp.status_int == 200
assert resp.content_type == 'text/plain'
assert resp.content_length > 0
def test_home(self):
resp = self.testapp.get('/')
self._assert_basic_html(resp)
assert '/pywb' in resp.body
def test_pywb_root(self):
resp = self.testapp.get('/pywb/')
self._assert_basic_html(resp)
assert 'Search' in resp.body
def test_pywb_root_head(self):
resp = self.testapp.head('/pywb/')
assert resp.content_type == 'text/html'
assert resp.status_int == 200
def test_pywb_invalid_path(self):
resp = self.testapp.head('/blah/', status=404)
assert resp.content_type == 'text/html'
assert resp.status_int == 404
def test_calendar_query(self):
resp = self.testapp.get('/pywb/*/iana.org')
self._assert_basic_html(resp)
# 3 Captures + header
assert len(resp.html.find_all('tr')) == 4
def test_calendar_query_filtered(self):
# unfiltered collection
resp = self.testapp.get('/pywb/*/http://www.iana.org/_css/2013.1/screen.css')
self._assert_basic_html(resp)
# 17 Captures + header
assert len(resp.html.find_all('tr')) == 18
# filtered collection
resp = self.testapp.get('/pywb-filt/*/http://www.iana.org/_css/2013.1/screen.css')
self._assert_basic_html(resp)
# 1 Capture (filtered) + header
assert len(resp.html.find_all('tr')) == 2
def test_calendar_query_fuzzy_match(self):
# fuzzy match removing _= according to standard rules.yaml
resp = self.testapp.get('/pywb/*/http://www.iana.org/_css/2013.1/screen.css?_=3141592653')
self._assert_basic_html(resp)
# 17 Captures + header
assert len(resp.html.find_all('tr')) == 18
def test_calendar_not_found(self):
# query with no results
resp = self.testapp.get('/pywb/*/http://not-exist.example.com')
self._assert_basic_html(resp)
assert 'No captures found' in resp.body, resp.body
assert len(resp.html.find_all('tr')) == 0
def test_cdx_query(self):
resp = self.testapp.get('/pywb/cdx_/*/http://www.iana.org/')
self._assert_basic_text(resp)
assert '20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB' in resp
# check for 3 cdx lines (strip final newline)
actual_len = len(str(resp.body).rstrip().split('\n'))
assert actual_len == 3, actual_len
def test_replay_top_frame(self):
resp = self.testapp.get('/pywb/20140127171238tf_/http://www.iana.org/')
assert '<iframe ' in resp.body
assert '/pywb/20140127171238/http://www.iana.org/' in resp.body, resp.body
def test_replay_content(self):
resp = self.testapp.get('/pywb/20140127171238/http://www.iana.org/')
self._assert_basic_html(resp)
assert '"20140127171238"' in resp.body
assert 'wb.js' in resp.body
assert 'new _WBWombat' in resp.body, resp.body
assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
def test_replay_non_frame_content(self):
resp = self.testapp.get('/pywb-nonframe/20140127171238/http://www.iana.org/')
self._assert_basic_html(resp)
assert '"20140127171238"' in resp.body
assert 'wb.js' in resp.body
assert '/pywb-nonframe/20140127171238/http://www.iana.org/time-zones"' in resp.body
def test_replay_non_surt(self):
resp = self.testapp.get('/pywb-nosurt/20140103030321/http://example.com?example=1')
self._assert_basic_html(resp)
assert '"20140103030321"' in resp.body
assert 'wb.js' in resp.body
assert '/pywb-nosurt/20140103030321/http://www.iana.org/domains/example' in resp.body
def test_replay_cdxj(self):
resp = self.testapp.get('/pywb-cdxj/20140103030321/http://example.com?example=1')
self._assert_basic_html(resp)
assert '"20140103030321"' in resp.body
assert 'wb.js' in resp.body
assert '/pywb-cdxj/20140103030321/http://www.iana.org/domains/example' in resp.body
def test_replay_cdxj_revisit(self):
resp = self.testapp.get('/pywb-cdxj/20140103030341/http://example.com?example=1')
self._assert_basic_html(resp)
assert '"20140103030341"' in resp.body
assert 'wb.js' in resp.body
assert '/pywb-cdxj/20140103030341/http://www.iana.org/domains/example' in resp.body
def test_zero_len_revisit(self):
resp = self.testapp.get('/pywb/20140603030341/http://example.com?example=2')
self._assert_basic_html(resp)
assert '"20140603030341"' in resp.body
assert 'wb.js' in resp.body
assert '/pywb/20140603030341/http://www.iana.org/domains/example' in resp.body
def test_replay_url_agnostic_revisit(self):
resp = self.testapp.get('/pywb/20130729195151/http://www.example.com/')
self._assert_basic_html(resp)
assert '"20130729195151"' in resp.body
assert 'wb.js' in resp.body
assert '/pywb/20130729195151/http://www.iana.org/domains/example"' in resp.body
def test_video_info_not_found(self):
# not actually archived, but ensure video info path is tested
resp = self.testapp.get('/pywb/vi_/https://www.youtube.com/watch?v=DjFZyFWSt1M', status=404)
assert resp.status_int == 404
def test_replay_cdx_mod(self):
resp = self.testapp.get('/pywb/20140127171239cdx_/http://www.iana.org/_css/2013.1/print.css')
self._assert_basic_text(resp)
lines = resp.body.rstrip().split('\n')
assert len(lines) == 17
assert lines[0].startswith('org,iana)/_css/2013.1/print.css 20140127171239')
def test_replay_banner_only(self):
resp = self.testapp.get('/pywb/20140126201054bn_/http://www.iana.org/domains/reserved')
# wb.js header insertion
assert 'wb.js' in resp.body
# no wombat present
assert '_WBWombat' not in resp.body
# url not rewritten
#assert '"http://www.iana.org/domains/example"' in resp.body
assert '"/_css/2013.1/screen.css"' in resp.body
def test_replay_identity_1(self):
resp = self.testapp.get('/pywb/20140127171251id_/http://example.com')
# no wb header insertion
assert 'wb.js' not in resp.body
assert resp.content_length == 1270, resp.content_length
# original unrewritten url present
assert '"http://www.iana.org/domains/example"' in resp.body
def test_replay_range_cache_content(self):
headers = [('Range', 'bytes=0-200')]
resp = self.testapp.get('/pywb/20140127171250id_/http://example.com', headers=headers)
assert resp.status_int == 206
assert resp.headers['Accept-Ranges'] == 'bytes'
assert resp.headers['Content-Range'] == 'bytes 0-200/1270', resp.headers['Content-Range']
assert resp.content_length == 201, resp.content_length
assert 'wb.js' not in resp.body
def test_replay_content_ignore_range(self):
headers = [('Range', 'bytes=0-200')]
resp = self.testapp.get('/pywb-norange/20140127171251id_/http://example.com', headers=headers)
# range request ignored
assert resp.status_int == 200
# full response
assert resp.content_length == 1270, resp.content_length
# identity, no header insertion
assert 'wb.js' not in resp.body
def test_replay_range_cache_content_bound_end(self):
headers = [('Range', 'bytes=10-10000')]
resp = self.testapp.get('/pywb/20140127171251id_/http://example.com', headers=headers)
assert resp.status_int == 206
assert resp.headers['Accept-Ranges'] == 'bytes'
assert resp.headers['Content-Range'] == 'bytes 10-1269/1270', resp.headers['Content-Range']
assert resp.content_length == 1260, resp.content_length
assert len(resp.body) == resp.content_length
assert 'wb.js' not in resp.body
def test_replay_redir_no_cache(self):
headers = [('Range', 'bytes=10-10000')]
# Range ignored
resp = self.testapp.get('/pywb/20140126200927/http://www.iana.org/domains/root/db/', headers=headers)
assert resp.content_length == 0
assert resp.status_int == 302
def test_replay_identity_2_arcgz(self):
resp = self.testapp.get('/pywb/20140216050221id_/http://arc.gz.test.example.com')
# no wb header insertion
assert 'wb.js' not in resp.body
# original unrewritten url present
assert '"http://www.iana.org/domains/example"' in resp.body
def test_replay_identity_2_arc(self):
resp = self.testapp.get('/pywb/20140216050221id_/http://arc.test.example.com')
# no wb header insertion
assert 'wb.js' not in resp.body
# original unrewritten url present
assert '"http://www.iana.org/domains/example"' in resp.body
def test_replay_content_length_1(self):
# test larger file, rewritten file (svg!)
resp = self.testapp.get('/pywb/20140126200654/http://www.iana.org/_img/2013.1/rir-map.svg')
assert resp.headers['Content-Length'] == str(len(resp.body))
def test_replay_css_mod(self):
resp = self.testapp.get('/pywb/20140127171239cs_/http://www.iana.org/_css/2013.1/screen.css')
assert resp.status_int == 200
assert resp.content_type == 'text/css'
def test_replay_js_mod(self):
# an empty js file
resp = self.testapp.get('/pywb/20140126201054js_/http://www.iana.org/_js/2013.1/iana.js')
assert resp.status_int == 200
assert resp.content_length == 0
assert resp.content_type == 'application/x-javascript'
def test_redirect_exact(self):
resp = self.testapp.get('/pywb/20140127171237/http://www.iana.org/')
assert resp.status_int == 302
assert resp.headers['Location'].endswith('/pywb/20140127171238/http://iana.org')
def test_no_redirect_non_exact(self):
# non-exact mode, don't redirect to exact capture
resp = self.testapp.get('/pywb-non-exact/20140127171237/http://www.iana.org/')
assert resp.status_int == 200
self._assert_basic_html(resp)
assert '"20140127171237"' in resp.body
# actual timestamp set in JS
assert 'timestamp = "20140127171238"' in resp.body
assert '/pywb-non-exact/20140127171237/http://www.iana.org/about/' in resp.body
def test_redirect_latest_replay(self):
resp = self.testapp.get('/pywb/http://example.com/')
assert resp.status_int == 302
assert resp.headers['Location'].endswith('/20140127171251/http://example.com')
resp = resp.follow()
#check resp
self._assert_basic_html(resp)
assert '"20140127171251"' in resp.body
assert '/pywb/20140127171251/http://www.iana.org/domains/example' in resp.body
def test_redirect_non_exact_latest_replay_ts(self):
resp = self.testapp.get('/pywb-non-exact/http://example.com/')
assert resp.status_int == 200
assert resp.headers['Content-Location'].endswith('/http://example.com')
# extract ts, which should be current time
ts = resp.headers['Content-Location'].rsplit('/http://')[0].rsplit('/', 1)[-1]
assert ts == '20140127171251'
#resp = resp.follow()
#self._assert_basic_html(resp)
# ensure the current ts is present in the links
assert '"{0}"'.format(ts) in resp.body
assert '/pywb-non-exact/http://www.iana.org/domains/example' in resp.body
# ensure ts is current ts
#assert timestamp_now() >= ts, ts
def test_redirect_relative_3(self):
# webtest uses Host: localhost:80 by default
# first two requests should result in same redirect
target = 'http://localhost:80/pywb/2014/http://iana.org/_css/2013.1/screen.css'
# without timestamp
resp = self.testapp.get('/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:80/pywb/2014/http://iana.org/')])
assert resp.status_int == 302
assert resp.headers['Location'] == target, resp.headers['Location']
# with timestamp
resp = self.testapp.get('/2014/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:80/pywb/2014/http://iana.org/')])
assert resp.status_int == 302
assert resp.headers['Location'] == target, resp.headers['Location']
resp = resp.follow()
assert resp.status_int == 302
assert resp.headers['Location'].endswith('/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css')
resp = resp.follow()
assert resp.status_int == 200
assert resp.content_type == 'text/css'
def test_rel_self_redirect(self):
uri = '/pywb/20140126200927/http://www.iana.org/domains/root/db'
resp = self.testapp.get(uri, status=302)
assert resp.status_int == 302
assert resp.headers['Location'].endswith('/pywb/20140126200928/http://www.iana.org/domains/root/db')
#def test_referrer_self_redirect(self):
# uri = '/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css'
# host = 'somehost:8082'
# referrer = 'http://' + host + uri
# capture is normally a 200
# resp = self.testapp.get(uri)
# assert resp.status_int == 200
# redirect causes skip of this capture, redirect to next
# resp = self.testapp.get(uri, headers = [('Referer', referrer), ('Host', host)], status = 302)
# assert resp.status_int == 302
def test_not_existant_warc_other_capture(self):
resp = self.testapp.get('/pywb/20140703030321/http://example.com?example=2')
assert resp.status_int == 302
assert resp.headers['Location'].endswith('/pywb/20140603030341/http://example.com?example=2')
def test_missing_revisit_other_capture(self):
resp = self.testapp.get('/pywb/20140603030351/http://example.com?example=2')
assert resp.status_int == 302
assert resp.headers['Location'].endswith('/pywb/20140603030341/http://example.com?example=2')
def test_not_existant_warc_no_other(self):
resp = self.testapp.get('/pywb/20140703030321/http://example.com?example=3', status = 503)
assert resp.status_int == 503
def test_missing_revisit_no_other(self):
resp = self.testapp.get('/pywb/20140603030351/http://example.com?example=3', status = 503)
assert resp.status_int == 503
def test_live_frame(self):
resp = self.testapp.get('/live/http://example.com/?test=test')
assert resp.status_int == 200
def test_live_redir_1(self):
resp = self.testapp.get('/live/*/http://example.com/?test=test')
assert resp.status_int == 302
assert resp.headers['Location'].endswith('/live/http://example.com/?test=test')
def test_live_redir_2(self):
resp = self.testapp.get('/live/2010-2011/http://example.com/?test=test')
assert resp.status_int == 302
assert resp.headers['Location'].endswith('/live/http://example.com/?test=test')
def test_live_fallback(self):
resp = self.testapp.get('/pywb-fallback//http://example.com/?test=test')
assert resp.status_int == 200
def test_post_1(self):
resp = self.testapp.post('/pywb/httpbin.org/post', {'foo': 'bar', 'test': 'abc'})
# no redirects for POST, as some browsers (FF) show modal confirmation dialog!
#assert resp.status_int == 307
#assert resp.headers['Location'].endswith('/pywb/20140610000859/http://httpbin.org/post')
# XX webtest doesn't support 307 redirect of post
#resp = resp.follow()
#resp = self.testapp.post(resp.headers['Location'], {'foo': 'bar', 'test': 'abc'})
assert resp.status_int == 200
assert '"foo": "bar"' in resp.body
assert '"test": "abc"' in resp.body
def test_post_2(self):
resp = self.testapp.post('/pywb/20140610001255/http://httpbin.org/post?foo=bar', {'data': '^'})
assert resp.status_int == 200
assert '"data": "^"' in resp.body
def test_post_invalid(self):
# not json
resp = self.testapp.post_json('/pywb/20140610001255/http://httpbin.org/post?foo=bar', {'data': '^'}, status=404)
assert resp.status_int == 404
def test_post_redirect(self):
# post handled without redirect (since 307 not allowed)
resp = self.testapp.post('/post', {'foo': 'bar', 'test': 'abc'}, headers=[('Referer', 'http://localhost:80/pywb/2014/http://httpbin.org/post')])
assert resp.status_int == 200
assert '"foo": "bar"' in resp.body
assert '"test": "abc"' in resp.body
def test_excluded_content(self):
resp = self.testapp.get('/pywb/http://www.iana.org/_img/bookmark_icon.ico', status = 403)
assert resp.status_int == 403
assert 'Excluded' in resp.body
def test_replay_not_found(self):
resp = self.testapp.head('/pywb/http://not-exist.example.com', status=404)
assert resp.content_type == 'text/html'
assert resp.status_int == 404
def test_static_content(self):
resp = self.testapp.get('/static/test/route/wb.css')
assert resp.status_int == 200
assert resp.content_type == 'text/css'
assert resp.content_length > 0
def test_static_content_filewrapper(self):
from wsgiref.util import FileWrapper
resp = self.testapp.get('/static/test/route/wb.css', extra_environ = {'wsgi.file_wrapper': FileWrapper})
assert resp.status_int == 200
assert resp.content_type == 'text/css'
assert resp.content_length > 0
def test_static_not_found(self):
resp = self.testapp.get('/static/test/route/notfound.css', status = 404)
assert resp.status_int == 404
def test_cdx_server_filters(self):
resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/screen.css&filter=mime:warc/revisit&filter=filename:dupes.warc.gz')
self._assert_basic_text(resp)
actual_len = len(resp.body.rstrip().split('\n'))
assert actual_len == 1, actual_len
def test_cdx_server_advanced(self):
# combine collapsing, reversing and revisit resolving
resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/print.css&collapseTime=11&resolveRevisits=true&reverse=true')
# convert back to CDXObject
cdxs = map(CDXObject, resp.body.rstrip().split('\n'))
assert len(cdxs) == 3, len(cdxs)
# verify timestamps
timestamps = map(lambda cdx: cdx['timestamp'], cdxs)
assert timestamps == ['20140127171239', '20140126201054', '20140126200625']
# verify orig filenames (2 revisits, one non)
origfilenames = map(lambda cdx: cdx['orig.filename'], cdxs)
assert origfilenames == ['iana.warc.gz', 'iana.warc.gz', '-']
def test_error(self):
resp = self.testapp.get('/pywb/?abc', status = 400)
assert resp.status_int == 400
assert 'Invalid Url: http://?abc' in resp.body
def test_coll_info_json(self):
resp = self.testapp.get('/collinfo.json')
assert resp.content_type == 'application/json'
assert len(resp.json) == 9
#def test_invalid_config(self):
# with raises(IOError):
# init_app(create_wb_router,
# load_yaml=True,
# config_file='x-invalid-x')