mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
tests: ensure redirect check only added in framed mode, ensure added for banner only mode, but not for proxy mode
461 lines
20 KiB
Python
461 lines
20 KiB
Python
from .base_config_test import BaseConfigTest, fmod
|
|
|
|
from pywb.warcserver.index.cdxobject import CDXObject
|
|
|
|
|
|
# ============================================================================
|
|
class TestWbIntegration(BaseConfigTest):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
super(TestWbIntegration, cls).setup_class('config_test.yaml')
|
|
|
|
def test_home(self):
|
|
resp = self.testapp.get('/')
|
|
self._assert_basic_html(resp)
|
|
assert '/pywb' in resp.text
|
|
|
|
def test_pywb_root(self):
|
|
resp = self.testapp.get('/pywb/')
|
|
self._assert_basic_html(resp)
|
|
assert 'Search' in resp.text
|
|
|
|
def test_pywb_root_head(self):
|
|
resp = self.testapp.head('/pywb/')
|
|
assert resp.content_type == 'text/html'
|
|
assert resp.status_int == 200
|
|
|
|
def test_pywb_invalid_path(self):
|
|
resp = self.testapp.head('/blah/', status=404)
|
|
assert resp.content_type == 'text/html'
|
|
assert resp.status_int == 404
|
|
|
|
def test_calendar_query(self):
|
|
resp = self.testapp.get('/pywb/*/iana.org')
|
|
self._assert_basic_html(resp)
|
|
# 3 Captures + header
|
|
#assert len(resp.html.find_all('tr')) == 4
|
|
|
|
def test_calendar_query_2(self):
|
|
# unfiltered collection
|
|
resp = self.testapp.get('/pywb/*/http://www.iana.org/_css/2013.1/screen.css')
|
|
self._assert_basic_html(resp)
|
|
# 17 Captures + header
|
|
#assert len(resp.html.find_all('tr')) == 18
|
|
|
|
# filtered collection
|
|
#resp = self.testapp.get('/pywb-filt/*/http://www.iana.org/_css/2013.1/screen.css')
|
|
#self._assert_basic_html(resp)
|
|
# 1 Capture (filtered) + header
|
|
#assert len(resp.html.find_all('tr')) == 2
|
|
|
|
def test_cdxj_query_fuzzy_match(self):
|
|
# fuzzy match removing _= according to standard rules.yaml
|
|
resp = self.testapp.get('/pywb/cdx?url=http://www.iana.org/_css/2013.1/screen.css%3F_=3141592653')
|
|
assert len(resp.text.rstrip().split('\n')) == 17
|
|
|
|
def test_cdxj_query_fuzzy_match_add_slash(self):
|
|
# fuzzy match removing _= according to standard rules.yaml
|
|
resp = self.testapp.get('/pywb/cdx?url=http://www.iana.org/_css/2013.1/screen.css/%3F_=3141592653')
|
|
# 17 Captures + header
|
|
assert len(resp.text.rstrip().split('\n')) == 17
|
|
|
|
def test_cdxj_not_found(self):
|
|
# query with no results
|
|
resp = self.testapp.get('/pywb/cdx?url=http://not-exist.example.com')
|
|
assert resp.text == ''
|
|
|
|
def test_cdxj_query(self):
|
|
resp = self.testapp.get('/pywb/cdx?url=http://www.iana.org/')
|
|
|
|
assert 'org,iana)/ 20140126200624 {"url": "http://www.iana.org/", "mime": "text/html", "status": "200", "digest": "OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB"' in resp.text
|
|
|
|
# check for 3 cdx lines (strip final newline)
|
|
assert len(resp.text.rstrip().split('\n')) == 3
|
|
|
|
def test_replay_top_frame(self):
|
|
resp = self.testapp.get('/pywb/20140127171238/http://www.iana.org/')
|
|
|
|
assert 'new ContentFrame' in resp.text
|
|
assert '"20140127171238"' in resp.text
|
|
assert 'http://www.iana.org/' in resp.text, resp.text
|
|
|
|
assert 'Content-Security-Policy' not in resp.headers
|
|
|
|
def test_replay_content(self, fmod):
|
|
resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/', fmod)
|
|
self._assert_basic_html(resp)
|
|
|
|
assert '"20140127171238"' in resp.text, resp.text
|
|
assert 'wombat.js' in resp.text
|
|
assert 'new _WBWombat' in resp.text, resp.text
|
|
assert '/pywb/20140127171238{0}/http://www.iana.org/time-zones"'.format(fmod) in resp.text
|
|
|
|
if fmod == 'mp_':
|
|
assert 'window == window.top' in resp.text
|
|
assert 'wbinfo.is_framed = true' in resp.text
|
|
else:
|
|
assert 'window == window.top' not in resp.text
|
|
assert 'wbinfo.is_framed = false' in resp.text
|
|
|
|
csp = "default-src 'unsafe-eval' 'unsafe-inline' 'self' data: blob: mediastream: ws: wss: ; form-action 'self'"
|
|
assert resp.headers['Content-Security-Policy'] == csp
|
|
|
|
def test_replay_resource(self, fmod):
|
|
resp = self.get('/pywb/20171122230223{0}/http://httpbin.org/anything/resource.json', fmod)
|
|
assert resp.headers['Content-Type'] == 'application/json'
|
|
|
|
def test_replay_redirect(self, fmod):
|
|
resp = self.get('/pywb/2014{0}/http://www.iana.org/domains/example', fmod)
|
|
assert resp.headers['Location'].startswith('/pywb/2014{0}/'.format(fmod))
|
|
assert resp.status_code == 302
|
|
|
|
def test_replay_fuzzy_1(self, fmod):
|
|
resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/?_=123', fmod)
|
|
assert resp.status_int == 200
|
|
assert resp.headers['Content-Location'].endswith('/pywb/20140126200624{0}/http://www.iana.org/'.format(fmod))
|
|
|
|
def test_replay_no_fuzzy_match(self, fmod):
|
|
resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/?foo=bar', fmod, status=404)
|
|
assert resp.status_int == 404
|
|
|
|
def test_no_slash_redir_1(self, fmod):
|
|
resp = self.get('/pywb/20140103030321{0}/http://example.com', fmod)
|
|
assert resp.status_int == 307
|
|
assert resp.headers['Location'].endswith('/pywb/20140103030321{0}/http://example.com/'.format(fmod))
|
|
|
|
def test_no_slash_redir_2(self, fmod):
|
|
resp = self.get('/pywb/20140103030321{0}/http://example.com?example=1', fmod)
|
|
assert resp.status_int == 307
|
|
assert resp.headers['Location'].endswith('/pywb/20140103030321{0}/http://example.com/?example=1'.format(fmod))
|
|
|
|
def test_replay_cdxj(self, fmod):
|
|
resp = self.get('/pywb-cdxj/20140103030321{0}/http://example.com/?example=1', fmod)
|
|
self._assert_basic_html(resp)
|
|
|
|
assert '"20140103030321"' in resp.text
|
|
assert 'wombat.js' in resp.text
|
|
assert '/pywb-cdxj/20140103030321{0}/http://www.iana.org/domains/example'.format(fmod) in resp.text
|
|
|
|
def test_replay_cdxj_revisit(self, fmod):
|
|
resp = self.get('/pywb-cdxj/20140103030341{0}/http://example.com/?example=1', fmod)
|
|
self._assert_basic_html(resp)
|
|
|
|
assert '"20140103030341"' in resp.text
|
|
assert 'wombat.js' in resp.text
|
|
assert '/pywb-cdxj/20140103030341{0}/http://www.iana.org/domains/example'.format(fmod) in resp.text
|
|
|
|
def test_zero_len_revisit(self, fmod):
|
|
resp = self.get('/pywb/20140603030341{0}/http://example.com/?example=2', fmod)
|
|
self._assert_basic_html(resp)
|
|
|
|
assert '"20140603030341"' in resp.text
|
|
assert 'wombat.js' in resp.text
|
|
assert '/pywb/20140603030341{0}/http://www.iana.org/domains/example'.format(fmod) in resp.text
|
|
|
|
def test_replay_url_agnostic_revisit(self, fmod):
|
|
resp = self.get('/pywb/20130729195151{0}/http://www.example.com/', fmod)
|
|
self._assert_basic_html(resp)
|
|
|
|
assert '"20130729195151"' in resp.text
|
|
assert 'wombat.js' in resp.text
|
|
assert '/pywb/20130729195151{0}/http://www.iana.org/domains/example"'.format(fmod) in resp.text
|
|
|
|
def test_video_info_not_found(self):
|
|
# not actually archived, but ensure video info path is tested
|
|
resp = self.testapp.get('/pywb/vi_/https://www.youtube.com/watch?v=DjFZyFWSt1M', status=404)
|
|
assert resp.status_int == 404
|
|
|
|
def test_replay_banner_only(self):
|
|
resp = self.testapp.get('/pywb/20140126201054bn_/http://www.iana.org/domains/reserved')
|
|
|
|
# wombat.js header not inserted
|
|
assert 'wombat.js' not in resp.text
|
|
|
|
# no wombat present
|
|
assert '_WBWombat' not in resp.text
|
|
|
|
# top-frame redirect check
|
|
assert 'window == window.top' in resp.text
|
|
|
|
# url not rewritten
|
|
#assert '"http://www.iana.org/domains/example"' in resp.text
|
|
assert '"/_css/2013.1/screen.css"' in resp.text
|
|
|
|
def test_replay_identity_1(self):
|
|
resp = self.testapp.get('/pywb/20140127171251id_/http://example.com/')
|
|
|
|
# no wb header insertion
|
|
assert 'wombat.js' not in resp.text
|
|
|
|
assert resp.content_length == 1270, resp.content_length
|
|
|
|
# original unrewritten url present
|
|
assert '"http://www.iana.org/domains/example"' in resp.text
|
|
|
|
def test_replay_identity_2_arcgz(self):
|
|
resp = self.testapp.get('/pywb/20140216050221id_/http://arc.gz.test.example.com/')
|
|
|
|
# no wb header insertion
|
|
assert 'wombat.js' not in resp.text
|
|
|
|
# original unrewritten url present
|
|
assert '"http://www.iana.org/domains/example"' in resp.text
|
|
|
|
def test_replay_identity_2_arc(self):
|
|
resp = self.testapp.get('/pywb/20140216050221id_/http://arc.test.example.com/')
|
|
|
|
# no wb header insertion
|
|
assert 'wombat.js' not in resp.text
|
|
|
|
# original unrewritten url present
|
|
assert '"http://www.iana.org/domains/example"' in resp.text
|
|
|
|
def test_replay_content_length_1(self, fmod):
|
|
# test larger file, rewritten file (svg!)
|
|
resp = self.get('/pywb/20140126200654{0}/http://www.iana.org/_img/2013.1/rir-map.svg', fmod)
|
|
assert resp.headers['Content-Length'] == str(len(resp.text))
|
|
|
|
def test_replay_css_mod(self):
|
|
resp = self.testapp.get('/pywb/20140127171239cs_/http://www.iana.org/_css/2013.1/screen.css')
|
|
assert resp.status_int == 200
|
|
assert resp.content_type == 'text/css'
|
|
|
|
def test_replay_js_mod_no_obj_proxy(self):
|
|
# an empty js file, (ie11 UA no js obj proxy)
|
|
resp = self.testapp.get('/pywb/20140126201054js_/http://www.iana.org/_js/2013.1/iana.js',
|
|
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'})
|
|
|
|
assert resp.status_int == 200
|
|
assert resp.content_length == 0
|
|
assert resp.content_type == 'application/x-javascript'
|
|
|
|
def test_replay_js_obj_proxy(self, fmod):
|
|
# test js proxy obj with jquery -- no user agent
|
|
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod)
|
|
|
|
assert resp.status_int == 200
|
|
assert resp.content_length != 0
|
|
assert resp.content_type == 'application/x-javascript'
|
|
|
|
# test with Chrome user agent
|
|
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
|
|
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'})
|
|
assert 'let window = _____WB$wombat$assign$function_____(' in resp.text
|
|
|
|
def test_replay_js_ie11_no_obj_proxy(self, fmod):
|
|
# IE11 user-agent, no proxy
|
|
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
|
|
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'})
|
|
|
|
assert 'let window = _____WB$wombat$assign$function_____(' not in resp.text
|
|
|
|
def test_replay_non_exact(self, fmod):
|
|
# non-exact mode, don't redirect to exact capture
|
|
resp = self.get('/pywb/20140127171237{0}/http://www.iana.org/', fmod)
|
|
assert resp.status_int == 200
|
|
|
|
self._assert_basic_html(resp)
|
|
assert '"20140127171237"' in resp.text
|
|
# actual timestamp set in JS
|
|
assert 'timestamp = "20140127171238"' in resp.text
|
|
assert '/pywb/20140127171237{0}/http://www.iana.org/about/'.format(fmod) in resp.text
|
|
|
|
def test_latest_replay(self, fmod):
|
|
fmod_slash = fmod + '/' if fmod else ''
|
|
resp = self.get('/pywb/{0}http://example.com/', fmod_slash)
|
|
self._assert_basic_html(resp)
|
|
|
|
assert resp.headers['Content-Location'].endswith('/20140127171251{0}/http://example.com'.format(fmod))
|
|
|
|
assert '"20140127171251"' in resp.text
|
|
assert '/pywb/{0}http://www.iana.org/domains/example'.format(fmod_slash) in resp.text, resp.text
|
|
|
|
def test_replay_non_latest_content_location_ts(self, fmod):
|
|
fmod_slash = fmod + '/' if fmod else ''
|
|
resp = self.get('/pywb/{0}http://example.com/', fmod_slash)
|
|
assert resp.status_int == 200
|
|
|
|
assert resp.headers['Content-Location'].endswith('/http://example.com')
|
|
|
|
# extract ts, which should be current time
|
|
ts = resp.headers['Content-Location'].rsplit('/http://')[0].rsplit('/', 1)[-1]
|
|
assert ts == '20140127171251{0}'.format(fmod)
|
|
|
|
if fmod:
|
|
ts = ts.replace(fmod, '')
|
|
|
|
# ensure the current ts is present in the links
|
|
assert '"{0}"'.format(ts) in resp.text
|
|
assert '/pywb/{0}http://www.iana.org/domains/example'.format(fmod_slash) in resp.text
|
|
|
|
# ensure ts is current ts
|
|
#assert timestamp_now() >= ts, ts
|
|
|
|
def test_refer_redirect(self, fmod):
|
|
# webtest uses Host: localhost:80 by default
|
|
target = 'http://localhost:80/pywb/2014{0}/http://iana.org/_css/2013.1/screen.css'.format(fmod)
|
|
|
|
resp = self.get('/_css/2013.1/screen.css', fmod, headers=[('Referer', 'http://localhost:80/pywb/2014{0}/http://iana.org/'.format(fmod))])
|
|
assert resp.status_int == 307
|
|
assert resp.headers['Location'] == target, resp.headers['Location']
|
|
|
|
resp = resp.follow()
|
|
assert resp.status_int == 200
|
|
assert resp.headers['Content-Location'].endswith('/pywb/20140127171239{0}/http://www.iana.org/_css/2013.1/screen.css'.format(fmod))
|
|
assert resp.content_type == 'text/css'
|
|
|
|
def test_non_exact_replay_skip_self_redir(self, fmod):
|
|
uri = '/pywb/20140126200927{0}/http://www.iana.org/domains/root/db'
|
|
resp = self.get(uri, fmod)
|
|
assert resp.status_int == 200
|
|
assert resp.headers['Content-Location'].endswith('/pywb/20140126200928{0}/http://www.iana.org/domains/root/db'.format(fmod))
|
|
|
|
def test_non_exact_replay_skip_self_redir_slash(self, fmod):
|
|
uri = '/pywb/20140126200927{0}/http://www.iana.org/domains/root/db/'
|
|
resp = self.get(uri, fmod)
|
|
assert resp.status_int == 200
|
|
assert resp.headers['Content-Location'].endswith('/pywb/20140126200928{0}/http://www.iana.org/domains/root/db'.format(fmod))
|
|
|
|
def test_not_existant_warc_other_capture(self, fmod):
|
|
resp = self.get('/pywb/20140703030321{0}/http://example.com/?example=2', fmod)
|
|
assert resp.status_int == 200
|
|
assert resp.headers['Content-Location'].endswith('/pywb/20140603030341{0}/http://example.com?example=2'.format(fmod))
|
|
|
|
def test_missing_revisit_other_capture(self, fmod):
|
|
resp = self.get('/pywb/20140603030351{0}/http://example.com/?example=2', fmod)
|
|
assert resp.status_int == 200
|
|
assert resp.headers['Content-Location'].endswith('/pywb/20140603030341{0}/http://example.com?example=2'.format(fmod))
|
|
|
|
def test_not_existant_warc_no_other(self, fmod):
|
|
resp = self.get('/pywb/20140703030321{0}/http://example.com/?example=3', fmod, status=503)
|
|
assert resp.status_int == 503
|
|
|
|
def test_missing_revisit_no_other(self, fmod):
|
|
resp = self.get('/pywb/20140603030351{0}/http://example.com/?example=3', fmod, status=503)
|
|
assert resp.status_int == 503
|
|
|
|
def test_live_frame(self):
|
|
resp = self.testapp.get('/live/http://example.com/?test=test')
|
|
assert resp.status_int == 200
|
|
|
|
def _test_live_redir_1(self):
|
|
resp = self.testapp.get('/live/*/http://example.com/?test=test')
|
|
assert resp.status_int == 302
|
|
assert resp.headers['Location'].endswith('/live/http://example.com/?test=test')
|
|
|
|
def _test_live_redir_2(self):
|
|
resp = self.testapp.get('/live/2010-2011/http://example.com/?test=test')
|
|
assert resp.status_int == 302
|
|
assert resp.headers['Location'].endswith('/live/http://example.com/?test=test')
|
|
|
|
def test_live_fallback(self, fmod):
|
|
fmod_slash = fmod + '/' if fmod else ''
|
|
resp = self.get('/pywb-fallback/{0}http://example.com/?test=test', fmod_slash)
|
|
assert resp.status_int == 200
|
|
|
|
def test_post_1(self, fmod):
|
|
fmod_slash = fmod + '/' if fmod else ''
|
|
resp = self.post('/pywb/{0}httpbin.org/post', fmod_slash, {'foo': 'bar', 'test': 'abc'})
|
|
|
|
assert resp.status_int == 200
|
|
assert '"foo": "bar"' in resp.text
|
|
assert '"test": "abc"' in resp.text
|
|
|
|
def test_post_2(self, fmod):
|
|
resp = self.post('/pywb/20140610001255{0}/http://httpbin.org/post?foo=bar', fmod, {'data': '^'})
|
|
assert resp.status_int == 200
|
|
assert '"data": "^"' in resp.text
|
|
|
|
def test_post_invalid(self, fmod):
|
|
# not json
|
|
resp = self.post_json('/pywb/20140610001255{0}/http://httpbin.org/post?foo=bar', fmod, {'data': '^'}, status=404)
|
|
assert resp.status_int == 404
|
|
|
|
def test_post_referer_redirect(self):
|
|
# allowing 307 redirects
|
|
resp = self.post('/post', fmod,
|
|
{'foo': 'bar', 'test': 'abc'},
|
|
headers=[('Referer', 'http://localhost:80/pywb/2014{0}/http://httpbin.org/foo'.format(fmod))])
|
|
|
|
assert resp.status_int == 307
|
|
assert resp.headers['Location'].endswith('/pywb/2014{0}/http://httpbin.org/post'.format(fmod))
|
|
|
|
def _test_excluded_content(self):
|
|
fmod_slash = fmod + '/' if fmod else ''
|
|
resp = self.get('/pywb/{0}http://www.iana.org/_img/bookmark_icon.ico', fmod_slash, status=403)
|
|
assert resp.status_int == 403
|
|
assert 'Excluded' in resp.text
|
|
|
|
def test_replay_not_found(self, fmod):
|
|
fmod_slash = fmod + '/' if fmod else ''
|
|
resp = self.head('/pywb/{0}http://not-exist.example.com/', fmod_slash, status=404)
|
|
assert resp.content_type == 'text/html'
|
|
assert resp.status_int == 404
|
|
|
|
def test_static_content(self):
|
|
resp = self.testapp.get('/static/default_banner.css')
|
|
assert resp.status_int == 200
|
|
assert resp.content_type == 'text/css'
|
|
assert resp.content_length > 0
|
|
|
|
def test_static_content_filewrapper(self):
|
|
from wsgiref.util import FileWrapper
|
|
resp = self.testapp.get('/static/default_banner.css', extra_environ = {'wsgi.file_wrapper': FileWrapper})
|
|
assert resp.status_int == 200
|
|
assert resp.content_type == 'text/css'
|
|
assert resp.content_length > 0
|
|
|
|
def test_static_nested_dir(self):
|
|
resp = self.testapp.get('/static/fonts/font-awesome/fontawesome-webfont.woff')
|
|
assert resp.status_int == 200
|
|
assert resp.content_length > 0
|
|
|
|
def test_static_not_found(self):
|
|
resp = self.testapp.get('/static/notfound.css', status = 404)
|
|
assert resp.status_int == 404
|
|
|
|
def test_cdx_server_filters(self):
|
|
resp = self.testapp.get('/pywb/cdx?url=http://www.iana.org/_css/2013.1/screen.css&filter=mime:warc/revisit&filter=filename:dupes.warc.gz')
|
|
assert resp.content_type == 'text/x-cdxj'
|
|
actual_len = len(resp.text.rstrip().split('\n'))
|
|
assert actual_len == 1, actual_len
|
|
|
|
def test_cdx_server_advanced(self):
|
|
# combine collapsing, reversing and revisit resolving
|
|
resp = self.testapp.get('/pywb/cdx?url=http://www.iana.org/_css/2013.1/print.css&collapseTime=11&resolveRevisits=true&reverse=true')
|
|
|
|
# convert back to CDXObject
|
|
cdxs = list(map(CDXObject, resp.body.rstrip().split(b'\n')))
|
|
assert len(cdxs) == 3, len(cdxs)
|
|
|
|
# verify timestamps
|
|
timestamps = list(map(lambda cdx: cdx['timestamp'], cdxs))
|
|
assert timestamps == ['20140127171239', '20140126201054', '20140126200625']
|
|
|
|
# verify orig filenames (2 revisits, one non)
|
|
origfilenames = list(map(lambda cdx: cdx['orig.filename'], cdxs))
|
|
assert origfilenames == ['iana.warc.gz', 'iana.warc.gz', '-']
|
|
|
|
|
|
# surt() no longer errors on this in 0.3b
|
|
#def test_error(self):
|
|
# resp = self.testapp.get('/pywb/?abc', status = 400)
|
|
# assert resp.status_int == 400
|
|
# assert 'Invalid Url: http://?abc' in resp.text
|
|
|
|
|
|
def test_coll_info_json(self):
|
|
resp = self.testapp.get('/collinfo.json')
|
|
assert resp.content_type == 'application/json'
|
|
value = resp.json
|
|
assert len(value['fixed']) == 4
|
|
assert len(value['dynamic']) == 0
|
|
|
|
#def test_invalid_config(self):
|
|
# with raises(IOError):
|
|
# init_app(create_wb_router,
|
|
# load_yaml=True,
|
|
# config_file='x-invalid-x')
|
|
|
|
|