from .base_config_test import BaseConfigTest, fmod from pywb.warcserver.index.cdxobject import CDXObject # ============================================================================ class TestWbIntegration(BaseConfigTest): @classmethod def setup_class(cls): super(TestWbIntegration, cls).setup_class('config_test.yaml') def test_home(self): resp = self.testapp.get('/') self._assert_basic_html(resp) assert '/pywb' in resp.text def test_pywb_root(self): resp = self.testapp.get('/pywb/') self._assert_basic_html(resp) assert 'Search' in resp.text def test_pywb_root_head(self): resp = self.testapp.head('/pywb/') assert resp.content_type == 'text/html' assert resp.status_int == 200 def test_pywb_invalid_path(self): resp = self.testapp.head('/blah/', status=404) assert resp.content_type == 'text/html' assert resp.status_int == 404 def test_calendar_query(self): resp = self.testapp.get('/pywb/*/iana.org') self._assert_basic_html(resp) # 3 Captures + header assert len(resp.html.find_all('tr')) == 4 def test_calendar_query_2(self): # unfiltered collection resp = self.testapp.get('/pywb/*/http://www.iana.org/_css/2013.1/screen.css') self._assert_basic_html(resp) # 17 Captures + header assert len(resp.html.find_all('tr')) == 18 # filtered collection #resp = self.testapp.get('/pywb-filt/*/http://www.iana.org/_css/2013.1/screen.css') #self._assert_basic_html(resp) # 1 Capture (filtered) + header #assert len(resp.html.find_all('tr')) == 2 def test_calendar_query_fuzzy_match(self): # fuzzy match removing _= according to standard rules.yaml resp = self.testapp.get('/pywb/*/http://www.iana.org/_css/2013.1/screen.css?_=3141592653') self._assert_basic_html(resp) # 17 Captures + header assert len(resp.html.find_all('tr')) == 18 def test_calendar_query_fuzzy_match_add_slash(self): # fuzzy match removing _= according to standard rules.yaml resp = self.testapp.get('/pywb/*/http://www.iana.org/_css/2013.1/screen.css/?_=3141592653') self._assert_basic_html(resp) # 17 Captures + header assert len(resp.html.find_all('tr')) == 18 def test_calendar_not_found(self): # query with no results resp = self.testapp.get('/pywb/*/http://not-exist.example.com') self._assert_basic_html(resp) assert 'No captures found' in resp.text, resp.text assert len(resp.html.find_all('tr')) == 0 def _test_cdx_query(self): resp = self.testapp.get('/pywb/cdx_/*/http://www.iana.org/') self._assert_basic_text(resp) assert '20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB' in resp # check for 3 cdx lines (strip final newline) actual_len = len(str(resp.text).rstrip().split('\n')) assert actual_len == 3, actual_len def test_replay_top_frame(self): resp = self.testapp.get('/pywb/20140127171238/http://www.iana.org/') assert 'new ContentFrame' in resp.text assert '"20140127171238"' in resp.text assert 'http://www.iana.org/' in resp.text, resp.text assert 'Content-Security-Policy' not in resp.headers def test_replay_content(self, fmod): resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/', fmod) self._assert_basic_html(resp) assert '"20140127171238"' in resp.text, resp.text assert 'wombat.js' in resp.text assert 'new _WBWombat' in resp.text, resp.text assert '/pywb/20140127171238{0}/http://www.iana.org/time-zones"'.format(fmod) in resp.text assert ('wbinfo.is_framed = ' + ('true' if fmod else 'false')) in resp.text csp = "default-src 'unsafe-eval' 'unsafe-inline' 'self' data: blob: mediastream: ws: wss: ; form-action 'self'" assert resp.headers['Content-Security-Policy'] == csp def test_replay_fuzzy_1(self, fmod): resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/?_=123', fmod) assert resp.status_int == 200 assert resp.headers['Content-Location'].endswith('/pywb/20140126200624{0}/http://www.iana.org/'.format(fmod)) def test_replay_no_fuzzy_match(self, fmod): resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/?foo=bar', fmod, status=404) assert resp.status_int == 404 def test_no_slash_redir_1(self, fmod): resp = self.get('/pywb/20140103030321{0}/http://example.com', fmod) assert resp.status_int == 307 assert resp.headers['Location'].endswith('/pywb/20140103030321{0}/http://example.com/'.format(fmod)) def test_no_slash_redir_2(self, fmod): resp = self.get('/pywb/20140103030321{0}/http://example.com?example=1', fmod) assert resp.status_int == 307 assert resp.headers['Location'].endswith('/pywb/20140103030321{0}/http://example.com/?example=1'.format(fmod)) def test_replay_cdxj(self, fmod): resp = self.get('/pywb-cdxj/20140103030321{0}/http://example.com/?example=1', fmod) self._assert_basic_html(resp) assert '"20140103030321"' in resp.text assert 'wombat.js' in resp.text assert '/pywb-cdxj/20140103030321{0}/http://www.iana.org/domains/example'.format(fmod) in resp.text def test_replay_cdxj_revisit(self, fmod): resp = self.get('/pywb-cdxj/20140103030341{0}/http://example.com/?example=1', fmod) self._assert_basic_html(resp) assert '"20140103030341"' in resp.text assert 'wombat.js' in resp.text assert '/pywb-cdxj/20140103030341{0}/http://www.iana.org/domains/example'.format(fmod) in resp.text def test_zero_len_revisit(self, fmod): resp = self.get('/pywb/20140603030341{0}/http://example.com/?example=2', fmod) self._assert_basic_html(resp) assert '"20140603030341"' in resp.text assert 'wombat.js' in resp.text assert '/pywb/20140603030341{0}/http://www.iana.org/domains/example'.format(fmod) in resp.text def test_replay_url_agnostic_revisit(self, fmod): resp = self.get('/pywb/20130729195151{0}/http://www.example.com/', fmod) self._assert_basic_html(resp) assert '"20130729195151"' in resp.text assert 'wombat.js' in resp.text assert '/pywb/20130729195151{0}/http://www.iana.org/domains/example"'.format(fmod) in resp.text def test_video_info_not_found(self): # not actually archived, but ensure video info path is tested resp = self.testapp.get('/pywb/vi_/https://www.youtube.com/watch?v=DjFZyFWSt1M', status=404) assert resp.status_int == 404 def _test_replay_cdx_mod(self): resp = self.testapp.get('/pywb/20140127171239cdx_/http://www.iana.org/_css/2013.1/print.css') self._assert_basic_text(resp) lines = resp.text.rstrip().split('\n') assert len(lines) == 17 assert lines[0].startswith('org,iana)/_css/2013.1/print.css 20140127171239') def test_replay_banner_only(self): resp = self.testapp.get('/pywb/20140126201054bn_/http://www.iana.org/domains/reserved') # wombat.js header insertion assert 'wombat.js' in resp.text # no wombat present assert '_WBWombat' not in resp.text # url not rewritten #assert '"http://www.iana.org/domains/example"' in resp.text assert '"/_css/2013.1/screen.css"' in resp.text def test_replay_identity_1(self): resp = self.testapp.get('/pywb/20140127171251id_/http://example.com/') # no wb header insertion assert 'wombat.js' not in resp.text assert resp.content_length == 1270, resp.content_length # original unrewritten url present assert '"http://www.iana.org/domains/example"' in resp.text def _test_replay_range_cache_content(self): headers = [('Range', 'bytes=0-200')] resp = self.testapp.get('/pywb/20140127171250id_/http://example.com', headers=headers) assert resp.status_int == 206 assert resp.headers['Accept-Ranges'] == 'bytes' assert resp.headers['Content-Range'] == 'bytes 0-200/1270', resp.headers['Content-Range'] assert resp.content_length == 201, resp.content_length assert 'wombat.js' not in resp.text def _test_replay_content_ignore_range(self): headers = [('Range', 'bytes=0-200')] resp = self.testapp.get('/pywb-norange/20140127171251id_/http://example.com', headers=headers) # range request ignored assert resp.status_int == 200 # full response assert resp.content_length == 1270, resp.content_length # identity, no header insertion assert 'wombat.js' not in resp.text def _test_replay_range_cache_content_bound_end(self): headers = [('Range', 'bytes=10-10000')] resp = self.testapp.get('/pywb/20140127171251id_/http://example.com', headers=headers) assert resp.status_int == 206 assert resp.headers['Accept-Ranges'] == 'bytes' assert resp.headers['Content-Range'] == 'bytes 10-1269/1270', resp.headers['Content-Range'] assert resp.content_length == 1260, resp.content_length assert len(resp.text) == resp.content_length assert 'wombat.js' not in resp.text def _test_replay_redir_no_cache(self): headers = [('Range', 'bytes=10-10000')] # Range ignored resp = self.testapp.get('/pywb/20140126200927/http://www.iana.org/domains/root/db/', headers=headers) assert resp.status_int == 302 assert resp.content_length == 0 def test_replay_identity_2_arcgz(self): resp = self.testapp.get('/pywb/20140216050221id_/http://arc.gz.test.example.com/') # no wb header insertion assert 'wombat.js' not in resp.text # original unrewritten url present assert '"http://www.iana.org/domains/example"' in resp.text def test_replay_identity_2_arc(self): resp = self.testapp.get('/pywb/20140216050221id_/http://arc.test.example.com/') # no wb header insertion assert 'wombat.js' not in resp.text # original unrewritten url present assert '"http://www.iana.org/domains/example"' in resp.text def test_replay_content_length_1(self, fmod): # test larger file, rewritten file (svg!) resp = self.get('/pywb/20140126200654{0}/http://www.iana.org/_img/2013.1/rir-map.svg', fmod) assert resp.headers['Content-Length'] == str(len(resp.text)) def test_replay_css_mod(self): resp = self.testapp.get('/pywb/20140127171239cs_/http://www.iana.org/_css/2013.1/screen.css') assert resp.status_int == 200 assert resp.content_type == 'text/css' def test_replay_js_mod_no_obj_proxy(self): # an empty js file, (ie11 UA no js obj proxy) resp = self.testapp.get('/pywb/20140126201054js_/http://www.iana.org/_js/2013.1/iana.js', headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'}) assert resp.status_int == 200 assert resp.content_length == 0 assert resp.content_type == 'application/x-javascript' def test_replay_js_obj_proxy(self, fmod): # test js proxy obj with jquery -- no user agent resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod) assert resp.status_int == 200 assert resp.content_length != 0 assert resp.content_type == 'application/x-javascript' # test with Chrome user agent resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}) assert 'let window = _____WB$wombat$assign$function_____(' in resp.text def test_replay_js_ie11_no_obj_proxy(self, fmod): # IE11 user-agent, no proxy resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'}) assert 'let window = _____WB$wombat$assign$function_____(' not in resp.text def test_replay_non_exact(self, fmod): # non-exact mode, don't redirect to exact capture resp = self.get('/pywb/20140127171237{0}/http://www.iana.org/', fmod) assert resp.status_int == 200 self._assert_basic_html(resp) assert '"20140127171237"' in resp.text # actual timestamp set in JS assert 'timestamp = "20140127171238"' in resp.text assert '/pywb/20140127171237{0}/http://www.iana.org/about/'.format(fmod) in resp.text def test_replay_remote_ait(self, fmod): resp = self.get('/ait:1068/2011{0}/http://www.iana.org/domains/example/', fmod) self._assert_basic_html(resp) assert '"20120119230023"' in resp.text, resp.text assert '