2014-01-31 19:41:44 -08:00
|
|
|
import webtest
|
2014-02-24 03:00:01 -08:00
|
|
|
from pywb.bootstrap.pywb_init import pywb_config
|
|
|
|
from pywb.bootstrap.wbapp import create_wb_app
|
2014-02-17 02:34:39 -08:00
|
|
|
from pywb.cdx.cdxobject import CDXObject
|
2014-03-01 18:40:16 -08:00
|
|
|
|
|
|
|
from fixture import TestExclusionPerms
|
2014-01-31 19:41:44 -08:00
|
|
|
|
|
|
|
class TestWb:
|
2014-02-05 10:10:33 -08:00
|
|
|
TEST_CONFIG = 'test_config.yaml'
|
|
|
|
|
2014-01-31 19:41:44 -08:00
|
|
|
def setup(self):
|
2014-02-03 09:24:40 -08:00
|
|
|
#self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config())
|
2014-02-28 01:39:04 +00:00
|
|
|
# save it in self - useful for debugging
|
|
|
|
self.router = pywb_config(self.TEST_CONFIG)
|
|
|
|
self.app = create_wb_app(self.router)
|
2014-01-31 19:41:44 -08:00
|
|
|
self.testapp = webtest.TestApp(self.app)
|
|
|
|
|
|
|
|
def _assert_basic_html(self, resp):
|
|
|
|
assert resp.status_int == 200
|
|
|
|
assert resp.content_type == 'text/html'
|
|
|
|
assert resp.content_length > 0
|
|
|
|
|
|
|
|
def _assert_basic_text(self, resp):
|
|
|
|
assert resp.status_int == 200
|
|
|
|
assert resp.content_type == 'text/plain'
|
|
|
|
assert resp.content_length > 0
|
|
|
|
|
|
|
|
def test_home(self):
|
|
|
|
resp = self.testapp.get('/')
|
|
|
|
self._assert_basic_html(resp)
|
|
|
|
assert '/pywb' in resp.body
|
|
|
|
|
|
|
|
def test_pywb_root(self):
|
|
|
|
resp = self.testapp.get('/pywb/')
|
|
|
|
self._assert_basic_html(resp)
|
|
|
|
assert 'Search' in resp.body
|
|
|
|
|
|
|
|
def test_calendar_query(self):
|
|
|
|
resp = self.testapp.get('/pywb/*/iana.org')
|
|
|
|
self._assert_basic_html(resp)
|
|
|
|
# 3 Captures + header
|
|
|
|
assert len(resp.html.find_all('tr')) == 4
|
|
|
|
|
2014-02-06 17:28:08 -08:00
|
|
|
def test_calendar_query_filtered(self):
|
|
|
|
# unfiltered collection
|
|
|
|
resp = self.testapp.get('/pywb/*/http://www.iana.org/_css/2013.1/screen.css')
|
|
|
|
self._assert_basic_html(resp)
|
|
|
|
# 17 Captures + header
|
|
|
|
assert len(resp.html.find_all('tr')) == 18
|
|
|
|
|
|
|
|
# filtered collection
|
|
|
|
resp = self.testapp.get('/pywb-filt/*/http://www.iana.org/_css/2013.1/screen.css')
|
|
|
|
self._assert_basic_html(resp)
|
|
|
|
# 1 Capture (filtered) + header
|
|
|
|
assert len(resp.html.find_all('tr')) == 2
|
|
|
|
|
2014-02-18 14:47:48 -08:00
|
|
|
def test_calendar_query_fuzzy_match(self):
|
|
|
|
# fuzzy match removing _= according to standard rules.yaml
|
|
|
|
resp = self.testapp.get('/pywb/*/http://www.iana.org/_css/2013.1/screen.css?_=3141592653')
|
|
|
|
self._assert_basic_html(resp)
|
|
|
|
# 17 Captures + header
|
|
|
|
assert len(resp.html.find_all('tr')) == 18
|
|
|
|
|
2014-01-31 19:41:44 -08:00
|
|
|
def test_cdx_query(self):
|
|
|
|
resp = self.testapp.get('/pywb/cdx_/*/http://www.iana.org/')
|
|
|
|
self._assert_basic_text(resp)
|
|
|
|
|
|
|
|
assert '20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB' in resp
|
|
|
|
# check for 3 cdx lines (strip final newline)
|
|
|
|
actual_len = len(str(resp.body).rstrip().split('\n'))
|
|
|
|
assert actual_len == 3, actual_len
|
|
|
|
|
|
|
|
|
|
|
|
def test_replay_1(self):
|
|
|
|
resp = self.testapp.get('/pywb/20140127171238/http://www.iana.org/')
|
|
|
|
self._assert_basic_html(resp)
|
|
|
|
|
|
|
|
assert 'Mon, Jan 27 2014 17:12:38' in resp.body
|
|
|
|
assert 'wb.js' in resp.body
|
2014-02-27 18:43:55 -08:00
|
|
|
assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
|
|
|
|
|
|
|
|
def test_replay_identity_1(self):
|
|
|
|
resp = self.testapp.get('/pywb/20140127171251id_/http://example.com')
|
|
|
|
#resp = self.testapp.get('/pywb/20140126200654id_/http://www.iana.org/_img/2013.1/rir-map.svg')
|
|
|
|
#resp = self.testapp.get('/pywb/20140127171239id_/http://www.iana.org/_css/2013.1/screen.css')
|
|
|
|
#self._assert_basic_html(resp)
|
|
|
|
|
|
|
|
# no wb header insertion
|
|
|
|
assert 'wb.js' not in resp.body
|
|
|
|
|
|
|
|
# original unrewritten url present
|
|
|
|
assert '"http://www.iana.org/domains/example"' in resp.body
|
2014-01-31 19:41:44 -08:00
|
|
|
|
2014-02-20 11:53:08 -08:00
|
|
|
def test_replay_content_length_1(self):
|
|
|
|
# test larger file, rewritten file (svg!)
|
|
|
|
resp = self.testapp.get('/pywb/20140126200654/http://www.iana.org/_img/2013.1/rir-map.svg')
|
|
|
|
assert resp.headers['Content-Length'] == str(len(resp.body))
|
|
|
|
|
2014-01-31 19:41:44 -08:00
|
|
|
|
|
|
|
def test_redirect_1(self):
|
|
|
|
resp = self.testapp.get('/pywb/20140127171237/http://www.iana.org/')
|
|
|
|
assert resp.status_int == 302
|
|
|
|
|
|
|
|
assert resp.headers['Location'].endswith('/pywb/20140127171238/http://iana.org')
|
|
|
|
|
|
|
|
|
|
|
|
def test_redirect_replay_2(self):
|
|
|
|
resp = self.testapp.get('/pywb/http://example.com/')
|
|
|
|
assert resp.status_int == 302
|
|
|
|
|
|
|
|
assert resp.headers['Location'].endswith('/20140127171251/http://example.com')
|
|
|
|
resp = resp.follow()
|
|
|
|
|
|
|
|
#check resp
|
|
|
|
self._assert_basic_html(resp)
|
|
|
|
assert 'Mon, Jan 27 2014 17:12:51' in resp.body
|
|
|
|
assert '/pywb/20140127171251/http://www.iana.org/domains/example' in resp.body
|
|
|
|
|
2014-02-08 20:07:16 -08:00
|
|
|
def test_redirect_relative_3(self):
|
|
|
|
# first two requests should result in same redirect
|
|
|
|
target = 'http://localhost:8080/pywb/2014/http://iana.org/_css/2013.1/screen.css'
|
|
|
|
|
|
|
|
# without timestamp
|
|
|
|
resp = self.testapp.get('/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:8080/pywb/2014/http://iana.org/')])
|
|
|
|
assert resp.status_int == 302
|
|
|
|
assert resp.headers['Location'] == target, resp.headers['Location']
|
|
|
|
|
|
|
|
# with timestamp
|
|
|
|
resp = self.testapp.get('/2014/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:8080/pywb/2014/http://iana.org/')])
|
|
|
|
assert resp.status_int == 302
|
|
|
|
assert resp.headers['Location'] == target, resp.headers['Location']
|
|
|
|
|
|
|
|
|
|
|
|
resp = resp.follow()
|
|
|
|
assert resp.status_int == 302
|
|
|
|
assert resp.headers['Location'].endswith('/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css')
|
|
|
|
|
|
|
|
resp = resp.follow()
|
|
|
|
assert resp.status_int == 200
|
|
|
|
assert resp.content_type == 'text/css'
|
|
|
|
|
|
|
|
|
2014-02-23 23:31:54 -08:00
|
|
|
def test_referrer_self_redirect(self):
|
|
|
|
uri = '/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css'
|
|
|
|
host = 'somehost:8082'
|
|
|
|
referrer = 'http://' + host + uri
|
|
|
|
|
|
|
|
# capture is normally a 200
|
|
|
|
resp = self.testapp.get(uri)
|
|
|
|
assert resp.status_int == 200
|
|
|
|
|
|
|
|
# redirect causes skip of this capture, redirect to next
|
|
|
|
resp = self.testapp.get(uri, headers = [('Referer', referrer), ('Host', host)], status = 302)
|
|
|
|
assert resp.status_int == 302
|
|
|
|
|
|
|
|
|
2014-02-19 20:20:31 -08:00
|
|
|
def test_excluded_content(self):
|
|
|
|
resp = self.testapp.get('/pywb/http://www.iana.org/_img/bookmark_icon.ico', status = 403)
|
|
|
|
assert resp.status_int == 403
|
|
|
|
assert 'Excluded' in resp.body
|
|
|
|
|
|
|
|
|
2014-02-07 19:32:58 -08:00
|
|
|
def test_static_content(self):
|
2014-02-08 20:07:16 -08:00
|
|
|
resp = self.testapp.get('/static/test/route/wb.css')
|
2014-02-07 19:32:58 -08:00
|
|
|
assert resp.status_int == 200
|
|
|
|
assert resp.content_type == 'text/css'
|
|
|
|
assert resp.content_length > 0
|
|
|
|
|
|
|
|
|
2014-02-08 20:07:16 -08:00
|
|
|
# 'Simulating' proxy by settings REQUEST_URI explicitly to http:// url and no SCRIPT_NAME
|
|
|
|
# would be nice to be able to test proxy more
|
|
|
|
def test_proxy_replay(self):
|
|
|
|
resp = self.testapp.get('/x-ignore-this-x', extra_environ = dict(REQUEST_URI = 'http://www.iana.org/domains/idn-tables', SCRIPT_NAME = ''))
|
|
|
|
self._assert_basic_html(resp)
|
|
|
|
|
|
|
|
assert 'Sun, Jan 26 2014 20:11:27' in resp.body
|
|
|
|
assert 'wb.js' in resp.body
|
2014-02-05 13:08:10 -08:00
|
|
|
|
2014-02-08 20:07:16 -08:00
|
|
|
def test_proxy_pac(self):
|
|
|
|
resp = self.testapp.get('/proxy.pac', extra_environ = dict(SERVER_NAME='pywb-proxy', SERVER_PORT='8080'))
|
|
|
|
assert resp.content_type == 'application/x-ns-proxy-autoconfig'
|
|
|
|
assert '"PROXY pywb-proxy:8080"' in resp.body
|
|
|
|
assert '"localhost"' in resp.body
|
2014-02-05 13:08:10 -08:00
|
|
|
|
2014-02-01 14:47:07 -08:00
|
|
|
def test_cdx_server_filters(self):
|
2014-02-03 09:24:40 -08:00
|
|
|
resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/screen.css&filter=mimetype:warc/revisit&filter=filename:dupes.warc.gz')
|
2014-02-01 14:47:07 -08:00
|
|
|
self._assert_basic_text(resp)
|
|
|
|
actual_len = len(resp.body.rstrip().split('\n'))
|
|
|
|
assert actual_len == 1, actual_len
|
2014-01-31 19:41:44 -08:00
|
|
|
|
2014-02-01 14:47:07 -08:00
|
|
|
def test_cdx_server_advanced(self):
|
|
|
|
# combine collapsing, reversing and revisit resolving
|
2014-02-19 20:20:31 -08:00
|
|
|
resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/print.css&collapseTime=11&resolveRevisits=true&reverse=true')
|
2014-01-31 19:41:44 -08:00
|
|
|
|
2014-02-12 13:16:07 -08:00
|
|
|
# convert back to CDXObject
|
|
|
|
cdxs = map(CDXObject, resp.body.rstrip().split('\n'))
|
2014-02-01 14:47:07 -08:00
|
|
|
assert len(cdxs) == 3, len(cdxs)
|
2014-01-31 19:41:44 -08:00
|
|
|
|
2014-02-01 14:47:07 -08:00
|
|
|
# verify timestamps
|
|
|
|
timestamps = map(lambda cdx: cdx['timestamp'], cdxs)
|
|
|
|
assert timestamps == ['20140127171239', '20140126201054', '20140126200625']
|
2014-01-31 19:41:44 -08:00
|
|
|
|
2014-02-01 14:47:07 -08:00
|
|
|
# verify orig filenames (2 revisits, one non)
|
|
|
|
origfilenames = map(lambda cdx: cdx['orig.filename'], cdxs)
|
|
|
|
assert origfilenames == ['iana.warc.gz', 'iana.warc.gz', '-']
|
2014-01-31 19:41:44 -08:00
|
|
|
|
|
|
|
|
2014-02-01 14:47:07 -08:00
|
|
|
def test_error(self):
|
|
|
|
resp = self.testapp.get('/pywb/?abc', status = 400)
|
|
|
|
assert resp.status_int == 400
|
2014-02-17 10:23:37 -08:00
|
|
|
assert 'Invalid Url: http://?abc' in resp.body
|
2014-02-11 14:10:40 -08:00
|
|
|
|