diff --git a/warcprox/tests/test_warcproxy.py b/warcprox/tests/test_warcproxy.py index 3259a9d..647ccce 100755 --- a/warcprox/tests/test_warcproxy.py +++ b/warcprox/tests/test_warcproxy.py @@ -53,7 +53,7 @@ class WarcproxTest(unittest.TestCase): @property def _cert(self): if self.__cert is None: - f = tempfile.NamedTemporaryFile(prefix='warcprox-test', suffix='-https.pem', delete=False) + f = tempfile.NamedTemporaryFile(prefix='warcprox-test-https-', suffix='.pem', delete=False) try: key = OpenSSL.crypto.PKey() key.generate_key(OpenSSL.crypto.TYPE_RSA, 2048) @@ -73,7 +73,7 @@ class WarcproxTest(unittest.TestCase): f.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM, key)) f.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM, cert)) - logging.info('generated self-signed certificate {}'.format(f.name)) + self.logger.info('generated self-signed certificate {}'.format(f.name)) self.__cert = f.name finally: f.close() @@ -82,7 +82,7 @@ class WarcproxTest(unittest.TestCase): def _start_http_servers(self): - self.http_daemon = BaseHTTPServer.HTTPServer(('localhost', 0), + self.http_daemon = BaseHTTPServer.HTTPServer(('localhost', 0), RequestHandlerClass=TestHttpRequestHandler) self.logger.info('starting http://{}:{}'.format(self.http_daemon.server_address[0], self.http_daemon.server_address[1])) self.http_daemon_thread = threading.Thread(name='HttpdThread', @@ -90,7 +90,7 @@ class WarcproxTest(unittest.TestCase): self.http_daemon_thread.start() # http://www.piware.de/2011/01/creating-an-https-server-in-python/ - self.https_daemon = BaseHTTPServer.HTTPServer(('localhost', 0), + self.https_daemon = BaseHTTPServer.HTTPServer(('localhost', 0), RequestHandlerClass=TestHttpRequestHandler) # self.https_daemon.socket = ssl.wrap_socket(httpd.socket, certfile='path/to/localhost.pem', server_side=True) self.https_daemon.socket = ssl.wrap_socket(self.https_daemon.socket, certfile=self._cert, server_side=True) @@ -101,7 +101,7 @@ class WarcproxTest(unittest.TestCase): def _start_warcprox(self): - f = tempfile.NamedTemporaryFile(prefix='warcprox-test-', suffix='-ca.pem', delete=True) + f = tempfile.NamedTemporaryFile(prefix='warcprox-test-ca-', suffix='.pem', delete=True) f.close() # delete it, or CertificateAuthority will try to read it self._ca_file = f.name self._ca_dir = tempfile.mkdtemp(prefix='warcprox-test-', suffix='-ca') @@ -109,25 +109,25 @@ class WarcproxTest(unittest.TestCase): recorded_url_q = Queue.Queue() - proxy = warcprox.WarcProxy(server_address=('localhost', 0), ca=ca, + proxy = warcprox.WarcProxy(server_address=('localhost', 0), ca=ca, recorded_url_q=recorded_url_q) - self._warcs_dir = tempfile.mkdtemp(prefix='warcprox-test-', suffix='-warcs') + self._warcs_dir = tempfile.mkdtemp(prefix='warcprox-test-warcs-') - f = tempfile.NamedTemporaryFile(prefix='warcprox-test-', suffix='-playback-index.db', delete=False) + f = tempfile.NamedTemporaryFile(prefix='warcprox-test-playback-index-', suffix='.db', delete=False) f.close() self._playback_index_db_file = f.name playback_index_db = warcprox.PlaybackIndexDb(self._playback_index_db_file) - playback_proxy = warcprox.PlaybackProxy(server_address=('localhost', 0), ca=ca, + playback_proxy = warcprox.PlaybackProxy(server_address=('localhost', 0), ca=ca, playback_index_db=playback_index_db, warcs_dir=self._warcs_dir) - f = tempfile.NamedTemporaryFile(prefix='warcprox-test-', suffix='-dedup.db', delete=False) + f = tempfile.NamedTemporaryFile(prefix='warcprox-test-dedup-', suffix='.db', delete=False) f.close() self._dedup_db_file = f.name dedup_db = warcprox.DedupDb(self._dedup_db_file) warc_writer = warcprox.WarcWriterThread(recorded_url_q=recorded_url_q, - directory=self._warcs_dir, port=proxy.server_port, + directory=self._warcs_dir, port=proxy.server_port, dedup_db=dedup_db, playback_index_db=playback_index_db) self.warcprox = warcprox.WarcproxController(proxy, warc_writer, playback_proxy) @@ -138,8 +138,8 @@ class WarcproxTest(unittest.TestCase): def setUp(self): - logging.basicConfig(stream=sys.stdout, level=logging.INFO, - format='%(asctime)s %(process)d %(threadName)s %(levelname)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s') + logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, + format='%(asctime)s %(levelname)s %(process)d %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s') self._start_http_servers() self._start_warcprox() @@ -172,10 +172,10 @@ class WarcproxTest(unittest.TestCase): for f in (self.__cert, self._ca_file, self._ca_dir, self._warcs_dir, self._playback_index_db_file, self._dedup_db_file): if os.path.isdir(f): - logging.info('deleting directory {}'.format(f)) + self.logger.info('deleting directory {}'.format(f)) shutil.rmtree(f) else: - logging.info('deleting file {}'.format(f)) + self.logger.info('deleting file {}'.format(f)) os.unlink(f) @@ -203,6 +203,18 @@ class WarcproxTest(unittest.TestCase): self.assertEqual(response.content, 'I am the warcprox test payload! dddddddddd!\n') + def poll_playback_until(self, url, status, timeout_sec): + start = time.time() + # check playback (warc writing is asynchronous, give it up to 10 sec) + while time.time() - start < timeout_sec: + response = requests.get(url, proxies=self.playback_proxies, verify=False) + if response.status_code == status: + break + time.sleep(0.5) + + return response + + def _test_archive_and_playback_http_url(self): url = 'http://localhost:{}/a/b'.format(self.http_daemon.server_port) @@ -217,13 +229,7 @@ class WarcproxTest(unittest.TestCase): self.assertEqual(response.headers['warcprox-test-header'], 'a!') self.assertEqual(response.content, 'I am the warcprox test payload! bbbbbbbbbb!\n') - # check playback (warc writing is asynchronous, give it up to 10 sec) - for i in xrange(0,20): - response = requests.get(url, proxies=self.playback_proxies) - if response.status_code != 404: - break - time.sleep(0.5) - + response = self.poll_playback_until(url, status=200, timeout_sec=10) self.assertEqual(response.status_code, 200) self.assertEqual(response.headers['warcprox-test-header'], 'a!') self.assertEqual(response.content, 'I am the warcprox test payload! bbbbbbbbbb!\n') @@ -237,30 +243,153 @@ class WarcproxTest(unittest.TestCase): self.assertEqual(response.status_code, 404) self.assertEqual(response.content, '404 Not in Archive\n') - # archive + # fetch & archive response response = requests.get(url, proxies=self.archiving_proxies, verify=False) self.assertEqual(response.status_code, 200) self.assertEqual(response.headers['warcprox-test-header'], 'c!') self.assertEqual(response.content, 'I am the warcprox test payload! dddddddddd!\n') - # check playback (warc writing is asynchronous, give it up to 10 sec) - for i in xrange(0,20): - response = requests.get(url, proxies=self.playback_proxies, verify=False) - if response.status_code != 404: - break - time.sleep(0.5) - + # test playback + response = self.poll_playback_until(url, status=200, timeout_sec=10) self.assertEqual(response.status_code, 200) self.assertEqual(response.headers['warcprox-test-header'], 'c!') self.assertEqual(response.content, 'I am the warcprox test payload! dddddddddd!\n') + # test dedup of same http url with same payload + def _test_dedup_http(self): + url = 'http://localhost:{}/e/f'.format(self.http_daemon.server_port) + + # ensure playback fails before archiving + response = requests.get(url, proxies=self.playback_proxies, verify=False) + self.assertEqual(response.status_code, 404) + self.assertEqual(response.content, '404 Not in Archive\n') + + # check not in dedup db + dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') + self.assertIsNone(dedup_lookup) + + # archive + response = requests.get(url, proxies=self.archiving_proxies, verify=False) + self.assertEqual(response.status_code, 200) + self.assertEqual(response.headers['warcprox-test-header'], 'e!') + self.assertEqual(response.content, 'I am the warcprox test payload! ffffffffff!\n') + + # test playback + response = self.poll_playback_until(url, status=200, timeout_sec=10) + self.assertEqual(response.status_code, 200) + self.assertEqual(response.headers['warcprox-test-header'], 'e!') + self.assertEqual(response.content, 'I am the warcprox test payload! ffffffffff!\n') + + # check in dedup db + # {u'i': u'', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'} + dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') + self.assertEquals(dedup_lookup['u'], url) + self.assertRegexpMatches(dedup_lookup['i'], r'^$') + self.assertRegexpMatches(dedup_lookup['d'], r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$') + record_id = dedup_lookup['i'] + dedup_date = dedup_lookup['d'] + + # need revisit to have a later timestamp than original, else playing + # back the latest record might not hit the revisit + time.sleep(1.5) + + # fetch & archive revisit + response = requests.get(url, proxies=self.archiving_proxies, verify=False) + self.assertEqual(response.status_code, 200) + self.assertEqual(response.headers['warcprox-test-header'], 'e!') + self.assertEqual(response.content, 'I am the warcprox test payload! ffffffffff!\n') + + # XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\ + time.sleep(2.0) + + # check in dedup db (no change from prev) + dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') + self.assertEquals(dedup_lookup['u'], url) + self.assertEquals(dedup_lookup['i'], record_id) + self.assertEquals(dedup_lookup['d'], dedup_date) + + # test playback + self.logger.debug('testing playback of revisit of {}'.format(url)) + response = self.poll_playback_until(url, status=200, timeout_sec=10) + self.assertEqual(response.status_code, 200) + self.assertEqual(response.headers['warcprox-test-header'], 'e!') + self.assertEqual(response.content, 'I am the warcprox test payload! ffffffffff!\n') + # XXX how to check dedup was used? + + + # test dedup of same https url with same payload + def _test_dedup_https(self): + url = 'https://localhost:{}/g/h'.format(self.https_daemon.server_port) + + # ensure playback fails before archiving + response = requests.get(url, proxies=self.playback_proxies, verify=False) + self.assertEqual(response.status_code, 404) + self.assertEqual(response.content, '404 Not in Archive\n') + + # check not in dedup db + dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') + self.assertIsNone(dedup_lookup) + + # archive + response = requests.get(url, proxies=self.archiving_proxies, verify=False) + self.assertEqual(response.status_code, 200) + self.assertEqual(response.headers['warcprox-test-header'], 'g!') + self.assertEqual(response.content, 'I am the warcprox test payload! hhhhhhhhhh!\n') + + # test playback + response = self.poll_playback_until(url, status=200, timeout_sec=10) + self.assertEqual(response.status_code, 200) + self.assertEqual(response.headers['warcprox-test-header'], 'g!') + self.assertEqual(response.content, 'I am the warcprox test payload! hhhhhhhhhh!\n') + + # check in dedup db + # {u'i': u'', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'} + dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') + self.assertEquals(dedup_lookup['u'], url) + self.assertRegexpMatches(dedup_lookup['i'], r'^$') + self.assertRegexpMatches(dedup_lookup['d'], r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$') + record_id = dedup_lookup['i'] + dedup_date = dedup_lookup['d'] + + # need revisit to have a later timestamp than original, else playing + # back the latest record might not hit the revisit + time.sleep(1.5) + + # fetch & archive revisit + response = requests.get(url, proxies=self.archiving_proxies, verify=False) + self.assertEqual(response.status_code, 200) + self.assertEqual(response.headers['warcprox-test-header'], 'g!') + self.assertEqual(response.content, 'I am the warcprox test payload! hhhhhhhhhh!\n') + + # XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\ + time.sleep(2.0) + + # check in dedup db (no change from prev) + dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') + self.assertEquals(dedup_lookup['u'], url) + self.assertEquals(dedup_lookup['i'], record_id) + self.assertEquals(dedup_lookup['d'], dedup_date) + + # test playback + self.logger.debug('testing playback of revisit of {}'.format(url)) + response = self.poll_playback_until(url, status=200, timeout_sec=10) + self.assertEqual(response.status_code, 200) + self.assertEqual(response.headers['warcprox-test-header'], 'g!') + self.assertEqual(response.content, 'I am the warcprox test payload! hhhhhhhhhh!\n') + # XXX how to check dedup was used? + + # run everything from here, otherwise it wants to setUp() and tearDown # around each test def runTest(self): self._test_httpds_no_proxy() self._test_archive_and_playback_http_url() self._test_archive_and_playback_https_url() + self._test_dedup_http() + self._test_dedup_https() + # self._test_dedup_mixed_http() + # self._test_dedup_mixed_https() if __name__ == '__main__':