mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
test dedup of same url
This commit is contained in:
parent
bdd218d338
commit
6fbae16a31
@ -53,7 +53,7 @@ class WarcproxTest(unittest.TestCase):
|
||||
@property
|
||||
def _cert(self):
|
||||
if self.__cert is None:
|
||||
f = tempfile.NamedTemporaryFile(prefix='warcprox-test', suffix='-https.pem', delete=False)
|
||||
f = tempfile.NamedTemporaryFile(prefix='warcprox-test-https-', suffix='.pem', delete=False)
|
||||
try:
|
||||
key = OpenSSL.crypto.PKey()
|
||||
key.generate_key(OpenSSL.crypto.TYPE_RSA, 2048)
|
||||
@ -73,7 +73,7 @@ class WarcproxTest(unittest.TestCase):
|
||||
f.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM, key))
|
||||
f.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM, cert))
|
||||
|
||||
logging.info('generated self-signed certificate {}'.format(f.name))
|
||||
self.logger.info('generated self-signed certificate {}'.format(f.name))
|
||||
self.__cert = f.name
|
||||
finally:
|
||||
f.close()
|
||||
@ -82,7 +82,7 @@ class WarcproxTest(unittest.TestCase):
|
||||
|
||||
|
||||
def _start_http_servers(self):
|
||||
self.http_daemon = BaseHTTPServer.HTTPServer(('localhost', 0),
|
||||
self.http_daemon = BaseHTTPServer.HTTPServer(('localhost', 0),
|
||||
RequestHandlerClass=TestHttpRequestHandler)
|
||||
self.logger.info('starting http://{}:{}'.format(self.http_daemon.server_address[0], self.http_daemon.server_address[1]))
|
||||
self.http_daemon_thread = threading.Thread(name='HttpdThread',
|
||||
@ -90,7 +90,7 @@ class WarcproxTest(unittest.TestCase):
|
||||
self.http_daemon_thread.start()
|
||||
|
||||
# http://www.piware.de/2011/01/creating-an-https-server-in-python/
|
||||
self.https_daemon = BaseHTTPServer.HTTPServer(('localhost', 0),
|
||||
self.https_daemon = BaseHTTPServer.HTTPServer(('localhost', 0),
|
||||
RequestHandlerClass=TestHttpRequestHandler)
|
||||
# self.https_daemon.socket = ssl.wrap_socket(httpd.socket, certfile='path/to/localhost.pem', server_side=True)
|
||||
self.https_daemon.socket = ssl.wrap_socket(self.https_daemon.socket, certfile=self._cert, server_side=True)
|
||||
@ -101,7 +101,7 @@ class WarcproxTest(unittest.TestCase):
|
||||
|
||||
|
||||
def _start_warcprox(self):
|
||||
f = tempfile.NamedTemporaryFile(prefix='warcprox-test-', suffix='-ca.pem', delete=True)
|
||||
f = tempfile.NamedTemporaryFile(prefix='warcprox-test-ca-', suffix='.pem', delete=True)
|
||||
f.close() # delete it, or CertificateAuthority will try to read it
|
||||
self._ca_file = f.name
|
||||
self._ca_dir = tempfile.mkdtemp(prefix='warcprox-test-', suffix='-ca')
|
||||
@ -109,25 +109,25 @@ class WarcproxTest(unittest.TestCase):
|
||||
|
||||
recorded_url_q = Queue.Queue()
|
||||
|
||||
proxy = warcprox.WarcProxy(server_address=('localhost', 0), ca=ca,
|
||||
proxy = warcprox.WarcProxy(server_address=('localhost', 0), ca=ca,
|
||||
recorded_url_q=recorded_url_q)
|
||||
|
||||
self._warcs_dir = tempfile.mkdtemp(prefix='warcprox-test-', suffix='-warcs')
|
||||
self._warcs_dir = tempfile.mkdtemp(prefix='warcprox-test-warcs-')
|
||||
|
||||
f = tempfile.NamedTemporaryFile(prefix='warcprox-test-', suffix='-playback-index.db', delete=False)
|
||||
f = tempfile.NamedTemporaryFile(prefix='warcprox-test-playback-index-', suffix='.db', delete=False)
|
||||
f.close()
|
||||
self._playback_index_db_file = f.name
|
||||
playback_index_db = warcprox.PlaybackIndexDb(self._playback_index_db_file)
|
||||
playback_proxy = warcprox.PlaybackProxy(server_address=('localhost', 0), ca=ca,
|
||||
playback_proxy = warcprox.PlaybackProxy(server_address=('localhost', 0), ca=ca,
|
||||
playback_index_db=playback_index_db, warcs_dir=self._warcs_dir)
|
||||
|
||||
f = tempfile.NamedTemporaryFile(prefix='warcprox-test-', suffix='-dedup.db', delete=False)
|
||||
f = tempfile.NamedTemporaryFile(prefix='warcprox-test-dedup-', suffix='.db', delete=False)
|
||||
f.close()
|
||||
self._dedup_db_file = f.name
|
||||
dedup_db = warcprox.DedupDb(self._dedup_db_file)
|
||||
|
||||
warc_writer = warcprox.WarcWriterThread(recorded_url_q=recorded_url_q,
|
||||
directory=self._warcs_dir, port=proxy.server_port,
|
||||
directory=self._warcs_dir, port=proxy.server_port,
|
||||
dedup_db=dedup_db, playback_index_db=playback_index_db)
|
||||
|
||||
self.warcprox = warcprox.WarcproxController(proxy, warc_writer, playback_proxy)
|
||||
@ -138,8 +138,8 @@ class WarcproxTest(unittest.TestCase):
|
||||
|
||||
|
||||
def setUp(self):
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.INFO,
|
||||
format='%(asctime)s %(process)d %(threadName)s %(levelname)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG,
|
||||
format='%(asctime)s %(levelname)s %(process)d %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
||||
|
||||
self._start_http_servers()
|
||||
self._start_warcprox()
|
||||
@ -172,10 +172,10 @@ class WarcproxTest(unittest.TestCase):
|
||||
|
||||
for f in (self.__cert, self._ca_file, self._ca_dir, self._warcs_dir, self._playback_index_db_file, self._dedup_db_file):
|
||||
if os.path.isdir(f):
|
||||
logging.info('deleting directory {}'.format(f))
|
||||
self.logger.info('deleting directory {}'.format(f))
|
||||
shutil.rmtree(f)
|
||||
else:
|
||||
logging.info('deleting file {}'.format(f))
|
||||
self.logger.info('deleting file {}'.format(f))
|
||||
os.unlink(f)
|
||||
|
||||
|
||||
@ -203,6 +203,18 @@ class WarcproxTest(unittest.TestCase):
|
||||
self.assertEqual(response.content, 'I am the warcprox test payload! dddddddddd!\n')
|
||||
|
||||
|
||||
def poll_playback_until(self, url, status, timeout_sec):
|
||||
start = time.time()
|
||||
# check playback (warc writing is asynchronous, give it up to 10 sec)
|
||||
while time.time() - start < timeout_sec:
|
||||
response = requests.get(url, proxies=self.playback_proxies, verify=False)
|
||||
if response.status_code == status:
|
||||
break
|
||||
time.sleep(0.5)
|
||||
|
||||
return response
|
||||
|
||||
|
||||
def _test_archive_and_playback_http_url(self):
|
||||
url = 'http://localhost:{}/a/b'.format(self.http_daemon.server_port)
|
||||
|
||||
@ -217,13 +229,7 @@ class WarcproxTest(unittest.TestCase):
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'a!')
|
||||
self.assertEqual(response.content, 'I am the warcprox test payload! bbbbbbbbbb!\n')
|
||||
|
||||
# check playback (warc writing is asynchronous, give it up to 10 sec)
|
||||
for i in xrange(0,20):
|
||||
response = requests.get(url, proxies=self.playback_proxies)
|
||||
if response.status_code != 404:
|
||||
break
|
||||
time.sleep(0.5)
|
||||
|
||||
response = self.poll_playback_until(url, status=200, timeout_sec=10)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'a!')
|
||||
self.assertEqual(response.content, 'I am the warcprox test payload! bbbbbbbbbb!\n')
|
||||
@ -237,30 +243,153 @@ class WarcproxTest(unittest.TestCase):
|
||||
self.assertEqual(response.status_code, 404)
|
||||
self.assertEqual(response.content, '404 Not in Archive\n')
|
||||
|
||||
# archive
|
||||
# fetch & archive response
|
||||
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'c!')
|
||||
self.assertEqual(response.content, 'I am the warcprox test payload! dddddddddd!\n')
|
||||
|
||||
# check playback (warc writing is asynchronous, give it up to 10 sec)
|
||||
for i in xrange(0,20):
|
||||
response = requests.get(url, proxies=self.playback_proxies, verify=False)
|
||||
if response.status_code != 404:
|
||||
break
|
||||
time.sleep(0.5)
|
||||
|
||||
# test playback
|
||||
response = self.poll_playback_until(url, status=200, timeout_sec=10)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'c!')
|
||||
self.assertEqual(response.content, 'I am the warcprox test payload! dddddddddd!\n')
|
||||
|
||||
|
||||
# test dedup of same http url with same payload
|
||||
def _test_dedup_http(self):
|
||||
url = 'http://localhost:{}/e/f'.format(self.http_daemon.server_port)
|
||||
|
||||
# ensure playback fails before archiving
|
||||
response = requests.get(url, proxies=self.playback_proxies, verify=False)
|
||||
self.assertEqual(response.status_code, 404)
|
||||
self.assertEqual(response.content, '404 Not in Archive\n')
|
||||
|
||||
# check not in dedup db
|
||||
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
|
||||
self.assertIsNone(dedup_lookup)
|
||||
|
||||
# archive
|
||||
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'e!')
|
||||
self.assertEqual(response.content, 'I am the warcprox test payload! ffffffffff!\n')
|
||||
|
||||
# test playback
|
||||
response = self.poll_playback_until(url, status=200, timeout_sec=10)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'e!')
|
||||
self.assertEqual(response.content, 'I am the warcprox test payload! ffffffffff!\n')
|
||||
|
||||
# check in dedup db
|
||||
# {u'i': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'}
|
||||
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
|
||||
self.assertEquals(dedup_lookup['u'], url)
|
||||
self.assertRegexpMatches(dedup_lookup['i'], r'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$')
|
||||
self.assertRegexpMatches(dedup_lookup['d'], r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$')
|
||||
record_id = dedup_lookup['i']
|
||||
dedup_date = dedup_lookup['d']
|
||||
|
||||
# need revisit to have a later timestamp than original, else playing
|
||||
# back the latest record might not hit the revisit
|
||||
time.sleep(1.5)
|
||||
|
||||
# fetch & archive revisit
|
||||
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'e!')
|
||||
self.assertEqual(response.content, 'I am the warcprox test payload! ffffffffff!\n')
|
||||
|
||||
# XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\
|
||||
time.sleep(2.0)
|
||||
|
||||
# check in dedup db (no change from prev)
|
||||
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
|
||||
self.assertEquals(dedup_lookup['u'], url)
|
||||
self.assertEquals(dedup_lookup['i'], record_id)
|
||||
self.assertEquals(dedup_lookup['d'], dedup_date)
|
||||
|
||||
# test playback
|
||||
self.logger.debug('testing playback of revisit of {}'.format(url))
|
||||
response = self.poll_playback_until(url, status=200, timeout_sec=10)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'e!')
|
||||
self.assertEqual(response.content, 'I am the warcprox test payload! ffffffffff!\n')
|
||||
# XXX how to check dedup was used?
|
||||
|
||||
|
||||
# test dedup of same https url with same payload
|
||||
def _test_dedup_https(self):
|
||||
url = 'https://localhost:{}/g/h'.format(self.https_daemon.server_port)
|
||||
|
||||
# ensure playback fails before archiving
|
||||
response = requests.get(url, proxies=self.playback_proxies, verify=False)
|
||||
self.assertEqual(response.status_code, 404)
|
||||
self.assertEqual(response.content, '404 Not in Archive\n')
|
||||
|
||||
# check not in dedup db
|
||||
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
|
||||
self.assertIsNone(dedup_lookup)
|
||||
|
||||
# archive
|
||||
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'g!')
|
||||
self.assertEqual(response.content, 'I am the warcprox test payload! hhhhhhhhhh!\n')
|
||||
|
||||
# test playback
|
||||
response = self.poll_playback_until(url, status=200, timeout_sec=10)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'g!')
|
||||
self.assertEqual(response.content, 'I am the warcprox test payload! hhhhhhhhhh!\n')
|
||||
|
||||
# check in dedup db
|
||||
# {u'i': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'}
|
||||
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
|
||||
self.assertEquals(dedup_lookup['u'], url)
|
||||
self.assertRegexpMatches(dedup_lookup['i'], r'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$')
|
||||
self.assertRegexpMatches(dedup_lookup['d'], r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$')
|
||||
record_id = dedup_lookup['i']
|
||||
dedup_date = dedup_lookup['d']
|
||||
|
||||
# need revisit to have a later timestamp than original, else playing
|
||||
# back the latest record might not hit the revisit
|
||||
time.sleep(1.5)
|
||||
|
||||
# fetch & archive revisit
|
||||
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'g!')
|
||||
self.assertEqual(response.content, 'I am the warcprox test payload! hhhhhhhhhh!\n')
|
||||
|
||||
# XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\
|
||||
time.sleep(2.0)
|
||||
|
||||
# check in dedup db (no change from prev)
|
||||
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
|
||||
self.assertEquals(dedup_lookup['u'], url)
|
||||
self.assertEquals(dedup_lookup['i'], record_id)
|
||||
self.assertEquals(dedup_lookup['d'], dedup_date)
|
||||
|
||||
# test playback
|
||||
self.logger.debug('testing playback of revisit of {}'.format(url))
|
||||
response = self.poll_playback_until(url, status=200, timeout_sec=10)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'g!')
|
||||
self.assertEqual(response.content, 'I am the warcprox test payload! hhhhhhhhhh!\n')
|
||||
# XXX how to check dedup was used?
|
||||
|
||||
|
||||
# run everything from here, otherwise it wants to setUp() and tearDown
|
||||
# around each test
|
||||
def runTest(self):
|
||||
self._test_httpds_no_proxy()
|
||||
self._test_archive_and_playback_http_url()
|
||||
self._test_archive_and_playback_https_url()
|
||||
self._test_dedup_http()
|
||||
self._test_dedup_https()
|
||||
# self._test_dedup_mixed_http()
|
||||
# self._test_dedup_mixed_https()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
Loading…
x
Reference in New Issue
Block a user