From 8a768dcd44bd0bb6fa51d1a57804b352ae3d2cfc Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 7 Aug 2017 14:06:53 -0700 Subject: [PATCH] fix crawl log test to avoid any dedup collisions --- tests/test_warcprox.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 63c96d2..19636d6 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1453,11 +1453,11 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): except: pass - url = 'http://localhost:%s/b/d' % http_daemon.server_port + url = 'http://localhost:%s/b/aa' % http_daemon.server_port response = requests.get(url, proxies=archiving_proxies) assert response.status_code == 200 - url = 'http://localhost:%s/b/e' % http_daemon.server_port + url = 'http://localhost:%s/b/bb' % http_daemon.server_port headers = { "Warcprox-Meta": json.dumps({"warc-prefix":"test_crawl_log_1"}), "Referer": "http://example.com/referer", @@ -1474,47 +1474,47 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): crawl_log = open(os.path.join( warcprox_.options.crawl_log_dir, 'crawl.log'), 'rb').read() - crawl_log_1 = open(os.path.join( - warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log'), 'rb').read() # tests will fail in year 3000 :) assert re.match(b'\A2[^\n]+\n\Z', crawl_log) assert crawl_log[24:31] == b' 200 ' - assert crawl_log[31:42] == b' 44 ' + assert crawl_log[31:42] == b' 54 ' fields = crawl_log.split() assert len(fields) == 13 - assert fields[3].endswith(b'/b/d') + assert fields[3].endswith(b'/b/aa') assert fields[4] == b'-' assert fields[5] == b'-' assert fields[6] == b'text/plain' assert fields[7] == b'-' assert re.match(br'^\d{17}[+]\d{3}', fields[8]) - assert fields[9] == b'sha1:NKW7OKGZHXIMRKILQGOB2EB22U2MXJLM' + assert fields[9] == b'sha1:NHKRURXEJICOQEINUDERRF6OZ2LZ7JYP' assert fields[10] == b'-' assert fields[11] == b'-' extra_info = json.loads(fields[12].decode('utf-8')) assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'} - assert extra_info['contentSize'] == 135 + assert extra_info['contentSize'] == 145 + crawl_log_1 = open(os.path.join( + warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log'), 'rb').read() assert re.match(b'\A2[^\n]+\n\Z', crawl_log_1) assert crawl_log_1[24:31] == b' 200 ' - assert crawl_log_1[31:42] == b' 44 ' + assert crawl_log_1[31:42] == b' 54 ' fields = crawl_log_1.split() assert len(fields) == 13 - assert fields[3].endswith(b'/b/e') + assert fields[3].endswith(b'/b/bb') assert fields[4] == b'-' assert fields[5] == b'http://example.com/referer' assert fields[6] == b'text/plain' assert fields[7] == b'-' assert re.match(br'^\d{17}[+]\d{3}', fields[8]) - assert fields[9] == b'sha1:DJURQDWPRKWTNMHDA6YS2KN2RLTWQ4JJ' + assert fields[9] == b'sha1:TKXGVS3ZPR24VDVV3XWZXYQSPTDBWP53' assert fields[10] == b'-' assert fields[11] == b'-' extra_info = json.loads(fields[12].decode('utf-8')) assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'} - assert extra_info['contentSize'] == 135 + assert extra_info['contentSize'] == 145 # should be deduplicated - url = 'http://localhost:%s/b/d' % http_daemon.server_port + url = 'http://localhost:%s/b/aa' % http_daemon.server_port headers = {"Warcprox-Meta": json.dumps({ "warc-prefix": "test_crawl_log_2", "metadata": {"seed": "http://example.com/seed"}})} @@ -1533,21 +1533,21 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): assert re.match(b'\A2[^\n]+\n\Z', crawl_log_2) assert crawl_log_2[24:31] == b' 200 ' - assert crawl_log_2[31:42] == b' 44 ' + assert crawl_log_2[31:42] == b' 54 ' fields = crawl_log_2.split() assert len(fields) == 13 - assert fields[3].endswith(b'/b/d') + assert fields[3].endswith(b'/b/aa') assert fields[4] == b'-' assert fields[5] == b'-' assert fields[6] == b'text/plain' assert fields[7] == b'-' assert re.match(br'^\d{17}[+]\d{3}', fields[8]) - assert fields[9] == b'sha1:NKW7OKGZHXIMRKILQGOB2EB22U2MXJLM' + assert fields[9] == b'sha1:NHKRURXEJICOQEINUDERRF6OZ2LZ7JYP' assert fields[10] == b'http://example.com/seed' assert fields[11] == b'duplicate:digest' extra_info = json.loads(fields[12].decode('utf-8')) assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'} - assert extra_info['contentSize'] == 135 + assert extra_info['contentSize'] == 145 if __name__ == '__main__': pytest.main()