mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
fix crawl log test to avoid any dedup collisions
This commit is contained in:
parent
edcc2cc296
commit
8a768dcd44
@ -1453,11 +1453,11 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
||||
except:
|
||||
pass
|
||||
|
||||
url = 'http://localhost:%s/b/d' % http_daemon.server_port
|
||||
url = 'http://localhost:%s/b/aa' % http_daemon.server_port
|
||||
response = requests.get(url, proxies=archiving_proxies)
|
||||
assert response.status_code == 200
|
||||
|
||||
url = 'http://localhost:%s/b/e' % http_daemon.server_port
|
||||
url = 'http://localhost:%s/b/bb' % http_daemon.server_port
|
||||
headers = {
|
||||
"Warcprox-Meta": json.dumps({"warc-prefix":"test_crawl_log_1"}),
|
||||
"Referer": "http://example.com/referer",
|
||||
@ -1474,47 +1474,47 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
||||
|
||||
crawl_log = open(os.path.join(
|
||||
warcprox_.options.crawl_log_dir, 'crawl.log'), 'rb').read()
|
||||
crawl_log_1 = open(os.path.join(
|
||||
warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log'), 'rb').read()
|
||||
# tests will fail in year 3000 :)
|
||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log)
|
||||
assert crawl_log[24:31] == b' 200 '
|
||||
assert crawl_log[31:42] == b' 44 '
|
||||
assert crawl_log[31:42] == b' 54 '
|
||||
fields = crawl_log.split()
|
||||
assert len(fields) == 13
|
||||
assert fields[3].endswith(b'/b/d')
|
||||
assert fields[3].endswith(b'/b/aa')
|
||||
assert fields[4] == b'-'
|
||||
assert fields[5] == b'-'
|
||||
assert fields[6] == b'text/plain'
|
||||
assert fields[7] == b'-'
|
||||
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
|
||||
assert fields[9] == b'sha1:NKW7OKGZHXIMRKILQGOB2EB22U2MXJLM'
|
||||
assert fields[9] == b'sha1:NHKRURXEJICOQEINUDERRF6OZ2LZ7JYP'
|
||||
assert fields[10] == b'-'
|
||||
assert fields[11] == b'-'
|
||||
extra_info = json.loads(fields[12].decode('utf-8'))
|
||||
assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'}
|
||||
assert extra_info['contentSize'] == 135
|
||||
assert extra_info['contentSize'] == 145
|
||||
|
||||
crawl_log_1 = open(os.path.join(
|
||||
warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log'), 'rb').read()
|
||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_1)
|
||||
assert crawl_log_1[24:31] == b' 200 '
|
||||
assert crawl_log_1[31:42] == b' 44 '
|
||||
assert crawl_log_1[31:42] == b' 54 '
|
||||
fields = crawl_log_1.split()
|
||||
assert len(fields) == 13
|
||||
assert fields[3].endswith(b'/b/e')
|
||||
assert fields[3].endswith(b'/b/bb')
|
||||
assert fields[4] == b'-'
|
||||
assert fields[5] == b'http://example.com/referer'
|
||||
assert fields[6] == b'text/plain'
|
||||
assert fields[7] == b'-'
|
||||
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
|
||||
assert fields[9] == b'sha1:DJURQDWPRKWTNMHDA6YS2KN2RLTWQ4JJ'
|
||||
assert fields[9] == b'sha1:TKXGVS3ZPR24VDVV3XWZXYQSPTDBWP53'
|
||||
assert fields[10] == b'-'
|
||||
assert fields[11] == b'-'
|
||||
extra_info = json.loads(fields[12].decode('utf-8'))
|
||||
assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'}
|
||||
assert extra_info['contentSize'] == 135
|
||||
assert extra_info['contentSize'] == 145
|
||||
|
||||
# should be deduplicated
|
||||
url = 'http://localhost:%s/b/d' % http_daemon.server_port
|
||||
url = 'http://localhost:%s/b/aa' % http_daemon.server_port
|
||||
headers = {"Warcprox-Meta": json.dumps({
|
||||
"warc-prefix": "test_crawl_log_2",
|
||||
"metadata": {"seed": "http://example.com/seed"}})}
|
||||
@ -1533,21 +1533,21 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
||||
|
||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_2)
|
||||
assert crawl_log_2[24:31] == b' 200 '
|
||||
assert crawl_log_2[31:42] == b' 44 '
|
||||
assert crawl_log_2[31:42] == b' 54 '
|
||||
fields = crawl_log_2.split()
|
||||
assert len(fields) == 13
|
||||
assert fields[3].endswith(b'/b/d')
|
||||
assert fields[3].endswith(b'/b/aa')
|
||||
assert fields[4] == b'-'
|
||||
assert fields[5] == b'-'
|
||||
assert fields[6] == b'text/plain'
|
||||
assert fields[7] == b'-'
|
||||
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
|
||||
assert fields[9] == b'sha1:NKW7OKGZHXIMRKILQGOB2EB22U2MXJLM'
|
||||
assert fields[9] == b'sha1:NHKRURXEJICOQEINUDERRF6OZ2LZ7JYP'
|
||||
assert fields[10] == b'http://example.com/seed'
|
||||
assert fields[11] == b'duplicate:digest'
|
||||
extra_info = json.loads(fields[12].decode('utf-8'))
|
||||
assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'}
|
||||
assert extra_info['contentSize'] == 135
|
||||
assert extra_info['contentSize'] == 145
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main()
|
||||
|
Loading…
x
Reference in New Issue
Block a user