mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
fix crawl log test
This commit is contained in:
parent
ecb07fc9cd
commit
edcc2cc296
@ -1465,18 +1465,10 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
|||||||
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
|
|
||||||
# should be deduplicated
|
|
||||||
url = 'http://localhost:%s/b/d' % http_daemon.server_port
|
|
||||||
headers = {"Warcprox-Meta": json.dumps({
|
|
||||||
"warc-prefix": "test_crawl_log_2",
|
|
||||||
"metadata": {"seed": "http://example.com/seed"}})}
|
|
||||||
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
|
||||||
assert response.status_code == 200
|
|
||||||
|
|
||||||
start = time.time()
|
start = time.time()
|
||||||
while time.time() - start < 10:
|
while time.time() - start < 10:
|
||||||
if os.path.exists(os.path.join(
|
if os.path.exists(os.path.join(
|
||||||
warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log')):
|
warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log')):
|
||||||
break
|
break
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
|
||||||
@ -1484,9 +1476,6 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
|||||||
warcprox_.options.crawl_log_dir, 'crawl.log'), 'rb').read()
|
warcprox_.options.crawl_log_dir, 'crawl.log'), 'rb').read()
|
||||||
crawl_log_1 = open(os.path.join(
|
crawl_log_1 = open(os.path.join(
|
||||||
warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log'), 'rb').read()
|
warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log'), 'rb').read()
|
||||||
crawl_log_2 = open(os.path.join(
|
|
||||||
warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log'), 'rb').read()
|
|
||||||
|
|
||||||
# tests will fail in year 3000 :)
|
# tests will fail in year 3000 :)
|
||||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log)
|
assert re.match(b'\A2[^\n]+\n\Z', crawl_log)
|
||||||
assert crawl_log[24:31] == b' 200 '
|
assert crawl_log[24:31] == b' 200 '
|
||||||
@ -1524,6 +1513,24 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
|||||||
assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'}
|
assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'}
|
||||||
assert extra_info['contentSize'] == 135
|
assert extra_info['contentSize'] == 135
|
||||||
|
|
||||||
|
# should be deduplicated
|
||||||
|
url = 'http://localhost:%s/b/d' % http_daemon.server_port
|
||||||
|
headers = {"Warcprox-Meta": json.dumps({
|
||||||
|
"warc-prefix": "test_crawl_log_2",
|
||||||
|
"metadata": {"seed": "http://example.com/seed"}})}
|
||||||
|
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||||
|
assert response.status_code == 200
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
while time.time() - start < 10:
|
||||||
|
if os.path.exists(os.path.join(
|
||||||
|
warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log')):
|
||||||
|
break
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
crawl_log_2 = open(os.path.join(
|
||||||
|
warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log'), 'rb').read()
|
||||||
|
|
||||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_2)
|
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_2)
|
||||||
assert crawl_log_2[24:31] == b' 200 '
|
assert crawl_log_2[24:31] == b' 200 '
|
||||||
assert crawl_log_2[31:42] == b' 44 '
|
assert crawl_log_2[31:42] == b' 44 '
|
||||||
@ -1537,7 +1544,7 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
|||||||
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
|
assert re.match(br'^\d{17}[+]\d{3}', fields[8])
|
||||||
assert fields[9] == b'sha1:NKW7OKGZHXIMRKILQGOB2EB22U2MXJLM'
|
assert fields[9] == b'sha1:NKW7OKGZHXIMRKILQGOB2EB22U2MXJLM'
|
||||||
assert fields[10] == b'http://example.com/seed'
|
assert fields[10] == b'http://example.com/seed'
|
||||||
assert fields[11] == b'-'
|
assert fields[11] == b'duplicate:digest'
|
||||||
extra_info = json.loads(fields[12].decode('utf-8'))
|
extra_info = json.loads(fields[12].decode('utf-8'))
|
||||||
assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'}
|
assert extra_info.keys() == {'contentSize','warcFilename','warcFileOffset'}
|
||||||
assert extra_info['contentSize'] == 135
|
assert extra_info['contentSize'] == 135
|
||||||
|
Loading…
x
Reference in New Issue
Block a user