rewrite test_dedup_min_size() to account for

the fact that we always save a record to the big captures table,
partly by adding a new check that --dedup-min-*-size is respected even
if there is an entry in the dedup db for the sha1
This commit is contained in:
Noah Levitt 2018-05-16 10:52:04 -07:00
parent e23af32e94
commit 5f0c46d579

View File

@ -1940,7 +1940,7 @@ def test_trough_segment_promotion(warcprox_):
time.sleep(3) time.sleep(3)
assert promoted == [] assert promoted == []
def test_dedup_min_size(http_daemon, warcprox_, archiving_proxies, playback_proxies): def test_dedup_min_text_size(http_daemon, warcprox_, archiving_proxies):
"""We use options --dedup-min-text-size=3 --dedup-min-binary-size=5 and we """We use options --dedup-min-text-size=3 --dedup-min-binary-size=5 and we
try to download content smaller than these limits to make sure that it is try to download content smaller than these limits to make sure that it is
not deduplicated. We create the digest_str with the following code: not deduplicated. We create the digest_str with the following code:
@ -1950,36 +1950,155 @@ def test_dedup_min_size(http_daemon, warcprox_, archiving_proxies, playback_prox
warcprox.digest_str(payload_digest) warcprox.digest_str(payload_digest)
``` ```
""" """
urls_before = warcprox_.proxy.running_stats.urls
# start a fresh warc
warcprox_.warc_writer_processor.writer_pool.close_writers()
# fetch small text
url = 'http://localhost:%s/text-2bytes' % http_daemon.server_port url = 'http://localhost:%s/text-2bytes' % http_daemon.server_port
response = requests.get( response = requests.get(
url, proxies=archiving_proxies, verify=False, timeout=10) url, proxies=archiving_proxies, verify=False, timeout=10)
assert len(response.content) == 2 assert len(response.content) == 2
# wait for postfetch chain
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
# check no dedup was saved (except RethinkCapturesDedup which always saves)
dedup_lookup = warcprox_.dedup_db.lookup( dedup_lookup = warcprox_.dedup_db.lookup(
b'sha1:e0c9035898dd52fc65c41454cec9c4d2611bfb37') b'sha1:e0c9035898dd52fc65c41454cec9c4d2611bfb37')
assert dedup_lookup is None if not isinstance(warcprox_.dedup_db, warcprox.bigtable.RethinkCapturesDedup):
time.sleep(3) assert dedup_lookup is None
# fetch again saving dedup info so that we can test dedup info ignored
orig_should_dedup = warcprox_.dedup_db.should_dedup
warcprox_.dedup_db.should_dedup = lambda *args, **kwargs: True
try:
response = requests.get(
url, proxies=archiving_proxies, verify=False, timeout=10)
assert len(response.content) == 2
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2)
# check dedup was saved
dedup_lookup = warcprox_.dedup_db.lookup(
b'sha1:e0c9035898dd52fc65c41454cec9c4d2611bfb37')
assert dedup_lookup
finally:
warcprox_.dedup_db.should_dedup = orig_should_dedup
else:
assert dedup_lookup
# fetch again and check that it was not deduped
urls_before = warcprox_.proxy.running_stats.urls
response = requests.get( response = requests.get(
url, proxies=archiving_proxies, verify=False, timeout=10) url, proxies=archiving_proxies, verify=False, timeout=10)
dedup_lookup = warcprox_.dedup_db.lookup( assert len(response.content) == 2
b'sha1:e0c9035898dd52fc65c41454cec9c4d2611bfb37') wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
# This would return dedup data if payload_size > dedup-min-text-size
assert dedup_lookup is None
# check that response records were written
warc = warcprox_.warc_writer_processor.writer_pool.default_warc_writer._available_warcs.queue[0].path
with open(warc, 'rb') as f:
rec_iter = iter(warcio.archiveiterator.ArchiveIterator(f))
record = next(rec_iter)
assert record.rec_type == 'warcinfo'
record = next(rec_iter)
assert record.rec_type == 'response'
assert record.rec_headers.get_header('warc-target-uri') == url
record = next(rec_iter)
assert record.rec_type == 'request'
assert record.rec_headers.get_header('warc-target-uri') == url
record = next(rec_iter)
assert record.rec_type == 'response'
assert record.rec_headers.get_header('warc-target-uri') == url
record = next(rec_iter)
assert record.rec_type == 'request'
assert record.rec_headers.get_header('warc-target-uri') == url
if not isinstance(warcprox_.dedup_db, warcprox.bigtable.RethinkCapturesDedup):
record = next(rec_iter)
assert record.rec_type == 'response'
assert record.rec_headers.get_header('warc-target-uri') == url
record = next(rec_iter)
assert record.rec_type == 'request'
assert record.rec_headers.get_header('warc-target-uri') == url
with pytest.raises(StopIteration):
next(rec_iter)
def test_dedup_min_binary_size(http_daemon, warcprox_, archiving_proxies):
"""We use options --dedup-min-text-size=3 --dedup-min-binary-size=5 and we
try to download content smaller than these limits to make sure that it is
not deduplicated. We create the digest_str with the following code:
```
payload_digest = hashlib.new('sha1')
payload_digest.update(b'aa')
warcprox.digest_str(payload_digest)
```
"""
urls_before = warcprox_.proxy.running_stats.urls
# start a fresh warc
warcprox_.warc_writer_processor.writer_pool.close_writers()
# fetch small binary
url = 'http://localhost:%s/binary-4bytes' % http_daemon.server_port url = 'http://localhost:%s/binary-4bytes' % http_daemon.server_port
response = requests.get( response = requests.get(
url, proxies=archiving_proxies, verify=False, timeout=10) url, proxies=archiving_proxies, verify=False, timeout=10)
assert len(response.content) == 4 assert len(response.content) == 4
# wait for postfetch chain
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
# check no dedup was saved (except RethinkCapturesDedup which always saves)
dedup_lookup = warcprox_.dedup_db.lookup( dedup_lookup = warcprox_.dedup_db.lookup(
b'sha1:70c881d4a26984ddce795f6f71817c9cf4480e79') b'sha1:70c881d4a26984ddce795f6f71817c9cf4480e79')
assert dedup_lookup is None if not isinstance(warcprox_.dedup_db, warcprox.bigtable.RethinkCapturesDedup):
time.sleep(3) assert dedup_lookup is None
# fetch again saving dedup info so that we can test dedup info ignored
orig_should_dedup = warcprox_.dedup_db.should_dedup
warcprox_.dedup_db.should_dedup = lambda *args, **kwargs: True
try:
response = requests.get(
url, proxies=archiving_proxies, verify=False, timeout=10)
assert len(response.content) == 4
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2)
# check dedup was saved
dedup_lookup = warcprox_.dedup_db.lookup(
b'sha1:70c881d4a26984ddce795f6f71817c9cf4480e79')
assert dedup_lookup
finally:
warcprox_.dedup_db.should_dedup = orig_should_dedup
else:
assert dedup_lookup
# fetch again and check that it was not deduped
urls_before = warcprox_.proxy.running_stats.urls
response = requests.get( response = requests.get(
url, proxies=archiving_proxies, verify=False, timeout=10) url, proxies=archiving_proxies, verify=False, timeout=10)
dedup_lookup = warcprox_.dedup_db.lookup( assert len(response.content) == 4
b'sha1:70c881d4a26984ddce795f6f71817c9cf4480e79') wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
# This would return dedup data if payload_size > dedup-min-binary-size
assert dedup_lookup is None
# check that response records were written
warc = warcprox_.warc_writer_processor.writer_pool.default_warc_writer._available_warcs.queue[0].path
with open(warc, 'rb') as f:
rec_iter = iter(warcio.archiveiterator.ArchiveIterator(f))
record = next(rec_iter)
assert record.rec_type == 'warcinfo'
record = next(rec_iter)
assert record.rec_type == 'response'
assert record.rec_headers.get_header('warc-target-uri') == url
record = next(rec_iter)
assert record.rec_type == 'request'
assert record.rec_headers.get_header('warc-target-uri') == url
record = next(rec_iter)
assert record.rec_type == 'response'
assert record.rec_headers.get_header('warc-target-uri') == url
record = next(rec_iter)
assert record.rec_type == 'request'
assert record.rec_headers.get_header('warc-target-uri') == url
if not isinstance(warcprox_.dedup_db, warcprox.bigtable.RethinkCapturesDedup):
record = next(rec_iter)
assert record.rec_type == 'response'
assert record.rec_headers.get_header('warc-target-uri') == url
record = next(rec_iter)
assert record.rec_type == 'request'
assert record.rec_headers.get_header('warc-target-uri') == url
with pytest.raises(StopIteration):
next(rec_iter)
if __name__ == '__main__': if __name__ == '__main__':
pytest.main() pytest.main()