rewrite test_dedup_min_size() to account for

the fact that we always save a record to the big captures table, partly by adding a new check that --dedup-min-*-size is respected even if there is an entry in the dedup db for the sha1
2025-01-18 13:22:09 +01:00 · 2018-05-16 10:52:04 -07:00 · 2018-05-16 10:52:04 -07:00 · 5f0c46d579
commit 5f0c46d579
parent e23af32e94
1 changed files with 132 additions and 13 deletions
--- a/tests/test_warcprox.py
+++ b/tests/test_warcprox.py
@ -1940,7 +1940,7 @@ def test_trough_segment_promotion(warcprox_):
    time.sleep(3)
    assert promoted == []

-def test_dedup_min_size(http_daemon, warcprox_, archiving_proxies, playback_proxies):
+def test_dedup_min_text_size(http_daemon, warcprox_, archiving_proxies):
    """We use options --dedup-min-text-size=3 --dedup-min-binary-size=5 and we
    try to download content smaller than these limits to make sure that it is
    not deduplicated. We create the digest_str with the following code:
@ -1950,36 +1950,155 @@ def test_dedup_min_size(http_daemon, warcprox_, archiving_proxies, playback_prox
    warcprox.digest_str(payload_digest)
    ```
    """
+    urls_before = warcprox_.proxy.running_stats.urls
+
+    # start a fresh warc
+    warcprox_.warc_writer_processor.writer_pool.close_writers()
+
+    # fetch small text
    url = 'http://localhost:%s/text-2bytes' % http_daemon.server_port
    response = requests.get(
        url, proxies=archiving_proxies, verify=False, timeout=10)
    assert len(response.content) == 2
+    # wait for postfetch chain
+    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
+    # check no dedup was saved (except RethinkCapturesDedup which always saves)
    dedup_lookup = warcprox_.dedup_db.lookup(
            b'sha1:e0c9035898dd52fc65c41454cec9c4d2611bfb37')
-    assert dedup_lookup is None
-    time.sleep(3)
+    if not isinstance(warcprox_.dedup_db, warcprox.bigtable.RethinkCapturesDedup):
+        assert dedup_lookup is None
+
+        # fetch again saving dedup info so that we can test dedup info ignored
+        orig_should_dedup = warcprox_.dedup_db.should_dedup
+        warcprox_.dedup_db.should_dedup = lambda *args, **kwargs: True
+        try:
+            response = requests.get(
+                url, proxies=archiving_proxies, verify=False, timeout=10)
+            assert len(response.content) == 2
+            wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2)
+            # check dedup was saved
+            dedup_lookup = warcprox_.dedup_db.lookup(
+                    b'sha1:e0c9035898dd52fc65c41454cec9c4d2611bfb37')
+            assert dedup_lookup
+        finally:
+            warcprox_.dedup_db.should_dedup = orig_should_dedup
+    else:
+        assert dedup_lookup
+
+    # fetch again and check that it was not deduped
+    urls_before = warcprox_.proxy.running_stats.urls
    response = requests.get(
        url, proxies=archiving_proxies, verify=False, timeout=10)
-    dedup_lookup = warcprox_.dedup_db.lookup(
-            b'sha1:e0c9035898dd52fc65c41454cec9c4d2611bfb37')
-    # This would return dedup data if payload_size > dedup-min-text-size
-    assert dedup_lookup is None
+    assert len(response.content) == 2
+    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)

+    # check that response records were written
+    warc = warcprox_.warc_writer_processor.writer_pool.default_warc_writer._available_warcs.queue[0].path
+    with open(warc, 'rb') as f:
+        rec_iter = iter(warcio.archiveiterator.ArchiveIterator(f))
+        record = next(rec_iter)
+        assert record.rec_type == 'warcinfo'
+        record = next(rec_iter)
+        assert record.rec_type == 'response'
+        assert record.rec_headers.get_header('warc-target-uri') == url
+        record = next(rec_iter)
+        assert record.rec_type == 'request'
+        assert record.rec_headers.get_header('warc-target-uri') == url
+        record = next(rec_iter)
+        assert record.rec_type == 'response'
+        assert record.rec_headers.get_header('warc-target-uri') == url
+        record = next(rec_iter)
+        assert record.rec_type == 'request'
+        assert record.rec_headers.get_header('warc-target-uri') == url
+        if not isinstance(warcprox_.dedup_db, warcprox.bigtable.RethinkCapturesDedup):
+            record = next(rec_iter)
+            assert record.rec_type == 'response'
+            assert record.rec_headers.get_header('warc-target-uri') == url
+            record = next(rec_iter)
+            assert record.rec_type == 'request'
+            assert record.rec_headers.get_header('warc-target-uri') == url
+        with pytest.raises(StopIteration):
+            next(rec_iter)
+
+def test_dedup_min_binary_size(http_daemon, warcprox_, archiving_proxies):
+    """We use options --dedup-min-text-size=3 --dedup-min-binary-size=5 and we
+    try to download content smaller than these limits to make sure that it is
+    not deduplicated. We create the digest_str with the following code:
+    ```
+    payload_digest = hashlib.new('sha1')
+    payload_digest.update(b'aa')
+    warcprox.digest_str(payload_digest)
+    ```
+    """
+    urls_before = warcprox_.proxy.running_stats.urls
+
+    # start a fresh warc
+    warcprox_.warc_writer_processor.writer_pool.close_writers()
+
+    # fetch small binary
    url = 'http://localhost:%s/binary-4bytes' % http_daemon.server_port
    response = requests.get(
        url, proxies=archiving_proxies, verify=False, timeout=10)
    assert len(response.content) == 4
+    # wait for postfetch chain
+    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
+    # check no dedup was saved (except RethinkCapturesDedup which always saves)
    dedup_lookup = warcprox_.dedup_db.lookup(
            b'sha1:70c881d4a26984ddce795f6f71817c9cf4480e79')
-    assert dedup_lookup is None
-    time.sleep(3)
+    if not isinstance(warcprox_.dedup_db, warcprox.bigtable.RethinkCapturesDedup):
+        assert dedup_lookup is None
+
+        # fetch again saving dedup info so that we can test dedup info ignored
+        orig_should_dedup = warcprox_.dedup_db.should_dedup
+        warcprox_.dedup_db.should_dedup = lambda *args, **kwargs: True
+        try:
+            response = requests.get(
+                url, proxies=archiving_proxies, verify=False, timeout=10)
+            assert len(response.content) == 4
+            wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2)
+            # check dedup was saved
+            dedup_lookup = warcprox_.dedup_db.lookup(
+                    b'sha1:70c881d4a26984ddce795f6f71817c9cf4480e79')
+            assert dedup_lookup
+        finally:
+            warcprox_.dedup_db.should_dedup = orig_should_dedup
+    else:
+        assert dedup_lookup
+
+    # fetch again and check that it was not deduped
+    urls_before = warcprox_.proxy.running_stats.urls
    response = requests.get(
        url, proxies=archiving_proxies, verify=False, timeout=10)
-    dedup_lookup = warcprox_.dedup_db.lookup(
-            b'sha1:70c881d4a26984ddce795f6f71817c9cf4480e79')
-    # This would return dedup data if payload_size > dedup-min-binary-size
-    assert dedup_lookup is None
+    assert len(response.content) == 4
+    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)

+    # check that response records were written
+    warc = warcprox_.warc_writer_processor.writer_pool.default_warc_writer._available_warcs.queue[0].path
+    with open(warc, 'rb') as f:
+        rec_iter = iter(warcio.archiveiterator.ArchiveIterator(f))
+        record = next(rec_iter)
+        assert record.rec_type == 'warcinfo'
+        record = next(rec_iter)
+        assert record.rec_type == 'response'
+        assert record.rec_headers.get_header('warc-target-uri') == url
+        record = next(rec_iter)
+        assert record.rec_type == 'request'
+        assert record.rec_headers.get_header('warc-target-uri') == url
+        record = next(rec_iter)
+        assert record.rec_type == 'response'
+        assert record.rec_headers.get_header('warc-target-uri') == url
+        record = next(rec_iter)
+        assert record.rec_type == 'request'
+        assert record.rec_headers.get_header('warc-target-uri') == url
+        if not isinstance(warcprox_.dedup_db, warcprox.bigtable.RethinkCapturesDedup):
+            record = next(rec_iter)
+            assert record.rec_type == 'response'
+            assert record.rec_headers.get_header('warc-target-uri') == url
+            record = next(rec_iter)
+            assert record.rec_type == 'request'
+            assert record.rec_headers.get_header('warc-target-uri') == url
+        with pytest.raises(StopIteration):
+            next(rec_iter)

 if __name__ == '__main__':
    pytest.main()