From d3314d7904783100f3a4c83fca7a9c7c00aebd90 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 18 Jul 2018 19:26:16 -0500 Subject: [PATCH] hopefully fix a trough dedup concurrency bug --- warcprox/dedup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index c0bc817..11fff7a 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -434,8 +434,9 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): fs = {} with futures.ThreadPoolExecutor(max_workers=len(buckets)) as pool: # send off the trough requests in parallel + key_indexes = {} for bucket in buckets: - key_index = self._build_key_index(buckets[bucket]) + key_indexes[bucket] = self._build_key_index(buckets[bucket]) future = pool.submit( self.trough_dedup_db.batch_lookup, key_index.keys(), bucket) @@ -446,6 +447,7 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): for future in futures.as_completed(fs, timeout=20): bucket = fs[future] try: + key_index = key_indexes[bucket] for entry in future.result(): for recorded_url in key_index[entry['digest_key']]: recorded_url.dedup_info = entry