From e6a1a7dd7e5633249abd4ad865dad48b9aef8241 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 6 Dec 2021 17:29:02 -0800 Subject: [PATCH 1/2] increase trough dedup batch window --- warcprox/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 9cd09a8..9fe3a74 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -1,7 +1,7 @@ """ warcprox/__init__.py - warcprox package main file, contains some utility code -Copyright (C) 2013-2019 Internet Archive +Copyright (C) 2013-2021 Internet Archive This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License @@ -175,8 +175,8 @@ class BaseStandardPostfetchProcessor(BasePostfetchProcessor): class BaseBatchPostfetchProcessor(BasePostfetchProcessor): MAX_BATCH_SIZE = 500 - MAX_BATCH_SEC = 10 - MIN_BATCH_SEC = 2.0 + MAX_BATCH_SEC = 30 + MIN_BATCH_SEC = 10 def _get_process_put(self): batch = [] From b67f1ad0f302489afc5bf4511fbc99179dfc736c Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 6 Dec 2021 17:29:27 -0800 Subject: [PATCH 2/2] add logging --- warcprox/dedup.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 98cfea1..e8e95c7 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -384,6 +384,9 @@ class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor): self.trough_dedup_db.batch_save, buckets[bucket], bucket) fs[future] = bucket + logging.debug( + 'storing dedup info for %s urls ' + 'in bucket %s', len(buckets[bucket]), bucket) # wait for results try: @@ -434,6 +437,8 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): warcprox.digest_str( recorded_url.payload_digest, self.options.base32) if recorded_url.payload_digest else 'n/a') + self.logger.debug( + 'hash_plus_urls: {}'.format(hash_plus_urls)) self.logger.debug( 'len(batch)=%s len(discards)=%s buckets=%s', len(batch), len(discards),