From 6a641074784a2ce144691023bb9de7f5eda6bc21 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 17 Jan 2018 12:27:19 -0800 Subject: [PATCH] don't keep next processor waiting in batch postfetch processor, accumulate urls for the next batch for at most 0.5 sec, if the outq is empty (i.e. the next processor is waiting idly) --- warcprox/__init__.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 60ca2ef..bc1365c 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -168,13 +168,24 @@ class BaseStandardPostfetchProcessor(BasePostfetchProcessor): class BaseBatchPostfetchProcessor(BasePostfetchProcessor): MAX_BATCH_SIZE = 500 MAX_BATCH_SEC = 10 + MIN_BATCH_SEC = 0.5 def _get_process_put(self): batch = [] start = time.time() - while (len(batch) < self.MAX_BATCH_SIZE - and time.time() - start < self.MAX_BATCH_SEC): + while True: + if len(batch) >= self.MAX_BATCH_SIZE: + break # full batch + + elapsed = time.time() - start + if elapsed >= self.MAX_BATCH_SEC: + break # been batching for a while + + if (elapsed >= self.MIN_BATCH_SEC and self.outq + and len(self.outq.queue) == 0): + break # next processor is waiting on us + try: batch.append(self.inq.get(block=True, timeout=0.5)) except queue.Empty: