add some debug logging in BatchTroughLoader

This commit is contained in:
Noah Levitt 2018-05-18 17:29:38 -07:00
parent b762d6468b
commit 997d4341fe
2 changed files with 21 additions and 2 deletions

View File

@ -40,7 +40,7 @@ except:
setuptools.setup( setuptools.setup(
name='warcprox', name='warcprox',
version='2.4b2.dev173', version='2.4b2.dev174',
description='WARC writing MITM HTTP/S proxy', description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox', url='https://github.com/internetarchive/warcprox',
author='Noah Levitt', author='Noah Levitt',

View File

@ -373,6 +373,8 @@ class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor):
'timed out saving dedup info to trough', exc_info=True) 'timed out saving dedup info to trough', exc_info=True)
class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
logger = logging.getLogger("warcprox.dedup.BatchTroughLoader")
def __init__(self, trough_dedup_db, options=warcprox.Options()): def __init__(self, trough_dedup_db, options=warcprox.Options()):
warcprox.BaseBatchPostfetchProcessor.__init__(self, options) warcprox.BaseBatchPostfetchProcessor.__init__(self, options)
self.trough_dedup_db = trough_dedup_db self.trough_dedup_db = trough_dedup_db
@ -386,6 +388,7 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
be looked up. be looked up.
''' '''
buckets = collections.defaultdict(list) buckets = collections.defaultdict(list)
discards = []
for recorded_url in batch: for recorded_url in batch:
if (recorded_url.response_recorder if (recorded_url.response_recorder
and recorded_url.payload_digest and recorded_url.payload_digest
@ -396,6 +399,13 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
else: else:
bucket = '__unspecified__' bucket = '__unspecified__'
buckets[bucket].append(recorded_url) buckets[bucket].append(recorded_url)
else:
discards.append(
warcprox.digest_str(
recorded_url.payload_digest, self.options.base32)
if recorded_url.payload_digest else 'n/a')
self.logger.debug(
'filtered out digests (not loading dedup): %r', discards)
return buckets return buckets
def _build_key_index(self, batch): def _build_key_index(self, batch):
@ -443,10 +453,19 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
'problem looking up dedup info for %s urls ' 'problem looking up dedup info for %s urls '
'in bucket %s', len(buckets[bucket]), bucket, 'in bucket %s', len(buckets[bucket]), bucket,
exc_info=True) exc_info=True)
if self.logger.isEnabledFor(logging.DEBUG):
dups = sorted([e['digest_key'] for e in future.result()])
novel = sorted([
k for k in key_index.keys() if k not in dups])
self.logger.debug(
'bucket %s: dups=%r novel=%r',
bucket, dups, novel)
except futures.TimeoutError as e: except futures.TimeoutError as e:
# the remaining threads actually keep running in this case, # the remaining threads actually keep running in this case,
# there's no way to stop them, but that should be harmless # there's no way to stop them, but that should be harmless
logging.warn( self.logger.warn(
'timed out loading dedup info from trough', exc_info=True) 'timed out loading dedup info from trough', exc_info=True)
class TroughDedupDb(DedupDb, DedupableMixin): class TroughDedupDb(DedupDb, DedupableMixin):