some logging improvements

This commit is contained in:
Noah Levitt 2018-07-18 19:25:43 -05:00
parent f4cf782922
commit b7e12a3ec2
4 changed files with 20 additions and 16 deletions

View File

@ -148,6 +148,8 @@ class BasePostfetchProcessor(threading.Thread):
raise Exception('not implemented')
def _run(self):
threading.current_thread().name = '%s(tid=%s)' % (
threading.current_thread().name, gettid())
self.logger.info('%s starting up', self)
self._startup()
while not self.stop.is_set():

View File

@ -405,7 +405,9 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
recorded_url.payload_digest, self.options.base32)
if recorded_url.payload_digest else 'n/a')
self.logger.debug(
'filtered out digests (not loading dedup): %r', discards)
'len(batch)=%s len(discards)=%s buckets=%s',
len(batch), len(discards),
{bucket: len(buckets[bucket]) for bucket in buckets})
return buckets
def _build_key_index(self, batch):
@ -459,8 +461,8 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
novel = sorted([
k for k in key_index.keys() if k not in dups])
self.logger.debug(
'bucket %s: dups=%r novel=%r',
bucket, dups, novel)
'bucket %s: dups(%s)=%r novel(%s)=%r',
bucket, len(dups), dups, len(novel), novel)
except futures.TimeoutError as e:
# the remaining threads actually keep running in this case,

View File

@ -204,13 +204,14 @@ class WarcWriter:
record.offset = offset
record.length = warc.f.tell() - offset
record.warc_filename = warc.finalname
self.logger.debug(
self.logger.trace(
'wrote warc record: warc_type=%s content_length=%s '
'url=%s warc=%s offset=%d',
record.get_header(warctools.WarcRecord.TYPE),
'digest=%s offset=%d warc=%s url=%s',
record.type,
record.get_header(warctools.WarcRecord.CONTENT_LENGTH),
record.get_header(warctools.WarcRecord.URL),
warc.path, record.offset)
record.get_header(b'WARC-Payload-Digest'),
record.offset, warc.path,
record.get_header(warctools.WarcRecord.URL))
return records

View File

@ -117,19 +117,18 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
and self._filter_accepts(recorded_url))
def _log(self, recorded_url, records):
try:
payload_digest = records[0].get_header('WARC-Payload-Digest').decode("utf-8")
except:
payload_digest = "-"
# 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"}
type_ = records[0].type.decode("utf-8") if records else '-'
try:
payload_digest = records[0].get_header(b'WARC-Payload-Digest').decode('utf-8')
except:
payload_digest = '-'
type_ = records[0].type.decode('utf-8') if records else '-'
filename = records[0].warc_filename if records else '-'
offset = records[0].offset if records else '-'
self.logger.info(
"%s %s %s %s %s size=%s %s %s %s offset=%s",
'%s %s %s %s %s size=%s %s %s %s offset=%s',
recorded_url.client_ip, recorded_url.status,
recorded_url.method, recorded_url.url.decode("utf-8"),
recorded_url.method, recorded_url.url.decode('utf-8'),
recorded_url.mimetype, recorded_url.size, payload_digest,
type_, filename, offset)