mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
fix logging
This commit is contained in:
parent
419e5bc536
commit
2765942421
@ -118,11 +118,7 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
|
||||
):
|
||||
crawl_id = recorded_url.warcprox_meta["metadata"]["ait-job-id"]
|
||||
if recorded_url.payload_digest in self.revisits[crawl_id]:
|
||||
self.logger.info(
|
||||
"Found duplicate revisit, skipping: %s, hash: %s",
|
||||
recorded_url.url,
|
||||
recorded_url.payload_digest,
|
||||
)
|
||||
self._log(recorded_url, None, annotation="_skip_revisit")
|
||||
return True
|
||||
else:
|
||||
self.revisits[crawl_id].add(recorded_url.payload_digest)
|
||||
@ -147,7 +143,7 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
|
||||
return False
|
||||
return False
|
||||
|
||||
def _log(self, recorded_url, records):
|
||||
def _log(self, recorded_url, records, annotation=""):
|
||||
# 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"}
|
||||
try:
|
||||
payload_digest = records[0].get_header(b'WARC-Payload-Digest').decode('utf-8')
|
||||
@ -157,11 +153,11 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
|
||||
filename = records[0].warc_filename if records else '-'
|
||||
offset = records[0].offset if records else '-'
|
||||
self.logger.info(
|
||||
'%s %s %s %s %s size=%s %s %s %s offset=%s',
|
||||
'%s %s %s %s %s size=%s %s %s %s offset=%s %s',
|
||||
recorded_url.client_ip, recorded_url.status,
|
||||
recorded_url.method, recorded_url.url.decode('utf-8'),
|
||||
recorded_url.mimetype, recorded_url.size, payload_digest,
|
||||
type_, filename, offset)
|
||||
type_, filename, offset, annotation)
|
||||
|
||||
def _shutdown(self):
|
||||
self.writer_pool.close_writers()
|
||||
|
Loading…
x
Reference in New Issue
Block a user