Merge pull request #185 from galgeek/qa

qa
This commit is contained in:
Barbara Miller 2023-06-06 13:30:47 -07:00 committed by GitHub
commit 1863551cba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -117,15 +117,11 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
and "ait-job-id" in recorded_url.warcprox_meta["metadata"] and "ait-job-id" in recorded_url.warcprox_meta["metadata"]
): ):
crawl_id = recorded_url.warcprox_meta["metadata"]["ait-job-id"] crawl_id = recorded_url.warcprox_meta["metadata"]["ait-job-id"]
if recorded_url.payload_digest in revisits[crawl_id]: if recorded_url.payload_digest in self.revisits[crawl_id]:
self.logger.info( self._log(recorded_url, None, annotation="_skip_revisit")
"Found duplicate revisit, skipping: %s, hash: %s",
recorded_url.url,
recorded_url.payload_digest,
)
return True return True
else: else:
revisits[crawl_id].add(recorded_url.payload_digest) self.revisits[crawl_id].add(recorded_url.payload_digest)
return False return False
@ -147,7 +143,7 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
return False return False
return False return False
def _log(self, recorded_url, records): def _log(self, recorded_url, records, annotation=""):
# 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"} # 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"}
try: try:
payload_digest = records[0].get_header(b'WARC-Payload-Digest').decode('utf-8') payload_digest = records[0].get_header(b'WARC-Payload-Digest').decode('utf-8')
@ -157,11 +153,11 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
filename = records[0].warc_filename if records else '-' filename = records[0].warc_filename if records else '-'
offset = records[0].offset if records else '-' offset = records[0].offset if records else '-'
self.logger.info( self.logger.info(
'%s %s %s %s %s size=%s %s %s %s offset=%s', '%s %s %s %s %s size=%s %s %s %s offset=%s %s',
recorded_url.client_ip, recorded_url.status, recorded_url.client_ip, recorded_url.status,
recorded_url.method, recorded_url.url.decode('utf-8'), recorded_url.method, recorded_url.url.decode('utf-8'),
recorded_url.mimetype, recorded_url.size, payload_digest, recorded_url.mimetype, recorded_url.size, payload_digest,
type_, filename, offset) type_, filename, offset, annotation)
def _shutdown(self): def _shutdown(self):
self.writer_pool.close_writers() self.writer_pool.close_writers()