From 419e5bc536c58334e3554ca235c626c2828de687 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 5 Jun 2023 17:56:26 -0700 Subject: [PATCH 1/3] fix typos --- warcprox/writerthread.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index e0628c8..e0942f0 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -117,7 +117,7 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor): and "ait-job-id" in recorded_url.warcprox_meta["metadata"] ): crawl_id = recorded_url.warcprox_meta["metadata"]["ait-job-id"] - if recorded_url.payload_digest in revisits[crawl_id]: + if recorded_url.payload_digest in self.revisits[crawl_id]: self.logger.info( "Found duplicate revisit, skipping: %s, hash: %s", recorded_url.url, @@ -125,7 +125,7 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor): ) return True else: - revisits[crawl_id].add(recorded_url.payload_digest) + self.revisits[crawl_id].add(recorded_url.payload_digest) return False From 2765942421ae4dd62830a85bde84cba25476095d Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 6 Jun 2023 12:27:13 -0700 Subject: [PATCH 2/3] fix logging --- warcprox/writerthread.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index e0942f0..7da2053 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -118,11 +118,7 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor): ): crawl_id = recorded_url.warcprox_meta["metadata"]["ait-job-id"] if recorded_url.payload_digest in self.revisits[crawl_id]: - self.logger.info( - "Found duplicate revisit, skipping: %s, hash: %s", - recorded_url.url, - recorded_url.payload_digest, - ) + self._log(recorded_url, None, annotation="_skip_revisit") return True else: self.revisits[crawl_id].add(recorded_url.payload_digest) @@ -147,7 +143,7 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor): return False return False - def _log(self, recorded_url, records): + def _log(self, recorded_url, records, annotation=""): # 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"} try: payload_digest = records[0].get_header(b'WARC-Payload-Digest').decode('utf-8') @@ -157,11 +153,11 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor): filename = records[0].warc_filename if records else '-' offset = records[0].offset if records else '-' self.logger.info( - '%s %s %s %s %s size=%s %s %s %s offset=%s', + '%s %s %s %s %s size=%s %s %s %s offset=%s %s', recorded_url.client_ip, recorded_url.status, recorded_url.method, recorded_url.url.decode('utf-8'), recorded_url.mimetype, recorded_url.size, payload_digest, - type_, filename, offset) + type_, filename, offset, annotation) def _shutdown(self): self.writer_pool.close_writers() From 4f0644727da0591d47523ba8bde5c933aa3c77a2 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 8 Jun 2023 17:08:30 -0700 Subject: [PATCH 3/3] get bytes from payload_digest obj --- warcprox/writerthread.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 7da2053..001bb36 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -2,7 +2,7 @@ warcprox/writerthread.py - warc writer thread, reads from the recorded url queue, writes warc records, runs final tasks after warc records are written -Copyright (C) 2013-2019 Internet Archive +Copyright (C) 2013-2023 Internet Archive This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License @@ -117,11 +117,13 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor): and "ait-job-id" in recorded_url.warcprox_meta["metadata"] ): crawl_id = recorded_url.warcprox_meta["metadata"]["ait-job-id"] - if recorded_url.payload_digest in self.revisits[crawl_id]: + hash = warcprox.digest_str( recorded_url.payload_digest, self.options.base32) + if hash in self.revisits[crawl_id]: self._log(recorded_url, None, annotation="_skip_revisit") return True else: - self.revisits[crawl_id].add(recorded_url.payload_digest) + self._log(recorded_url, None, annotation="_keep_revisit") + self.revisits[crawl_id].add(hash) return False