From fee200c72c4ae706636b9af855d27c38908ab6d5 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 4 Sep 2015 01:30:16 +0000 Subject: [PATCH] get rid of silly _decode because we know which fields are bytes and which str --- warcprox/writerthread.py | 25 ++++--------------------- 1 file changed, 4 insertions(+), 21 deletions(-) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 182835f..d656951 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -58,34 +58,17 @@ class WarcWriterThread(threading.Thread): # closest thing we have to heritrix crawl log at the moment def _log(self, recorded_url, records): - def _decode(x): - if isinstance(x, bytes): - return x.decode("utf-8") - else: - return x - try: payload_digest = records[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode("utf-8") except: payload_digest = "-" - mimetype = _decode(recorded_url.content_type) - if mimetype: - n = mimetype.find(";") - if n >= 0: - mimetype = mimetype[:n] # 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"} self.logger.info("{} {} {} {} {} size={} {} {} {} offset={}".format( - _decode(recorded_url.client_ip), - _decode(recorded_url.status), - _decode(recorded_url.method), - _decode(recorded_url.url), - mimetype, - recorded_url.size, - _decode(payload_digest), - _decode(records[0].get_header(warctools.WarcRecord.TYPE)), - _decode(records[0].warc_filename), - records[0].offset)) + recorded_url.client_ip, recorded_url.status, recorded_url.method, + recorded_url.url.decode("utf-8"), recorded_url.mimetype, + recorded_url.size, payload_digest, records[0].type.decode("utf-8"), + records[0].warc_filename, records[0].offset)) def _final_tasks(self, recorded_url, records): if self.listeners: