From dfc081fff82c53039a9332141d8a99cab753df07 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 2 May 2019 14:25:29 -0700 Subject: [PATCH] do not write incorrect warc-payload-digest to... ... request records see https://github.com/webrecorder/warcio/issues/74#issuecomment-487816378 --- setup.py | 2 +- warcprox/warc.py | 51 +++++++++++++++++++++++++++++------------------- 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/setup.py b/setup.py index 75177d8..9ab99c9 100755 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.8', + version='2.4.9', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/warc.py b/warcprox/warc.py index 94fe137..1eceee2 100644 --- a/warcprox/warc.py +++ b/warcprox/warc.py @@ -125,48 +125,59 @@ class WarcRecordBuilder: headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to)) if content_type is not None: headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type)) - if payload_digest is not None: - headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest)) # truncated value may be 'length' or 'time' if truncated is not None: headers.append((b'WARC-Truncated', truncated)) + if content_length is not None: + headers.append(( + warctools.WarcRecord.CONTENT_LENGTH, + str(content_length).encode('latin1'))) if recorder is not None: - if content_length is not None: - headers.append(( - warctools.WarcRecord.CONTENT_LENGTH, - str(content_length).encode('latin1'))) - else: + if payload_digest is not None: + headers.append( + (warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest)) + if content_length is None: headers.append(( warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1'))) headers.append((warctools.WarcRecord.BLOCK_DIGEST, warcprox.digest_str(recorder.block_digest, self.base32))) recorder.tempfile.seek(0) - record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile) + record = warctools.WarcRecord( + headers=headers, content_file=recorder.tempfile) else: - if content_length is not None: - headers.append(( - warctools.WarcRecord.CONTENT_LENGTH, - str(content_length).encode('latin1'))) - else: + if content_length is None: headers.append(( warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1'))) - # no http headers so block digest == payload digest - if not payload_digest: - payload_digest = warcprox.digest_str( + + block_digest = None + if not hasattr(data, 'read'): + block_digest = warcprox.digest_str( hashlib.new(self.digest_algorithm, data), self.base32) - headers.append(( - warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest)) - headers.append((warctools.WarcRecord.BLOCK_DIGEST, payload_digest)) + + if not content_type.lower().startswith(b'application/http'): + # no http headers, so block digest == payload digest + if payload_digest and not block_digest: + block_digest = payload_digest + elif block_digest and not payload_digest: + payload_digest = block_digest + + if block_digest: + headers.append( + (warctools.WarcRecord.BLOCK_DIGEST, block_digest)) + if payload_digest: + headers.append( + (warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest)) + if hasattr(data, 'read'): record = warctools.WarcRecord( headers=headers, content_file=data) else: content_tuple = content_type, data record = warctools.WarcRecord( - headers=headers, content=content_tuple) + headers=headers, content=(content_type, data)) return record