From 3c215b42b56f45537973e91cc4a66050364a31b8 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Fri, 10 Nov 2017 14:34:06 -0800
Subject: [PATCH 1/4] missed a spot handling case of no warc records written

---
 setup.py          |  2 +-
 warcprox/stats.py | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/setup.py b/setup.py
index 0b5c891..56c39f5 100755
--- a/setup.py
+++ b/setup.py
@@ -51,7 +51,7 @@ except:
 
 setuptools.setup(
         name='warcprox',
-        version='2.2.1b2.dev112',
+        version='2.2.1b2.dev113',
         description='WARC writing MITM HTTP/S proxy',
         url='https://github.com/internetarchive/warcprox',
         author='Noah Levitt',
diff --git a/warcprox/stats.py b/warcprox/stats.py
index 99e6804..254f764 100644
--- a/warcprox/stats.py
+++ b/warcprox/stats.py
@@ -305,7 +305,6 @@ class RethinkStatsDb(StatsDb):
 
     def tally(self, recorded_url, records):
         buckets = self.buckets(recorded_url)
-        is_revisit = records[0].type == b'revisit'
         with self._batch_lock:
             for bucket in buckets:
                 bucket_stats = self._batch.setdefault(
@@ -314,12 +313,13 @@ class RethinkStatsDb(StatsDb):
                 bucket_stats["total"]["urls"] += 1
                 bucket_stats["total"]["wire_bytes"] += recorded_url.size
 
-                if is_revisit:
-                    bucket_stats["revisit"]["urls"] += 1
-                    bucket_stats["revisit"]["wire_bytes"] += recorded_url.size
-                else:
-                    bucket_stats["new"]["urls"] += 1
-                    bucket_stats["new"]["wire_bytes"] += recorded_url.size
+                if records:
+                    if records[0].type == b'revisit':
+                        bucket_stats["revisit"]["urls"] += 1
+                        bucket_stats["revisit"]["wire_bytes"] += recorded_url.size
+                    else:
+                        bucket_stats["new"]["urls"] += 1
+                        bucket_stats["new"]["wire_bytes"] += recorded_url.size
 
     def _add_to_batch(self, add_me):
         with self._batch_lock:

From 30b6b0b337782c1f5ce918355f9009770bbae6a2 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Fri, 10 Nov 2017 17:02:33 -0800
Subject: [PATCH 2/4] new failing test for correct calculation of payload
 digest

which should match rfc2616 entity body, which is transfer decoded but not
content-decoded
---
 tests/test_warcprox.py | 149 ++++++++++++++++++++++++++++++++++++++++-
 warcprox/__init__.py   |   2 +-
 warcprox/mitmproxy.py  |   3 +-
 3 files changed, 151 insertions(+), 3 deletions(-)

diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py
index 0a357b2..4d1caab 100755
--- a/tests/test_warcprox.py
+++ b/tests/test_warcprox.py
@@ -46,6 +46,10 @@ from collections import Counter
 import socket
 import datetime
 import warcio.archiveiterator
+import io
+import gzip
+import mock
+import email.message
 
 try:
     import http.server as http_server
@@ -84,7 +88,7 @@ def _send(self, data):
 # http_client.HTTPConnection.send = _send
 
 logging.basicConfig(
-        stream=sys.stdout, level=logging.DEBUG, # level=warcprox.TRACE,
+        stream=sys.stdout, level=warcprox.TRACE,
         format='%(asctime)s %(process)d %(levelname)s %(threadName)s '
         '%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
 logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN)
@@ -134,6 +138,24 @@ def dump_state(signum=None, frame=None):
 
 signal.signal(signal.SIGQUIT, dump_state)
 
+def chunkify(buf, chunk_size=13):
+    i = 0
+    result = b''
+    while i < len(buf):
+        chunk_len = min(len(buf) - i, chunk_size)
+        result += ('%x\r\n' % chunk_len).encode('ascii')
+        result += buf[i:i+chunk_len]
+        result += b'\r\n'
+        i += chunk_size
+    result += b'0\r\n\r\n'
+    return result
+
+# def gzipify(buf):
+#     with io.BytesIO() as outbuf:
+#         with gzip.GzipFile(fileobj=outbuf, mode='wb') as gz:
+#             gz.write(buf)
+#         return outbuf.getvalue()
+
 class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
     def build_response(self):
         m = re.match(r'^/([^/]+)/([^/]+)$', self.path)
@@ -150,6 +172,71 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
                     +  b'Content-Type: text/plain\r\n'
                     +  b'\r\n')
             payload = b'This response is missing a Content-Length http header.'
+        elif self.path.startswith('/test_payload_digest-'):
+            content_body = (
+                    b'Hello. How are you. I am the test_payload_digest '
+                    b'content body. The entity body is a possibly content-'
+                    b'encoded version of me. The message body is a possibly '
+                    b'transfer-encoded version of the entity body.\n')
+            gzipped = (
+                    b"\x1f\x8b\x08\x00jA\x06Z\x02\xffm\x8d1\x0e\xc20\x10\x04{^"
+                    b"\xb1\x1f\xc0\xef\x08=}t\x897\xc1\x92\xed\x8b|\x07\xc8"
+                    b"\xbf'\n\xa2@J9\xab\x19\xed\xc0\x9c5`\xd07\xa4\x11]\x9f"
+                    b"\x017H\x81?\x08\xa7\xf9\xb8I\xcf*q\x8ci\xdd\x11\xb3VguL"
+                    b"\x1a{\xc0}\xb7vJ\xde\x8f\x01\xc9 \xd8\xd4,M\xb9\xff\xdc"
+                    b"+\xeb\xac\x91\x11/6KZ\xa1\x0b\n\xbfq\xa1\x99\xac<\xab"
+                    b"\xbdI\xb5\x85\xed,\xf7\xff\xdfp\xf9\x00\xfc\t\x02\xb0"
+                    b"\xc8\x00\x00\x00")
+            double_gzipped = (
+                    b"\x1f\x8b\x08\x00jA\x06Z\x02\xff\x01\x89\x00v\xff\x1f\x8b"
+                    b"\x08\x00jA\x06Z\x02\xffm\x8d1\x0e\xc20\x10\x04{^\xb1\x1f"
+                    b"\xc0\xef\x08=}t\x897\xc1\x92\xed\x8b|\x07\xc8\xbf'\n\xa2"
+                    b"@J9\xab\x19\xed\xc0\x9c5`\xd07\xa4\x11]\x9f\x017H\x81?"
+                    b"\x08\xa7\xf9\xb8I\xcf*q\x8ci\xdd\x11\xb3VguL\x1a{\xc0}"
+                    b"\xb7vJ\xde\x8f\x01\xc9 \xd8\xd4,M\xb9\xff\xdc+\xeb\xac"
+                    b"\x91\x11/6KZ\xa1\x0b\n\xbfq\xa1\x99\xac<\xab\xbdI\xb5"
+                    b"\x85\xed,\xf7\xff\xdfp\xf9\x00\xfc\t\x02\xb0\xc8\x00\x00"
+                    b"\x00\xf9\xdd\x8f\xed\x89\x00\x00\x00")
+            if self.path == '/test_payload_digest-plain':
+                payload = content_body
+                actual_headers = (b'Content-Type: text/plain\r\n'
+                               +  b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n')
+            elif self.path == '/test_payload_digest-gzip':
+                payload = gzipped
+                actual_headers = (b'Content-Type: application/gzip\r\n'
+                               +  b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n')
+            elif self.path == '/test_payload_digest-ce-gzip':
+                payload = gzipped
+                actual_headers = (b'Content-Type: text/plain\r\n'
+                               +  b'Content-Encoding: gzip\r\n'
+                               +  b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n')
+            elif self.path == '/test_payload_digest-gzip-ce-gzip':
+                payload = double_gzipped
+                actual_headers = (b'Content-Type: application/gzip\r\n'
+                               +  b'Content-Encoding: gzip\r\n'
+                               +  b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n')
+            elif self.path == '/test_payload_digest-te-chunked':
+                payload = chunkify(content_body)
+                actual_headers = (b'Content-Type: text/plain\r\n'
+                               +  b'Transfer-Encoding: chunked\r\n')
+            elif self.path == '/test_payload_digest-gzip-te-chunked':
+                payload = chunkify(gzipped)
+                actual_headers = (b'Content-Type: application/gzip\r\n'
+                               +  b'Transfer-Encoding: chunked\r\n')
+            elif self.path == '/test_payload_digest-ce-gzip-te-chunked':
+                payload = chunkify(gzipped)
+                actual_headers = (b'Content-Type: text/plain\r\n'
+                               +  b'Content-Encoding: gzip\r\n'
+                               +  b'Transfer-Encoding: chunked\r\n')
+            elif self.path == '/test_payload_digest-gzip-ce-gzip-te-chunked':
+                payload = chunkify(double_gzipped)
+                actual_headers = (b'Content-Type: application/gzip\r\n'
+                               +  b'Content-Encoding: gzip\r\n'
+                               +  b'Transfer-Encoding: chunked\r\n')
+            else:
+                raise Exception('bad path')
+            headers = b'HTTP/1.1 200 OK\r\n' + actual_headers +  b'\r\n'
+            logging.info('headers=%r payload=%r', headers, payload)
         else:
             payload = b'404 Not Found\n'
             headers = (b'HTTP/1.1 404 Not Found\r\n'
@@ -1554,6 +1641,66 @@ def test_long_warcprox_meta(
         with pytest.raises(StopIteration):
             next(rec_iter)
 
+def test_payload_digest(warcprox_, http_daemon):
+    '''
+    Tests that digest is of RFC2616 "entity body"
+    (transfer-decoded but not content-decoded)
+    '''
+    class HalfMockedMitm(warcprox.mitmproxy.MitmProxyHandler):
+        def __init__(self, url):
+            self.path = url
+            self.request_version = 'HTTP/1.1'
+            self.client_address = mock.MagicMock()
+            self.headers = email.message.Message()
+            self.headers.add_header('Host', 'localhost:%s' % http_daemon.server_port)
+            self.server = warcprox_.proxy
+            self.command = 'GET'
+            self.connection = mock.Mock()
+
+    PLAIN_SHA1 = b'sha1:881289333370aa4e3214505f1173423cc5a896b7'
+    GZIP_SHA1 = b'sha1:634e25de71ae01edb5c5d9e2e99c4836bbe94129'
+    GZIP_GZIP_SHA1 = b'sha1:cecbf3a5c4975072f5e4c5e0489f808ef246c2b4'
+
+    # plain
+    mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-plain' % http_daemon.server_port)
+    req, prox_rec_res = mitm.do_GET()
+    assert warcprox.digest_str(prox_rec_res.payload_digest) == PLAIN_SHA1
+
+    # content-type: application/gzip
+    mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-gzip' % http_daemon.server_port)
+    req, prox_rec_res = mitm.do_GET()
+    assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_SHA1
+
+    # content-encoding: gzip
+    mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-ce-gzip' % http_daemon.server_port)
+    req, prox_rec_res = mitm.do_GET()
+    assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_SHA1
+
+    # content-type: application/gzip && content-encoding: gzip
+    mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-gzip-ce-gzip' % http_daemon.server_port)
+    req, prox_rec_res = mitm.do_GET()
+    assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_GZIP_SHA1
+
+    # chunked plain
+    mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-te-chunked' % http_daemon.server_port)
+    req, prox_rec_res = mitm.do_GET()
+    assert warcprox.digest_str(prox_rec_res.payload_digest) == PLAIN_SHA1
+
+    # chunked content-type: application/gzip
+    mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-gzip-te-chunked' % http_daemon.server_port)
+    req, prox_rec_res = mitm.do_GET()
+    assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_SHA1
+
+    # chunked content-encoding: gzip
+    mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-ce-gzip-te-chunked' % http_daemon.server_port)
+    req, prox_rec_res = mitm.do_GET()
+    assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_SHA1
+
+    # chunked content-type: application/gzip && content-encoding: gzip
+    mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-gzip-ce-gzip-te-chunked' % http_daemon.server_port)
+    req, prox_rec_res = mitm.do_GET()
+    assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_GZIP_SHA1
+
 if __name__ == '__main__':
     pytest.main()
 
diff --git a/warcprox/__init__.py b/warcprox/__init__.py
index ecd6f53..e50a415 100644
--- a/warcprox/__init__.py
+++ b/warcprox/__init__.py
@@ -28,7 +28,7 @@ except ImportError:
     import Queue as queue
 import datetime
 
-def digest_str(hash_obj, base32):
+def digest_str(hash_obj, base32=False):
     import base64
     return hash_obj.name.encode('utf-8') + b':' + (
             base64.b32encode(hash_obj.digest()) if base32
diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py
index b14cddf..722311b 100644
--- a/warcprox/mitmproxy.py
+++ b/warcprox/mitmproxy.py
@@ -361,7 +361,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
             return
 
         try:
-            self._proxy_request()
+            return self._proxy_request()
         except:
             self.logger.error("exception proxying request", exc_info=True)
             raise
@@ -406,6 +406,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
         if 'Content-Length' in self.headers:
             req += self.rfile.read(int(self.headers['Content-Length']))
 
+        prox_rec_res = None
         try:
             self.logger.debug('sending to remote server req=%r', req)
 

From 3a0f6e0947c859ebd45032ea7421d3d3449cbfd9 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Fri, 10 Nov 2017 17:18:22 -0800
Subject: [PATCH 3/4] fix payload digest by pulling calculation up one level
 where content has already been transfer-decoded

---
 setup.py                       |  2 +-
 tests/single-threaded-proxy.py |  4 ++--
 warcprox/bigtable.py           |  8 ++++----
 warcprox/crawl_log.py          |  2 +-
 warcprox/dedup.py              | 11 +++++------
 warcprox/mitmproxy.py          | 25 +++++++++++++------------
 warcprox/warc.py               | 13 +++++--------
 warcprox/warcproxy.py          |  7 +++++--
 8 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/setup.py b/setup.py
index 56c39f5..625abbb 100755
--- a/setup.py
+++ b/setup.py
@@ -51,7 +51,7 @@ except:
 
 setuptools.setup(
         name='warcprox',
-        version='2.2.1b2.dev113',
+        version='2.2.1b2.dev114',
         description='WARC writing MITM HTTP/S proxy',
         url='https://github.com/internetarchive/warcprox',
         author='Noah Levitt',
diff --git a/tests/single-threaded-proxy.py b/tests/single-threaded-proxy.py
index dd5e709..1c176a2 100755
--- a/tests/single-threaded-proxy.py
+++ b/tests/single-threaded-proxy.py
@@ -3,7 +3,7 @@
 tests/single-threaded-proxy.py - single-threaded MITM proxy, useful for
 debugging, does not write warcs
 
-Copyright (C) 2015-2016 Internet Archive
+Copyright (C) 2015-2017 Internet Archive
 
 This program is free software; you can redistribute it and/or
 modify it under the terms of the GNU General Public License
@@ -46,7 +46,7 @@ class FakeQueue(object):
         logging.info("{} {} {} {} {} size={} {}".format(
             recorded_url.client_ip, recorded_url.status, recorded_url.method,
             recorded_url.url.decode("utf-8"), recorded_url.mimetype,
-            recorded_url.size, warcprox.digest_str(recorded_url.response_recorder.payload_digest, False).decode('utf-8')))
+            recorded_url.size, warcprox.digest_str(recorded_url.payload_digest, False).decode('utf-8')))
 
 def parse_args():
     prog = os.path.basename(sys.argv[0])
diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py
index c9547b2..115aed9 100644
--- a/warcprox/bigtable.py
+++ b/warcprox/bigtable.py
@@ -140,16 +140,16 @@ class RethinkCaptures:
         return result
 
     def _assemble_entry(self, recorded_url, records):
-        if recorded_url.response_recorder:
-            if recorded_url.response_recorder.payload_digest.name == "sha1":
+        if recorded_url.payload_digest:
+            if recorded_url.payload_digest.name == "sha1":
                 sha1base32 = base64.b32encode(
-                        recorded_url.response_recorder.payload_digest.digest()
+                        recorded_url.payload_digest.digest()
                         ).decode("utf-8")
             else:
                 self.logger.warn(
                         "digest type is %r but big captures table is indexed "
                         "by sha1",
-                        recorded_url.response_recorder.payload_digest.name)
+                        recorded_url.payload_digest.name)
         else:
             digest = hashlib.new("sha1", records[0].content[1])
             sha1base32 = base64.b32encode(digest.digest()).decode("utf-8")
diff --git a/warcprox/crawl_log.py b/warcprox/crawl_log.py
index 5b4a4fc..f28683a 100644
--- a/warcprox/crawl_log.py
+++ b/warcprox/crawl_log.py
@@ -43,7 +43,7 @@ class CrawlLogger(object):
         if recorded_url.response_recorder:
             content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset
             payload_digest = warcprox.digest_str(
-                recorded_url.response_recorder.payload_digest,
+                recorded_url.payload_digest,
                 self.options.base32)
         else:
             # WARCPROX_WRITE_RECORD request
diff --git a/warcprox/dedup.py b/warcprox/dedup.py
index b9cd223..be49104 100644
--- a/warcprox/dedup.py
+++ b/warcprox/dedup.py
@@ -96,8 +96,7 @@ class DedupDb(object):
         if (records and records[0].type == b'response'
                 and recorded_url.response_recorder.payload_size() > 0):
             digest_key = warcprox.digest_str(
-                    recorded_url.response_recorder.payload_digest,
-                    self.options.base32)
+                    recorded_url.payload_digest, self.options.base32)
             if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
                 self.save(
                         digest_key, records[0],
@@ -108,9 +107,9 @@ class DedupDb(object):
 
 def decorate_with_dedup_info(dedup_db, recorded_url, base32=False):
     if (recorded_url.response_recorder
-            and recorded_url.response_recorder.payload_digest
+            and recorded_url.payload_digest
             and recorded_url.response_recorder.payload_size() > 0):
-        digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32)
+        digest_key = warcprox.digest_str(recorded_url.payload_digest, base32)
         if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
             recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"],
                                                       recorded_url.url)
@@ -174,8 +173,8 @@ class RethinkDedupDb:
     def notify(self, recorded_url, records):
         if (records and records[0].type == b'response'
                 and recorded_url.response_recorder.payload_size() > 0):
-            digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest,
-                    self.options.base32)
+            digest_key = warcprox.digest_str(
+                    recorded_url.payload_digest, self.options.base32)
             if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
                 self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"])
             else:
diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py
index 722311b..2c34bcd 100644
--- a/warcprox/mitmproxy.py
+++ b/warcprox/mitmproxy.py
@@ -66,7 +66,7 @@ import time
 class ProxyingRecorder(object):
     """
     Wraps a socket._fileobject, recording the bytes as they are read,
-    calculating digests, and sending them on to the proxy client.
+    calculating the block digest, and sending them on to the proxy client.
     """
 
     logger = logging.getLogger("warcprox.mitmproxy.ProxyingRecorder")
@@ -78,27 +78,19 @@ class ProxyingRecorder(object):
         self.digest_algorithm = digest_algorithm
         self.block_digest = hashlib.new(digest_algorithm)
         self.payload_offset = None
-        self.payload_digest = None
         self.proxy_client = proxy_client
         self._proxy_client_conn_open = True
         self.len = 0
         self.url = url
 
     def payload_starts_now(self):
-        self.payload_digest = hashlib.new(self.digest_algorithm)
         self.payload_offset = self.len
 
-    def _update_payload_digest(self, hunk):
-        if self.payload_digest:
-            self.payload_digest.update(hunk)
-
     def _update(self, hunk):
-        self._update_payload_digest(hunk)
         self.block_digest.update(hunk)
-
         self.tempfile.write(hunk)
 
-        if self.payload_digest and self._proxy_client_conn_open:
+        if self.payload_offset is not None and self._proxy_client_conn_open:
             try:
                 self.proxy_client.sendall(hunk)
             except BaseException as e:
@@ -157,6 +149,7 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
                 self, sock, debuglevel=debuglevel, method=method)
         self.proxy_client = proxy_client
         self.url = url
+        self.digest_algorithm = digest_algorithm
 
         # Keep around extra reference to self.fp because HTTPResponse sets
         # self.fp=None after it finishes reading, but we still need it
@@ -164,6 +157,8 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
                 self.fp, proxy_client, digest_algorithm, url=url)
         self.fp = self.recorder
 
+        self.payload_digest = None
+
     def begin(self, extra_response_headers={}):
         http_client.HTTPResponse.begin(self)  # reads status line, headers
 
@@ -185,6 +180,12 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
         self.proxy_client.sendall(status_and_headers.encode('latin1'))
 
         self.recorder.payload_starts_now()
+        self.payload_digest = hashlib.new(self.digest_algorithm)
+
+    def read(self, amt=None):
+        buf = http_client.HTTPResponse.read(self, amt)
+        self.payload_digest.update(buf)
+        return buf
 
 def via_header_value(orig, request_version):
     via = orig
@@ -419,9 +420,9 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
                     url=self.url, method=self.command)
             prox_rec_res.begin(extra_response_headers=extra_response_headers)
 
-            buf = prox_rec_res.read(8192)
+            buf = prox_rec_res.read(65536)
             while buf != b'':
-                buf = prox_rec_res.read(8192)
+                buf = prox_rec_res.read(65536)
 
             self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
         except Exception as e:
diff --git a/warcprox/warc.py b/warcprox/warc.py
index de0ec06..6b9cbcf 100644
--- a/warcprox/warc.py
+++ b/warcprox/warc.py
@@ -53,7 +53,8 @@ class WarcRecordBuilder:
                     refers_to=recorded_url.dedup_info.get('id'),
                     refers_to_target_uri=recorded_url.dedup_info['url'],
                     refers_to_date=recorded_url.dedup_info['date'],
-                    payload_digest=warcprox.digest_str(recorded_url.response_recorder.payload_digest, self.base32),
+                    payload_digest=warcprox.digest_str(
+                        recorded_url.payload_digest, self.base32),
                     profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST,
                     content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
                     remote_ip=recorded_url.remote_ip)
@@ -64,7 +65,9 @@ class WarcRecordBuilder:
                     recorder=recorded_url.response_recorder,
                     warc_type=warctools.WarcRecord.RESPONSE,
                     content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
-                    remote_ip=recorded_url.remote_ip)
+                    remote_ip=recorded_url.remote_ip,
+                    payload_digest=warcprox.digest_str(
+                        recorded_url.payload_digest, self.base32))
 
     def build_warc_records(self, recorded_url):
         """Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)"""
@@ -122,13 +125,8 @@ class WarcRecordBuilder:
             headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1')))
             headers.append((warctools.WarcRecord.BLOCK_DIGEST,
                 warcprox.digest_str(recorder.block_digest, self.base32)))
-            if recorder.payload_digest is not None:
-                headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
-                    warcprox.digest_str(recorder.payload_digest, self.base32)))
-
             recorder.tempfile.seek(0)
             record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)
-
         else:
             headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1')))
             digest = hashlib.new(self.digest_algorithm, data)
@@ -137,7 +135,6 @@ class WarcRecordBuilder:
             if not payload_digest:
                 headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
                                 warcprox.digest_str(digest, self.base32)))
-
             content_tuple = content_type, data
             record = warctools.WarcRecord(headers=headers, content=content_tuple)
 
diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py
index afe1835..f1de01e 100644
--- a/warcprox/warcproxy.py
+++ b/warcprox/warcproxy.py
@@ -218,7 +218,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
                 content_type=content_type, method=self.command,
                 timestamp=timestamp, host=self.hostname,
                 duration=datetime.datetime.utcnow()-timestamp,
-                referer=self.headers.get('referer'))
+                referer=self.headers.get('referer'),
+                payload_digest=prox_rec_res.payload_digest)
         self.server.recorded_url_q.put(recorded_url)
 
         return recorded_url
@@ -328,7 +329,8 @@ class RecordedUrl:
     def __init__(self, url, request_data, response_recorder, remote_ip,
             warcprox_meta=None, content_type=None, custom_type=None,
             status=None, size=None, client_ip=None, method=None,
-            timestamp=None, host=None, duration=None, referer=None):
+            timestamp=None, host=None, duration=None, referer=None,
+            payload_digest=None):
         # XXX should test what happens with non-ascii url (when does
         # url-encoding happen?)
         if type(url) is not bytes:
@@ -366,6 +368,7 @@ class RecordedUrl:
         self.host = host
         self.duration = duration
         self.referer = referer
+        self.payload_digest = payload_digest
 
 # inherit from object so that multiple inheritance from this class works
 # properly in python 2

From ffc8a268ab517e76ccb4e543305195a9ca552932 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Mon, 13 Nov 2017 11:45:06 -0800
Subject: [PATCH 4/4] hopefully fix test failing occasionally apparently due to
 race condition by checking that the file we're waiting for has some content

---
 setup.py               |  2 +-
 tests/test_warcprox.py | 32 +++++++++++++++++---------------
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/setup.py b/setup.py
index 625abbb..ebeb213 100755
--- a/setup.py
+++ b/setup.py
@@ -51,7 +51,7 @@ except:
 
 setuptools.setup(
         name='warcprox',
-        version='2.2.1b2.dev114',
+        version='2.2.1b2.dev115',
         description='WARC writing MITM HTTP/S proxy',
         url='https://github.com/internetarchive/warcprox',
         author='Noah Levitt',
diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py
index 4d1caab..97e4351 100755
--- a/tests/test_warcprox.py
+++ b/tests/test_warcprox.py
@@ -1441,11 +1441,14 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
     assert response.status_code == 200
 
     start = time.time()
+    file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log')
     while time.time() - start < 10:
-        if os.path.exists(os.path.join(
-            warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log')):
+        if os.path.exists(file) and os.stat(file).st_size > 0:
             break
         time.sleep(0.5)
+    assert os.path.exists(file)
+    assert os.path.exists(os.path.join(
+        warcprox_.options.crawl_log_dir, 'crawl.log'))
 
     crawl_log = open(os.path.join(
         warcprox_.options.crawl_log_dir, 'crawl.log'), 'rb').read()
@@ -1499,14 +1502,14 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
     assert response.status_code == 200
 
     start = time.time()
+    file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log')
     while time.time() - start < 10:
-        if os.path.exists(os.path.join(
-            warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log')):
+        if os.path.exists(file) and os.stat(file).st_size > 0:
             break
         time.sleep(0.5)
+    assert os.path.exists(file)
 
-    crawl_log_2 = open(os.path.join(
-        warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log'), 'rb').read()
+    crawl_log_2 = open(file, 'rb').read()
 
     assert re.match(b'\A2[^\n]+\n\Z', crawl_log_2)
     assert crawl_log_2[24:31] == b'   200 '
@@ -1533,16 +1536,15 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
     headers = {'Warcprox-Meta': json.dumps({'warc-prefix': 'test_crawl_log_3'})}
     response = requests.head(url, proxies=archiving_proxies, headers=headers)
 
+    file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_3.log')
     start = time.time()
     while time.time() - start < 10:
-        if os.path.exists(os.path.join(
-            warcprox_.options.crawl_log_dir, 'test_crawl_log_3.log')):
+        if os.path.exists(file) and os.stat(file).st_size > 0:
             break
         time.sleep(0.5)
 
-    crawl_log_3 = open(os.path.join(
-        warcprox_.options.crawl_log_dir, 'test_crawl_log_3.log'), 'rb').read()
-
+    assert os.path.exists(file)
+    crawl_log_3 = open(file, 'rb').read()
     assert re.match(b'\A2[^\n]+\n\Z', crawl_log_3)
     assert crawl_log_3[24:31] == b'   200 '
     assert crawl_log_3[31:42] == b'         0 '
@@ -1575,14 +1577,14 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
     assert response.status_code == 204
 
     start = time.time()
+    file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_4.log')
     while time.time() - start < 10:
-        if os.path.exists(os.path.join(
-            warcprox_.options.crawl_log_dir, 'test_crawl_log_4.log')):
+        if os.path.exists(file) and os.stat(file).st_size > 0:
             break
         time.sleep(0.5)
 
-    crawl_log_4 = open(os.path.join(
-        warcprox_.options.crawl_log_dir, 'test_crawl_log_4.log'), 'rb').read()
+    assert os.path.exists(file)
+    crawl_log_4 = open(file, 'rb').read()
 
     assert re.match(b'\A2[^\n]+\n\Z', crawl_log_4)
     assert crawl_log_4[24:31] == b'   204 '