From 30b6b0b337782c1f5ce918355f9009770bbae6a2 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 10 Nov 2017 17:02:33 -0800 Subject: [PATCH] new failing test for correct calculation of payload digest which should match rfc2616 entity body, which is transfer decoded but not content-decoded --- tests/test_warcprox.py | 149 ++++++++++++++++++++++++++++++++++++++++- warcprox/__init__.py | 2 +- warcprox/mitmproxy.py | 3 +- 3 files changed, 151 insertions(+), 3 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 0a357b2..4d1caab 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -46,6 +46,10 @@ from collections import Counter import socket import datetime import warcio.archiveiterator +import io +import gzip +import mock +import email.message try: import http.server as http_server @@ -84,7 +88,7 @@ def _send(self, data): # http_client.HTTPConnection.send = _send logging.basicConfig( - stream=sys.stdout, level=logging.DEBUG, # level=warcprox.TRACE, + stream=sys.stdout, level=warcprox.TRACE, format='%(asctime)s %(process)d %(levelname)s %(threadName)s ' '%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s') logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN) @@ -134,6 +138,24 @@ def dump_state(signum=None, frame=None): signal.signal(signal.SIGQUIT, dump_state) +def chunkify(buf, chunk_size=13): + i = 0 + result = b'' + while i < len(buf): + chunk_len = min(len(buf) - i, chunk_size) + result += ('%x\r\n' % chunk_len).encode('ascii') + result += buf[i:i+chunk_len] + result += b'\r\n' + i += chunk_size + result += b'0\r\n\r\n' + return result + +# def gzipify(buf): +# with io.BytesIO() as outbuf: +# with gzip.GzipFile(fileobj=outbuf, mode='wb') as gz: +# gz.write(buf) +# return outbuf.getvalue() + class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler): def build_response(self): m = re.match(r'^/([^/]+)/([^/]+)$', self.path) @@ -150,6 +172,71 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler): + b'Content-Type: text/plain\r\n' + b'\r\n') payload = b'This response is missing a Content-Length http header.' + elif self.path.startswith('/test_payload_digest-'): + content_body = ( + b'Hello. How are you. I am the test_payload_digest ' + b'content body. The entity body is a possibly content-' + b'encoded version of me. The message body is a possibly ' + b'transfer-encoded version of the entity body.\n') + gzipped = ( + b"\x1f\x8b\x08\x00jA\x06Z\x02\xffm\x8d1\x0e\xc20\x10\x04{^" + b"\xb1\x1f\xc0\xef\x08=}t\x897\xc1\x92\xed\x8b|\x07\xc8" + b"\xbf'\n\xa2@J9\xab\x19\xed\xc0\x9c5`\xd07\xa4\x11]\x9f" + b"\x017H\x81?\x08\xa7\xf9\xb8I\xcf*q\x8ci\xdd\x11\xb3VguL" + b"\x1a{\xc0}\xb7vJ\xde\x8f\x01\xc9 \xd8\xd4,M\xb9\xff\xdc" + b"+\xeb\xac\x91\x11/6KZ\xa1\x0b\n\xbfq\xa1\x99\xac<\xab" + b"\xbdI\xb5\x85\xed,\xf7\xff\xdfp\xf9\x00\xfc\t\x02\xb0" + b"\xc8\x00\x00\x00") + double_gzipped = ( + b"\x1f\x8b\x08\x00jA\x06Z\x02\xff\x01\x89\x00v\xff\x1f\x8b" + b"\x08\x00jA\x06Z\x02\xffm\x8d1\x0e\xc20\x10\x04{^\xb1\x1f" + b"\xc0\xef\x08=}t\x897\xc1\x92\xed\x8b|\x07\xc8\xbf'\n\xa2" + b"@J9\xab\x19\xed\xc0\x9c5`\xd07\xa4\x11]\x9f\x017H\x81?" + b"\x08\xa7\xf9\xb8I\xcf*q\x8ci\xdd\x11\xb3VguL\x1a{\xc0}" + b"\xb7vJ\xde\x8f\x01\xc9 \xd8\xd4,M\xb9\xff\xdc+\xeb\xac" + b"\x91\x11/6KZ\xa1\x0b\n\xbfq\xa1\x99\xac<\xab\xbdI\xb5" + b"\x85\xed,\xf7\xff\xdfp\xf9\x00\xfc\t\x02\xb0\xc8\x00\x00" + b"\x00\xf9\xdd\x8f\xed\x89\x00\x00\x00") + if self.path == '/test_payload_digest-plain': + payload = content_body + actual_headers = (b'Content-Type: text/plain\r\n' + + b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n') + elif self.path == '/test_payload_digest-gzip': + payload = gzipped + actual_headers = (b'Content-Type: application/gzip\r\n' + + b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n') + elif self.path == '/test_payload_digest-ce-gzip': + payload = gzipped + actual_headers = (b'Content-Type: text/plain\r\n' + + b'Content-Encoding: gzip\r\n' + + b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n') + elif self.path == '/test_payload_digest-gzip-ce-gzip': + payload = double_gzipped + actual_headers = (b'Content-Type: application/gzip\r\n' + + b'Content-Encoding: gzip\r\n' + + b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n') + elif self.path == '/test_payload_digest-te-chunked': + payload = chunkify(content_body) + actual_headers = (b'Content-Type: text/plain\r\n' + + b'Transfer-Encoding: chunked\r\n') + elif self.path == '/test_payload_digest-gzip-te-chunked': + payload = chunkify(gzipped) + actual_headers = (b'Content-Type: application/gzip\r\n' + + b'Transfer-Encoding: chunked\r\n') + elif self.path == '/test_payload_digest-ce-gzip-te-chunked': + payload = chunkify(gzipped) + actual_headers = (b'Content-Type: text/plain\r\n' + + b'Content-Encoding: gzip\r\n' + + b'Transfer-Encoding: chunked\r\n') + elif self.path == '/test_payload_digest-gzip-ce-gzip-te-chunked': + payload = chunkify(double_gzipped) + actual_headers = (b'Content-Type: application/gzip\r\n' + + b'Content-Encoding: gzip\r\n' + + b'Transfer-Encoding: chunked\r\n') + else: + raise Exception('bad path') + headers = b'HTTP/1.1 200 OK\r\n' + actual_headers + b'\r\n' + logging.info('headers=%r payload=%r', headers, payload) else: payload = b'404 Not Found\n' headers = (b'HTTP/1.1 404 Not Found\r\n' @@ -1554,6 +1641,66 @@ def test_long_warcprox_meta( with pytest.raises(StopIteration): next(rec_iter) +def test_payload_digest(warcprox_, http_daemon): + ''' + Tests that digest is of RFC2616 "entity body" + (transfer-decoded but not content-decoded) + ''' + class HalfMockedMitm(warcprox.mitmproxy.MitmProxyHandler): + def __init__(self, url): + self.path = url + self.request_version = 'HTTP/1.1' + self.client_address = mock.MagicMock() + self.headers = email.message.Message() + self.headers.add_header('Host', 'localhost:%s' % http_daemon.server_port) + self.server = warcprox_.proxy + self.command = 'GET' + self.connection = mock.Mock() + + PLAIN_SHA1 = b'sha1:881289333370aa4e3214505f1173423cc5a896b7' + GZIP_SHA1 = b'sha1:634e25de71ae01edb5c5d9e2e99c4836bbe94129' + GZIP_GZIP_SHA1 = b'sha1:cecbf3a5c4975072f5e4c5e0489f808ef246c2b4' + + # plain + mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-plain' % http_daemon.server_port) + req, prox_rec_res = mitm.do_GET() + assert warcprox.digest_str(prox_rec_res.payload_digest) == PLAIN_SHA1 + + # content-type: application/gzip + mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-gzip' % http_daemon.server_port) + req, prox_rec_res = mitm.do_GET() + assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_SHA1 + + # content-encoding: gzip + mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-ce-gzip' % http_daemon.server_port) + req, prox_rec_res = mitm.do_GET() + assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_SHA1 + + # content-type: application/gzip && content-encoding: gzip + mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-gzip-ce-gzip' % http_daemon.server_port) + req, prox_rec_res = mitm.do_GET() + assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_GZIP_SHA1 + + # chunked plain + mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-te-chunked' % http_daemon.server_port) + req, prox_rec_res = mitm.do_GET() + assert warcprox.digest_str(prox_rec_res.payload_digest) == PLAIN_SHA1 + + # chunked content-type: application/gzip + mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-gzip-te-chunked' % http_daemon.server_port) + req, prox_rec_res = mitm.do_GET() + assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_SHA1 + + # chunked content-encoding: gzip + mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-ce-gzip-te-chunked' % http_daemon.server_port) + req, prox_rec_res = mitm.do_GET() + assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_SHA1 + + # chunked content-type: application/gzip && content-encoding: gzip + mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-gzip-ce-gzip-te-chunked' % http_daemon.server_port) + req, prox_rec_res = mitm.do_GET() + assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_GZIP_SHA1 + if __name__ == '__main__': pytest.main() diff --git a/warcprox/__init__.py b/warcprox/__init__.py index ecd6f53..e50a415 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -28,7 +28,7 @@ except ImportError: import Queue as queue import datetime -def digest_str(hash_obj, base32): +def digest_str(hash_obj, base32=False): import base64 return hash_obj.name.encode('utf-8') + b':' + ( base64.b32encode(hash_obj.digest()) if base32 diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index b14cddf..722311b 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -361,7 +361,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): return try: - self._proxy_request() + return self._proxy_request() except: self.logger.error("exception proxying request", exc_info=True) raise @@ -406,6 +406,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): if 'Content-Length' in self.headers: req += self.rfile.read(int(self.headers['Content-Length'])) + prox_rec_res = None try: self.logger.debug('sending to remote server req=%r', req)