new failing test for correct calculation of payload digest

which should match rfc2616 entity body, which is transfer decoded but not
content-decoded
This commit is contained in:
Noah Levitt 2017-11-10 17:02:33 -08:00
parent 3c215b42b5
commit 30b6b0b337
3 changed files with 151 additions and 3 deletions

View File

@ -46,6 +46,10 @@ from collections import Counter
import socket
import datetime
import warcio.archiveiterator
import io
import gzip
import mock
import email.message
try:
import http.server as http_server
@ -84,7 +88,7 @@ def _send(self, data):
# http_client.HTTPConnection.send = _send
logging.basicConfig(
stream=sys.stdout, level=logging.DEBUG, # level=warcprox.TRACE,
stream=sys.stdout, level=warcprox.TRACE,
format='%(asctime)s %(process)d %(levelname)s %(threadName)s '
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN)
@ -134,6 +138,24 @@ def dump_state(signum=None, frame=None):
signal.signal(signal.SIGQUIT, dump_state)
def chunkify(buf, chunk_size=13):
i = 0
result = b''
while i < len(buf):
chunk_len = min(len(buf) - i, chunk_size)
result += ('%x\r\n' % chunk_len).encode('ascii')
result += buf[i:i+chunk_len]
result += b'\r\n'
i += chunk_size
result += b'0\r\n\r\n'
return result
# def gzipify(buf):
# with io.BytesIO() as outbuf:
# with gzip.GzipFile(fileobj=outbuf, mode='wb') as gz:
# gz.write(buf)
# return outbuf.getvalue()
class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
def build_response(self):
m = re.match(r'^/([^/]+)/([^/]+)$', self.path)
@ -150,6 +172,71 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
+ b'Content-Type: text/plain\r\n'
+ b'\r\n')
payload = b'This response is missing a Content-Length http header.'
elif self.path.startswith('/test_payload_digest-'):
content_body = (
b'Hello. How are you. I am the test_payload_digest '
b'content body. The entity body is a possibly content-'
b'encoded version of me. The message body is a possibly '
b'transfer-encoded version of the entity body.\n')
gzipped = (
b"\x1f\x8b\x08\x00jA\x06Z\x02\xffm\x8d1\x0e\xc20\x10\x04{^"
b"\xb1\x1f\xc0\xef\x08=}t\x897\xc1\x92\xed\x8b|\x07\xc8"
b"\xbf'\n\xa2@J9\xab\x19\xed\xc0\x9c5`\xd07\xa4\x11]\x9f"
b"\x017H\x81?\x08\xa7\xf9\xb8I\xcf*q\x8ci\xdd\x11\xb3VguL"
b"\x1a{\xc0}\xb7vJ\xde\x8f\x01\xc9 \xd8\xd4,M\xb9\xff\xdc"
b"+\xeb\xac\x91\x11/6KZ\xa1\x0b\n\xbfq\xa1\x99\xac<\xab"
b"\xbdI\xb5\x85\xed,\xf7\xff\xdfp\xf9\x00\xfc\t\x02\xb0"
b"\xc8\x00\x00\x00")
double_gzipped = (
b"\x1f\x8b\x08\x00jA\x06Z\x02\xff\x01\x89\x00v\xff\x1f\x8b"
b"\x08\x00jA\x06Z\x02\xffm\x8d1\x0e\xc20\x10\x04{^\xb1\x1f"
b"\xc0\xef\x08=}t\x897\xc1\x92\xed\x8b|\x07\xc8\xbf'\n\xa2"
b"@J9\xab\x19\xed\xc0\x9c5`\xd07\xa4\x11]\x9f\x017H\x81?"
b"\x08\xa7\xf9\xb8I\xcf*q\x8ci\xdd\x11\xb3VguL\x1a{\xc0}"
b"\xb7vJ\xde\x8f\x01\xc9 \xd8\xd4,M\xb9\xff\xdc+\xeb\xac"
b"\x91\x11/6KZ\xa1\x0b\n\xbfq\xa1\x99\xac<\xab\xbdI\xb5"
b"\x85\xed,\xf7\xff\xdfp\xf9\x00\xfc\t\x02\xb0\xc8\x00\x00"
b"\x00\xf9\xdd\x8f\xed\x89\x00\x00\x00")
if self.path == '/test_payload_digest-plain':
payload = content_body
actual_headers = (b'Content-Type: text/plain\r\n'
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n')
elif self.path == '/test_payload_digest-gzip':
payload = gzipped
actual_headers = (b'Content-Type: application/gzip\r\n'
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n')
elif self.path == '/test_payload_digest-ce-gzip':
payload = gzipped
actual_headers = (b'Content-Type: text/plain\r\n'
+ b'Content-Encoding: gzip\r\n'
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n')
elif self.path == '/test_payload_digest-gzip-ce-gzip':
payload = double_gzipped
actual_headers = (b'Content-Type: application/gzip\r\n'
+ b'Content-Encoding: gzip\r\n'
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n')
elif self.path == '/test_payload_digest-te-chunked':
payload = chunkify(content_body)
actual_headers = (b'Content-Type: text/plain\r\n'
+ b'Transfer-Encoding: chunked\r\n')
elif self.path == '/test_payload_digest-gzip-te-chunked':
payload = chunkify(gzipped)
actual_headers = (b'Content-Type: application/gzip\r\n'
+ b'Transfer-Encoding: chunked\r\n')
elif self.path == '/test_payload_digest-ce-gzip-te-chunked':
payload = chunkify(gzipped)
actual_headers = (b'Content-Type: text/plain\r\n'
+ b'Content-Encoding: gzip\r\n'
+ b'Transfer-Encoding: chunked\r\n')
elif self.path == '/test_payload_digest-gzip-ce-gzip-te-chunked':
payload = chunkify(double_gzipped)
actual_headers = (b'Content-Type: application/gzip\r\n'
+ b'Content-Encoding: gzip\r\n'
+ b'Transfer-Encoding: chunked\r\n')
else:
raise Exception('bad path')
headers = b'HTTP/1.1 200 OK\r\n' + actual_headers + b'\r\n'
logging.info('headers=%r payload=%r', headers, payload)
else:
payload = b'404 Not Found\n'
headers = (b'HTTP/1.1 404 Not Found\r\n'
@ -1554,6 +1641,66 @@ def test_long_warcprox_meta(
with pytest.raises(StopIteration):
next(rec_iter)
def test_payload_digest(warcprox_, http_daemon):
'''
Tests that digest is of RFC2616 "entity body"
(transfer-decoded but not content-decoded)
'''
class HalfMockedMitm(warcprox.mitmproxy.MitmProxyHandler):
def __init__(self, url):
self.path = url
self.request_version = 'HTTP/1.1'
self.client_address = mock.MagicMock()
self.headers = email.message.Message()
self.headers.add_header('Host', 'localhost:%s' % http_daemon.server_port)
self.server = warcprox_.proxy
self.command = 'GET'
self.connection = mock.Mock()
PLAIN_SHA1 = b'sha1:881289333370aa4e3214505f1173423cc5a896b7'
GZIP_SHA1 = b'sha1:634e25de71ae01edb5c5d9e2e99c4836bbe94129'
GZIP_GZIP_SHA1 = b'sha1:cecbf3a5c4975072f5e4c5e0489f808ef246c2b4'
# plain
mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-plain' % http_daemon.server_port)
req, prox_rec_res = mitm.do_GET()
assert warcprox.digest_str(prox_rec_res.payload_digest) == PLAIN_SHA1
# content-type: application/gzip
mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-gzip' % http_daemon.server_port)
req, prox_rec_res = mitm.do_GET()
assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_SHA1
# content-encoding: gzip
mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-ce-gzip' % http_daemon.server_port)
req, prox_rec_res = mitm.do_GET()
assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_SHA1
# content-type: application/gzip && content-encoding: gzip
mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-gzip-ce-gzip' % http_daemon.server_port)
req, prox_rec_res = mitm.do_GET()
assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_GZIP_SHA1
# chunked plain
mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-te-chunked' % http_daemon.server_port)
req, prox_rec_res = mitm.do_GET()
assert warcprox.digest_str(prox_rec_res.payload_digest) == PLAIN_SHA1
# chunked content-type: application/gzip
mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-gzip-te-chunked' % http_daemon.server_port)
req, prox_rec_res = mitm.do_GET()
assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_SHA1
# chunked content-encoding: gzip
mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-ce-gzip-te-chunked' % http_daemon.server_port)
req, prox_rec_res = mitm.do_GET()
assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_SHA1
# chunked content-type: application/gzip && content-encoding: gzip
mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-gzip-ce-gzip-te-chunked' % http_daemon.server_port)
req, prox_rec_res = mitm.do_GET()
assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_GZIP_SHA1
if __name__ == '__main__':
pytest.main()

View File

@ -28,7 +28,7 @@ except ImportError:
import Queue as queue
import datetime
def digest_str(hash_obj, base32):
def digest_str(hash_obj, base32=False):
import base64
return hash_obj.name.encode('utf-8') + b':' + (
base64.b32encode(hash_obj.digest()) if base32

View File

@ -361,7 +361,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
return
try:
self._proxy_request()
return self._proxy_request()
except:
self.logger.error("exception proxying request", exc_info=True)
raise
@ -406,6 +406,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
if 'Content-Length' in self.headers:
req += self.rfile.read(int(self.headers['Content-Length']))
prox_rec_res = None
try:
self.logger.debug('sending to remote server req=%r', req)