mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
new failing test for correct calculation of payload digest
which should match rfc2616 entity body, which is transfer decoded but not content-decoded
This commit is contained in:
parent
3c215b42b5
commit
30b6b0b337
@ -46,6 +46,10 @@ from collections import Counter
|
||||
import socket
|
||||
import datetime
|
||||
import warcio.archiveiterator
|
||||
import io
|
||||
import gzip
|
||||
import mock
|
||||
import email.message
|
||||
|
||||
try:
|
||||
import http.server as http_server
|
||||
@ -84,7 +88,7 @@ def _send(self, data):
|
||||
# http_client.HTTPConnection.send = _send
|
||||
|
||||
logging.basicConfig(
|
||||
stream=sys.stdout, level=logging.DEBUG, # level=warcprox.TRACE,
|
||||
stream=sys.stdout, level=warcprox.TRACE,
|
||||
format='%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
||||
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
||||
logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN)
|
||||
@ -134,6 +138,24 @@ def dump_state(signum=None, frame=None):
|
||||
|
||||
signal.signal(signal.SIGQUIT, dump_state)
|
||||
|
||||
def chunkify(buf, chunk_size=13):
|
||||
i = 0
|
||||
result = b''
|
||||
while i < len(buf):
|
||||
chunk_len = min(len(buf) - i, chunk_size)
|
||||
result += ('%x\r\n' % chunk_len).encode('ascii')
|
||||
result += buf[i:i+chunk_len]
|
||||
result += b'\r\n'
|
||||
i += chunk_size
|
||||
result += b'0\r\n\r\n'
|
||||
return result
|
||||
|
||||
# def gzipify(buf):
|
||||
# with io.BytesIO() as outbuf:
|
||||
# with gzip.GzipFile(fileobj=outbuf, mode='wb') as gz:
|
||||
# gz.write(buf)
|
||||
# return outbuf.getvalue()
|
||||
|
||||
class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
|
||||
def build_response(self):
|
||||
m = re.match(r'^/([^/]+)/([^/]+)$', self.path)
|
||||
@ -150,6 +172,71 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
|
||||
+ b'Content-Type: text/plain\r\n'
|
||||
+ b'\r\n')
|
||||
payload = b'This response is missing a Content-Length http header.'
|
||||
elif self.path.startswith('/test_payload_digest-'):
|
||||
content_body = (
|
||||
b'Hello. How are you. I am the test_payload_digest '
|
||||
b'content body. The entity body is a possibly content-'
|
||||
b'encoded version of me. The message body is a possibly '
|
||||
b'transfer-encoded version of the entity body.\n')
|
||||
gzipped = (
|
||||
b"\x1f\x8b\x08\x00jA\x06Z\x02\xffm\x8d1\x0e\xc20\x10\x04{^"
|
||||
b"\xb1\x1f\xc0\xef\x08=}t\x897\xc1\x92\xed\x8b|\x07\xc8"
|
||||
b"\xbf'\n\xa2@J9\xab\x19\xed\xc0\x9c5`\xd07\xa4\x11]\x9f"
|
||||
b"\x017H\x81?\x08\xa7\xf9\xb8I\xcf*q\x8ci\xdd\x11\xb3VguL"
|
||||
b"\x1a{\xc0}\xb7vJ\xde\x8f\x01\xc9 \xd8\xd4,M\xb9\xff\xdc"
|
||||
b"+\xeb\xac\x91\x11/6KZ\xa1\x0b\n\xbfq\xa1\x99\xac<\xab"
|
||||
b"\xbdI\xb5\x85\xed,\xf7\xff\xdfp\xf9\x00\xfc\t\x02\xb0"
|
||||
b"\xc8\x00\x00\x00")
|
||||
double_gzipped = (
|
||||
b"\x1f\x8b\x08\x00jA\x06Z\x02\xff\x01\x89\x00v\xff\x1f\x8b"
|
||||
b"\x08\x00jA\x06Z\x02\xffm\x8d1\x0e\xc20\x10\x04{^\xb1\x1f"
|
||||
b"\xc0\xef\x08=}t\x897\xc1\x92\xed\x8b|\x07\xc8\xbf'\n\xa2"
|
||||
b"@J9\xab\x19\xed\xc0\x9c5`\xd07\xa4\x11]\x9f\x017H\x81?"
|
||||
b"\x08\xa7\xf9\xb8I\xcf*q\x8ci\xdd\x11\xb3VguL\x1a{\xc0}"
|
||||
b"\xb7vJ\xde\x8f\x01\xc9 \xd8\xd4,M\xb9\xff\xdc+\xeb\xac"
|
||||
b"\x91\x11/6KZ\xa1\x0b\n\xbfq\xa1\x99\xac<\xab\xbdI\xb5"
|
||||
b"\x85\xed,\xf7\xff\xdfp\xf9\x00\xfc\t\x02\xb0\xc8\x00\x00"
|
||||
b"\x00\xf9\xdd\x8f\xed\x89\x00\x00\x00")
|
||||
if self.path == '/test_payload_digest-plain':
|
||||
payload = content_body
|
||||
actual_headers = (b'Content-Type: text/plain\r\n'
|
||||
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n')
|
||||
elif self.path == '/test_payload_digest-gzip':
|
||||
payload = gzipped
|
||||
actual_headers = (b'Content-Type: application/gzip\r\n'
|
||||
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n')
|
||||
elif self.path == '/test_payload_digest-ce-gzip':
|
||||
payload = gzipped
|
||||
actual_headers = (b'Content-Type: text/plain\r\n'
|
||||
+ b'Content-Encoding: gzip\r\n'
|
||||
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n')
|
||||
elif self.path == '/test_payload_digest-gzip-ce-gzip':
|
||||
payload = double_gzipped
|
||||
actual_headers = (b'Content-Type: application/gzip\r\n'
|
||||
+ b'Content-Encoding: gzip\r\n'
|
||||
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n')
|
||||
elif self.path == '/test_payload_digest-te-chunked':
|
||||
payload = chunkify(content_body)
|
||||
actual_headers = (b'Content-Type: text/plain\r\n'
|
||||
+ b'Transfer-Encoding: chunked\r\n')
|
||||
elif self.path == '/test_payload_digest-gzip-te-chunked':
|
||||
payload = chunkify(gzipped)
|
||||
actual_headers = (b'Content-Type: application/gzip\r\n'
|
||||
+ b'Transfer-Encoding: chunked\r\n')
|
||||
elif self.path == '/test_payload_digest-ce-gzip-te-chunked':
|
||||
payload = chunkify(gzipped)
|
||||
actual_headers = (b'Content-Type: text/plain\r\n'
|
||||
+ b'Content-Encoding: gzip\r\n'
|
||||
+ b'Transfer-Encoding: chunked\r\n')
|
||||
elif self.path == '/test_payload_digest-gzip-ce-gzip-te-chunked':
|
||||
payload = chunkify(double_gzipped)
|
||||
actual_headers = (b'Content-Type: application/gzip\r\n'
|
||||
+ b'Content-Encoding: gzip\r\n'
|
||||
+ b'Transfer-Encoding: chunked\r\n')
|
||||
else:
|
||||
raise Exception('bad path')
|
||||
headers = b'HTTP/1.1 200 OK\r\n' + actual_headers + b'\r\n'
|
||||
logging.info('headers=%r payload=%r', headers, payload)
|
||||
else:
|
||||
payload = b'404 Not Found\n'
|
||||
headers = (b'HTTP/1.1 404 Not Found\r\n'
|
||||
@ -1554,6 +1641,66 @@ def test_long_warcprox_meta(
|
||||
with pytest.raises(StopIteration):
|
||||
next(rec_iter)
|
||||
|
||||
def test_payload_digest(warcprox_, http_daemon):
|
||||
'''
|
||||
Tests that digest is of RFC2616 "entity body"
|
||||
(transfer-decoded but not content-decoded)
|
||||
'''
|
||||
class HalfMockedMitm(warcprox.mitmproxy.MitmProxyHandler):
|
||||
def __init__(self, url):
|
||||
self.path = url
|
||||
self.request_version = 'HTTP/1.1'
|
||||
self.client_address = mock.MagicMock()
|
||||
self.headers = email.message.Message()
|
||||
self.headers.add_header('Host', 'localhost:%s' % http_daemon.server_port)
|
||||
self.server = warcprox_.proxy
|
||||
self.command = 'GET'
|
||||
self.connection = mock.Mock()
|
||||
|
||||
PLAIN_SHA1 = b'sha1:881289333370aa4e3214505f1173423cc5a896b7'
|
||||
GZIP_SHA1 = b'sha1:634e25de71ae01edb5c5d9e2e99c4836bbe94129'
|
||||
GZIP_GZIP_SHA1 = b'sha1:cecbf3a5c4975072f5e4c5e0489f808ef246c2b4'
|
||||
|
||||
# plain
|
||||
mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-plain' % http_daemon.server_port)
|
||||
req, prox_rec_res = mitm.do_GET()
|
||||
assert warcprox.digest_str(prox_rec_res.payload_digest) == PLAIN_SHA1
|
||||
|
||||
# content-type: application/gzip
|
||||
mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-gzip' % http_daemon.server_port)
|
||||
req, prox_rec_res = mitm.do_GET()
|
||||
assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_SHA1
|
||||
|
||||
# content-encoding: gzip
|
||||
mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-ce-gzip' % http_daemon.server_port)
|
||||
req, prox_rec_res = mitm.do_GET()
|
||||
assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_SHA1
|
||||
|
||||
# content-type: application/gzip && content-encoding: gzip
|
||||
mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-gzip-ce-gzip' % http_daemon.server_port)
|
||||
req, prox_rec_res = mitm.do_GET()
|
||||
assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_GZIP_SHA1
|
||||
|
||||
# chunked plain
|
||||
mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-te-chunked' % http_daemon.server_port)
|
||||
req, prox_rec_res = mitm.do_GET()
|
||||
assert warcprox.digest_str(prox_rec_res.payload_digest) == PLAIN_SHA1
|
||||
|
||||
# chunked content-type: application/gzip
|
||||
mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-gzip-te-chunked' % http_daemon.server_port)
|
||||
req, prox_rec_res = mitm.do_GET()
|
||||
assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_SHA1
|
||||
|
||||
# chunked content-encoding: gzip
|
||||
mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-ce-gzip-te-chunked' % http_daemon.server_port)
|
||||
req, prox_rec_res = mitm.do_GET()
|
||||
assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_SHA1
|
||||
|
||||
# chunked content-type: application/gzip && content-encoding: gzip
|
||||
mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-gzip-ce-gzip-te-chunked' % http_daemon.server_port)
|
||||
req, prox_rec_res = mitm.do_GET()
|
||||
assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_GZIP_SHA1
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main()
|
||||
|
||||
|
@ -28,7 +28,7 @@ except ImportError:
|
||||
import Queue as queue
|
||||
import datetime
|
||||
|
||||
def digest_str(hash_obj, base32):
|
||||
def digest_str(hash_obj, base32=False):
|
||||
import base64
|
||||
return hash_obj.name.encode('utf-8') + b':' + (
|
||||
base64.b32encode(hash_obj.digest()) if base32
|
||||
|
@ -361,7 +361,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
return
|
||||
|
||||
try:
|
||||
self._proxy_request()
|
||||
return self._proxy_request()
|
||||
except:
|
||||
self.logger.error("exception proxying request", exc_info=True)
|
||||
raise
|
||||
@ -406,6 +406,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
if 'Content-Length' in self.headers:
|
||||
req += self.rfile.read(int(self.headers['Content-Length']))
|
||||
|
||||
prox_rec_res = None
|
||||
try:
|
||||
self.logger.debug('sending to remote server req=%r', req)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user