mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
new failing test for correct calculation of payload digest
which should match rfc2616 entity body, which is transfer decoded but not content-decoded
This commit is contained in:
parent
3c215b42b5
commit
30b6b0b337
@ -46,6 +46,10 @@ from collections import Counter
|
|||||||
import socket
|
import socket
|
||||||
import datetime
|
import datetime
|
||||||
import warcio.archiveiterator
|
import warcio.archiveiterator
|
||||||
|
import io
|
||||||
|
import gzip
|
||||||
|
import mock
|
||||||
|
import email.message
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import http.server as http_server
|
import http.server as http_server
|
||||||
@ -84,7 +88,7 @@ def _send(self, data):
|
|||||||
# http_client.HTTPConnection.send = _send
|
# http_client.HTTPConnection.send = _send
|
||||||
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
stream=sys.stdout, level=logging.DEBUG, # level=warcprox.TRACE,
|
stream=sys.stdout, level=warcprox.TRACE,
|
||||||
format='%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
format='%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
||||||
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
||||||
logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN)
|
logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN)
|
||||||
@ -134,6 +138,24 @@ def dump_state(signum=None, frame=None):
|
|||||||
|
|
||||||
signal.signal(signal.SIGQUIT, dump_state)
|
signal.signal(signal.SIGQUIT, dump_state)
|
||||||
|
|
||||||
|
def chunkify(buf, chunk_size=13):
|
||||||
|
i = 0
|
||||||
|
result = b''
|
||||||
|
while i < len(buf):
|
||||||
|
chunk_len = min(len(buf) - i, chunk_size)
|
||||||
|
result += ('%x\r\n' % chunk_len).encode('ascii')
|
||||||
|
result += buf[i:i+chunk_len]
|
||||||
|
result += b'\r\n'
|
||||||
|
i += chunk_size
|
||||||
|
result += b'0\r\n\r\n'
|
||||||
|
return result
|
||||||
|
|
||||||
|
# def gzipify(buf):
|
||||||
|
# with io.BytesIO() as outbuf:
|
||||||
|
# with gzip.GzipFile(fileobj=outbuf, mode='wb') as gz:
|
||||||
|
# gz.write(buf)
|
||||||
|
# return outbuf.getvalue()
|
||||||
|
|
||||||
class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
|
class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
|
||||||
def build_response(self):
|
def build_response(self):
|
||||||
m = re.match(r'^/([^/]+)/([^/]+)$', self.path)
|
m = re.match(r'^/([^/]+)/([^/]+)$', self.path)
|
||||||
@ -150,6 +172,71 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
+ b'Content-Type: text/plain\r\n'
|
+ b'Content-Type: text/plain\r\n'
|
||||||
+ b'\r\n')
|
+ b'\r\n')
|
||||||
payload = b'This response is missing a Content-Length http header.'
|
payload = b'This response is missing a Content-Length http header.'
|
||||||
|
elif self.path.startswith('/test_payload_digest-'):
|
||||||
|
content_body = (
|
||||||
|
b'Hello. How are you. I am the test_payload_digest '
|
||||||
|
b'content body. The entity body is a possibly content-'
|
||||||
|
b'encoded version of me. The message body is a possibly '
|
||||||
|
b'transfer-encoded version of the entity body.\n')
|
||||||
|
gzipped = (
|
||||||
|
b"\x1f\x8b\x08\x00jA\x06Z\x02\xffm\x8d1\x0e\xc20\x10\x04{^"
|
||||||
|
b"\xb1\x1f\xc0\xef\x08=}t\x897\xc1\x92\xed\x8b|\x07\xc8"
|
||||||
|
b"\xbf'\n\xa2@J9\xab\x19\xed\xc0\x9c5`\xd07\xa4\x11]\x9f"
|
||||||
|
b"\x017H\x81?\x08\xa7\xf9\xb8I\xcf*q\x8ci\xdd\x11\xb3VguL"
|
||||||
|
b"\x1a{\xc0}\xb7vJ\xde\x8f\x01\xc9 \xd8\xd4,M\xb9\xff\xdc"
|
||||||
|
b"+\xeb\xac\x91\x11/6KZ\xa1\x0b\n\xbfq\xa1\x99\xac<\xab"
|
||||||
|
b"\xbdI\xb5\x85\xed,\xf7\xff\xdfp\xf9\x00\xfc\t\x02\xb0"
|
||||||
|
b"\xc8\x00\x00\x00")
|
||||||
|
double_gzipped = (
|
||||||
|
b"\x1f\x8b\x08\x00jA\x06Z\x02\xff\x01\x89\x00v\xff\x1f\x8b"
|
||||||
|
b"\x08\x00jA\x06Z\x02\xffm\x8d1\x0e\xc20\x10\x04{^\xb1\x1f"
|
||||||
|
b"\xc0\xef\x08=}t\x897\xc1\x92\xed\x8b|\x07\xc8\xbf'\n\xa2"
|
||||||
|
b"@J9\xab\x19\xed\xc0\x9c5`\xd07\xa4\x11]\x9f\x017H\x81?"
|
||||||
|
b"\x08\xa7\xf9\xb8I\xcf*q\x8ci\xdd\x11\xb3VguL\x1a{\xc0}"
|
||||||
|
b"\xb7vJ\xde\x8f\x01\xc9 \xd8\xd4,M\xb9\xff\xdc+\xeb\xac"
|
||||||
|
b"\x91\x11/6KZ\xa1\x0b\n\xbfq\xa1\x99\xac<\xab\xbdI\xb5"
|
||||||
|
b"\x85\xed,\xf7\xff\xdfp\xf9\x00\xfc\t\x02\xb0\xc8\x00\x00"
|
||||||
|
b"\x00\xf9\xdd\x8f\xed\x89\x00\x00\x00")
|
||||||
|
if self.path == '/test_payload_digest-plain':
|
||||||
|
payload = content_body
|
||||||
|
actual_headers = (b'Content-Type: text/plain\r\n'
|
||||||
|
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n')
|
||||||
|
elif self.path == '/test_payload_digest-gzip':
|
||||||
|
payload = gzipped
|
||||||
|
actual_headers = (b'Content-Type: application/gzip\r\n'
|
||||||
|
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n')
|
||||||
|
elif self.path == '/test_payload_digest-ce-gzip':
|
||||||
|
payload = gzipped
|
||||||
|
actual_headers = (b'Content-Type: text/plain\r\n'
|
||||||
|
+ b'Content-Encoding: gzip\r\n'
|
||||||
|
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n')
|
||||||
|
elif self.path == '/test_payload_digest-gzip-ce-gzip':
|
||||||
|
payload = double_gzipped
|
||||||
|
actual_headers = (b'Content-Type: application/gzip\r\n'
|
||||||
|
+ b'Content-Encoding: gzip\r\n'
|
||||||
|
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n')
|
||||||
|
elif self.path == '/test_payload_digest-te-chunked':
|
||||||
|
payload = chunkify(content_body)
|
||||||
|
actual_headers = (b'Content-Type: text/plain\r\n'
|
||||||
|
+ b'Transfer-Encoding: chunked\r\n')
|
||||||
|
elif self.path == '/test_payload_digest-gzip-te-chunked':
|
||||||
|
payload = chunkify(gzipped)
|
||||||
|
actual_headers = (b'Content-Type: application/gzip\r\n'
|
||||||
|
+ b'Transfer-Encoding: chunked\r\n')
|
||||||
|
elif self.path == '/test_payload_digest-ce-gzip-te-chunked':
|
||||||
|
payload = chunkify(gzipped)
|
||||||
|
actual_headers = (b'Content-Type: text/plain\r\n'
|
||||||
|
+ b'Content-Encoding: gzip\r\n'
|
||||||
|
+ b'Transfer-Encoding: chunked\r\n')
|
||||||
|
elif self.path == '/test_payload_digest-gzip-ce-gzip-te-chunked':
|
||||||
|
payload = chunkify(double_gzipped)
|
||||||
|
actual_headers = (b'Content-Type: application/gzip\r\n'
|
||||||
|
+ b'Content-Encoding: gzip\r\n'
|
||||||
|
+ b'Transfer-Encoding: chunked\r\n')
|
||||||
|
else:
|
||||||
|
raise Exception('bad path')
|
||||||
|
headers = b'HTTP/1.1 200 OK\r\n' + actual_headers + b'\r\n'
|
||||||
|
logging.info('headers=%r payload=%r', headers, payload)
|
||||||
else:
|
else:
|
||||||
payload = b'404 Not Found\n'
|
payload = b'404 Not Found\n'
|
||||||
headers = (b'HTTP/1.1 404 Not Found\r\n'
|
headers = (b'HTTP/1.1 404 Not Found\r\n'
|
||||||
@ -1554,6 +1641,66 @@ def test_long_warcprox_meta(
|
|||||||
with pytest.raises(StopIteration):
|
with pytest.raises(StopIteration):
|
||||||
next(rec_iter)
|
next(rec_iter)
|
||||||
|
|
||||||
|
def test_payload_digest(warcprox_, http_daemon):
|
||||||
|
'''
|
||||||
|
Tests that digest is of RFC2616 "entity body"
|
||||||
|
(transfer-decoded but not content-decoded)
|
||||||
|
'''
|
||||||
|
class HalfMockedMitm(warcprox.mitmproxy.MitmProxyHandler):
|
||||||
|
def __init__(self, url):
|
||||||
|
self.path = url
|
||||||
|
self.request_version = 'HTTP/1.1'
|
||||||
|
self.client_address = mock.MagicMock()
|
||||||
|
self.headers = email.message.Message()
|
||||||
|
self.headers.add_header('Host', 'localhost:%s' % http_daemon.server_port)
|
||||||
|
self.server = warcprox_.proxy
|
||||||
|
self.command = 'GET'
|
||||||
|
self.connection = mock.Mock()
|
||||||
|
|
||||||
|
PLAIN_SHA1 = b'sha1:881289333370aa4e3214505f1173423cc5a896b7'
|
||||||
|
GZIP_SHA1 = b'sha1:634e25de71ae01edb5c5d9e2e99c4836bbe94129'
|
||||||
|
GZIP_GZIP_SHA1 = b'sha1:cecbf3a5c4975072f5e4c5e0489f808ef246c2b4'
|
||||||
|
|
||||||
|
# plain
|
||||||
|
mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-plain' % http_daemon.server_port)
|
||||||
|
req, prox_rec_res = mitm.do_GET()
|
||||||
|
assert warcprox.digest_str(prox_rec_res.payload_digest) == PLAIN_SHA1
|
||||||
|
|
||||||
|
# content-type: application/gzip
|
||||||
|
mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-gzip' % http_daemon.server_port)
|
||||||
|
req, prox_rec_res = mitm.do_GET()
|
||||||
|
assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_SHA1
|
||||||
|
|
||||||
|
# content-encoding: gzip
|
||||||
|
mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-ce-gzip' % http_daemon.server_port)
|
||||||
|
req, prox_rec_res = mitm.do_GET()
|
||||||
|
assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_SHA1
|
||||||
|
|
||||||
|
# content-type: application/gzip && content-encoding: gzip
|
||||||
|
mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-gzip-ce-gzip' % http_daemon.server_port)
|
||||||
|
req, prox_rec_res = mitm.do_GET()
|
||||||
|
assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_GZIP_SHA1
|
||||||
|
|
||||||
|
# chunked plain
|
||||||
|
mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-te-chunked' % http_daemon.server_port)
|
||||||
|
req, prox_rec_res = mitm.do_GET()
|
||||||
|
assert warcprox.digest_str(prox_rec_res.payload_digest) == PLAIN_SHA1
|
||||||
|
|
||||||
|
# chunked content-type: application/gzip
|
||||||
|
mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-gzip-te-chunked' % http_daemon.server_port)
|
||||||
|
req, prox_rec_res = mitm.do_GET()
|
||||||
|
assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_SHA1
|
||||||
|
|
||||||
|
# chunked content-encoding: gzip
|
||||||
|
mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-ce-gzip-te-chunked' % http_daemon.server_port)
|
||||||
|
req, prox_rec_res = mitm.do_GET()
|
||||||
|
assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_SHA1
|
||||||
|
|
||||||
|
# chunked content-type: application/gzip && content-encoding: gzip
|
||||||
|
mitm = HalfMockedMitm('http://localhost:%s/test_payload_digest-gzip-ce-gzip-te-chunked' % http_daemon.server_port)
|
||||||
|
req, prox_rec_res = mitm.do_GET()
|
||||||
|
assert warcprox.digest_str(prox_rec_res.payload_digest) == GZIP_GZIP_SHA1
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
pytest.main()
|
pytest.main()
|
||||||
|
|
||||||
|
@ -28,7 +28,7 @@ except ImportError:
|
|||||||
import Queue as queue
|
import Queue as queue
|
||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
def digest_str(hash_obj, base32):
|
def digest_str(hash_obj, base32=False):
|
||||||
import base64
|
import base64
|
||||||
return hash_obj.name.encode('utf-8') + b':' + (
|
return hash_obj.name.encode('utf-8') + b':' + (
|
||||||
base64.b32encode(hash_obj.digest()) if base32
|
base64.b32encode(hash_obj.digest()) if base32
|
||||||
|
@ -361,7 +361,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self._proxy_request()
|
return self._proxy_request()
|
||||||
except:
|
except:
|
||||||
self.logger.error("exception proxying request", exc_info=True)
|
self.logger.error("exception proxying request", exc_info=True)
|
||||||
raise
|
raise
|
||||||
@ -406,6 +406,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
if 'Content-Length' in self.headers:
|
if 'Content-Length' in self.headers:
|
||||||
req += self.rfile.read(int(self.headers['Content-Length']))
|
req += self.rfile.read(int(self.headers['Content-Length']))
|
||||||
|
|
||||||
|
prox_rec_res = None
|
||||||
try:
|
try:
|
||||||
self.logger.debug('sending to remote server req=%r', req)
|
self.logger.debug('sending to remote server req=%r', req)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user