From 5ced2588d4cad5dc14bf47ebde124a3eaa3823b8 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Sat, 13 Apr 2019 17:33:38 -0700 Subject: [PATCH 1/2] failing test test_incomplete_read --- tests/test_warcprox.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 18bcf37..7e6b19f 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -68,7 +68,6 @@ import certauth.certauth import warcprox import warcprox.main - try: import http.client as http_client except ImportError: @@ -282,6 +281,15 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler): payload = b'Test.' actual_headers = (b'Content-Type: text/plain\r\n' + b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n') + elif self.path == '/incomplete-read': + headers = (b'HTTP/1.1 200 OK\r\n' + + b'Content-Type: text/plain\r\n' + + b'Transfer-Encoding: chunked\r\n' + + b'\r\n') + # payload = b'''1\r\na''' + payload = chunkify( + b'Server closes connection when client expects next chunk') + payload = payload[:-7] else: payload = b'404 Not Found\n' headers = (b'HTTP/1.1 404 Not Found\r\n' @@ -295,7 +303,9 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler): headers, payload = self.build_response() self.connection.sendall(headers) self.connection.sendall(payload) - if self.path in ('/missing-content-length', '/empty-response'): + if self.path in ( + '/missing-content-length', '/empty-response', + '/incomplete-read'): # server must close the connection, else client has no idea if # there is more data coming self.connection.shutdown(socket.SHUT_RDWR) @@ -1614,13 +1624,11 @@ def test_controller_with_defaults(): assert not wwp.writer_pool.default_warc_writer.record_builder.base32 assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' - class EarlyPlugin(warcprox.BaseStandardPostfetchProcessor): CHAIN_POSITION = 'early' def _process_url(self): pass - def test_load_plugin(): options = warcprox.Options(port=0, plugins=[ 'warcprox.stats.RunningStats', @@ -2226,6 +2234,18 @@ def test_dedup_min_binary_size(http_daemon, warcprox_, archiving_proxies): with pytest.raises(StopIteration): next(rec_iter) +def test_incomplete_read(http_daemon, warcprox_, archiving_proxies): + urls_before = warcprox_.proxy.running_stats.urls + + # see https://github.com/internetarchive/warcprox/pull/123 + url = 'http://localhost:%s/incomplete-read' % http_daemon.server_port + with pytest.raises(requests.exceptions.ChunkedEncodingError): + response = requests.get( + url, proxies=archiving_proxies, verify=False, timeout=10) + + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1) + if __name__ == '__main__': pytest.main() From 0d268659abf54c9771cee9dbbe0f755d7f769080 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Sat, 13 Apr 2019 17:46:52 -0700 Subject: [PATCH 2/2] handle incomplete read see Vangelis's writeup at https://github.com/internetarchive/warcprox/pull/123 --- warcprox/mitmproxy.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index a9e7e38..ae6a9f0 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -487,9 +487,14 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): tmp_file_max_memory_size=self._tmp_file_max_memory_size) prox_rec_res.begin(extra_response_headers=extra_response_headers) - buf = prox_rec_res.read(65536) + buf = None while buf != b'': - buf = prox_rec_res.read(65536) + try: + buf = prox_rec_res.read(65536) + except http_client.IncompleteRead as e: + self.logger.warn('%s from %s', e, self.url) + buf = b'' + if (self._max_resource_size and prox_rec_res.recorder.len > self._max_resource_size): prox_rec_res.truncated = b'length'