From 76abe4b7538ba1797350dd7343cb18c4fcc93367 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Mon, 10 Jun 2019 06:26:26 +0000 Subject: [PATCH 1/8] Catch BadStatusLine exception When trying to begin downloading from a remote host, we may get a `RemoteDisconnected` exception if it returns no data. We already handle that. We may also get `BadStatusLine` in case the response HTTP status is not fine. https://github.com/python/cpython/blob/3.7/Lib/http/client.py#L288 We should also add these cases in bad hosts cache. --- warcprox/mitmproxy.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index d6a0593..deb3a52 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -572,9 +572,15 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): # A common error is to connect to the remote server successfully # but raise a `RemoteDisconnected` exception when trying to begin # downloading. Its caused by prox_rec_res.begin(...) which calls - # http_client._read_status(). In that case, the host is also bad - # and we must add it to `bad_hostnames_ports` cache. - if isinstance(e, http_client.RemoteDisconnected): + # http_client._read_status(). The connection fails there. + # https://github.com/python/cpython/blob/3.7/Lib/http/client.py#L275 + # Another case is when the connection is fine but the response + # status is problematic, raising `BadStatusLine`. + # https://github.com/python/cpython/blob/3.7/Lib/http/client.py#L296 + # In both cases, the host is bad and we must add it to + # `bad_hostnames_ports` cache. + if isinstance(e, (http_client.RemoteDisconnected, + http_client.BadStatusLine)): host_port = self._hostname_port_cache_key() with self.server.bad_hostnames_ports_lock: self.server.bad_hostnames_ports[host_port] = 502 From 2d6eefd8c608730bc6443fe992ca9ce9f86bc2c4 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Mon, 10 Jun 2019 06:53:12 +0000 Subject: [PATCH 2/8] Check if connection is still open when trying to close When an exception is raised during network communication with the remote close, we handle it and we close the socket. Some times, the socket is already closed due to the exception and we get an extra `OSError [Errno 107] Transport endpoint is not connected` when trying to shutdown the socket. We add a check to avoid that. --- warcprox/mitmproxy.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index d6a0593..5ef53c1 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -581,8 +581,12 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self.logger.info('bad_hostnames_ports cache size: %d', len(self.server.bad_hostnames_ports)) - self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR) - self._remote_server_conn.sock.close() + # Close the connection only if its still open. If its already + # closed, an `OSError` "([Errno 107] Transport endpoint is not + # connected)" would be raised. + if not is_connection_dropped(self._remote_server_conn): + self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR) + self._remote_server_conn.sock.close() raise finally: if prox_rec_res: From 4ca10a22d8ffebca9e7fe126c902616486d5e55b Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Tue, 11 Jun 2019 09:31:54 +0000 Subject: [PATCH 3/8] Optimise WarcWriter.maybe_size_rollover() Every time we write WARC records to file, we call `maybe_size_rollover()` to check if the current WARC filesize is over the rollover threshold. We use `os.path.getsize` which does a disk `stat` to do that. We already know the current WARC file size from the WARC record offset (`self.f.tell()`). There is no need to call `os.path.getsize`, we just reuse the offset info. This way, we do one less disk `stat` every time we write to WARC which is a nice improvement. --- warcprox/writer.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/warcprox/writer.py b/warcprox/writer.py index fecb533..730d606 100644 --- a/warcprox/writer.py +++ b/warcprox/writer.py @@ -115,10 +115,8 @@ class WarcWriter: ''' Ensures `self.f` is ready to write the next warc record. - Closes current warc if size limit has been reached. Then, if warc is - not open, opens one, and writes the warcinfo record. + If warc is not open, opens one, and writes the warcinfo record. ''' - self.maybe_size_rollover() if not self.f: serial = self.serial self.serial += 1 @@ -136,11 +134,14 @@ class WarcWriter: records = self.record_builder.build_warc_records(recorded_url) self.ensure_open() + total_warc_file_size = None for record in records: offset = self.f.tell() record.write_to(self.f, gzip=self.gzip) record.offset = offset - record.length = self.f.tell() - offset + offset2 = self.f.tell() + record.length = offset2 - offset + total_warc_file_size = offset2 record.warc_filename = self.finalname self.logger.trace( 'wrote warc record: warc_type=%s content_length=%s ' @@ -150,7 +151,8 @@ class WarcWriter: self.path, record.get_header(warctools.WarcRecord.URL)) self.f.flush() self.last_activity = time.time() - + # Closes current warc if size limit has been reached. + self.maybe_size_rollover(total_warc_file_size) return records def close(self): @@ -185,11 +187,11 @@ class WarcWriter: self.finalname, time.time() - self.last_activity) self.close() - def maybe_size_rollover(self): - if self.path and os.path.getsize(self.path) > self.rollover_size: + def maybe_size_rollover(self, total_warc_file_size): + if total_warc_file_size and total_warc_file_size > self.rollover_size: self.logger.info( 'rolling over %s because it has reached %s bytes in size', - self.finalname, os.path.getsize(self.path)) + self.finalname, total_warc_file_size) self.close() class WarcWriterPool: From 81a945e84055b8840a78d289db965690455cce0a Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 11 Jun 2019 10:58:52 -0700 Subject: [PATCH 4/8] bump version after a few small PRs --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0b6d756..f1d2347 100755 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.14', + version='2.4.15', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From 932001c92115c4881a32d8b8f4d05c00a6efdaa7 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 20 Jun 2019 14:57:36 -0700 Subject: [PATCH 5/8] bump version after merge --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f1d2347..73462d8 100755 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.15', + version='2.4.16', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From fce1c3d72227fab0e411d3f5935387be970060a3 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 26 Jul 2019 14:03:36 -0700 Subject: [PATCH 6/8] requests/urllib3 version conflict from april must be obsolete by now... --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 73462d8..edf955c 100755 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ deps = [ 'warctools>=4.10.0', 'urlcanon>=0.3.0', 'doublethink>=0.2.0.dev87', - 'urllib3>=1.14,<1.25', + 'urllib3>=1.14', 'requests>=2.0.1', 'PySocks>=1.6.8', 'cryptography>=2.3', @@ -43,7 +43,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.16', + version='2.4.17', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From 1aa6b0c5d6d53c680a615baadca619a25bc9f064 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 16 Aug 2019 18:31:35 +0000 Subject: [PATCH 7/8] log remote host/ip/port on SSLError --- setup.py | 2 +- warcprox/mitmproxy.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index edf955c..8c0415e 100755 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.17', + version='2.4.18', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 69d2c63..6a7ce4c 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -276,6 +276,8 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): host=self.hostname, port=int(self.port), scheme='http', pool_kwargs={'maxsize': 12, 'timeout': self._socket_timeout}) + remote_ip = None + self._remote_server_conn = self._conn_pool._get_conn() if is_connection_dropped(self._remote_server_conn): if self.onion_tor_socks_proxy_host and self.hostname.endswith('.onion'): @@ -291,6 +293,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._remote_server_conn.sock.connect((self.hostname, int(self.port))) else: self._remote_server_conn.connect() + remote_ip = self._remote_server_conn.sock.getpeername()[0] # Wrap socket if SSL is required if self.is_connect: @@ -312,6 +315,11 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): "consider upgrading to python 2.7.9+ or 3.4+", self.hostname) raise + except ssl.SSLError as e: + self.logger.error( + 'error connecting to %s (%s) port %s: %s', + self.hostname, remote_ip, self.port, e) + raise return self._remote_server_conn.sock def _transition_to_ssl(self): From 32200db7ab546d610a00b28fe804334a49e6387f Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 13 Sep 2019 11:43:39 -0700 Subject: [PATCH 8/8] log long-running fetches --- warcprox/mitmproxy.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 6a7ce4c..9bac478 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -561,15 +561,18 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): 'bytes exceeded for URL %s', self._max_resource_size, self.url) break - elif (not 'content-length' in self.headers - and time.time() - start > 3 * 60 * 60): - prox_rec_res.truncated = b'time' - self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR) - self._remote_server_conn.sock.close() - self.logger.info( - 'reached hard timeout of 3 hours fetching url ' - 'without content-length: %s', self.url) - break + elif time.time() - start > 3 * 60 * 60: + if not 'content-length' in self.headers: + prox_rec_res.truncated = b'time' + self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR) + self._remote_server_conn.sock.close() + self.logger.info( + 'reached hard timeout of 3 hours fetching url ' + 'without content-length: %s', self.url) + break + else: + self.logger.info( + 'long-running fetch for URL %s', self.url) self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) # Let's close off the remote end. If remote connection is fine,