From bb50a6c7ff75d7b6350b79c47fb0037b6ca96891 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 12 Nov 2018 15:11:23 -0800 Subject: [PATCH 1/2] use predictable id in service registry so that when warcprox restarts it replaces the obsolete entry --- setup.py | 2 +- tests/test_warcprox.py | 3 +++ warcprox/controller.py | 5 ++++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index a948dec..7b068f6 100755 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ except: setuptools.setup( name='warcprox', - version='2.4b3.dev190', + version='2.4b3.dev191', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 031db85..a69553d 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1576,8 +1576,11 @@ def test_svcreg_status(warcprox_): 'rates_15min', 'active_requests','start_time','urls_processed', 'warc_bytes_written', 'postfetch_chain', 'earliest_still_active_fetch_start',} + assert status['id'] == 'warcprox:%s:%s' % ( + socket.gethostname(), warcprox_.proxy.server_port) assert status['role'] == 'warcprox' assert status['version'] == warcprox.__version__ + assert status['host'] == socket.gethostname() assert status['port'] == warcprox_.proxy.server_port assert status['pid'] == os.getpid() assert status['threads'] == warcprox_.proxy.pool._max_workers diff --git a/warcprox/controller.py b/warcprox/controller.py index 1eed21b..80eca1c 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -36,6 +36,7 @@ import functools import doublethink import importlib import queue +import socket class Factory: @staticmethod @@ -319,13 +320,15 @@ class WarcproxController(object): status_info = self.status_info else: status_info = { + 'id': 'warcprox:%s:%s' % ( + socket.gethostname(), self.proxy.server_port), 'role': 'warcprox', 'version': warcprox.__version__, 'ttl': self.HEARTBEAT_INTERVAL * 3, + 'host': socket.gethostname(), 'port': self.proxy.server_port, } status_info.update(self.proxy.status()) - self.status_info = self.service_registry.heartbeat(status_info) self.logger.trace('status in service registry: %s', self.status_info) From 1ea8a06a69b5a786ef7a0cf71a311c102ef800b4 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 12 Nov 2018 15:57:37 -0800 Subject: [PATCH 2/2] 3 hour hard timeout on urls without content-length so that indefinite streams like icecast radio stations don't hang forever --- setup.py | 2 +- warcprox/mitmproxy.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7b068f6..27dde45 100755 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ except: setuptools.setup( name='warcprox', - version='2.4b3.dev191', + version='2.4b3.dev192', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 7a7751e..1fc0c72 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -464,6 +464,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): req += self.rfile.read(int(self.headers['Content-Length'])) prox_rec_res = None + start = time.time() try: self.logger.debug('sending to remote server req=%r', req) @@ -490,6 +491,15 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): 'bytes exceeded for URL %s', self._max_resource_size, self.url) break + elif (not 'content-length' in self.headers + and time.time() - start > 3 * 60 * 60): + prox_rec_res.truncated = b'time' + self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR) + self._remote_server_conn.sock.close() + self.logger.info( + 'reached hard timeout of 3 hours fetching url ' + 'without content-length: %s', self.url) + break self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) # Let's close off the remote end. If remote connection is fine,