From eda0656737e2e3564165d3e615dc2f49a408a085 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Wed, 7 Mar 2018 08:00:18 +0000 Subject: [PATCH] Configurable tmp file max memory size We use `tempfile.SpooledTemporaryFile(max_size=512*1024)` to keep recorded data before writing them to WARC. Data are kept in memory when they are smaller than `max_size`, else they are written to disk. We add option `--tmp-file-max-memory-size` to make this configurable. A higher value means less /tmp disk I/O and higher overall performance but also increased memory usage. --- warcprox/main.py | 4 ++++ warcprox/mitmproxy.py | 14 +++++++++----- warcprox/warcproxy.py | 2 ++ 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/warcprox/main.py b/warcprox/main.py index 64d01c7..8ff466b 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -166,6 +166,10 @@ def _build_arg_parser(prog='warcprox'): arg_parser.add_argument( '--socket-timeout', dest='socket_timeout', type=float, default=None, help=argparse.SUPPRESS) + # Increasing this value increases memory usage but reduces /tmp disk I/O. + arg_parser.add_argument( + '--tmp-file-max-memory-size', dest='tmp_file_max_memory_size', + type=int, default=512*1024, help=argparse.SUPPRESS) arg_parser.add_argument( '--max-resource-size', dest='max_resource_size', type=int, default=None, help='maximum resource size limit in bytes') diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 1bbd930..14f26f9 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -73,10 +73,11 @@ class ProxyingRecorder(object): logger = logging.getLogger("warcprox.mitmproxy.ProxyingRecorder") - def __init__(self, fp, proxy_client, digest_algorithm='sha1', url=None): + def __init__(self, fp, proxy_client, digest_algorithm='sha1', url=None, + tmp_file_max_memory_size=524288): self.fp = fp # "The file has no name, and will cease to exist when it is closed." - self.tempfile = tempfile.SpooledTemporaryFile(max_size=512*1024) + self.tempfile = tempfile.SpooledTemporaryFile(max_size=tmp_file_max_memory_size) self.digest_algorithm = digest_algorithm self.block_digest = hashlib.new(digest_algorithm) self.payload_offset = None @@ -146,7 +147,7 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): ''' def __init__( self, sock, debuglevel=0, method=None, proxy_client=None, - digest_algorithm='sha1', url=None): + digest_algorithm='sha1', url=None, tmp_file_max_memory_size=None): http_client.HTTPResponse.__init__( self, sock, debuglevel=debuglevel, method=method) self.proxy_client = proxy_client @@ -156,7 +157,8 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): # Keep around extra reference to self.fp because HTTPResponse sets # self.fp=None after it finishes reading, but we still need it self.recorder = ProxyingRecorder( - self.fp, proxy_client, digest_algorithm, url=url) + self.fp, proxy_client, digest_algorithm, url=url, + tmp_file_max_memory_size=tmp_file_max_memory_size) self.fp = self.recorder self.payload_digest = None @@ -208,6 +210,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler") _socket_timeout = 60 _max_resource_size = None + _tmp_file_max_memory_size = 512 * 1024 def __init__(self, request, client_address, server): threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1]) @@ -425,7 +428,8 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): prox_rec_res = ProxyingRecordingHTTPResponse( self._remote_server_sock, proxy_client=self.connection, digest_algorithm=self.server.digest_algorithm, - url=self.url, method=self.command) + url=self.url, method=self.command, + tmp_file_max_memory_size=self._tmp_file_max_memory_size) prox_rec_res.begin(extra_response_headers=extra_response_headers) buf = prox_rec_res.read(65536) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 5b42655..2aa171c 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -405,6 +405,8 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object): WarcProxyHandler._socket_timeout = options.socket_timeout if options.max_resource_size: WarcProxyHandler._max_resource_size = options.max_resource_size + if options.tmp_file_max_memory_size: + WarcProxyHandler._tmp_file_max_memory_size = options.tmp_file_max_memory_size http_server.HTTPServer.__init__( self, server_address, WarcProxyHandler, bind_and_activate=True)