Configurable tmp file max memory size

We use `tempfile.SpooledTemporaryFile(max_size=512*1024)` to keep
recorded data before writing them to WARC.
Data are kept in memory when they are smaller than `max_size`, else they
are written to disk.

We add option `--tmp-file-max-memory-size` to make this configurable.
A higher value means less /tmp disk I/O and higher overall performance but
also increased memory usage.
This commit is contained in:
Vangelis Banos 2018-03-07 08:00:18 +00:00
parent 530aaba461
commit eda0656737
3 changed files with 15 additions and 5 deletions

View File

@ -166,6 +166,10 @@ def _build_arg_parser(prog='warcprox'):
arg_parser.add_argument(
'--socket-timeout', dest='socket_timeout', type=float,
default=None, help=argparse.SUPPRESS)
# Increasing this value increases memory usage but reduces /tmp disk I/O.
arg_parser.add_argument(
'--tmp-file-max-memory-size', dest='tmp_file_max_memory_size',
type=int, default=512*1024, help=argparse.SUPPRESS)
arg_parser.add_argument(
'--max-resource-size', dest='max_resource_size', type=int,
default=None, help='maximum resource size limit in bytes')

View File

@ -73,10 +73,11 @@ class ProxyingRecorder(object):
logger = logging.getLogger("warcprox.mitmproxy.ProxyingRecorder")
def __init__(self, fp, proxy_client, digest_algorithm='sha1', url=None):
def __init__(self, fp, proxy_client, digest_algorithm='sha1', url=None,
tmp_file_max_memory_size=524288):
self.fp = fp
# "The file has no name, and will cease to exist when it is closed."
self.tempfile = tempfile.SpooledTemporaryFile(max_size=512*1024)
self.tempfile = tempfile.SpooledTemporaryFile(max_size=tmp_file_max_memory_size)
self.digest_algorithm = digest_algorithm
self.block_digest = hashlib.new(digest_algorithm)
self.payload_offset = None
@ -146,7 +147,7 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
'''
def __init__(
self, sock, debuglevel=0, method=None, proxy_client=None,
digest_algorithm='sha1', url=None):
digest_algorithm='sha1', url=None, tmp_file_max_memory_size=None):
http_client.HTTPResponse.__init__(
self, sock, debuglevel=debuglevel, method=method)
self.proxy_client = proxy_client
@ -156,7 +157,8 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
# Keep around extra reference to self.fp because HTTPResponse sets
# self.fp=None after it finishes reading, but we still need it
self.recorder = ProxyingRecorder(
self.fp, proxy_client, digest_algorithm, url=url)
self.fp, proxy_client, digest_algorithm, url=url,
tmp_file_max_memory_size=tmp_file_max_memory_size)
self.fp = self.recorder
self.payload_digest = None
@ -208,6 +210,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler")
_socket_timeout = 60
_max_resource_size = None
_tmp_file_max_memory_size = 512 * 1024
def __init__(self, request, client_address, server):
threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1])
@ -425,7 +428,8 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
prox_rec_res = ProxyingRecordingHTTPResponse(
self._remote_server_sock, proxy_client=self.connection,
digest_algorithm=self.server.digest_algorithm,
url=self.url, method=self.command)
url=self.url, method=self.command,
tmp_file_max_memory_size=self._tmp_file_max_memory_size)
prox_rec_res.begin(extra_response_headers=extra_response_headers)
buf = prox_rec_res.read(65536)

View File

@ -405,6 +405,8 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object):
WarcProxyHandler._socket_timeout = options.socket_timeout
if options.max_resource_size:
WarcProxyHandler._max_resource_size = options.max_resource_size
if options.tmp_file_max_memory_size:
WarcProxyHandler._tmp_file_max_memory_size = options.tmp_file_max_memory_size
http_server.HTTPServer.__init__(
self, server_address, WarcProxyHandler, bind_and_activate=True)