mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Configurable tmp file max memory size
We use `tempfile.SpooledTemporaryFile(max_size=512*1024)` to keep recorded data before writing them to WARC. Data are kept in memory when they are smaller than `max_size`, else they are written to disk. We add option `--tmp-file-max-memory-size` to make this configurable. A higher value means less /tmp disk I/O and higher overall performance but also increased memory usage.
This commit is contained in:
parent
530aaba461
commit
eda0656737
@ -166,6 +166,10 @@ def _build_arg_parser(prog='warcprox'):
|
||||
arg_parser.add_argument(
|
||||
'--socket-timeout', dest='socket_timeout', type=float,
|
||||
default=None, help=argparse.SUPPRESS)
|
||||
# Increasing this value increases memory usage but reduces /tmp disk I/O.
|
||||
arg_parser.add_argument(
|
||||
'--tmp-file-max-memory-size', dest='tmp_file_max_memory_size',
|
||||
type=int, default=512*1024, help=argparse.SUPPRESS)
|
||||
arg_parser.add_argument(
|
||||
'--max-resource-size', dest='max_resource_size', type=int,
|
||||
default=None, help='maximum resource size limit in bytes')
|
||||
|
@ -73,10 +73,11 @@ class ProxyingRecorder(object):
|
||||
|
||||
logger = logging.getLogger("warcprox.mitmproxy.ProxyingRecorder")
|
||||
|
||||
def __init__(self, fp, proxy_client, digest_algorithm='sha1', url=None):
|
||||
def __init__(self, fp, proxy_client, digest_algorithm='sha1', url=None,
|
||||
tmp_file_max_memory_size=524288):
|
||||
self.fp = fp
|
||||
# "The file has no name, and will cease to exist when it is closed."
|
||||
self.tempfile = tempfile.SpooledTemporaryFile(max_size=512*1024)
|
||||
self.tempfile = tempfile.SpooledTemporaryFile(max_size=tmp_file_max_memory_size)
|
||||
self.digest_algorithm = digest_algorithm
|
||||
self.block_digest = hashlib.new(digest_algorithm)
|
||||
self.payload_offset = None
|
||||
@ -146,7 +147,7 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
|
||||
'''
|
||||
def __init__(
|
||||
self, sock, debuglevel=0, method=None, proxy_client=None,
|
||||
digest_algorithm='sha1', url=None):
|
||||
digest_algorithm='sha1', url=None, tmp_file_max_memory_size=None):
|
||||
http_client.HTTPResponse.__init__(
|
||||
self, sock, debuglevel=debuglevel, method=method)
|
||||
self.proxy_client = proxy_client
|
||||
@ -156,7 +157,8 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
|
||||
# Keep around extra reference to self.fp because HTTPResponse sets
|
||||
# self.fp=None after it finishes reading, but we still need it
|
||||
self.recorder = ProxyingRecorder(
|
||||
self.fp, proxy_client, digest_algorithm, url=url)
|
||||
self.fp, proxy_client, digest_algorithm, url=url,
|
||||
tmp_file_max_memory_size=tmp_file_max_memory_size)
|
||||
self.fp = self.recorder
|
||||
|
||||
self.payload_digest = None
|
||||
@ -208,6 +210,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler")
|
||||
_socket_timeout = 60
|
||||
_max_resource_size = None
|
||||
_tmp_file_max_memory_size = 512 * 1024
|
||||
|
||||
def __init__(self, request, client_address, server):
|
||||
threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1])
|
||||
@ -425,7 +428,8 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
prox_rec_res = ProxyingRecordingHTTPResponse(
|
||||
self._remote_server_sock, proxy_client=self.connection,
|
||||
digest_algorithm=self.server.digest_algorithm,
|
||||
url=self.url, method=self.command)
|
||||
url=self.url, method=self.command,
|
||||
tmp_file_max_memory_size=self._tmp_file_max_memory_size)
|
||||
prox_rec_res.begin(extra_response_headers=extra_response_headers)
|
||||
|
||||
buf = prox_rec_res.read(65536)
|
||||
|
@ -405,6 +405,8 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object):
|
||||
WarcProxyHandler._socket_timeout = options.socket_timeout
|
||||
if options.max_resource_size:
|
||||
WarcProxyHandler._max_resource_size = options.max_resource_size
|
||||
if options.tmp_file_max_memory_size:
|
||||
WarcProxyHandler._tmp_file_max_memory_size = options.tmp_file_max_memory_size
|
||||
|
||||
http_server.HTTPServer.__init__(
|
||||
self, server_address, WarcProxyHandler, bind_and_activate=True)
|
||||
|
Loading…
x
Reference in New Issue
Block a user