mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
default to 100 proxy threads, 1 warc writer thread
see https://github.com/internetarchive/warcprox/wiki/benchmarking-number-of-threads
This commit is contained in:
parent
ea4fc0f10a
commit
a1930495af
2
setup.py
2
setup.py
@ -40,7 +40,7 @@ except:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='warcprox',
|
name='warcprox',
|
||||||
version='2.4b2.dev167',
|
version='2.4b2.dev168',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -141,11 +141,9 @@ class WarcproxController(object):
|
|||||||
self.playback_proxy = Factory.playback_proxy(
|
self.playback_proxy = Factory.playback_proxy(
|
||||||
self.proxy.ca, self.options)
|
self.proxy.ca, self.options)
|
||||||
|
|
||||||
# default number of warc writer threads = sqrt(proxy.max_threads)
|
# https://github.com/internetarchive/warcprox/wiki/benchmarking-number-of-threads
|
||||||
# pulled out of thin air because it strikes me as reasonable
|
|
||||||
# 1=>1 2=>1 5=>2 10=>3 50=>7 100=>10 200=>14 500=>22 1000=>32 2000=>45
|
|
||||||
if not self.options.writer_threads:
|
if not self.options.writer_threads:
|
||||||
self.options.writer_threads = int(self.proxy.max_threads ** 0.5)
|
self.options.writer_threads = 1
|
||||||
|
|
||||||
self.build_postfetch_chain(self.proxy.recorded_url_q)
|
self.build_postfetch_chain(self.proxy.recorded_url_q)
|
||||||
|
|
||||||
|
@ -501,35 +501,14 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
class PooledMixIn(socketserver.ThreadingMixIn):
|
class PooledMixIn(socketserver.ThreadingMixIn):
|
||||||
logger = logging.getLogger("warcprox.mitmproxy.PooledMixIn")
|
logger = logging.getLogger("warcprox.mitmproxy.PooledMixIn")
|
||||||
def __init__(self, max_threads=None):
|
def __init__(self, max_threads=None):
|
||||||
'''
|
|
||||||
If max_threads is not supplied, calculates a reasonable value based
|
|
||||||
on system resource limits.
|
|
||||||
'''
|
|
||||||
self.active_requests = set()
|
self.active_requests = set()
|
||||||
self.unaccepted_requests = 0
|
self.unaccepted_requests = 0
|
||||||
if not max_threads:
|
if max_threads:
|
||||||
# man getrlimit: "RLIMIT_NPROC The maximum number of processes (or,
|
self.max_threads = max_threads
|
||||||
# more precisely on Linux, threads) that can be created for the
|
else:
|
||||||
# real user ID of the calling process."
|
self.max_threads = 100
|
||||||
try:
|
self.pool = concurrent.futures.ThreadPoolExecutor(self.max_threads)
|
||||||
import resource
|
self.logger.info("%s proxy threads", self.max_threads)
|
||||||
rlimit_nproc = resource.getrlimit(resource.RLIMIT_NPROC)[0]
|
|
||||||
rlimit_nofile = resource.getrlimit(resource.RLIMIT_NOFILE)[0]
|
|
||||||
max_threads = min(rlimit_nofile // 10, rlimit_nproc // 2)
|
|
||||||
# resource.RLIM_INFINITY == -1 which can result in max_threads == 0
|
|
||||||
if max_threads <= 0 or max_threads > 5000:
|
|
||||||
max_threads = 5000
|
|
||||||
self.logger.info(
|
|
||||||
"max_threads=%s (rlimit_nproc=%s, rlimit_nofile=%s)",
|
|
||||||
max_threads, rlimit_nproc, rlimit_nofile)
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.warn(
|
|
||||||
"unable to calculate optimal number of threads based "
|
|
||||||
"on resource limits due to %s", e)
|
|
||||||
max_threads = 100
|
|
||||||
self.logger.info("max_threads=%s", max_threads)
|
|
||||||
self.max_threads = max_threads
|
|
||||||
self.pool = concurrent.futures.ThreadPoolExecutor(max_threads)
|
|
||||||
|
|
||||||
def status(self):
|
def status(self):
|
||||||
if hasattr(super(), 'status'):
|
if hasattr(super(), 'status'):
|
||||||
|
@ -54,7 +54,7 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
|
|||||||
self.batch = set()
|
self.batch = set()
|
||||||
|
|
||||||
def _startup(self):
|
def _startup(self):
|
||||||
self.logger.info('%s threads', self.pool._max_workers)
|
self.logger.info('%s warc writer threads', self.pool._max_workers)
|
||||||
warcprox.BaseStandardPostfetchProcessor._startup(self)
|
warcprox.BaseStandardPostfetchProcessor._startup(self)
|
||||||
|
|
||||||
def _get_process_put(self):
|
def _get_process_put(self):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user