diff --git a/setup.py b/setup.py index 5522b2a..437b46e 100755 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ except: setuptools.setup( name='warcprox', - version='2.4b2.dev167', + version='2.4b2.dev168', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/controller.py b/warcprox/controller.py index cfffd06..e89ecbb 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -141,11 +141,9 @@ class WarcproxController(object): self.playback_proxy = Factory.playback_proxy( self.proxy.ca, self.options) - # default number of warc writer threads = sqrt(proxy.max_threads) - # pulled out of thin air because it strikes me as reasonable - # 1=>1 2=>1 5=>2 10=>3 50=>7 100=>10 200=>14 500=>22 1000=>32 2000=>45 + # https://github.com/internetarchive/warcprox/wiki/benchmarking-number-of-threads if not self.options.writer_threads: - self.options.writer_threads = int(self.proxy.max_threads ** 0.5) + self.options.writer_threads = 1 self.build_postfetch_chain(self.proxy.recorded_url_q) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 8bd1861..e01f15e 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -501,35 +501,14 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): class PooledMixIn(socketserver.ThreadingMixIn): logger = logging.getLogger("warcprox.mitmproxy.PooledMixIn") def __init__(self, max_threads=None): - ''' - If max_threads is not supplied, calculates a reasonable value based - on system resource limits. - ''' self.active_requests = set() self.unaccepted_requests = 0 - if not max_threads: - # man getrlimit: "RLIMIT_NPROC The maximum number of processes (or, - # more precisely on Linux, threads) that can be created for the - # real user ID of the calling process." - try: - import resource - rlimit_nproc = resource.getrlimit(resource.RLIMIT_NPROC)[0] - rlimit_nofile = resource.getrlimit(resource.RLIMIT_NOFILE)[0] - max_threads = min(rlimit_nofile // 10, rlimit_nproc // 2) - # resource.RLIM_INFINITY == -1 which can result in max_threads == 0 - if max_threads <= 0 or max_threads > 5000: - max_threads = 5000 - self.logger.info( - "max_threads=%s (rlimit_nproc=%s, rlimit_nofile=%s)", - max_threads, rlimit_nproc, rlimit_nofile) - except Exception as e: - self.logger.warn( - "unable to calculate optimal number of threads based " - "on resource limits due to %s", e) - max_threads = 100 - self.logger.info("max_threads=%s", max_threads) - self.max_threads = max_threads - self.pool = concurrent.futures.ThreadPoolExecutor(max_threads) + if max_threads: + self.max_threads = max_threads + else: + self.max_threads = 100 + self.pool = concurrent.futures.ThreadPoolExecutor(self.max_threads) + self.logger.info("%s proxy threads", self.max_threads) def status(self): if hasattr(super(), 'status'): diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index f4de35d..ef0bd2d 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -54,7 +54,7 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor): self.batch = set() def _startup(self): - self.logger.info('%s threads', self.pool._max_workers) + self.logger.info('%s warc writer threads', self.pool._max_workers) warcprox.BaseStandardPostfetchProcessor._startup(self) def _get_process_put(self):