mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
cap the number of urls queued for warc writing
This commit is contained in:
parent
cb0dea3739
commit
cc8fb4c608
2
setup.py
2
setup.py
@ -40,7 +40,7 @@ except:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='warcprox',
|
name='warcprox',
|
||||||
version='2.4b2.dev165',
|
version='2.4b2.dev166',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -26,12 +26,14 @@ import time
|
|||||||
import logging
|
import logging
|
||||||
from argparse import Namespace as _Namespace
|
from argparse import Namespace as _Namespace
|
||||||
from pkg_resources import get_distribution as _get_distribution
|
from pkg_resources import get_distribution as _get_distribution
|
||||||
__version__ = _get_distribution('warcprox').version
|
import concurrent.futures
|
||||||
try:
|
try:
|
||||||
import queue
|
import queue
|
||||||
except ImportError:
|
except ImportError:
|
||||||
import Queue as queue
|
import Queue as queue
|
||||||
|
|
||||||
|
__version__ = _get_distribution('warcprox').version
|
||||||
|
|
||||||
def digest_str(hash_obj, base32=False):
|
def digest_str(hash_obj, base32=False):
|
||||||
import base64
|
import base64
|
||||||
return hash_obj.name.encode('utf-8') + b':' + (
|
return hash_obj.name.encode('utf-8') + b':' + (
|
||||||
@ -45,6 +47,17 @@ class Options(_Namespace):
|
|||||||
except AttributeError:
|
except AttributeError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
class ThreadPoolExecutor(concurrent.futures.ThreadPoolExecutor):
|
||||||
|
'''
|
||||||
|
`concurrent.futures.ThreadPoolExecutor` supporting a queue of limited size.
|
||||||
|
|
||||||
|
If `max_queued` is set, calls to `submit()` will block if necessary until a
|
||||||
|
free slot is available.
|
||||||
|
'''
|
||||||
|
def __init__(self, max_queued=None, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self._work_queue = queue.Queue(maxsize=max_queued or 0)
|
||||||
|
|
||||||
class TimestampedQueue(queue.Queue):
|
class TimestampedQueue(queue.Queue):
|
||||||
"""
|
"""
|
||||||
A queue.Queue that exposes the time enqueued of the oldest item in the
|
A queue.Queue that exposes the time enqueued of the oldest item in the
|
||||||
|
@ -41,7 +41,13 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
|
|||||||
warcprox.BaseStandardPostfetchProcessor.__init__(self, options=options)
|
warcprox.BaseStandardPostfetchProcessor.__init__(self, options=options)
|
||||||
self.writer_pool = warcprox.writer.WarcWriterPool(options)
|
self.writer_pool = warcprox.writer.WarcWriterPool(options)
|
||||||
self.method_filter = set(method.upper() for method in self.options.method_filter or [])
|
self.method_filter = set(method.upper() for method in self.options.method_filter or [])
|
||||||
self.pool = futures.ThreadPoolExecutor(max_workers=options.writer_threads or 1)
|
|
||||||
|
# set max_queued small, because self.inq is already handling queueing
|
||||||
|
# for us; but give it a little breathing room to make sure it can keep
|
||||||
|
# worker threads busy
|
||||||
|
self.pool = warcprox.ThreadPoolExecutor(
|
||||||
|
max_workers=options.writer_threads or 1,
|
||||||
|
max_queued=10 * (options.writer_threads or 1))
|
||||||
self.batch = set()
|
self.batch = set()
|
||||||
|
|
||||||
def _startup(self):
|
def _startup(self):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user