refactor the multithreaded warc writing

main functional change is that only as man warc files are created as are needed to keep up with the throughput
2025-01-18 13:22:09 +01:00 · 2018-02-05 17:22:09 -08:00 · 2018-02-05 17:22:09 -08:00 · fd81190517
commit fd81190517
parent d2bdc9e213
6 changed files with 218 additions and 306 deletions
--- a/tests/test_warcprox.py
+++ b/tests/test_warcprox.py
@ -767,10 +767,12 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 4)
    # close the warc
-    assert warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"]
+    assert warcprox_.warc_writer_processor.writer_pool.warc_writers["test_dedup_buckets"]
-    writer = warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"]
+    writer = warcprox_.warc_writer_processor.writer_pool.warc_writers["test_dedup_buckets"]
-    warc_path = os.path.join(writer.directory, writer._f_finalname)
+    warc = writer._available_warcs.queue[0]
-    warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"].close_writer()
+    warc_path = os.path.join(warc.directory, warc.finalname)
    assert not os.path.exists(warc_path)
    warcprox_.warc_writer_processor.writer_pool.warc_writers["test_dedup_buckets"].close_writer()
    assert os.path.exists(warc_path)
    # read the warc
@ -1389,20 +1391,16 @@ def test_controller_with_defaults():
    assert controller.proxy.server_port == 8000
    assert controller.proxy.running_stats
    assert not controller.proxy.stats_db
-    wwt = controller.warc_writer_thread
+    wwp = controller.warc_writer_processor
-    assert wwt
+    assert wwp
-    assert wwt.inq
+    assert wwp.inq
-    assert wwt.outq
+    assert wwp.outq
-    assert wwt.writer_pool
+    assert wwp.writer_pool
-    assert wwt.writer_pool.default_warc_writer
+    assert wwp.writer_pool.default_warc_writer
-    assert wwt.writer_pool.default_warc_writer.directory == './warcs'
+    assert wwp.writer_pool.default_warc_writer.gzip is False
-    assert wwt.writer_pool.default_warc_writer.rollover_idle_time is None
+    assert wwp.writer_pool.default_warc_writer.record_builder
-    assert wwt.writer_pool.default_warc_writer.rollover_size == 1000000000
+    assert not wwp.writer_pool.default_warc_writer.record_builder.base32
-    assert wwt.writer_pool.default_warc_writer.prefix == 'warcprox'
+    assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1'
    assert wwt.writer_pool.default_warc_writer.gzip is False
    assert wwt.writer_pool.default_warc_writer.record_builder
    assert not wwt.writer_pool.default_warc_writer.record_builder.base32
    assert wwt.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1'
 def test_load_plugin():
    options = warcprox.Options(port=0, plugins=[
@ -1482,7 +1480,7 @@ def test_via_response_header(warcprox_, http_daemon, archiving_proxies, playback
    assert response.status_code == 200
    assert not 'via' in playback_response
-    warc = warcprox_.warc_writer_thread.writer_pool.default_warc_writer._fpath
+    warc = warcprox_.warc_writer_processor.writer_pool.default_warc_writer._available_warcs.queue[0].path
    with open(warc, 'rb') as f:
        for record in warcio.archiveiterator.ArchiveIterator(f):
            if record.rec_headers.get_header('warc-target-uri') == url:
@ -1700,10 +1698,11 @@ def test_long_warcprox_meta(
    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
    # check that warcprox-meta was parsed and honored ("warc-prefix" param)
-    assert warcprox_.warc_writer_thread.writer_pool.warc_writers["test_long_warcprox_meta"]
+    assert warcprox_.warc_writer_processor.writer_pool.warc_writers["test_long_warcprox_meta"]
-    writer = warcprox_.warc_writer_thread.writer_pool.warc_writers["test_long_warcprox_meta"]
+    writer = warcprox_.warc_writer_processor.writer_pool.warc_writers["test_long_warcprox_meta"]
-    warc_path = os.path.join(writer.directory, writer._f_finalname)
+    warc = writer._available_warcs.queue[0]
-    warcprox_.warc_writer_thread.writer_pool.warc_writers["test_long_warcprox_meta"].close_writer()
+    warc_path = os.path.join(warc.directory, warc.finalname)
    warcprox_.warc_writer_processor.writer_pool.warc_writers["test_long_warcprox_meta"].close_writer()
    assert os.path.exists(warc_path)
    # read the warc
--- a/tests/test_writer.py
+++ b/tests/test_writer.py
@ -61,7 +61,8 @@ def test_warc_writer_locking(tmpdir):
            timestamp=datetime.utcnow())
    dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer')))
-    wwriter = WarcWriter(Options(directory=dirname, no_warc_open_suffix=True))
+    wwriter = WarcWriter(Options(
        directory=dirname, no_warc_open_suffix=True, writer_threads=1))
    wwriter.write_records(recorded_url)
    warcs = [fn for fn in os.listdir(dirname) if fn.endswith('.warc')]
    assert warcs
@ -93,7 +94,8 @@ def test_special_dont_write_prefix():
        logging.debug('cd %s', tmpdir)
        os.chdir(tmpdir)
-        wwt = warcprox.writerthread.WarcWriterThread(Options(prefix='-'))
+        wwt = warcprox.writerthread.WarcWriterProcessor(
                Options(prefix='-', writer_threads=1))
        wwt.inq = warcprox.TimestampedQueue(maxsize=1)
        wwt.outq = warcprox.TimestampedQueue(maxsize=1)
        try:
@ -126,7 +128,8 @@ def test_special_dont_write_prefix():
            wwt.stop.set()
            wwt.join()
-        wwt = warcprox.writerthread.WarcWriterThread()
+        wwt = warcprox.writerthread.WarcWriterProcessor(
                Options(writer_threads=1))
        wwt.inq = warcprox.TimestampedQueue(maxsize=1)
        wwt.outq = warcprox.TimestampedQueue(maxsize=1)
        try:
@ -172,8 +175,11 @@ def test_warc_writer_filename(tmpdir):
    dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer')))
    wwriter = WarcWriter(Options(directory=dirname, prefix='foo',
-        warc_filename='{timestamp17}_{prefix}_{timestamp14}_{serialno}'))
+        warc_filename='{timestamp17}_{prefix}_{timestamp14}_{serialno}',
        writer_threads=1))
    wwriter.write_records(recorded_url)
    warcs = [fn for fn in os.listdir(dirname)]
    assert warcs
-    assert re.search('\d{17}_foo_\d{14}_00000.warc.open', wwriter._fpath)
+    assert re.search(
            r'\d{17}_foo_\d{14}_00000.warc.open',
            wwriter._available_warcs.queue[0].path)
--- a/warcprox/init.py
+++ b/warcprox/init.py
@ -237,6 +237,14 @@ class ListenerPostfetchProcessor(BaseStandardPostfetchProcessor):
                self.logger.error(
                        '%s raised exception', listener.stop, exc_info=True)
 def timestamp17():
    now = datetime.datetime.utcnow()
    return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000)
 def timestamp14():
    now = datetime.datetime.utcnow()
    return '{:%Y%m%d%H%M%S}'.format(now)
 # monkey-patch log levels TRACE and NOTICE
 TRACE = 5
 def _logger_trace(self, msg, *args, **kwargs):
--- a/warcprox/controller.py
+++ b/warcprox/controller.py
@ -57,7 +57,6 @@ class Factory:
    @staticmethod
    def stats_processor(options):
        # return warcprox.stats.StatsProcessor(options)
        if options.rethinkdb_stats_url:
            stats_processor = warcprox.stats.RethinkStatsProcessor(options)
        elif options.stats_db_file in (None, '', '/dev/null'):
@ -68,11 +67,8 @@ class Factory:
        return stats_processor
    @staticmethod
-    def warc_writer(options):
+    def warc_writer_processor(options):
-        if options.writer_threads:
+        return warcprox.writerthread.WarcWriterProcessor(options)
            return warcprox.writerthread.WarcWriterMultiThread(options)
        else:
            return warcprox.writerthread.WarcWriterThread(options)
    @staticmethod
    def playback_proxy(ca, options):
@ -145,6 +141,12 @@ class WarcproxController(object):
        self.playback_proxy = Factory.playback_proxy(
            self.proxy.ca, self.options)
        # default number of warc writer threads = sqrt(proxy.max_threads)
        # pulled out of thin air because it strikes me as reasonable
        # 1=>1 2=>1 5=>2 10=>3 50=>7 100=>10 200=>14 500=>22 1000=>32 2000=>45
        if not self.options.writer_threads:
            self.options.writer_threads = int(self.proxy.max_threads ** 0.5)
        self.build_postfetch_chain(self.proxy.recorded_url_q)
        self.service_registry = Factory.service_registry(options)
@ -184,8 +186,8 @@ class WarcproxController(object):
        if self.dedup_db:
            self._postfetch_chain.append(self.dedup_db.loader())
-        self.warc_writer_thread = Factory.warc_writer(self.options)
+        self.warc_writer_processor = Factory.warc_writer_processor(self.options)
-        self._postfetch_chain.append(self.warc_writer_thread)
+        self._postfetch_chain.append(self.warc_writer_processor)
        if self.dedup_db:
            self._postfetch_chain.append(self.dedup_db.storer())
--- a/warcprox/writer.py
+++ b/warcprox/writer.py
@ -22,296 +22,220 @@ USA.
 from __future__ import absolute_import
 import logging
 from datetime import datetime
 from hanzo import warctools
 import fcntl
 import time
 import warcprox
 import os
 import socket
 import string
 import random
 import threading
 try:
    import queue
 except ImportError:
    import Queue as queue
 import contextlib
 class _OneWritableWarc:
    logger = logging.getLogger('warcprox.writer._OneWritableWarc')
-class WarcWriter:
+    '''
-    logger = logging.getLogger('warcprox.writer.WarcWriter')
+    Utility class used by WarcWriter
-
+    '''
-    def __init__(self, options=warcprox.Options()):
+    def __init__(self, options=warcprox.Options(), randomtoken='0'):
-
+        self.f = None
        self.path = None
        self.finalname = None
        self.gzip = options.gzip or False
        self.prefix = options.prefix or 'warcprox'
        self.open_suffix = '' if options.no_warc_open_suffix else '.open'
        self.randomtoken = randomtoken
        self.rollover_size = options.rollover_size or 1000000000
        self.rollover_idle_time = options.rollover_idle_time or None
        self._last_activity = time.time()
        self.gzip = options.gzip or False
        self.warc_filename = options.warc_filename or \
            '{prefix}-{timestamp17}-{randomtoken}-{serialno}'
        digest_algorithm = options.digest_algorithm or 'sha1'
        base32 = options.base32
        self.record_builder = warcprox.warc.WarcRecordBuilder(
                digest_algorithm=digest_algorithm, base32=base32)
        # warc path and filename stuff
        self.directory = options.directory or './warcs'
-        self.prefix = options.prefix or 'warcprox'
+        self.filename_template = options.warc_filename or \
-
+                '{prefix}-{timestamp17}-{randomtoken}-{serialno}'
-        self._f = None
+        self.last_activity = time.time()
        self._fpath = None
        self._f_finalname = None
        self._f_open_suffix = '' if options.no_warc_open_suffix else '.open'
        self._serial = 0
        self._lock = threading.RLock()
        self._randomtoken = "".join(random.Random().sample(string.digits + string.ascii_lowercase, 8))
        if not os.path.exists(self.directory):
            self.logger.info("warc destination directory {} doesn't exist, creating it".format(self.directory))
            os.mkdir(self.directory)
    def timestamp17(self):
        now = datetime.utcnow()
        return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000)
    def timestamp14(self):
        now = datetime.utcnow()
        return '{:%Y%m%d%H%M%S}'.format(now)
    def close_writer(self):
        with self._lock:
            if self._fpath:
                self.logger.info('closing %s', self._f_finalname)
                if self._f_open_suffix == '':
                    try:
                        fcntl.lockf(self._f, fcntl.LOCK_UN)
                    except IOError as exc:
                        self.logger.error('could not unlock file %s (%s)',
                                          self._fpath, exc)
                self._f.close()
                finalpath = os.path.sep.join(
                        [self.directory, self._f_finalname])
                os.rename(self._fpath, finalpath)
                self._fpath = None
                self._f = None
    def serial(self):
        return '{:05d}'.format(self._serial)
    # h3 default <!-- <property name="template" value="${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}" /> -->
-    def _warc_filename(self):
+    def next_filename(self, serial):
        """WARC filename is configurable with CLI parameter --warc-filename.
-        Default: '{prefix}-{timestamp17}-{serialno}-{randomtoken}'
+        Default: '{prefix}-{timestamp17}-{randomtoken}-{serialno}'
        Available variables are: prefix, timestamp14, timestamp17, serialno,
        randomtoken, hostname, shorthostname.
        Extension ``.warc`` or ``.warc.gz`` is appended automatically.
        """
        hostname = socket.getfqdn()
        shorthostname = hostname.split('.')[0]
-        fname = self.warc_filename.format(prefix=self.prefix,
+        fname = self.filename_template.format(
-                                          timestamp14=self.timestamp14(),
+                prefix=self.prefix, timestamp14=warcprox.timestamp14(),
-                                          timestamp17=self.timestamp17(),
+                timestamp17=warcprox.timestamp17(),
-                                          serialno=self.serial(),
+                serialno='{:05d}'.format(serial),
-                                          randomtoken=self._randomtoken,
+                randomtoken=self.randomtoken, hostname=hostname,
-                                          hostname=hostname,
+                shorthostname=shorthostname)
                                          shorthostname=shorthostname)
        if self.gzip:
            fname = fname + '.warc.gz'
        else:
            fname = fname + '.warc'
        return fname
-    def _writer(self):
+    def open(self, serial):
-        with self._lock:
+        if not os.path.exists(self.directory):
-            if self._fpath and os.path.getsize(
+            self.logger.info(
-                    self._fpath) > self.rollover_size:
+                    "warc destination directory %s doesn't exist, creating it",
-                self.close_writer()
+                    self.directory)
            os.mkdir(self.directory)
-            if self._f == None:
+        self.finalname = self.next_filename(serial)
-                self._f_finalname = self._warc_filename()
+        self.path = os.path.sep.join(
-                self._fpath = os.path.sep.join([
+                [self.directory, self.finalname + self.open_suffix])
                    self.directory, self._f_finalname + self._f_open_suffix])
-                self._f = open(self._fpath, 'wb')
+        self.f = open(self.path, 'wb')
-                # if no '.open' suffix is used for WARC, acquire an exclusive
+        # if no '.open' suffix is used for WARC, acquire an exclusive
-                # file lock.
+        # file lock.
-                if self._f_open_suffix == '':
+        if self.open_suffix == '':
-                    try:
+            try:
-                        fcntl.lockf(self._f, fcntl.LOCK_EX | fcntl.LOCK_NB)
+                fcntl.lockf(self.f, fcntl.LOCK_EX | fcntl.LOCK_NB)
-                    except IOError as exc:
+            except IOError as exc:
-                        self.logger.error('could not lock file %s (%s)',
+                self.logger.error(
-                                          self._fpath, exc)
+                        'could not lock file %s (%s)', self.path, exc)
        return self.f
-                warcinfo_record = self.record_builder.build_warcinfo_record(
+    def close(self):
-                        self._f_finalname)
+        if self.path:
-                self.logger.debug(
+            self.logger.trace('closing %s', self.finalname)
-                        'warcinfo_record.headers=%s', warcinfo_record.headers)
+            if self.open_suffix == '':
-                warcinfo_record.write_to(self._f, gzip=self.gzip)
+                try:
                    fcntl.lockf(self.f, fcntl.LOCK_UN)
                except IOError as exc:
                    self.logger.error(
                            'could not unlock file %s (%s)', self.path, exc)
            self.f.close()
            finalpath = os.path.sep.join(
                    [self.directory, self.finalname])
            os.rename(self.path, finalpath)
-                self._serial += 1
+            self.path = None
-
+            self.f = None
        return self._f
    def write_records(self, recorded_url):
        """Returns tuple of records written, which are instances of
        hanzo.warctools.warc.WarcRecord, decorated with "warc_filename" and
        "offset" attributes."""
        records = self.record_builder.build_warc_records(recorded_url)
        with self._lock:
            writer = self._writer()
            for record in records:
                offset = writer.tell()
                record.write_to(writer, gzip=self.gzip)
                record.offset = offset
                record.length = writer.tell() - offset
                record.warc_filename = self._f_finalname
                self.logger.debug(
                        'wrote warc record: warc_type=%s content_length=%s '
                        'url=%s warc=%s offset=%d',
                        record.get_header(warctools.WarcRecord.TYPE),
                        record.get_header(warctools.WarcRecord.CONTENT_LENGTH),
                        record.get_header(warctools.WarcRecord.URL),
                        self._fpath, record.offset)
            self._f.flush()
            self._last_activity = time.time()
        return records
    def maybe_idle_rollover(self):
-        with self._lock:
+        if (self.path and self.rollover_idle_time
-            if (self._fpath is not None
+                and self.rollover_idle_time > 0
-                    and self.rollover_idle_time is not None
+                and time.time() - self.last_activity > self.rollover_idle_time):
-                    and self.rollover_idle_time > 0
+            self.logger.info(
-                    and time.time() - self._last_activity > self.rollover_idle_time):
+                    'rolling over %s after %0.1f seconds idle',
-                self.logger.info(
+                    self.finalname, time.time() - self.last_activity)
-                        'rolling over %s after %s seconds idle',
+            self.close()
                        self._f_finalname, time.time() - self._last_activity)
                self.close_writer()
-class MultiWarcWriter(WarcWriter):
+    def maybe_size_rollover(self):
-    logger = logging.getLogger("warcprox.writer.MultiWarcWriter")
+        if self.path and os.path.getsize(self.path) > self.rollover_size:
            self.logger.info(
                    'rolling over %s because it has reached %s bytes in size',
                    self.finalname, os.path.getsize(self.path))
            self.close()
 class WarcWriter:
    logger = logging.getLogger('warcprox.writer.WarcWriter')
    def __init__(self, options=warcprox.Options()):
-        super().__init__(options)
+        self.options = options
        self._thread_num = options.writer_threads
        self._f = [None] * self._thread_num
        self._fpath = [None] * self._thread_num
        self._f_finalname = [None] * self._thread_num
        self._lock = [threading.RLock()] * self._thread_num
        self._available_threads = queue.Queue()
        for i in range(self._thread_num):
            self._available_threads.put(i)
-    def _writer(self, curr):
+        self.gzip = options.gzip or False
-        with self._lock[curr]:
+        self.record_builder = warcprox.warc.WarcRecordBuilder(
-            if self._fpath[curr] and os.path.getsize(
+                digest_algorithm=options.digest_algorithm or 'sha1',
-                    self._fpath[curr]) > self.rollover_size:
+                base32=options.base32)
                self.close_writer(curr)
-            if self._f[curr] == None:
+        self._available_warcs = queue.Queue()
-                self._f_finalname[curr] = self._warc_filename()
+        self._warc_count = 0
-                self._fpath[curr] = os.path.sep.join([
+        self._warc_count_lock = threading.Lock()
                    self.directory, self._f_finalname[curr] + self._f_open_suffix])
-                self._f[curr] = open(self._fpath[curr], 'wb')
+        self._serial = 0
-                # if no '.open' suffix is used for WARC, acquire an exclusive
+        self._serial_lock = threading.Lock()
                # file lock.
                if self._f_open_suffix == '':
                    try:
                        fcntl.lockf(self._f[curr], fcntl.LOCK_EX | fcntl.LOCK_NB)
                    except IOError as exc:
                        self.logger.error('could not lock file %s (%s)',
                                          self._fpath, exc)
-                warcinfo_record = self.record_builder.build_warcinfo_record(
+        self._randomtoken = ''.join(
-                        self._f_finalname[curr])
+                random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8))
                self.logger.debug(
                        'warcinfo_record.headers=%s', warcinfo_record.headers)
                warcinfo_record.write_to(self._f[curr], gzip=self.gzip)
    def _bespeak_warc(self):
        try:
            return self._available_warcs.get(block=False)
        except queue.Empty:
            with self._warc_count_lock:
                if self._warc_count < self.options.writer_threads:
                    self._warc_count += 1
                    return _OneWritableWarc(self.options, self._randomtoken)
            # else we're maxed out, wait for one to free up
            return self._available_warcs.get(block=True)
    @contextlib.contextmanager
    def _warc(self):
        warc = self._bespeak_warc()
        warc.maybe_size_rollover()
        # lazy file open
        if warc.f == None:
            with self._serial_lock:
                serial = self._serial
                self._serial += 1
            warc.open(serial)
            warcinfo = self.record_builder.build_warcinfo_record(warc.finalname)
            self.logger.debug('warcinfo.headers=%s', warcinfo.headers)
            warcinfo.write_to(warc.f, gzip=self.gzip)
-        return self._f[curr]
+        yield warc
        # __exit__()
        warc.f.flush()
        warc.last_activity = time.time()
        self._available_warcs.put(warc)
    def write_records(self, recorded_url):
        """Returns tuple of records written, which are instances of
        hanzo.warctools.warc.WarcRecord, decorated with "warc_filename" and
        "offset" attributes."""
        records = self.record_builder.build_warc_records(recorded_url)
        curr = self._available_threads.get()
        # we could also remove that lock?? The queue guaranties that no two
        # threads have the same curr open.
        with self._lock[curr]:
            writer = self._writer(curr)
        with self._warc() as warc:
            for record in records:
-                offset = writer.tell()
+                offset = warc.f.tell()
-                record.write_to(writer, gzip=self.gzip)
+                record.write_to(warc.f, gzip=self.gzip)
                record.offset = offset
-                record.length = writer.tell() - offset
+                record.length = warc.f.tell() - offset
-                record.warc_filename = self._f_finalname[curr]
+                record.warc_filename = warc.finalname
                self.logger.debug(
                        'wrote warc record: warc_type=%s content_length=%s '
                        'url=%s warc=%s offset=%d',
                        record.get_header(warctools.WarcRecord.TYPE),
                        record.get_header(warctools.WarcRecord.CONTENT_LENGTH),
                        record.get_header(warctools.WarcRecord.URL),
-                        self._fpath[curr], record.offset)
+                        warc.path, record.offset)
            self._f[curr].flush()
            self._last_activity = time.time()
        self._available_threads.put(curr)
        return records
    def maybe_idle_rollover(self):
-        for curr in range(0, self._thread_num):
+        warcs = []
-            with self._lock[curr]:
+        while True:
-                if (self._fpath[curr] is not None
+            try:
-                        and self.rollover_idle_time is not None
+                warc = self._available_warcs.get(block=False)
-                        and self.rollover_idle_time > 0
+                warcs.append(warc)
-                        and time.time() - self._last_activity > self.rollover_idle_time):
+            except queue.Empty:
-                    self.logger.info(
+                break
-                            'rolling over %s after %s seconds idle',
+        for warc in warcs:
-                            self._f_finalname[curr], time.time() - self._last_activity)
+            warc.maybe_idle_rollover()
-                    self.close_writer(curr)
+            self._available_warcs.put(warc)
-    def close_writer(self, curr=None):
+    def close_writer(self):
-        """When this method is invoked without any argument (program termination)
+        while self._warc_count > 0:
-        close all writer.
+            with self._warc_count_lock:
-        """
+                warc = self._available_warcs.get()
-        if not curr:
+                warc.close()
-            for curr in range(0, self._thread_num):
+                self._warc_count -= 1
                self.close_writer(curr)
            return
        with self._lock[curr]:
            if self._fpath[curr]:
                self.logger.info('closing %s', self._f_finalname[curr])
                if self._f_open_suffix == '':
                    try:
                        fcntl.lockf(self._f[curr], fcntl.LOCK_UN)
                    except IOError as exc:
                        self.logger.error('could not unlock file %s (%s)',
                                          self._fpath[curr], exc)
                self._f[curr].close()
                finalpath = os.path.sep.join(
                        [self.directory, self._f_finalname[curr]])
                os.rename(self._fpath[curr], finalpath)
 class WarcWriterPool:
    logger = logging.getLogger("warcprox.writer.WarcWriterPool")
    def __init__(self, options=warcprox.Options()):
-        if options.writer_threads:
+        self.default_warc_writer = WarcWriter(options)
            self.default_warc_writer = MultiWarcWriter(options=options)
        else:
            self.default_warc_writer = WarcWriter(options=options)
        self.warc_writers = {}  # {prefix:WarcWriter}
        self.options = options
        self._lock = threading.RLock()
@ -326,12 +250,7 @@ class WarcWriterPool:
            options.prefix = recorded_url.warcprox_meta["warc-prefix"]
            with self._lock:
                if not options.prefix in self.warc_writers:
-                    if self.options.writer_threads:
+                    self.warc_writers[options.prefix] = WarcWriter(options)
                        self.warc_writers[options.prefix] = MultiWarcWriter(
                            options=options)
                    else:
                        self.warc_writers[options.prefix] = WarcWriter(
                            options=options)
                w = self.warc_writers[options.prefix]
        return w
--- a/warcprox/writerthread.py
+++ b/warcprox/writerthread.py
@ -32,32 +32,42 @@ import time
 import warcprox
 from concurrent import futures
-class WarcWriterThread(warcprox.BaseStandardPostfetchProcessor):
+class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
-    logger = logging.getLogger("warcprox.writerthread.WarcWriterThread")
+    logger = logging.getLogger("warcprox.writerthread.WarcWriterProcessor")
    _ALWAYS_ACCEPT = {'WARCPROX_WRITE_RECORD'}
    def __init__(self, options=warcprox.Options()):
        warcprox.BaseStandardPostfetchProcessor.__init__(self, options=options)
        self.options = options
        self.writer_pool = warcprox.writer.WarcWriterPool(options)
        self.method_filter = set(method.upper() for method in self.options.method_filter or [])
        self.pool = futures.ThreadPoolExecutor(max_workers=options.writer_threads or 1)
        self.batch = set()
    def _get_process_put(self):
-        try:
+        recorded_url = self.inq.get(block=True, timeout=0.5)
-            warcprox.BaseStandardPostfetchProcessor._get_process_put(self)
+        self.batch.add(recorded_url)
-        finally:
+        self.pool.submit(self._process_url, recorded_url)
            self.writer_pool.maybe_idle_rollover()
    def _process_url(self, recorded_url):
-        records = []
+        try:
-        if self._should_archive(recorded_url):
+            records = []
-            records = self.writer_pool.write_records(recorded_url)
+            if self._should_archive(recorded_url):
-        recorded_url.warc_records = records
+                records = self.writer_pool.write_records(recorded_url)
-        self._log(recorded_url, records)
+            recorded_url.warc_records = records
-        # try to release resources in a timely fashion
+            self._log(recorded_url, records)
-        if recorded_url.response_recorder and recorded_url.response_recorder.tempfile:
+            # try to release resources in a timely fashion
-            recorded_url.response_recorder.tempfile.close()
+            if recorded_url.response_recorder and recorded_url.response_recorder.tempfile:
                recorded_url.response_recorder.tempfile.close()
        except:
            logging.error(
                    'caught exception processing %s', recorded_url.url,
                    exc_info=True)
        finally:
            self.batch.remove(recorded_url)
            if self.outq:
                self.outq.put(recorded_url)
            self.writer_pool.maybe_idle_rollover()
    def _filter_accepts(self, recorded_url):
        if not self.method_filter:
@ -94,35 +104,3 @@ class WarcWriterThread(warcprox.BaseStandardPostfetchProcessor):
    def _shutdown(self):
        self.writer_pool.close_writers()
 class WarcWriterMultiThread(WarcWriterThread):
    logger = logging.getLogger("warcprox.writerthread.WarcWriterMultiThread")
    def __init__(self, options=warcprox.Options()):
        warcprox.BaseStandardPostfetchProcessor.__init__(self, options=options)
        self.pool = futures.ThreadPoolExecutor(max_workers=options.writer_threads)
        self.batch = set()
        self.options = options
        self.writer_pool = warcprox.writer.WarcWriterPool(options)
        self.method_filter = set(method.upper() for method in self.options.method_filter or [])
    def _get_process_put(self):
        recorded_url = self.inq.get(block=True, timeout=0.5)
        self.batch.add(recorded_url)
        self.pool.submit(self._process_url, recorded_url)
    def _process_url(self, recorded_url):
        try:
            records = []
            if self._should_archive(recorded_url):
                records = self.writer_pool.write_records(recorded_url)
            recorded_url.warc_records = records
            self._log(recorded_url, records)
            # try to release resources in a timely fashion
            if recorded_url.response_recorder and recorded_url.response_recorder.tempfile:
                recorded_url.response_recorder.tempfile.close()
        finally:
            self.batch.remove(recorded_url)
            if self.outq:
                self.outq.put(recorded_url)
            self.writer_pool.maybe_idle_rollover()