mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
refactor the multithreaded warc writing
main functional change is that only as man warc files are created as are needed to keep up with the throughput
This commit is contained in:
parent
d2bdc9e213
commit
fd81190517
@ -767,10 +767,12 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 4)
|
||||
|
||||
# close the warc
|
||||
assert warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"]
|
||||
writer = warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"]
|
||||
warc_path = os.path.join(writer.directory, writer._f_finalname)
|
||||
warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"].close_writer()
|
||||
assert warcprox_.warc_writer_processor.writer_pool.warc_writers["test_dedup_buckets"]
|
||||
writer = warcprox_.warc_writer_processor.writer_pool.warc_writers["test_dedup_buckets"]
|
||||
warc = writer._available_warcs.queue[0]
|
||||
warc_path = os.path.join(warc.directory, warc.finalname)
|
||||
assert not os.path.exists(warc_path)
|
||||
warcprox_.warc_writer_processor.writer_pool.warc_writers["test_dedup_buckets"].close_writer()
|
||||
assert os.path.exists(warc_path)
|
||||
|
||||
# read the warc
|
||||
@ -1389,20 +1391,16 @@ def test_controller_with_defaults():
|
||||
assert controller.proxy.server_port == 8000
|
||||
assert controller.proxy.running_stats
|
||||
assert not controller.proxy.stats_db
|
||||
wwt = controller.warc_writer_thread
|
||||
assert wwt
|
||||
assert wwt.inq
|
||||
assert wwt.outq
|
||||
assert wwt.writer_pool
|
||||
assert wwt.writer_pool.default_warc_writer
|
||||
assert wwt.writer_pool.default_warc_writer.directory == './warcs'
|
||||
assert wwt.writer_pool.default_warc_writer.rollover_idle_time is None
|
||||
assert wwt.writer_pool.default_warc_writer.rollover_size == 1000000000
|
||||
assert wwt.writer_pool.default_warc_writer.prefix == 'warcprox'
|
||||
assert wwt.writer_pool.default_warc_writer.gzip is False
|
||||
assert wwt.writer_pool.default_warc_writer.record_builder
|
||||
assert not wwt.writer_pool.default_warc_writer.record_builder.base32
|
||||
assert wwt.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1'
|
||||
wwp = controller.warc_writer_processor
|
||||
assert wwp
|
||||
assert wwp.inq
|
||||
assert wwp.outq
|
||||
assert wwp.writer_pool
|
||||
assert wwp.writer_pool.default_warc_writer
|
||||
assert wwp.writer_pool.default_warc_writer.gzip is False
|
||||
assert wwp.writer_pool.default_warc_writer.record_builder
|
||||
assert not wwp.writer_pool.default_warc_writer.record_builder.base32
|
||||
assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1'
|
||||
|
||||
def test_load_plugin():
|
||||
options = warcprox.Options(port=0, plugins=[
|
||||
@ -1482,7 +1480,7 @@ def test_via_response_header(warcprox_, http_daemon, archiving_proxies, playback
|
||||
assert response.status_code == 200
|
||||
assert not 'via' in playback_response
|
||||
|
||||
warc = warcprox_.warc_writer_thread.writer_pool.default_warc_writer._fpath
|
||||
warc = warcprox_.warc_writer_processor.writer_pool.default_warc_writer._available_warcs.queue[0].path
|
||||
with open(warc, 'rb') as f:
|
||||
for record in warcio.archiveiterator.ArchiveIterator(f):
|
||||
if record.rec_headers.get_header('warc-target-uri') == url:
|
||||
@ -1700,10 +1698,11 @@ def test_long_warcprox_meta(
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
|
||||
|
||||
# check that warcprox-meta was parsed and honored ("warc-prefix" param)
|
||||
assert warcprox_.warc_writer_thread.writer_pool.warc_writers["test_long_warcprox_meta"]
|
||||
writer = warcprox_.warc_writer_thread.writer_pool.warc_writers["test_long_warcprox_meta"]
|
||||
warc_path = os.path.join(writer.directory, writer._f_finalname)
|
||||
warcprox_.warc_writer_thread.writer_pool.warc_writers["test_long_warcprox_meta"].close_writer()
|
||||
assert warcprox_.warc_writer_processor.writer_pool.warc_writers["test_long_warcprox_meta"]
|
||||
writer = warcprox_.warc_writer_processor.writer_pool.warc_writers["test_long_warcprox_meta"]
|
||||
warc = writer._available_warcs.queue[0]
|
||||
warc_path = os.path.join(warc.directory, warc.finalname)
|
||||
warcprox_.warc_writer_processor.writer_pool.warc_writers["test_long_warcprox_meta"].close_writer()
|
||||
assert os.path.exists(warc_path)
|
||||
|
||||
# read the warc
|
||||
|
@ -61,7 +61,8 @@ def test_warc_writer_locking(tmpdir):
|
||||
timestamp=datetime.utcnow())
|
||||
|
||||
dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer')))
|
||||
wwriter = WarcWriter(Options(directory=dirname, no_warc_open_suffix=True))
|
||||
wwriter = WarcWriter(Options(
|
||||
directory=dirname, no_warc_open_suffix=True, writer_threads=1))
|
||||
wwriter.write_records(recorded_url)
|
||||
warcs = [fn for fn in os.listdir(dirname) if fn.endswith('.warc')]
|
||||
assert warcs
|
||||
@ -93,7 +94,8 @@ def test_special_dont_write_prefix():
|
||||
logging.debug('cd %s', tmpdir)
|
||||
os.chdir(tmpdir)
|
||||
|
||||
wwt = warcprox.writerthread.WarcWriterThread(Options(prefix='-'))
|
||||
wwt = warcprox.writerthread.WarcWriterProcessor(
|
||||
Options(prefix='-', writer_threads=1))
|
||||
wwt.inq = warcprox.TimestampedQueue(maxsize=1)
|
||||
wwt.outq = warcprox.TimestampedQueue(maxsize=1)
|
||||
try:
|
||||
@ -126,7 +128,8 @@ def test_special_dont_write_prefix():
|
||||
wwt.stop.set()
|
||||
wwt.join()
|
||||
|
||||
wwt = warcprox.writerthread.WarcWriterThread()
|
||||
wwt = warcprox.writerthread.WarcWriterProcessor(
|
||||
Options(writer_threads=1))
|
||||
wwt.inq = warcprox.TimestampedQueue(maxsize=1)
|
||||
wwt.outq = warcprox.TimestampedQueue(maxsize=1)
|
||||
try:
|
||||
@ -172,8 +175,11 @@ def test_warc_writer_filename(tmpdir):
|
||||
|
||||
dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer')))
|
||||
wwriter = WarcWriter(Options(directory=dirname, prefix='foo',
|
||||
warc_filename='{timestamp17}_{prefix}_{timestamp14}_{serialno}'))
|
||||
warc_filename='{timestamp17}_{prefix}_{timestamp14}_{serialno}',
|
||||
writer_threads=1))
|
||||
wwriter.write_records(recorded_url)
|
||||
warcs = [fn for fn in os.listdir(dirname)]
|
||||
assert warcs
|
||||
assert re.search('\d{17}_foo_\d{14}_00000.warc.open', wwriter._fpath)
|
||||
assert re.search(
|
||||
r'\d{17}_foo_\d{14}_00000.warc.open',
|
||||
wwriter._available_warcs.queue[0].path)
|
||||
|
@ -237,6 +237,14 @@ class ListenerPostfetchProcessor(BaseStandardPostfetchProcessor):
|
||||
self.logger.error(
|
||||
'%s raised exception', listener.stop, exc_info=True)
|
||||
|
||||
def timestamp17():
|
||||
now = datetime.datetime.utcnow()
|
||||
return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000)
|
||||
|
||||
def timestamp14():
|
||||
now = datetime.datetime.utcnow()
|
||||
return '{:%Y%m%d%H%M%S}'.format(now)
|
||||
|
||||
# monkey-patch log levels TRACE and NOTICE
|
||||
TRACE = 5
|
||||
def _logger_trace(self, msg, *args, **kwargs):
|
||||
|
@ -57,7 +57,6 @@ class Factory:
|
||||
|
||||
@staticmethod
|
||||
def stats_processor(options):
|
||||
# return warcprox.stats.StatsProcessor(options)
|
||||
if options.rethinkdb_stats_url:
|
||||
stats_processor = warcprox.stats.RethinkStatsProcessor(options)
|
||||
elif options.stats_db_file in (None, '', '/dev/null'):
|
||||
@ -68,11 +67,8 @@ class Factory:
|
||||
return stats_processor
|
||||
|
||||
@staticmethod
|
||||
def warc_writer(options):
|
||||
if options.writer_threads:
|
||||
return warcprox.writerthread.WarcWriterMultiThread(options)
|
||||
else:
|
||||
return warcprox.writerthread.WarcWriterThread(options)
|
||||
def warc_writer_processor(options):
|
||||
return warcprox.writerthread.WarcWriterProcessor(options)
|
||||
|
||||
@staticmethod
|
||||
def playback_proxy(ca, options):
|
||||
@ -145,6 +141,12 @@ class WarcproxController(object):
|
||||
self.playback_proxy = Factory.playback_proxy(
|
||||
self.proxy.ca, self.options)
|
||||
|
||||
# default number of warc writer threads = sqrt(proxy.max_threads)
|
||||
# pulled out of thin air because it strikes me as reasonable
|
||||
# 1=>1 2=>1 5=>2 10=>3 50=>7 100=>10 200=>14 500=>22 1000=>32 2000=>45
|
||||
if not self.options.writer_threads:
|
||||
self.options.writer_threads = int(self.proxy.max_threads ** 0.5)
|
||||
|
||||
self.build_postfetch_chain(self.proxy.recorded_url_q)
|
||||
|
||||
self.service_registry = Factory.service_registry(options)
|
||||
@ -184,8 +186,8 @@ class WarcproxController(object):
|
||||
if self.dedup_db:
|
||||
self._postfetch_chain.append(self.dedup_db.loader())
|
||||
|
||||
self.warc_writer_thread = Factory.warc_writer(self.options)
|
||||
self._postfetch_chain.append(self.warc_writer_thread)
|
||||
self.warc_writer_processor = Factory.warc_writer_processor(self.options)
|
||||
self._postfetch_chain.append(self.warc_writer_processor)
|
||||
|
||||
if self.dedup_db:
|
||||
self._postfetch_chain.append(self.dedup_db.storer())
|
||||
|
@ -22,296 +22,220 @@ USA.
|
||||
from __future__ import absolute_import
|
||||
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from hanzo import warctools
|
||||
import fcntl
|
||||
import time
|
||||
import warcprox
|
||||
import os
|
||||
import socket
|
||||
import string
|
||||
import random
|
||||
import threading
|
||||
try:
|
||||
import queue
|
||||
except ImportError:
|
||||
import Queue as queue
|
||||
import contextlib
|
||||
|
||||
class _OneWritableWarc:
|
||||
logger = logging.getLogger('warcprox.writer._OneWritableWarc')
|
||||
|
||||
class WarcWriter:
|
||||
logger = logging.getLogger('warcprox.writer.WarcWriter')
|
||||
|
||||
def __init__(self, options=warcprox.Options()):
|
||||
|
||||
'''
|
||||
Utility class used by WarcWriter
|
||||
'''
|
||||
def __init__(self, options=warcprox.Options(), randomtoken='0'):
|
||||
self.f = None
|
||||
self.path = None
|
||||
self.finalname = None
|
||||
self.gzip = options.gzip or False
|
||||
self.prefix = options.prefix or 'warcprox'
|
||||
self.open_suffix = '' if options.no_warc_open_suffix else '.open'
|
||||
self.randomtoken = randomtoken
|
||||
self.rollover_size = options.rollover_size or 1000000000
|
||||
self.rollover_idle_time = options.rollover_idle_time or None
|
||||
self._last_activity = time.time()
|
||||
|
||||
self.gzip = options.gzip or False
|
||||
self.warc_filename = options.warc_filename or \
|
||||
'{prefix}-{timestamp17}-{randomtoken}-{serialno}'
|
||||
digest_algorithm = options.digest_algorithm or 'sha1'
|
||||
base32 = options.base32
|
||||
self.record_builder = warcprox.warc.WarcRecordBuilder(
|
||||
digest_algorithm=digest_algorithm, base32=base32)
|
||||
|
||||
# warc path and filename stuff
|
||||
self.directory = options.directory or './warcs'
|
||||
self.prefix = options.prefix or 'warcprox'
|
||||
|
||||
self._f = None
|
||||
self._fpath = None
|
||||
self._f_finalname = None
|
||||
self._f_open_suffix = '' if options.no_warc_open_suffix else '.open'
|
||||
self._serial = 0
|
||||
self._lock = threading.RLock()
|
||||
|
||||
self._randomtoken = "".join(random.Random().sample(string.digits + string.ascii_lowercase, 8))
|
||||
|
||||
if not os.path.exists(self.directory):
|
||||
self.logger.info("warc destination directory {} doesn't exist, creating it".format(self.directory))
|
||||
os.mkdir(self.directory)
|
||||
|
||||
def timestamp17(self):
|
||||
now = datetime.utcnow()
|
||||
return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000)
|
||||
|
||||
def timestamp14(self):
|
||||
now = datetime.utcnow()
|
||||
return '{:%Y%m%d%H%M%S}'.format(now)
|
||||
|
||||
def close_writer(self):
|
||||
with self._lock:
|
||||
if self._fpath:
|
||||
self.logger.info('closing %s', self._f_finalname)
|
||||
if self._f_open_suffix == '':
|
||||
try:
|
||||
fcntl.lockf(self._f, fcntl.LOCK_UN)
|
||||
except IOError as exc:
|
||||
self.logger.error('could not unlock file %s (%s)',
|
||||
self._fpath, exc)
|
||||
self._f.close()
|
||||
finalpath = os.path.sep.join(
|
||||
[self.directory, self._f_finalname])
|
||||
os.rename(self._fpath, finalpath)
|
||||
|
||||
self._fpath = None
|
||||
self._f = None
|
||||
|
||||
def serial(self):
|
||||
return '{:05d}'.format(self._serial)
|
||||
self.filename_template = options.warc_filename or \
|
||||
'{prefix}-{timestamp17}-{randomtoken}-{serialno}'
|
||||
self.last_activity = time.time()
|
||||
|
||||
# h3 default <!-- <property name="template" value="${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}" /> -->
|
||||
def _warc_filename(self):
|
||||
def next_filename(self, serial):
|
||||
"""WARC filename is configurable with CLI parameter --warc-filename.
|
||||
Default: '{prefix}-{timestamp17}-{serialno}-{randomtoken}'
|
||||
Default: '{prefix}-{timestamp17}-{randomtoken}-{serialno}'
|
||||
Available variables are: prefix, timestamp14, timestamp17, serialno,
|
||||
randomtoken, hostname, shorthostname.
|
||||
Extension ``.warc`` or ``.warc.gz`` is appended automatically.
|
||||
"""
|
||||
hostname = socket.getfqdn()
|
||||
shorthostname = hostname.split('.')[0]
|
||||
fname = self.warc_filename.format(prefix=self.prefix,
|
||||
timestamp14=self.timestamp14(),
|
||||
timestamp17=self.timestamp17(),
|
||||
serialno=self.serial(),
|
||||
randomtoken=self._randomtoken,
|
||||
hostname=hostname,
|
||||
shorthostname=shorthostname)
|
||||
fname = self.filename_template.format(
|
||||
prefix=self.prefix, timestamp14=warcprox.timestamp14(),
|
||||
timestamp17=warcprox.timestamp17(),
|
||||
serialno='{:05d}'.format(serial),
|
||||
randomtoken=self.randomtoken, hostname=hostname,
|
||||
shorthostname=shorthostname)
|
||||
if self.gzip:
|
||||
fname = fname + '.warc.gz'
|
||||
else:
|
||||
fname = fname + '.warc'
|
||||
return fname
|
||||
|
||||
def _writer(self):
|
||||
with self._lock:
|
||||
if self._fpath and os.path.getsize(
|
||||
self._fpath) > self.rollover_size:
|
||||
self.close_writer()
|
||||
def open(self, serial):
|
||||
if not os.path.exists(self.directory):
|
||||
self.logger.info(
|
||||
"warc destination directory %s doesn't exist, creating it",
|
||||
self.directory)
|
||||
os.mkdir(self.directory)
|
||||
|
||||
if self._f == None:
|
||||
self._f_finalname = self._warc_filename()
|
||||
self._fpath = os.path.sep.join([
|
||||
self.directory, self._f_finalname + self._f_open_suffix])
|
||||
self.finalname = self.next_filename(serial)
|
||||
self.path = os.path.sep.join(
|
||||
[self.directory, self.finalname + self.open_suffix])
|
||||
|
||||
self._f = open(self._fpath, 'wb')
|
||||
# if no '.open' suffix is used for WARC, acquire an exclusive
|
||||
# file lock.
|
||||
if self._f_open_suffix == '':
|
||||
try:
|
||||
fcntl.lockf(self._f, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||
except IOError as exc:
|
||||
self.logger.error('could not lock file %s (%s)',
|
||||
self._fpath, exc)
|
||||
self.f = open(self.path, 'wb')
|
||||
# if no '.open' suffix is used for WARC, acquire an exclusive
|
||||
# file lock.
|
||||
if self.open_suffix == '':
|
||||
try:
|
||||
fcntl.lockf(self.f, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||
except IOError as exc:
|
||||
self.logger.error(
|
||||
'could not lock file %s (%s)', self.path, exc)
|
||||
return self.f
|
||||
|
||||
warcinfo_record = self.record_builder.build_warcinfo_record(
|
||||
self._f_finalname)
|
||||
self.logger.debug(
|
||||
'warcinfo_record.headers=%s', warcinfo_record.headers)
|
||||
warcinfo_record.write_to(self._f, gzip=self.gzip)
|
||||
def close(self):
|
||||
if self.path:
|
||||
self.logger.trace('closing %s', self.finalname)
|
||||
if self.open_suffix == '':
|
||||
try:
|
||||
fcntl.lockf(self.f, fcntl.LOCK_UN)
|
||||
except IOError as exc:
|
||||
self.logger.error(
|
||||
'could not unlock file %s (%s)', self.path, exc)
|
||||
self.f.close()
|
||||
finalpath = os.path.sep.join(
|
||||
[self.directory, self.finalname])
|
||||
os.rename(self.path, finalpath)
|
||||
|
||||
self._serial += 1
|
||||
|
||||
return self._f
|
||||
|
||||
def write_records(self, recorded_url):
|
||||
"""Returns tuple of records written, which are instances of
|
||||
hanzo.warctools.warc.WarcRecord, decorated with "warc_filename" and
|
||||
"offset" attributes."""
|
||||
records = self.record_builder.build_warc_records(recorded_url)
|
||||
|
||||
with self._lock:
|
||||
writer = self._writer()
|
||||
|
||||
for record in records:
|
||||
offset = writer.tell()
|
||||
record.write_to(writer, gzip=self.gzip)
|
||||
record.offset = offset
|
||||
record.length = writer.tell() - offset
|
||||
record.warc_filename = self._f_finalname
|
||||
self.logger.debug(
|
||||
'wrote warc record: warc_type=%s content_length=%s '
|
||||
'url=%s warc=%s offset=%d',
|
||||
record.get_header(warctools.WarcRecord.TYPE),
|
||||
record.get_header(warctools.WarcRecord.CONTENT_LENGTH),
|
||||
record.get_header(warctools.WarcRecord.URL),
|
||||
self._fpath, record.offset)
|
||||
|
||||
self._f.flush()
|
||||
self._last_activity = time.time()
|
||||
|
||||
return records
|
||||
self.path = None
|
||||
self.f = None
|
||||
|
||||
def maybe_idle_rollover(self):
|
||||
with self._lock:
|
||||
if (self._fpath is not None
|
||||
and self.rollover_idle_time is not None
|
||||
and self.rollover_idle_time > 0
|
||||
and time.time() - self._last_activity > self.rollover_idle_time):
|
||||
self.logger.info(
|
||||
'rolling over %s after %s seconds idle',
|
||||
self._f_finalname, time.time() - self._last_activity)
|
||||
self.close_writer()
|
||||
if (self.path and self.rollover_idle_time
|
||||
and self.rollover_idle_time > 0
|
||||
and time.time() - self.last_activity > self.rollover_idle_time):
|
||||
self.logger.info(
|
||||
'rolling over %s after %0.1f seconds idle',
|
||||
self.finalname, time.time() - self.last_activity)
|
||||
self.close()
|
||||
|
||||
class MultiWarcWriter(WarcWriter):
|
||||
logger = logging.getLogger("warcprox.writer.MultiWarcWriter")
|
||||
def maybe_size_rollover(self):
|
||||
if self.path and os.path.getsize(self.path) > self.rollover_size:
|
||||
self.logger.info(
|
||||
'rolling over %s because it has reached %s bytes in size',
|
||||
self.finalname, os.path.getsize(self.path))
|
||||
self.close()
|
||||
|
||||
class WarcWriter:
|
||||
logger = logging.getLogger('warcprox.writer.WarcWriter')
|
||||
|
||||
def __init__(self, options=warcprox.Options()):
|
||||
super().__init__(options)
|
||||
self._thread_num = options.writer_threads
|
||||
self._f = [None] * self._thread_num
|
||||
self._fpath = [None] * self._thread_num
|
||||
self._f_finalname = [None] * self._thread_num
|
||||
self._lock = [threading.RLock()] * self._thread_num
|
||||
self._available_threads = queue.Queue()
|
||||
for i in range(self._thread_num):
|
||||
self._available_threads.put(i)
|
||||
self.options = options
|
||||
|
||||
def _writer(self, curr):
|
||||
with self._lock[curr]:
|
||||
if self._fpath[curr] and os.path.getsize(
|
||||
self._fpath[curr]) > self.rollover_size:
|
||||
self.close_writer(curr)
|
||||
self.gzip = options.gzip or False
|
||||
self.record_builder = warcprox.warc.WarcRecordBuilder(
|
||||
digest_algorithm=options.digest_algorithm or 'sha1',
|
||||
base32=options.base32)
|
||||
|
||||
if self._f[curr] == None:
|
||||
self._f_finalname[curr] = self._warc_filename()
|
||||
self._fpath[curr] = os.path.sep.join([
|
||||
self.directory, self._f_finalname[curr] + self._f_open_suffix])
|
||||
self._available_warcs = queue.Queue()
|
||||
self._warc_count = 0
|
||||
self._warc_count_lock = threading.Lock()
|
||||
|
||||
self._f[curr] = open(self._fpath[curr], 'wb')
|
||||
# if no '.open' suffix is used for WARC, acquire an exclusive
|
||||
# file lock.
|
||||
if self._f_open_suffix == '':
|
||||
try:
|
||||
fcntl.lockf(self._f[curr], fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||
except IOError as exc:
|
||||
self.logger.error('could not lock file %s (%s)',
|
||||
self._fpath, exc)
|
||||
self._serial = 0
|
||||
self._serial_lock = threading.Lock()
|
||||
|
||||
warcinfo_record = self.record_builder.build_warcinfo_record(
|
||||
self._f_finalname[curr])
|
||||
self.logger.debug(
|
||||
'warcinfo_record.headers=%s', warcinfo_record.headers)
|
||||
warcinfo_record.write_to(self._f[curr], gzip=self.gzip)
|
||||
self._randomtoken = ''.join(
|
||||
random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8))
|
||||
|
||||
def _bespeak_warc(self):
|
||||
try:
|
||||
return self._available_warcs.get(block=False)
|
||||
except queue.Empty:
|
||||
with self._warc_count_lock:
|
||||
if self._warc_count < self.options.writer_threads:
|
||||
self._warc_count += 1
|
||||
return _OneWritableWarc(self.options, self._randomtoken)
|
||||
# else we're maxed out, wait for one to free up
|
||||
return self._available_warcs.get(block=True)
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _warc(self):
|
||||
warc = self._bespeak_warc()
|
||||
|
||||
warc.maybe_size_rollover()
|
||||
|
||||
# lazy file open
|
||||
if warc.f == None:
|
||||
with self._serial_lock:
|
||||
serial = self._serial
|
||||
self._serial += 1
|
||||
warc.open(serial)
|
||||
warcinfo = self.record_builder.build_warcinfo_record(warc.finalname)
|
||||
self.logger.debug('warcinfo.headers=%s', warcinfo.headers)
|
||||
warcinfo.write_to(warc.f, gzip=self.gzip)
|
||||
|
||||
return self._f[curr]
|
||||
yield warc
|
||||
|
||||
# __exit__()
|
||||
warc.f.flush()
|
||||
warc.last_activity = time.time()
|
||||
self._available_warcs.put(warc)
|
||||
|
||||
def write_records(self, recorded_url):
|
||||
"""Returns tuple of records written, which are instances of
|
||||
hanzo.warctools.warc.WarcRecord, decorated with "warc_filename" and
|
||||
"offset" attributes."""
|
||||
records = self.record_builder.build_warc_records(recorded_url)
|
||||
curr = self._available_threads.get()
|
||||
# we could also remove that lock?? The queue guaranties that no two
|
||||
# threads have the same curr open.
|
||||
with self._lock[curr]:
|
||||
writer = self._writer(curr)
|
||||
|
||||
with self._warc() as warc:
|
||||
for record in records:
|
||||
offset = writer.tell()
|
||||
record.write_to(writer, gzip=self.gzip)
|
||||
offset = warc.f.tell()
|
||||
record.write_to(warc.f, gzip=self.gzip)
|
||||
record.offset = offset
|
||||
record.length = writer.tell() - offset
|
||||
record.warc_filename = self._f_finalname[curr]
|
||||
record.length = warc.f.tell() - offset
|
||||
record.warc_filename = warc.finalname
|
||||
self.logger.debug(
|
||||
'wrote warc record: warc_type=%s content_length=%s '
|
||||
'url=%s warc=%s offset=%d',
|
||||
record.get_header(warctools.WarcRecord.TYPE),
|
||||
record.get_header(warctools.WarcRecord.CONTENT_LENGTH),
|
||||
record.get_header(warctools.WarcRecord.URL),
|
||||
self._fpath[curr], record.offset)
|
||||
warc.path, record.offset)
|
||||
|
||||
self._f[curr].flush()
|
||||
self._last_activity = time.time()
|
||||
self._available_threads.put(curr)
|
||||
return records
|
||||
|
||||
def maybe_idle_rollover(self):
|
||||
for curr in range(0, self._thread_num):
|
||||
with self._lock[curr]:
|
||||
if (self._fpath[curr] is not None
|
||||
and self.rollover_idle_time is not None
|
||||
and self.rollover_idle_time > 0
|
||||
and time.time() - self._last_activity > self.rollover_idle_time):
|
||||
self.logger.info(
|
||||
'rolling over %s after %s seconds idle',
|
||||
self._f_finalname[curr], time.time() - self._last_activity)
|
||||
self.close_writer(curr)
|
||||
warcs = []
|
||||
while True:
|
||||
try:
|
||||
warc = self._available_warcs.get(block=False)
|
||||
warcs.append(warc)
|
||||
except queue.Empty:
|
||||
break
|
||||
for warc in warcs:
|
||||
warc.maybe_idle_rollover()
|
||||
self._available_warcs.put(warc)
|
||||
|
||||
def close_writer(self, curr=None):
|
||||
"""When this method is invoked without any argument (program termination)
|
||||
close all writer.
|
||||
"""
|
||||
if not curr:
|
||||
for curr in range(0, self._thread_num):
|
||||
self.close_writer(curr)
|
||||
return
|
||||
|
||||
with self._lock[curr]:
|
||||
if self._fpath[curr]:
|
||||
self.logger.info('closing %s', self._f_finalname[curr])
|
||||
if self._f_open_suffix == '':
|
||||
try:
|
||||
fcntl.lockf(self._f[curr], fcntl.LOCK_UN)
|
||||
except IOError as exc:
|
||||
self.logger.error('could not unlock file %s (%s)',
|
||||
self._fpath[curr], exc)
|
||||
self._f[curr].close()
|
||||
finalpath = os.path.sep.join(
|
||||
[self.directory, self._f_finalname[curr]])
|
||||
os.rename(self._fpath[curr], finalpath)
|
||||
def close_writer(self):
|
||||
while self._warc_count > 0:
|
||||
with self._warc_count_lock:
|
||||
warc = self._available_warcs.get()
|
||||
warc.close()
|
||||
self._warc_count -= 1
|
||||
|
||||
class WarcWriterPool:
|
||||
logger = logging.getLogger("warcprox.writer.WarcWriterPool")
|
||||
|
||||
def __init__(self, options=warcprox.Options()):
|
||||
if options.writer_threads:
|
||||
self.default_warc_writer = MultiWarcWriter(options=options)
|
||||
else:
|
||||
self.default_warc_writer = WarcWriter(options=options)
|
||||
self.default_warc_writer = WarcWriter(options)
|
||||
self.warc_writers = {} # {prefix:WarcWriter}
|
||||
self.options = options
|
||||
self._lock = threading.RLock()
|
||||
@ -326,12 +250,7 @@ class WarcWriterPool:
|
||||
options.prefix = recorded_url.warcprox_meta["warc-prefix"]
|
||||
with self._lock:
|
||||
if not options.prefix in self.warc_writers:
|
||||
if self.options.writer_threads:
|
||||
self.warc_writers[options.prefix] = MultiWarcWriter(
|
||||
options=options)
|
||||
else:
|
||||
self.warc_writers[options.prefix] = WarcWriter(
|
||||
options=options)
|
||||
self.warc_writers[options.prefix] = WarcWriter(options)
|
||||
w = self.warc_writers[options.prefix]
|
||||
return w
|
||||
|
||||
|
@ -32,32 +32,42 @@ import time
|
||||
import warcprox
|
||||
from concurrent import futures
|
||||
|
||||
class WarcWriterThread(warcprox.BaseStandardPostfetchProcessor):
|
||||
logger = logging.getLogger("warcprox.writerthread.WarcWriterThread")
|
||||
class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
|
||||
logger = logging.getLogger("warcprox.writerthread.WarcWriterProcessor")
|
||||
|
||||
_ALWAYS_ACCEPT = {'WARCPROX_WRITE_RECORD'}
|
||||
|
||||
def __init__(self, options=warcprox.Options()):
|
||||
warcprox.BaseStandardPostfetchProcessor.__init__(self, options=options)
|
||||
self.options = options
|
||||
self.writer_pool = warcprox.writer.WarcWriterPool(options)
|
||||
self.method_filter = set(method.upper() for method in self.options.method_filter or [])
|
||||
self.pool = futures.ThreadPoolExecutor(max_workers=options.writer_threads or 1)
|
||||
self.batch = set()
|
||||
|
||||
def _get_process_put(self):
|
||||
try:
|
||||
warcprox.BaseStandardPostfetchProcessor._get_process_put(self)
|
||||
finally:
|
||||
self.writer_pool.maybe_idle_rollover()
|
||||
recorded_url = self.inq.get(block=True, timeout=0.5)
|
||||
self.batch.add(recorded_url)
|
||||
self.pool.submit(self._process_url, recorded_url)
|
||||
|
||||
def _process_url(self, recorded_url):
|
||||
records = []
|
||||
if self._should_archive(recorded_url):
|
||||
records = self.writer_pool.write_records(recorded_url)
|
||||
recorded_url.warc_records = records
|
||||
self._log(recorded_url, records)
|
||||
# try to release resources in a timely fashion
|
||||
if recorded_url.response_recorder and recorded_url.response_recorder.tempfile:
|
||||
recorded_url.response_recorder.tempfile.close()
|
||||
try:
|
||||
records = []
|
||||
if self._should_archive(recorded_url):
|
||||
records = self.writer_pool.write_records(recorded_url)
|
||||
recorded_url.warc_records = records
|
||||
self._log(recorded_url, records)
|
||||
# try to release resources in a timely fashion
|
||||
if recorded_url.response_recorder and recorded_url.response_recorder.tempfile:
|
||||
recorded_url.response_recorder.tempfile.close()
|
||||
except:
|
||||
logging.error(
|
||||
'caught exception processing %s', recorded_url.url,
|
||||
exc_info=True)
|
||||
finally:
|
||||
self.batch.remove(recorded_url)
|
||||
if self.outq:
|
||||
self.outq.put(recorded_url)
|
||||
self.writer_pool.maybe_idle_rollover()
|
||||
|
||||
def _filter_accepts(self, recorded_url):
|
||||
if not self.method_filter:
|
||||
@ -94,35 +104,3 @@ class WarcWriterThread(warcprox.BaseStandardPostfetchProcessor):
|
||||
def _shutdown(self):
|
||||
self.writer_pool.close_writers()
|
||||
|
||||
|
||||
class WarcWriterMultiThread(WarcWriterThread):
|
||||
logger = logging.getLogger("warcprox.writerthread.WarcWriterMultiThread")
|
||||
|
||||
def __init__(self, options=warcprox.Options()):
|
||||
warcprox.BaseStandardPostfetchProcessor.__init__(self, options=options)
|
||||
self.pool = futures.ThreadPoolExecutor(max_workers=options.writer_threads)
|
||||
self.batch = set()
|
||||
self.options = options
|
||||
self.writer_pool = warcprox.writer.WarcWriterPool(options)
|
||||
self.method_filter = set(method.upper() for method in self.options.method_filter or [])
|
||||
|
||||
def _get_process_put(self):
|
||||
recorded_url = self.inq.get(block=True, timeout=0.5)
|
||||
self.batch.add(recorded_url)
|
||||
self.pool.submit(self._process_url, recorded_url)
|
||||
|
||||
def _process_url(self, recorded_url):
|
||||
try:
|
||||
records = []
|
||||
if self._should_archive(recorded_url):
|
||||
records = self.writer_pool.write_records(recorded_url)
|
||||
recorded_url.warc_records = records
|
||||
self._log(recorded_url, records)
|
||||
# try to release resources in a timely fashion
|
||||
if recorded_url.response_recorder and recorded_url.response_recorder.tempfile:
|
||||
recorded_url.response_recorder.tempfile.close()
|
||||
finally:
|
||||
self.batch.remove(recorded_url)
|
||||
if self.outq:
|
||||
self.outq.put(recorded_url)
|
||||
self.writer_pool.maybe_idle_rollover()
|
||||
|
Loading…
x
Reference in New Issue
Block a user