Optimise WarcWriter.maybe_size_rollover()

Every time we write WARC records to file, we call
`maybe_size_rollover()` to check if the current WARC filesize is over
the rollover threshold.
We use `os.path.getsize` which does a disk `stat` to do that.

We already know the current WARC file size from the WARC record offset
(`self.f.tell()`). There is no need to call `os.path.getsize`, we just
reuse the offset info.

This way, we do one less disk `stat` every time we write to WARC which
is a nice improvement.
This commit is contained in:
Vangelis Banos 2019-06-11 09:31:54 +00:00
parent 740a80bfdb
commit 4ca10a22d8

View File

@ -115,10 +115,8 @@ class WarcWriter:
'''
Ensures `self.f` is ready to write the next warc record.
Closes current warc if size limit has been reached. Then, if warc is
not open, opens one, and writes the warcinfo record.
If warc is not open, opens one, and writes the warcinfo record.
'''
self.maybe_size_rollover()
if not self.f:
serial = self.serial
self.serial += 1
@ -136,11 +134,14 @@ class WarcWriter:
records = self.record_builder.build_warc_records(recorded_url)
self.ensure_open()
total_warc_file_size = None
for record in records:
offset = self.f.tell()
record.write_to(self.f, gzip=self.gzip)
record.offset = offset
record.length = self.f.tell() - offset
offset2 = self.f.tell()
record.length = offset2 - offset
total_warc_file_size = offset2
record.warc_filename = self.finalname
self.logger.trace(
'wrote warc record: warc_type=%s content_length=%s '
@ -150,7 +151,8 @@ class WarcWriter:
self.path, record.get_header(warctools.WarcRecord.URL))
self.f.flush()
self.last_activity = time.time()
# Closes current warc if size limit has been reached.
self.maybe_size_rollover(total_warc_file_size)
return records
def close(self):
@ -185,11 +187,11 @@ class WarcWriter:
self.finalname, time.time() - self.last_activity)
self.close()
def maybe_size_rollover(self):
if self.path and os.path.getsize(self.path) > self.rollover_size:
def maybe_size_rollover(self, total_warc_file_size):
if total_warc_file_size and total_warc_file_size > self.rollover_size:
self.logger.info(
'rolling over %s because it has reached %s bytes in size',
self.finalname, os.path.getsize(self.path))
self.finalname, total_warc_file_size)
self.close()
class WarcWriterPool: