Merge pull request #136 from vbanos/save-stat

Optimise WarcWriter.maybe_size_rollover()
This commit is contained in:
Noah Levitt 2019-06-11 10:25:15 -07:00 committed by GitHub
commit 0abb1808b2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -115,10 +115,8 @@ class WarcWriter:
''' '''
Ensures `self.f` is ready to write the next warc record. Ensures `self.f` is ready to write the next warc record.
Closes current warc if size limit has been reached. Then, if warc is If warc is not open, opens one, and writes the warcinfo record.
not open, opens one, and writes the warcinfo record.
''' '''
self.maybe_size_rollover()
if not self.f: if not self.f:
serial = self.serial serial = self.serial
self.serial += 1 self.serial += 1
@ -136,11 +134,14 @@ class WarcWriter:
records = self.record_builder.build_warc_records(recorded_url) records = self.record_builder.build_warc_records(recorded_url)
self.ensure_open() self.ensure_open()
total_warc_file_size = None
for record in records: for record in records:
offset = self.f.tell() offset = self.f.tell()
record.write_to(self.f, gzip=self.gzip) record.write_to(self.f, gzip=self.gzip)
record.offset = offset record.offset = offset
record.length = self.f.tell() - offset offset2 = self.f.tell()
record.length = offset2 - offset
total_warc_file_size = offset2
record.warc_filename = self.finalname record.warc_filename = self.finalname
self.logger.trace( self.logger.trace(
'wrote warc record: warc_type=%s content_length=%s ' 'wrote warc record: warc_type=%s content_length=%s '
@ -150,7 +151,8 @@ class WarcWriter:
self.path, record.get_header(warctools.WarcRecord.URL)) self.path, record.get_header(warctools.WarcRecord.URL))
self.f.flush() self.f.flush()
self.last_activity = time.time() self.last_activity = time.time()
# Closes current warc if size limit has been reached.
self.maybe_size_rollover(total_warc_file_size)
return records return records
def close(self): def close(self):
@ -185,11 +187,11 @@ class WarcWriter:
self.finalname, time.time() - self.last_activity) self.finalname, time.time() - self.last_activity)
self.close() self.close()
def maybe_size_rollover(self): def maybe_size_rollover(self, total_warc_file_size):
if self.path and os.path.getsize(self.path) > self.rollover_size: if total_warc_file_size and total_warc_file_size > self.rollover_size:
self.logger.info( self.logger.info(
'rolling over %s because it has reached %s bytes in size', 'rolling over %s because it has reached %s bytes in size',
self.finalname, os.path.getsize(self.path)) self.finalname, total_warc_file_size)
self.close() self.close()
class WarcWriterPool: class WarcWriterPool: