mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Merge pull request #136 from vbanos/save-stat
Optimise WarcWriter.maybe_size_rollover()
This commit is contained in:
commit
0abb1808b2
@ -115,10 +115,8 @@ class WarcWriter:
|
|||||||
'''
|
'''
|
||||||
Ensures `self.f` is ready to write the next warc record.
|
Ensures `self.f` is ready to write the next warc record.
|
||||||
|
|
||||||
Closes current warc if size limit has been reached. Then, if warc is
|
If warc is not open, opens one, and writes the warcinfo record.
|
||||||
not open, opens one, and writes the warcinfo record.
|
|
||||||
'''
|
'''
|
||||||
self.maybe_size_rollover()
|
|
||||||
if not self.f:
|
if not self.f:
|
||||||
serial = self.serial
|
serial = self.serial
|
||||||
self.serial += 1
|
self.serial += 1
|
||||||
@ -136,11 +134,14 @@ class WarcWriter:
|
|||||||
records = self.record_builder.build_warc_records(recorded_url)
|
records = self.record_builder.build_warc_records(recorded_url)
|
||||||
|
|
||||||
self.ensure_open()
|
self.ensure_open()
|
||||||
|
total_warc_file_size = None
|
||||||
for record in records:
|
for record in records:
|
||||||
offset = self.f.tell()
|
offset = self.f.tell()
|
||||||
record.write_to(self.f, gzip=self.gzip)
|
record.write_to(self.f, gzip=self.gzip)
|
||||||
record.offset = offset
|
record.offset = offset
|
||||||
record.length = self.f.tell() - offset
|
offset2 = self.f.tell()
|
||||||
|
record.length = offset2 - offset
|
||||||
|
total_warc_file_size = offset2
|
||||||
record.warc_filename = self.finalname
|
record.warc_filename = self.finalname
|
||||||
self.logger.trace(
|
self.logger.trace(
|
||||||
'wrote warc record: warc_type=%s content_length=%s '
|
'wrote warc record: warc_type=%s content_length=%s '
|
||||||
@ -150,7 +151,8 @@ class WarcWriter:
|
|||||||
self.path, record.get_header(warctools.WarcRecord.URL))
|
self.path, record.get_header(warctools.WarcRecord.URL))
|
||||||
self.f.flush()
|
self.f.flush()
|
||||||
self.last_activity = time.time()
|
self.last_activity = time.time()
|
||||||
|
# Closes current warc if size limit has been reached.
|
||||||
|
self.maybe_size_rollover(total_warc_file_size)
|
||||||
return records
|
return records
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
@ -185,11 +187,11 @@ class WarcWriter:
|
|||||||
self.finalname, time.time() - self.last_activity)
|
self.finalname, time.time() - self.last_activity)
|
||||||
self.close()
|
self.close()
|
||||||
|
|
||||||
def maybe_size_rollover(self):
|
def maybe_size_rollover(self, total_warc_file_size):
|
||||||
if self.path and os.path.getsize(self.path) > self.rollover_size:
|
if total_warc_file_size and total_warc_file_size > self.rollover_size:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'rolling over %s because it has reached %s bytes in size',
|
'rolling over %s because it has reached %s bytes in size',
|
||||||
self.finalname, os.path.getsize(self.path))
|
self.finalname, total_warc_file_size)
|
||||||
self.close()
|
self.close()
|
||||||
|
|
||||||
class WarcWriterPool:
|
class WarcWriterPool:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user