mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
make sure we always format WARC-Date properly
We started getting some WARC-Dates like this: > WARC-Date: 2018-11-04T06:34:35+00:00Z but only rarely. The warctools library function we were using to format the timestamps looks like this: def warc_datetime_str(d): s = d.isoformat() if '.' in s: s = s[:s.find('.')] return (s + 'Z').encode('utf-8') isoformat() adds a timestamp like "+00:00" if the datetime has a timezone. And it turns out that `isoformat()` leaves off the fractional part if it's zero. In that case we don't get inside the if statement there and don't chop off the timestamp. Theoretically this case should only happen once in every million records, but in practice we are seeing it more often than that (maybe in the ballpark of 1/1000). It could be that there's a codepath that produces a timestamp with no microsecond part but I'm not seeing that in the warcprox code. In any case, this is the fix.
This commit is contained in:
parent
1460040789
commit
9837d3e3a6
@ -34,6 +34,9 @@ class WarcRecordBuilder:
|
||||
self.digest_algorithm = digest_algorithm
|
||||
self.base32 = base32
|
||||
|
||||
def format_warc_date(self, dt):
|
||||
return dt.strftime('%Y-%m-%dT%H:%M:%SZ').encode('ascii')
|
||||
|
||||
def _build_response_principal_record(self, recorded_url, warc_date):
|
||||
"""Builds response or revisit record, whichever is appropriate."""
|
||||
if hasattr(recorded_url, "dedup_info") and recorded_url.dedup_info:
|
||||
@ -70,7 +73,7 @@ class WarcRecordBuilder:
|
||||
|
||||
def build_warc_records(self, recorded_url):
|
||||
"""Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)"""
|
||||
warc_date = warctools.warc.warc_datetime_str(recorded_url.timestamp)
|
||||
warc_date = self.format_warc_date(recorded_url.timestamp)
|
||||
|
||||
if recorded_url.response_recorder:
|
||||
principal_record = self._build_response_principal_record(recorded_url, warc_date)
|
||||
@ -98,7 +101,7 @@ class WarcRecordBuilder:
|
||||
content_length=None):
|
||||
|
||||
if warc_date is None:
|
||||
warc_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow())
|
||||
warc_date = self.format_warc_date(datetime.datetime.utcnow())
|
||||
|
||||
record_id = warctools.WarcRecord.random_warc_uuid()
|
||||
|
||||
@ -175,7 +178,7 @@ class WarcRecordBuilder:
|
||||
return output
|
||||
|
||||
def build_warcinfo_record(self, filename):
|
||||
warc_record_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow())
|
||||
warc_record_date = self.format_warc_date(datetime.datetime.utcnow())
|
||||
record_id = warctools.WarcRecord.random_warc_uuid()
|
||||
|
||||
headers = []
|
||||
|
Loading…
x
Reference in New Issue
Block a user