From 14600407899afe09bb3b8ad3666c2d837fb7b4f9 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 31 Oct 2018 16:23:00 -0700 Subject: [PATCH 1/3] bump version after merge --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9e3dfee..cf17125 100755 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ except: setuptools.setup( name='warcprox', - version='2.4b3.dev188', + version='2.4b3.dev189', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From 9837d3e3a6aba3ea3b344f3ca48d1632f8cd1c9c Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 6 Nov 2018 11:21:12 -0800 Subject: [PATCH 2/3] make sure we always format WARC-Date properly We started getting some WARC-Dates like this: > WARC-Date: 2018-11-04T06:34:35+00:00Z but only rarely. The warctools library function we were using to format the timestamps looks like this: def warc_datetime_str(d): s = d.isoformat() if '.' in s: s = s[:s.find('.')] return (s + 'Z').encode('utf-8') isoformat() adds a timestamp like "+00:00" if the datetime has a timezone. And it turns out that `isoformat()` leaves off the fractional part if it's zero. In that case we don't get inside the if statement there and don't chop off the timestamp. Theoretically this case should only happen once in every million records, but in practice we are seeing it more often than that (maybe in the ballpark of 1/1000). It could be that there's a codepath that produces a timestamp with no microsecond part but I'm not seeing that in the warcprox code. In any case, this is the fix. --- warcprox/warc.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/warcprox/warc.py b/warcprox/warc.py index 21d0f5d..94fe137 100644 --- a/warcprox/warc.py +++ b/warcprox/warc.py @@ -34,6 +34,9 @@ class WarcRecordBuilder: self.digest_algorithm = digest_algorithm self.base32 = base32 + def format_warc_date(self, dt): + return dt.strftime('%Y-%m-%dT%H:%M:%SZ').encode('ascii') + def _build_response_principal_record(self, recorded_url, warc_date): """Builds response or revisit record, whichever is appropriate.""" if hasattr(recorded_url, "dedup_info") and recorded_url.dedup_info: @@ -70,7 +73,7 @@ class WarcRecordBuilder: def build_warc_records(self, recorded_url): """Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)""" - warc_date = warctools.warc.warc_datetime_str(recorded_url.timestamp) + warc_date = self.format_warc_date(recorded_url.timestamp) if recorded_url.response_recorder: principal_record = self._build_response_principal_record(recorded_url, warc_date) @@ -98,7 +101,7 @@ class WarcRecordBuilder: content_length=None): if warc_date is None: - warc_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow()) + warc_date = self.format_warc_date(datetime.datetime.utcnow()) record_id = warctools.WarcRecord.random_warc_uuid() @@ -175,7 +178,7 @@ class WarcRecordBuilder: return output def build_warcinfo_record(self, filename): - warc_record_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow()) + warc_record_date = self.format_warc_date(datetime.datetime.utcnow()) record_id = warctools.WarcRecord.random_warc_uuid() headers = [] From 4f836e91793b3dffb3914b1f4b345ca758ae649d Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 6 Nov 2018 11:29:33 -0800 Subject: [PATCH 3/3] bump version number --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index cf17125..a948dec 100755 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ except: setuptools.setup( name='warcprox', - version='2.4b3.dev189', + version='2.4b3.dev190', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt',