mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
We started getting some WARC-Dates like this: > WARC-Date: 2018-11-04T06:34:35+00:00Z but only rarely. The warctools library function we were using to format the timestamps looks like this: def warc_datetime_str(d): s = d.isoformat() if '.' in s: s = s[:s.find('.')] return (s + 'Z').encode('utf-8') isoformat() adds a timestamp like "+00:00" if the datetime has a timezone. And it turns out that `isoformat()` leaves off the fractional part if it's zero. In that case we don't get inside the if statement there and don't chop off the timestamp. Theoretically this case should only happen once in every million records, but in practice we are seeing it more often than that (maybe in the ballpark of 1/1000). It could be that there's a codepath that produces a timestamp with no microsecond part but I'm not seeing that in the warcprox code. In any case, this is the fix.
205 lines
9.4 KiB
Python
205 lines
9.4 KiB
Python
'''
|
|
warcprox/warc.py - assembles warc records
|
|
|
|
Copyright (C) 2013-2018 Internet Archive
|
|
|
|
This program is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU General Public License
|
|
as published by the Free Software Foundation; either version 2
|
|
of the License, or (at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
|
USA.
|
|
'''
|
|
|
|
import logging
|
|
import warcprox
|
|
import hashlib
|
|
import socket
|
|
import hanzo.httptools
|
|
from hanzo import warctools
|
|
import datetime
|
|
|
|
class WarcRecordBuilder:
|
|
logger = logging.getLogger("warcprox.warc.WarcRecordBuilder")
|
|
|
|
def __init__(self, digest_algorithm="sha1", base32=False):
|
|
self.digest_algorithm = digest_algorithm
|
|
self.base32 = base32
|
|
|
|
def format_warc_date(self, dt):
|
|
return dt.strftime('%Y-%m-%dT%H:%M:%SZ').encode('ascii')
|
|
|
|
def _build_response_principal_record(self, recorded_url, warc_date):
|
|
"""Builds response or revisit record, whichever is appropriate."""
|
|
if hasattr(recorded_url, "dedup_info") and recorded_url.dedup_info:
|
|
# revisit record
|
|
recorded_url.response_recorder.tempfile.seek(0)
|
|
if recorded_url.response_recorder.payload_offset is not None:
|
|
response_header_block = recorded_url.response_recorder.tempfile.read(recorded_url.response_recorder.payload_offset)
|
|
else:
|
|
response_header_block = recorded_url.response_recorder.tempfile.read()
|
|
|
|
return self.build_warc_record(
|
|
url=recorded_url.url, warc_date=warc_date,
|
|
data=response_header_block,
|
|
warc_type=warctools.WarcRecord.REVISIT,
|
|
refers_to=recorded_url.dedup_info.get('id'),
|
|
refers_to_target_uri=recorded_url.dedup_info['url'],
|
|
refers_to_date=recorded_url.dedup_info['date'],
|
|
payload_digest=warcprox.digest_str(
|
|
recorded_url.payload_digest, self.base32),
|
|
profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST,
|
|
content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
|
|
remote_ip=recorded_url.remote_ip)
|
|
else:
|
|
# response record
|
|
return self.build_warc_record(
|
|
url=recorded_url.url, warc_date=warc_date,
|
|
recorder=recorded_url.response_recorder,
|
|
warc_type=warctools.WarcRecord.RESPONSE,
|
|
content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
|
|
remote_ip=recorded_url.remote_ip,
|
|
payload_digest=warcprox.digest_str(
|
|
recorded_url.payload_digest, self.base32),
|
|
truncated=recorded_url.truncated)
|
|
|
|
def build_warc_records(self, recorded_url):
|
|
"""Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)"""
|
|
warc_date = self.format_warc_date(recorded_url.timestamp)
|
|
|
|
if recorded_url.response_recorder:
|
|
principal_record = self._build_response_principal_record(recorded_url, warc_date)
|
|
request_record = self.build_warc_record(url=recorded_url.url,
|
|
warc_date=warc_date, data=recorded_url.request_data,
|
|
warc_type=warctools.WarcRecord.REQUEST,
|
|
content_type=hanzo.httptools.RequestMessage.CONTENT_TYPE,
|
|
concurrent_to=principal_record.id)
|
|
return principal_record, request_record
|
|
else:
|
|
principal_record = self.build_warc_record(
|
|
url=recorded_url.url,
|
|
warc_date=warc_date, data=recorded_url.request_data,
|
|
warc_type=recorded_url.custom_type,
|
|
content_type=recorded_url.content_type.encode("latin1"),
|
|
payload_digest=warcprox.digest_str(
|
|
recorded_url.payload_digest, self.base32),
|
|
content_length=recorded_url.size)
|
|
return (principal_record,)
|
|
|
|
def build_warc_record(self, url, warc_date=None, recorder=None, data=None,
|
|
concurrent_to=None, warc_type=None, content_type=None, remote_ip=None,
|
|
profile=None, refers_to=None, refers_to_target_uri=None,
|
|
refers_to_date=None, payload_digest=None, truncated=None,
|
|
content_length=None):
|
|
|
|
if warc_date is None:
|
|
warc_date = self.format_warc_date(datetime.datetime.utcnow())
|
|
|
|
record_id = warctools.WarcRecord.random_warc_uuid()
|
|
|
|
headers = []
|
|
if warc_type is not None:
|
|
headers.append((warctools.WarcRecord.TYPE, warc_type))
|
|
headers.append((warctools.WarcRecord.ID, record_id))
|
|
headers.append((warctools.WarcRecord.DATE, warc_date))
|
|
headers.append((warctools.WarcRecord.URL, url))
|
|
if remote_ip is not None:
|
|
headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip))
|
|
if profile is not None:
|
|
headers.append((warctools.WarcRecord.PROFILE, profile))
|
|
if refers_to is not None:
|
|
headers.append((warctools.WarcRecord.REFERS_TO, refers_to))
|
|
if refers_to_target_uri is not None:
|
|
headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri))
|
|
if refers_to_date is not None:
|
|
headers.append((warctools.WarcRecord.REFERS_TO_DATE, refers_to_date))
|
|
if concurrent_to is not None:
|
|
headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to))
|
|
if content_type is not None:
|
|
headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
|
|
if payload_digest is not None:
|
|
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
|
|
# truncated value may be 'length' or 'time'
|
|
if truncated is not None:
|
|
headers.append((b'WARC-Truncated', truncated))
|
|
|
|
if recorder is not None:
|
|
if content_length is not None:
|
|
headers.append((
|
|
warctools.WarcRecord.CONTENT_LENGTH,
|
|
str(content_length).encode('latin1')))
|
|
else:
|
|
headers.append((
|
|
warctools.WarcRecord.CONTENT_LENGTH,
|
|
str(len(recorder)).encode('latin1')))
|
|
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
|
warcprox.digest_str(recorder.block_digest, self.base32)))
|
|
recorder.tempfile.seek(0)
|
|
record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)
|
|
else:
|
|
if content_length is not None:
|
|
headers.append((
|
|
warctools.WarcRecord.CONTENT_LENGTH,
|
|
str(content_length).encode('latin1')))
|
|
else:
|
|
headers.append((
|
|
warctools.WarcRecord.CONTENT_LENGTH,
|
|
str(len(data)).encode('latin1')))
|
|
# no http headers so block digest == payload digest
|
|
if not payload_digest:
|
|
payload_digest = warcprox.digest_str(
|
|
hashlib.new(self.digest_algorithm, data), self.base32)
|
|
headers.append((
|
|
warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
|
|
headers.append((warctools.WarcRecord.BLOCK_DIGEST, payload_digest))
|
|
if hasattr(data, 'read'):
|
|
record = warctools.WarcRecord(
|
|
headers=headers, content_file=data)
|
|
else:
|
|
content_tuple = content_type, data
|
|
record = warctools.WarcRecord(
|
|
headers=headers, content=content_tuple)
|
|
|
|
return record
|
|
|
|
def _local_address(self):
|
|
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
|
s.connect(('10.255.255.255', 1)) # ip doesn't need to be reachable
|
|
output = s.getsockname()[0]
|
|
s.close()
|
|
return output
|
|
|
|
def build_warcinfo_record(self, filename):
|
|
warc_record_date = self.format_warc_date(datetime.datetime.utcnow())
|
|
record_id = warctools.WarcRecord.random_warc_uuid()
|
|
|
|
headers = []
|
|
headers.append((warctools.WarcRecord.ID, record_id))
|
|
headers.append((warctools.WarcRecord.TYPE, warctools.WarcRecord.WARCINFO))
|
|
headers.append((warctools.WarcRecord.FILENAME, filename.encode('latin1')))
|
|
headers.append((warctools.WarcRecord.DATE, warc_record_date))
|
|
|
|
warcinfo_fields = []
|
|
warcinfo_fields.append(b'software: warcprox ' + warcprox.__version__.encode('latin1'))
|
|
hostname = socket.gethostname()
|
|
warcinfo_fields.append('hostname: {}'.format(hostname).encode('latin1'))
|
|
warcinfo_fields.append(('ip: %s' % self._local_address()).encode('latin1'))
|
|
warcinfo_fields.append(b'format: WARC File Format 1.0')
|
|
# warcinfo_fields.append('robots: ignore')
|
|
# warcinfo_fields.append('description: {0}'.format(self.description))
|
|
# warcinfo_fields.append('isPartOf: {0}'.format(self.is_part_of))
|
|
data = b'\r\n'.join(warcinfo_fields) + b'\r\n'
|
|
|
|
record = warctools.WarcRecord(headers=headers, content=(b'application/warc-fields', data))
|
|
|
|
return record
|
|
|