fix bugs, improve logging of each warc record

This commit is contained in:
Noah Levitt 2013-10-17 18:35:11 -07:00
parent 568df5360d
commit e01691c1f2

View File

@ -225,8 +225,8 @@ class WarcProxyHandler(BaseHTTPServer.BaseHTTPRequestHandler):
def _transition_to_ssl(self):
self.connection = ssl.wrap_socket(self.connection, server_side=True,
certfile=self.server.ca[self.hostname])
self.request = self.connection = ssl.wrap_socket(self.connection,
server_side=True, certfile=self.server.ca[self.hostname])
def do_CONNECT(self):
@ -279,7 +279,7 @@ class WarcProxyHandler(BaseHTTPServer.BaseHTTPRequestHandler):
self.send_error(500, str(e))
return
else:
self.url = _construct_tunneled_url()
self.url = self._construct_tunneled_url()
# Build request
req = '%s %s %s\r\n' % (self.command, self.path, self.request_version)
@ -506,8 +506,13 @@ class WarcWriterThread(threading.Thread):
writer = self._writer()
for record in recordset:
offset = writer.tell()
record.write_to(writer, gzip=self.gzip)
logging.info('wrote warc record {}'.format(record))
logging.info('wrote warc record: warc_type={} content_length={} url={} warc={} offset={}'.format(
record.get_header(warctools.WarcRecord.TYPE),
record.get_header(warctools.WarcRecord.CONTENT_LENGTH),
record.get_header(warctools.WarcRecord.URL),
self._fpath, offset))
if record.content_file:
# XXX now we know we're done with this... messy to