From e01691c1f2537082e933db68518edec41382ef57 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 17 Oct 2013 18:35:11 -0700 Subject: [PATCH] fix bugs, improve logging of each warc record --- warcprox.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/warcprox.py b/warcprox.py index 1d8623b..1d4e280 100755 --- a/warcprox.py +++ b/warcprox.py @@ -225,8 +225,8 @@ class WarcProxyHandler(BaseHTTPServer.BaseHTTPRequestHandler): def _transition_to_ssl(self): - self.connection = ssl.wrap_socket(self.connection, server_side=True, - certfile=self.server.ca[self.hostname]) + self.request = self.connection = ssl.wrap_socket(self.connection, + server_side=True, certfile=self.server.ca[self.hostname]) def do_CONNECT(self): @@ -279,7 +279,7 @@ class WarcProxyHandler(BaseHTTPServer.BaseHTTPRequestHandler): self.send_error(500, str(e)) return else: - self.url = _construct_tunneled_url() + self.url = self._construct_tunneled_url() # Build request req = '%s %s %s\r\n' % (self.command, self.path, self.request_version) @@ -506,8 +506,13 @@ class WarcWriterThread(threading.Thread): writer = self._writer() for record in recordset: + offset = writer.tell() record.write_to(writer, gzip=self.gzip) - logging.info('wrote warc record {}'.format(record)) + logging.info('wrote warc record: warc_type={} content_length={} url={} warc={} offset={}'.format( + record.get_header(warctools.WarcRecord.TYPE), + record.get_header(warctools.WarcRecord.CONTENT_LENGTH), + record.get_header(warctools.WarcRecord.URL), + self._fpath, offset)) if record.content_file: # XXX now we know we're done with this... messy to