diff --git a/warcprox/crawl_log.py b/warcprox/crawl_log.py index 2f7ea5e..0ba075b 100644 --- a/warcprox/crawl_log.py +++ b/warcprox/crawl_log.py @@ -40,7 +40,7 @@ class CrawlLogger(object): def notify(self, recorded_url, records): # 2017-08-03T21:45:24.496Z 200 2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"} now = datetime.datetime.utcnow() - extra_info = {'contentSize': recorded_url.size,} + extra_info = {'contentSize': recorded_url.size,} if recorded_url.size > 0 else {} if records: extra_info['warcFilename'] = records[0].warc_filename extra_info['warcFileOffset'] = records[0].offset @@ -51,10 +51,13 @@ class CrawlLogger(object): payload_digest = warcprox.digest_str( recorded_url.payload_digest, self.options.base32) - else: + elif records is not None and len(records) > 0: # WARCPROX_WRITE_RECORD request content_length = int(records[0].get_header(b'Content-Length')) payload_digest = records[0].get_header(b'WARC-Payload-Digest') + else: + content_length = 0 + payload_digest = '-' fields = [ '{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000), '% 5s' % recorded_url.status, @@ -67,7 +70,7 @@ class CrawlLogger(object): '{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format( recorded_url.timestamp, recorded_url.timestamp.microsecond//1000, - recorded_url.duration.microseconds//1000), + recorded_url.duration.microseconds//1000) if (recorded_url.timestamp is not None and recorded_url.duration is not None) else '-', payload_digest, recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'), 'duplicate:digest' if records and records[0].type == b'revisit' else '-', diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 6b32a40..f4279de 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -359,7 +359,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self.logger.error( "problem handling %r: %r", self.requestline, e) if type(e) is socket.timeout: - self.send_error(504, str(e)) + self.send_error(-2, str(e)) else: self.send_error(500, str(e)) except Exception as f: @@ -425,7 +425,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): response_code = 500 cache = False if isinstance(e, (socket.timeout, TimeoutError,)): - response_code = 504 + response_code = -2 cache = True elif isinstance(e, HTTPError): response_code = 502 @@ -459,6 +459,12 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): return def send_error(self, code, message=None, explain=None): + + if code == -2: + return_code = 504 + else: + return_code = code + # BaseHTTPRequestHandler.send_response_only() in http/server.py # does this: # if not hasattr(self, '_headers_buffer'): @@ -470,7 +476,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._headers_buffer = [] try: return http_server.BaseHTTPRequestHandler.send_error( - self, code, message, explain) + self, return_code, message, explain) except Exception as e: level = logging.ERROR if isinstance(e, OSError) and e.errno == 9: diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 9d23244..0c780c3 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -343,13 +343,68 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): except: self.logger.error("uncaught exception in do_WARCPROX_WRITE_RECORD", exc_info=True) raise + def send_error(self, code, message=None, explain=None): + super().send_error(code, message, explain) + + # Build request + req_str = '{} {} {}\r\n'.format( + self.command, self.path, self.request_version) + + # Swallow headers that don't make sense to forward on, i.e. most + # hop-by-hop headers. http://tools.ietf.org/html/rfc2616#section-13.5. + # self.headers is an email.message.Message, which is case-insensitive + # and doesn't throw KeyError in __delitem__ + for key in ( + 'Connection', 'Proxy-Connection', 'Keep-Alive', + 'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'): + del self.headers[key] + + self.headers['Via'] = via_header_value( + self.headers.get('Via'), + self.request_version.replace('HTTP/', '')) + + # Add headers to the request + # XXX in at least python3.3 str(self.headers) uses \n not \r\n :( + req_str += '\r\n'.join( + '{}: {}'.format(k,v) for (k,v) in self.headers.items()) + + warcprox_meta = None + raw_warcprox_meta = self.headers.get('Warcprox-Meta') + if raw_warcprox_meta: + warcprox_meta = json.loads(raw_warcprox_meta) + + req = req_str.encode('latin1') + b'\r\n\r\n' + recorded_url = RecordedUrl( + url=self.url, + remote_ip=b'', + warcprox_meta=warcprox_meta, + status=code, + client_ip=self.client_address[0], + method=self.command, + content_type="unknown", + response_recorder=None, + request_data=req, + duration=None ,size=0, + timestamp=None, host=self.hostname, + do_not_archive=True, + referer=self.headers.get('referer')) + + self.server.recorded_url_q.put(recorded_url) + def log_message(self, fmt, *args): # logging better handled elsewhere? pass RE_MIMETYPE = re.compile(r'[;\s]') - +def via_header_value(orig, request_version): + via = orig + if via: + via += ', ' + else: + via = '' + via = via + '%s %s' % (request_version, 'warcprox') + return via class RecordedUrl: logger = logging.getLogger("warcprox.warcproxy.RecordedUrl")