From edeae3b21abc24ad269d7b2686eb3720a8de32c6 Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Wed, 22 Jul 2020 21:36:39 +0000 Subject: [PATCH] Expanding logging to handle DNS failures, print error message to crawl log info, and report cached connection errors. --- warcprox/crawl_log.py | 30 +++++++++++++++++++++++++++--- warcprox/mitmproxy.py | 4 ++-- warcprox/warcproxy.py | 4 +++- 3 files changed, 32 insertions(+), 6 deletions(-) diff --git a/warcprox/crawl_log.py b/warcprox/crawl_log.py index eea17d5..6c847bb 100644 --- a/warcprox/crawl_log.py +++ b/warcprox/crawl_log.py @@ -25,7 +25,7 @@ import json import os import warcprox import socket -from urllib3.exceptions import TimeoutError, HTTPError +from urllib3.exceptions import TimeoutError, HTTPError, NewConnectionError, MaxRetryError class CrawlLogger(object): def __init__(self, dir_, options=warcprox.Options()): @@ -43,6 +43,10 @@ class CrawlLogger(object): now = datetime.datetime.utcnow() status = self.get_artificial_status(recorded_url) extra_info = {'contentSize': recorded_url.size,} if recorded_url.size is not None and recorded_url.size > 0 else {} + if hasattr(recorded_url, 'exception') and recorded_url.exception is not None: + extra_info['exception'] = str(recorded_url.exception) + if(hasattr(recorded_url, 'message') and recorded_url.message is not None): + extra_info['exceptionMessage'] = str(recorded_url.message) if records: extra_info['warcFilename'] = records[0].warc_filename extra_info['warcFileOffset'] = records[0].offset @@ -95,8 +99,28 @@ class CrawlLogger(object): f.write(line) def get_artificial_status(self, recorded_url): - if hasattr(recorded_url, 'exception') and isinstance(recorded_url.exception, (socket.timeout, TimeoutError, )): - return '-2' + # urllib3 Does not specify DNS errors. We must parse them from the exception string. + # Unfortunately, the errors are reported differently on different systems. + # https://stackoverflow.com/questions/40145631 + + if hasattr(recorded_url, 'exception') and isinstance(recorded_url.exception, (MaxRetryError, )): + return '-8' + elif hasattr(recorded_url, 'exception') and isinstance(recorded_url.exception, (NewConnectionError, )): + exception_string=str(recorded_url.exception) + if ("[Errno 11001] getaddrinfo failed" in exception_string or # Windows + "[Errno -2] Name or service not known" in exception_string or # Linux + "[Errno -3] Temporary failure in name resolution" in exception_string or # Linux + "[Errno 8] nodename nor servname " in exception_string): # OS X + return '-6' # DNS Failure + else: + return '-2' # Other Connection Failure + elif hasattr(recorded_url, 'exception') and isinstance(recorded_url.exception, (socket.timeout, TimeoutError, )): + return '-2' # Connection Timeout + elif isinstance(recorded_url, warcprox.warcproxy.FailedUrl): + # synthetic status, used when some other status (such as connection-lost) + # is considered by policy the same as a document-not-found + # Cached failures result in FailedUrl with no Exception + return '-404' else: return recorded_url.status diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 36d20db..cb2693a 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -78,7 +78,7 @@ import collections import cProfile from urllib3 import PoolManager from urllib3.util import is_connection_dropped -from urllib3.exceptions import TimeoutError, HTTPError +from urllib3.exceptions import TimeoutError, HTTPError, NewConnectionError import doublethink from cachetools import TTLCache from threading import RLock @@ -407,7 +407,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): cached = self.server.bad_hostnames_ports.get(hostname_port) if cached: self.logger.info('Cannot connect to %s (cache)', hostname_port) - self.send_error(cached) + self.send_error(cached, exception=Exception('Cached Failed Connection')) return # Connect to destination self._connect_to_remote_server() diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index a586cee..6dd58b6 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -372,6 +372,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): duration=None, referer=self.headers.get('referer'), do_not_archive=True, + message=message, exception=exception) self.server.recorded_url_q.put(failed_url) @@ -441,13 +442,14 @@ class FailedUrl(RequestedUrl): def __init__(self, url, request_data, warcprox_meta=None, status=None, client_ip=None, method=None, timestamp=None, host=None, duration=None, - referer=None, do_not_archive=True, exception=None): + referer=None, do_not_archive=True, message=None, exception=None): super().__init__(url, request_data, warcprox_meta=warcprox_meta, status=status, client_ip=client_ip, method=method, timestamp=timestamp, host=host, duration=duration, referer=referer, do_not_archive=do_not_archive) + self.message = message self.exception = exception class RecordedUrl(RequestedUrl):