Expanding logging to handle DNS failures, print error message to crawl log info, and report cached connection errors.

This commit is contained in:
Adam Miller 2020-07-22 21:36:39 +00:00
parent b34419543f
commit edeae3b21a
3 changed files with 32 additions and 6 deletions

View File

@ -25,7 +25,7 @@ import json
import os
import warcprox
import socket
from urllib3.exceptions import TimeoutError, HTTPError
from urllib3.exceptions import TimeoutError, HTTPError, NewConnectionError, MaxRetryError
class CrawlLogger(object):
def __init__(self, dir_, options=warcprox.Options()):
@ -43,6 +43,10 @@ class CrawlLogger(object):
now = datetime.datetime.utcnow()
status = self.get_artificial_status(recorded_url)
extra_info = {'contentSize': recorded_url.size,} if recorded_url.size is not None and recorded_url.size > 0 else {}
if hasattr(recorded_url, 'exception') and recorded_url.exception is not None:
extra_info['exception'] = str(recorded_url.exception)
if(hasattr(recorded_url, 'message') and recorded_url.message is not None):
extra_info['exceptionMessage'] = str(recorded_url.message)
if records:
extra_info['warcFilename'] = records[0].warc_filename
extra_info['warcFileOffset'] = records[0].offset
@ -95,8 +99,28 @@ class CrawlLogger(object):
f.write(line)
def get_artificial_status(self, recorded_url):
if hasattr(recorded_url, 'exception') and isinstance(recorded_url.exception, (socket.timeout, TimeoutError, )):
return '-2'
# urllib3 Does not specify DNS errors. We must parse them from the exception string.
# Unfortunately, the errors are reported differently on different systems.
# https://stackoverflow.com/questions/40145631
if hasattr(recorded_url, 'exception') and isinstance(recorded_url.exception, (MaxRetryError, )):
return '-8'
elif hasattr(recorded_url, 'exception') and isinstance(recorded_url.exception, (NewConnectionError, )):
exception_string=str(recorded_url.exception)
if ("[Errno 11001] getaddrinfo failed" in exception_string or # Windows
"[Errno -2] Name or service not known" in exception_string or # Linux
"[Errno -3] Temporary failure in name resolution" in exception_string or # Linux
"[Errno 8] nodename nor servname " in exception_string): # OS X
return '-6' # DNS Failure
else:
return '-2' # Other Connection Failure
elif hasattr(recorded_url, 'exception') and isinstance(recorded_url.exception, (socket.timeout, TimeoutError, )):
return '-2' # Connection Timeout
elif isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
# synthetic status, used when some other status (such as connection-lost)
# is considered by policy the same as a document-not-found
# Cached failures result in FailedUrl with no Exception
return '-404'
else:
return recorded_url.status

View File

@ -78,7 +78,7 @@ import collections
import cProfile
from urllib3 import PoolManager
from urllib3.util import is_connection_dropped
from urllib3.exceptions import TimeoutError, HTTPError
from urllib3.exceptions import TimeoutError, HTTPError, NewConnectionError
import doublethink
from cachetools import TTLCache
from threading import RLock
@ -407,7 +407,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
cached = self.server.bad_hostnames_ports.get(hostname_port)
if cached:
self.logger.info('Cannot connect to %s (cache)', hostname_port)
self.send_error(cached)
self.send_error(cached, exception=Exception('Cached Failed Connection'))
return
# Connect to destination
self._connect_to_remote_server()

View File

@ -372,6 +372,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
duration=None,
referer=self.headers.get('referer'),
do_not_archive=True,
message=message,
exception=exception)
self.server.recorded_url_q.put(failed_url)
@ -441,13 +442,14 @@ class FailedUrl(RequestedUrl):
def __init__(self, url, request_data, warcprox_meta=None, status=None,
client_ip=None, method=None, timestamp=None, host=None, duration=None,
referer=None, do_not_archive=True, exception=None):
referer=None, do_not_archive=True, message=None, exception=None):
super().__init__(url, request_data, warcprox_meta=warcprox_meta,
status=status, client_ip=client_ip, method=method,
timestamp=timestamp, host=host, duration=duration,
referer=referer, do_not_archive=do_not_archive)
self.message = message
self.exception = exception
class RecordedUrl(RequestedUrl):