1
0
mirror of https://github.com/internetarchive/warcprox.git synced 2025-01-18 13:22:09 +01:00

Expanding logging to handle DNS failures, print error message to crawl log info, and report cached connection errors.

This commit is contained in:
Adam Miller 2020-07-22 21:36:39 +00:00
parent b34419543f
commit edeae3b21a
3 changed files with 32 additions and 6 deletions

@ -25,7 +25,7 @@ import json
import os import os
import warcprox import warcprox
import socket import socket
from urllib3.exceptions import TimeoutError, HTTPError from urllib3.exceptions import TimeoutError, HTTPError, NewConnectionError, MaxRetryError
class CrawlLogger(object): class CrawlLogger(object):
def __init__(self, dir_, options=warcprox.Options()): def __init__(self, dir_, options=warcprox.Options()):
@ -43,6 +43,10 @@ class CrawlLogger(object):
now = datetime.datetime.utcnow() now = datetime.datetime.utcnow()
status = self.get_artificial_status(recorded_url) status = self.get_artificial_status(recorded_url)
extra_info = {'contentSize': recorded_url.size,} if recorded_url.size is not None and recorded_url.size > 0 else {} extra_info = {'contentSize': recorded_url.size,} if recorded_url.size is not None and recorded_url.size > 0 else {}
if hasattr(recorded_url, 'exception') and recorded_url.exception is not None:
extra_info['exception'] = str(recorded_url.exception)
if(hasattr(recorded_url, 'message') and recorded_url.message is not None):
extra_info['exceptionMessage'] = str(recorded_url.message)
if records: if records:
extra_info['warcFilename'] = records[0].warc_filename extra_info['warcFilename'] = records[0].warc_filename
extra_info['warcFileOffset'] = records[0].offset extra_info['warcFileOffset'] = records[0].offset
@ -95,8 +99,28 @@ class CrawlLogger(object):
f.write(line) f.write(line)
def get_artificial_status(self, recorded_url): def get_artificial_status(self, recorded_url):
if hasattr(recorded_url, 'exception') and isinstance(recorded_url.exception, (socket.timeout, TimeoutError, )): # urllib3 Does not specify DNS errors. We must parse them from the exception string.
return '-2' # Unfortunately, the errors are reported differently on different systems.
# https://stackoverflow.com/questions/40145631
if hasattr(recorded_url, 'exception') and isinstance(recorded_url.exception, (MaxRetryError, )):
return '-8'
elif hasattr(recorded_url, 'exception') and isinstance(recorded_url.exception, (NewConnectionError, )):
exception_string=str(recorded_url.exception)
if ("[Errno 11001] getaddrinfo failed" in exception_string or # Windows
"[Errno -2] Name or service not known" in exception_string or # Linux
"[Errno -3] Temporary failure in name resolution" in exception_string or # Linux
"[Errno 8] nodename nor servname " in exception_string): # OS X
return '-6' # DNS Failure
else:
return '-2' # Other Connection Failure
elif hasattr(recorded_url, 'exception') and isinstance(recorded_url.exception, (socket.timeout, TimeoutError, )):
return '-2' # Connection Timeout
elif isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
# synthetic status, used when some other status (such as connection-lost)
# is considered by policy the same as a document-not-found
# Cached failures result in FailedUrl with no Exception
return '-404'
else: else:
return recorded_url.status return recorded_url.status

@ -78,7 +78,7 @@ import collections
import cProfile import cProfile
from urllib3 import PoolManager from urllib3 import PoolManager
from urllib3.util import is_connection_dropped from urllib3.util import is_connection_dropped
from urllib3.exceptions import TimeoutError, HTTPError from urllib3.exceptions import TimeoutError, HTTPError, NewConnectionError
import doublethink import doublethink
from cachetools import TTLCache from cachetools import TTLCache
from threading import RLock from threading import RLock
@ -407,7 +407,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
cached = self.server.bad_hostnames_ports.get(hostname_port) cached = self.server.bad_hostnames_ports.get(hostname_port)
if cached: if cached:
self.logger.info('Cannot connect to %s (cache)', hostname_port) self.logger.info('Cannot connect to %s (cache)', hostname_port)
self.send_error(cached) self.send_error(cached, exception=Exception('Cached Failed Connection'))
return return
# Connect to destination # Connect to destination
self._connect_to_remote_server() self._connect_to_remote_server()

@ -372,6 +372,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
duration=None, duration=None,
referer=self.headers.get('referer'), referer=self.headers.get('referer'),
do_not_archive=True, do_not_archive=True,
message=message,
exception=exception) exception=exception)
self.server.recorded_url_q.put(failed_url) self.server.recorded_url_q.put(failed_url)
@ -441,13 +442,14 @@ class FailedUrl(RequestedUrl):
def __init__(self, url, request_data, warcprox_meta=None, status=None, def __init__(self, url, request_data, warcprox_meta=None, status=None,
client_ip=None, method=None, timestamp=None, host=None, duration=None, client_ip=None, method=None, timestamp=None, host=None, duration=None,
referer=None, do_not_archive=True, exception=None): referer=None, do_not_archive=True, message=None, exception=None):
super().__init__(url, request_data, warcprox_meta=warcprox_meta, super().__init__(url, request_data, warcprox_meta=warcprox_meta,
status=status, client_ip=client_ip, method=method, status=status, client_ip=client_ip, method=method,
timestamp=timestamp, host=host, duration=duration, timestamp=timestamp, host=host, duration=duration,
referer=referer, do_not_archive=do_not_archive) referer=referer, do_not_archive=do_not_archive)
self.message = message
self.exception = exception self.exception = exception
class RecordedUrl(RequestedUrl): class RecordedUrl(RequestedUrl):