mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Expanding logging to handle DNS failures, print error message to crawl log info, and report cached connection errors.
This commit is contained in:
parent
b34419543f
commit
edeae3b21a
@ -25,7 +25,7 @@ import json
|
||||
import os
|
||||
import warcprox
|
||||
import socket
|
||||
from urllib3.exceptions import TimeoutError, HTTPError
|
||||
from urllib3.exceptions import TimeoutError, HTTPError, NewConnectionError, MaxRetryError
|
||||
|
||||
class CrawlLogger(object):
|
||||
def __init__(self, dir_, options=warcprox.Options()):
|
||||
@ -43,6 +43,10 @@ class CrawlLogger(object):
|
||||
now = datetime.datetime.utcnow()
|
||||
status = self.get_artificial_status(recorded_url)
|
||||
extra_info = {'contentSize': recorded_url.size,} if recorded_url.size is not None and recorded_url.size > 0 else {}
|
||||
if hasattr(recorded_url, 'exception') and recorded_url.exception is not None:
|
||||
extra_info['exception'] = str(recorded_url.exception)
|
||||
if(hasattr(recorded_url, 'message') and recorded_url.message is not None):
|
||||
extra_info['exceptionMessage'] = str(recorded_url.message)
|
||||
if records:
|
||||
extra_info['warcFilename'] = records[0].warc_filename
|
||||
extra_info['warcFileOffset'] = records[0].offset
|
||||
@ -95,8 +99,28 @@ class CrawlLogger(object):
|
||||
f.write(line)
|
||||
|
||||
def get_artificial_status(self, recorded_url):
|
||||
if hasattr(recorded_url, 'exception') and isinstance(recorded_url.exception, (socket.timeout, TimeoutError, )):
|
||||
return '-2'
|
||||
# urllib3 Does not specify DNS errors. We must parse them from the exception string.
|
||||
# Unfortunately, the errors are reported differently on different systems.
|
||||
# https://stackoverflow.com/questions/40145631
|
||||
|
||||
if hasattr(recorded_url, 'exception') and isinstance(recorded_url.exception, (MaxRetryError, )):
|
||||
return '-8'
|
||||
elif hasattr(recorded_url, 'exception') and isinstance(recorded_url.exception, (NewConnectionError, )):
|
||||
exception_string=str(recorded_url.exception)
|
||||
if ("[Errno 11001] getaddrinfo failed" in exception_string or # Windows
|
||||
"[Errno -2] Name or service not known" in exception_string or # Linux
|
||||
"[Errno -3] Temporary failure in name resolution" in exception_string or # Linux
|
||||
"[Errno 8] nodename nor servname " in exception_string): # OS X
|
||||
return '-6' # DNS Failure
|
||||
else:
|
||||
return '-2' # Other Connection Failure
|
||||
elif hasattr(recorded_url, 'exception') and isinstance(recorded_url.exception, (socket.timeout, TimeoutError, )):
|
||||
return '-2' # Connection Timeout
|
||||
elif isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
|
||||
# synthetic status, used when some other status (such as connection-lost)
|
||||
# is considered by policy the same as a document-not-found
|
||||
# Cached failures result in FailedUrl with no Exception
|
||||
return '-404'
|
||||
else:
|
||||
return recorded_url.status
|
||||
|
||||
|
@ -78,7 +78,7 @@ import collections
|
||||
import cProfile
|
||||
from urllib3 import PoolManager
|
||||
from urllib3.util import is_connection_dropped
|
||||
from urllib3.exceptions import TimeoutError, HTTPError
|
||||
from urllib3.exceptions import TimeoutError, HTTPError, NewConnectionError
|
||||
import doublethink
|
||||
from cachetools import TTLCache
|
||||
from threading import RLock
|
||||
@ -407,7 +407,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
cached = self.server.bad_hostnames_ports.get(hostname_port)
|
||||
if cached:
|
||||
self.logger.info('Cannot connect to %s (cache)', hostname_port)
|
||||
self.send_error(cached)
|
||||
self.send_error(cached, exception=Exception('Cached Failed Connection'))
|
||||
return
|
||||
# Connect to destination
|
||||
self._connect_to_remote_server()
|
||||
|
@ -372,6 +372,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
duration=None,
|
||||
referer=self.headers.get('referer'),
|
||||
do_not_archive=True,
|
||||
message=message,
|
||||
exception=exception)
|
||||
|
||||
self.server.recorded_url_q.put(failed_url)
|
||||
@ -441,13 +442,14 @@ class FailedUrl(RequestedUrl):
|
||||
|
||||
def __init__(self, url, request_data, warcprox_meta=None, status=None,
|
||||
client_ip=None, method=None, timestamp=None, host=None, duration=None,
|
||||
referer=None, do_not_archive=True, exception=None):
|
||||
referer=None, do_not_archive=True, message=None, exception=None):
|
||||
|
||||
super().__init__(url, request_data, warcprox_meta=warcprox_meta,
|
||||
status=status, client_ip=client_ip, method=method,
|
||||
timestamp=timestamp, host=host, duration=duration,
|
||||
referer=referer, do_not_archive=do_not_archive)
|
||||
|
||||
self.message = message
|
||||
self.exception = exception
|
||||
|
||||
class RecordedUrl(RequestedUrl):
|
||||
|
Loading…
x
Reference in New Issue
Block a user