Beginning modifications to pass along a dummy RecordedUrl on connection timeout for logging

This commit is contained in:
Adam Miller 2019-12-11 01:54:11 +00:00
parent f77c152037
commit f9c9443d2f
3 changed files with 71 additions and 7 deletions

View File

@ -40,7 +40,7 @@ class CrawlLogger(object):
def notify(self, recorded_url, records):
# 2017-08-03T21:45:24.496Z 200 2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"}
now = datetime.datetime.utcnow()
extra_info = {'contentSize': recorded_url.size,}
extra_info = {'contentSize': recorded_url.size,} if recorded_url.size > 0 else {}
if records:
extra_info['warcFilename'] = records[0].warc_filename
extra_info['warcFileOffset'] = records[0].offset
@ -51,10 +51,13 @@ class CrawlLogger(object):
payload_digest = warcprox.digest_str(
recorded_url.payload_digest,
self.options.base32)
else:
elif records is not None and len(records) > 0:
# WARCPROX_WRITE_RECORD request
content_length = int(records[0].get_header(b'Content-Length'))
payload_digest = records[0].get_header(b'WARC-Payload-Digest')
else:
content_length = 0
payload_digest = '-'
fields = [
'{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000),
'% 5s' % recorded_url.status,
@ -67,7 +70,7 @@ class CrawlLogger(object):
'{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(
recorded_url.timestamp,
recorded_url.timestamp.microsecond//1000,
recorded_url.duration.microseconds//1000),
recorded_url.duration.microseconds//1000) if (recorded_url.timestamp is not None and recorded_url.duration is not None) else '-',
payload_digest,
recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'),
'duplicate:digest' if records and records[0].type == b'revisit' else '-',

View File

@ -359,7 +359,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
self.logger.error(
"problem handling %r: %r", self.requestline, e)
if type(e) is socket.timeout:
self.send_error(504, str(e))
self.send_error(-2, str(e))
else:
self.send_error(500, str(e))
except Exception as f:
@ -425,7 +425,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
response_code = 500
cache = False
if isinstance(e, (socket.timeout, TimeoutError,)):
response_code = 504
response_code = -2
cache = True
elif isinstance(e, HTTPError):
response_code = 502
@ -459,6 +459,12 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
return
def send_error(self, code, message=None, explain=None):
if code == -2:
return_code = 504
else:
return_code = code
# BaseHTTPRequestHandler.send_response_only() in http/server.py
# does this:
# if not hasattr(self, '_headers_buffer'):
@ -470,7 +476,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
self._headers_buffer = []
try:
return http_server.BaseHTTPRequestHandler.send_error(
self, code, message, explain)
self, return_code, message, explain)
except Exception as e:
level = logging.ERROR
if isinstance(e, OSError) and e.errno == 9:

View File

@ -343,13 +343,68 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
except:
self.logger.error("uncaught exception in do_WARCPROX_WRITE_RECORD", exc_info=True)
raise
def send_error(self, code, message=None, explain=None):
super().send_error(code, message, explain)
# Build request
req_str = '{} {} {}\r\n'.format(
self.command, self.path, self.request_version)
# Swallow headers that don't make sense to forward on, i.e. most
# hop-by-hop headers. http://tools.ietf.org/html/rfc2616#section-13.5.
# self.headers is an email.message.Message, which is case-insensitive
# and doesn't throw KeyError in __delitem__
for key in (
'Connection', 'Proxy-Connection', 'Keep-Alive',
'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'):
del self.headers[key]
self.headers['Via'] = via_header_value(
self.headers.get('Via'),
self.request_version.replace('HTTP/', ''))
# Add headers to the request
# XXX in at least python3.3 str(self.headers) uses \n not \r\n :(
req_str += '\r\n'.join(
'{}: {}'.format(k,v) for (k,v) in self.headers.items())
warcprox_meta = None
raw_warcprox_meta = self.headers.get('Warcprox-Meta')
if raw_warcprox_meta:
warcprox_meta = json.loads(raw_warcprox_meta)
req = req_str.encode('latin1') + b'\r\n\r\n'
recorded_url = RecordedUrl(
url=self.url,
remote_ip=b'',
warcprox_meta=warcprox_meta,
status=code,
client_ip=self.client_address[0],
method=self.command,
content_type="unknown",
response_recorder=None,
request_data=req,
duration=None ,size=0,
timestamp=None, host=self.hostname,
do_not_archive=True,
referer=self.headers.get('referer'))
self.server.recorded_url_q.put(recorded_url)
def log_message(self, fmt, *args):
# logging better handled elsewhere?
pass
RE_MIMETYPE = re.compile(r'[;\s]')
def via_header_value(orig, request_version):
via = orig
if via:
via += ', '
else:
via = ''
via = via + '%s %s' % (request_version, 'warcprox')
return via
class RecordedUrl:
logger = logging.getLogger("warcprox.warcproxy.RecordedUrl")