mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Beginning modifications to pass along a dummy RecordedUrl on connection timeout for logging
This commit is contained in:
parent
f77c152037
commit
f9c9443d2f
@ -40,7 +40,7 @@ class CrawlLogger(object):
|
||||
def notify(self, recorded_url, records):
|
||||
# 2017-08-03T21:45:24.496Z 200 2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"}
|
||||
now = datetime.datetime.utcnow()
|
||||
extra_info = {'contentSize': recorded_url.size,}
|
||||
extra_info = {'contentSize': recorded_url.size,} if recorded_url.size > 0 else {}
|
||||
if records:
|
||||
extra_info['warcFilename'] = records[0].warc_filename
|
||||
extra_info['warcFileOffset'] = records[0].offset
|
||||
@ -51,10 +51,13 @@ class CrawlLogger(object):
|
||||
payload_digest = warcprox.digest_str(
|
||||
recorded_url.payload_digest,
|
||||
self.options.base32)
|
||||
else:
|
||||
elif records is not None and len(records) > 0:
|
||||
# WARCPROX_WRITE_RECORD request
|
||||
content_length = int(records[0].get_header(b'Content-Length'))
|
||||
payload_digest = records[0].get_header(b'WARC-Payload-Digest')
|
||||
else:
|
||||
content_length = 0
|
||||
payload_digest = '-'
|
||||
fields = [
|
||||
'{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000),
|
||||
'% 5s' % recorded_url.status,
|
||||
@ -67,7 +70,7 @@ class CrawlLogger(object):
|
||||
'{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(
|
||||
recorded_url.timestamp,
|
||||
recorded_url.timestamp.microsecond//1000,
|
||||
recorded_url.duration.microseconds//1000),
|
||||
recorded_url.duration.microseconds//1000) if (recorded_url.timestamp is not None and recorded_url.duration is not None) else '-',
|
||||
payload_digest,
|
||||
recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'),
|
||||
'duplicate:digest' if records and records[0].type == b'revisit' else '-',
|
||||
|
@ -359,7 +359,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
self.logger.error(
|
||||
"problem handling %r: %r", self.requestline, e)
|
||||
if type(e) is socket.timeout:
|
||||
self.send_error(504, str(e))
|
||||
self.send_error(-2, str(e))
|
||||
else:
|
||||
self.send_error(500, str(e))
|
||||
except Exception as f:
|
||||
@ -425,7 +425,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
response_code = 500
|
||||
cache = False
|
||||
if isinstance(e, (socket.timeout, TimeoutError,)):
|
||||
response_code = 504
|
||||
response_code = -2
|
||||
cache = True
|
||||
elif isinstance(e, HTTPError):
|
||||
response_code = 502
|
||||
@ -459,6 +459,12 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
return
|
||||
|
||||
def send_error(self, code, message=None, explain=None):
|
||||
|
||||
if code == -2:
|
||||
return_code = 504
|
||||
else:
|
||||
return_code = code
|
||||
|
||||
# BaseHTTPRequestHandler.send_response_only() in http/server.py
|
||||
# does this:
|
||||
# if not hasattr(self, '_headers_buffer'):
|
||||
@ -470,7 +476,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
self._headers_buffer = []
|
||||
try:
|
||||
return http_server.BaseHTTPRequestHandler.send_error(
|
||||
self, code, message, explain)
|
||||
self, return_code, message, explain)
|
||||
except Exception as e:
|
||||
level = logging.ERROR
|
||||
if isinstance(e, OSError) and e.errno == 9:
|
||||
|
@ -343,13 +343,68 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
except:
|
||||
self.logger.error("uncaught exception in do_WARCPROX_WRITE_RECORD", exc_info=True)
|
||||
raise
|
||||
def send_error(self, code, message=None, explain=None):
|
||||
super().send_error(code, message, explain)
|
||||
|
||||
# Build request
|
||||
req_str = '{} {} {}\r\n'.format(
|
||||
self.command, self.path, self.request_version)
|
||||
|
||||
# Swallow headers that don't make sense to forward on, i.e. most
|
||||
# hop-by-hop headers. http://tools.ietf.org/html/rfc2616#section-13.5.
|
||||
# self.headers is an email.message.Message, which is case-insensitive
|
||||
# and doesn't throw KeyError in __delitem__
|
||||
for key in (
|
||||
'Connection', 'Proxy-Connection', 'Keep-Alive',
|
||||
'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'):
|
||||
del self.headers[key]
|
||||
|
||||
self.headers['Via'] = via_header_value(
|
||||
self.headers.get('Via'),
|
||||
self.request_version.replace('HTTP/', ''))
|
||||
|
||||
# Add headers to the request
|
||||
# XXX in at least python3.3 str(self.headers) uses \n not \r\n :(
|
||||
req_str += '\r\n'.join(
|
||||
'{}: {}'.format(k,v) for (k,v) in self.headers.items())
|
||||
|
||||
warcprox_meta = None
|
||||
raw_warcprox_meta = self.headers.get('Warcprox-Meta')
|
||||
if raw_warcprox_meta:
|
||||
warcprox_meta = json.loads(raw_warcprox_meta)
|
||||
|
||||
req = req_str.encode('latin1') + b'\r\n\r\n'
|
||||
recorded_url = RecordedUrl(
|
||||
url=self.url,
|
||||
remote_ip=b'',
|
||||
warcprox_meta=warcprox_meta,
|
||||
status=code,
|
||||
client_ip=self.client_address[0],
|
||||
method=self.command,
|
||||
content_type="unknown",
|
||||
response_recorder=None,
|
||||
request_data=req,
|
||||
duration=None ,size=0,
|
||||
timestamp=None, host=self.hostname,
|
||||
do_not_archive=True,
|
||||
referer=self.headers.get('referer'))
|
||||
|
||||
self.server.recorded_url_q.put(recorded_url)
|
||||
|
||||
|
||||
def log_message(self, fmt, *args):
|
||||
# logging better handled elsewhere?
|
||||
pass
|
||||
|
||||
RE_MIMETYPE = re.compile(r'[;\s]')
|
||||
|
||||
def via_header_value(orig, request_version):
|
||||
via = orig
|
||||
if via:
|
||||
via += ', '
|
||||
else:
|
||||
via = ''
|
||||
via = via + '%s %s' % (request_version, 'warcprox')
|
||||
return via
|
||||
class RecordedUrl:
|
||||
logger = logging.getLogger("warcprox.warcproxy.RecordedUrl")
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user