Merge branch 'adds-hop-path-logging' into qa

This commit is contained in:
Adam Miller 2022-03-24 21:41:11 +00:00
commit 28bec1bb14

View File

@ -65,22 +65,37 @@ class CrawlLogger(object):
content_length = 0
payload_digest = '-'
logging.info('warcprox_meta %s' , recorded_url.warcprox_meta)
hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path', '-')
if hop_path is None:
hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path')
brozzled_url = recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url')
hop_via_url = recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url')
if hop_path is None and brozzled_url is None and hop_via_url is None:
#No hop info headers provided
hop_path = "-"
hop_path_referer = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path_referer', "-")
if hop_path_referer != recorded_url.url.decode('ascii'):
if hop_path == "-":
hop_path = "B"
else:
hop_path = "".join([hop_path,"B"])
via_url = recorded_url.referer or '-'
else:
if hop_path is None:
hop_path = "-"
if hop_via_url is None:
hop_via_url = "-"
#Prefer referer header. Otherwise use provided via_url
via_url = recorded_url.referer or hop_via_url if hop_path != "-" else "-"
if brozzled_url != recorded_url.url.decode('ascii') and "brozzled_url" in recorded_url.warcprox_meta.get('metadata', {}).keys():
#Requested page is not the Brozzled url, thus we are an embed or redirect.
via_url = brozzled_url
if hop_path == "-":
hop_path = "B"
else:
hop_path = "".join([hop_path,"B"])
fields = [
'{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000),
'% 5s' % status,
'% 10s' % content_length,
recorded_url.url,
hop_path,
recorded_url.referer or hop_path_referer if hop_path != "-" else "-",
via_url,
recorded_url.mimetype if recorded_url.mimetype is not None and recorded_url.mimetype.strip() else '-',
'-',
'{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(