diff --git a/warcprox/crawl_log.py b/warcprox/crawl_log.py index c859159..e30b371 100644 --- a/warcprox/crawl_log.py +++ b/warcprox/crawl_log.py @@ -65,22 +65,37 @@ class CrawlLogger(object): content_length = 0 payload_digest = '-' logging.info('warcprox_meta %s' , recorded_url.warcprox_meta) - hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path', '-') - if hop_path is None: + + hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path') + brozzled_url = recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url') + hop_via_url = recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url') + + if hop_path is None and brozzled_url is None and hop_via_url is None: + #No hop info headers provided hop_path = "-" - hop_path_referer = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path_referer', "-") - if hop_path_referer != recorded_url.url.decode('ascii'): - if hop_path == "-": - hop_path = "B" - else: - hop_path = "".join([hop_path,"B"]) + via_url = recorded_url.referer or '-' + else: + if hop_path is None: + hop_path = "-" + if hop_via_url is None: + hop_via_url = "-" + #Prefer referer header. Otherwise use provided via_url + via_url = recorded_url.referer or hop_via_url if hop_path != "-" else "-" + if brozzled_url != recorded_url.url.decode('ascii') and "brozzled_url" in recorded_url.warcprox_meta.get('metadata', {}).keys(): + #Requested page is not the Brozzled url, thus we are an embed or redirect. + via_url = brozzled_url + if hop_path == "-": + hop_path = "B" + else: + hop_path = "".join([hop_path,"B"]) + fields = [ '{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000), '% 5s' % status, '% 10s' % content_length, recorded_url.url, hop_path, - recorded_url.referer or hop_path_referer if hop_path != "-" else "-", + via_url, recorded_url.mimetype if recorded_url.mimetype is not None and recorded_url.mimetype.strip() else '-', '-', '{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(