From 3a234d0ceccc759b3926b871c15e5ba7dd3f3d47 Mon Sep 17 00:00:00 2001 From: Adam Miller Date: Thu, 3 Mar 2022 00:18:16 +0000 Subject: [PATCH] Refactor hop_path metadata --- warcprox/crawl_log.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/warcprox/crawl_log.py b/warcprox/crawl_log.py index 73d6457..5bc26bd 100644 --- a/warcprox/crawl_log.py +++ b/warcprox/crawl_log.py @@ -64,18 +64,23 @@ class CrawlLogger(object): else: content_length = 0 payload_digest = '-' - logging.info('metadata %s', recorded_url.warcprox_meta) - hop_path = recorded_url.warcprox_meta["hop_path"] if "hop_path" in recorded_url.warcprox_meta and recorded_url.warcprox_meta["hop_path"] and len(recorded_url.warcprox_meta["hop_path"].strip()) > 0 else '-' - hop_path_parent = recorded_url.warcprox_meta["hop_path_parent"] if "hop_path_parent" in recorded_url.warcprox_meta else None - if hop_path_parent and hop_path_parent != recorded_url.url: - hop_path = str(hop_path if hop_path and hop_path != "-" else "") + "B" + logging.info('warcprox_meta %s' , recorded_url.warcprox_meta) + hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path', '-') + if hop_path is None: + hop_path = "-" + hop_path_referer = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path_referer', recorded_url.referer) + if hop_path_referer != recorded_url.url.decode('ascii'): + if hop_path == "-": + hop_path = "B" + else: + hop_path = "".join([hop_path,"B"]) fields = [ '{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000), '% 5s' % status, '% 10s' % content_length, recorded_url.url, hop_path, - recorded_url.referer or '-', + recorded_url.referer or hop_path_referer or '-', recorded_url.mimetype if recorded_url.mimetype is not None and recorded_url.mimetype.strip() else '-', '-', '{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format( @@ -94,7 +99,6 @@ class CrawlLogger(object): except: pass line = b' '.join(fields) + b'\n' - prefix = recorded_url.warcprox_meta.get('warc-prefix', 'crawl') filename = '%s-%s-%s.log' % ( prefix, self.hostname, self.options.server_port)