From 5ae1291e3740e3d849103a74352153464812486e Mon Sep 17 00:00:00 2001
From: Adam Miller <adam@archive.org>
Date: Thu, 24 Mar 2022 21:40:55 +0000
Subject: [PATCH] Refactor of hop path referer logic

---
 warcprox/crawl_log.py | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/warcprox/crawl_log.py b/warcprox/crawl_log.py
index c859159..e30b371 100644
--- a/warcprox/crawl_log.py
+++ b/warcprox/crawl_log.py
@@ -65,22 +65,37 @@ class CrawlLogger(object):
             content_length = 0
             payload_digest = '-'
         logging.info('warcprox_meta %s' , recorded_url.warcprox_meta)
-        hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path', '-')
-        if hop_path is None:
+
+        hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path')
+        brozzled_url = recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url')
+        hop_via_url = recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url')
+
+        if hop_path is None and brozzled_url is None and hop_via_url is None:
+            #No hop info headers provided
             hop_path = "-"
-        hop_path_referer = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path_referer', "-")
-        if hop_path_referer != recorded_url.url.decode('ascii'):
-            if hop_path == "-":
-                hop_path = "B"
-            else:
-                hop_path = "".join([hop_path,"B"])
+            via_url = recorded_url.referer or '-'
+        else:
+            if hop_path is None:
+                hop_path = "-"
+            if hop_via_url is None:
+                hop_via_url = "-"
+            #Prefer referer header. Otherwise use provided via_url
+            via_url = recorded_url.referer or hop_via_url if hop_path != "-" else "-"
+            if brozzled_url != recorded_url.url.decode('ascii') and "brozzled_url" in recorded_url.warcprox_meta.get('metadata', {}).keys():
+                #Requested page is not the Brozzled url, thus we are an embed or redirect.
+                via_url = brozzled_url
+                if hop_path == "-":
+                    hop_path = "B"
+                else:
+                    hop_path = "".join([hop_path,"B"])
+
         fields = [
             '{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000),
             '% 5s' % status,
             '% 10s' % content_length,
             recorded_url.url,
             hop_path,
-            recorded_url.referer or hop_path_referer if hop_path != "-" else "-",
+            via_url,
             recorded_url.mimetype if recorded_url.mimetype is not None and recorded_url.mimetype.strip() else '-',
             '-',
             '{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(