From 269e9604c1608610e63f5e56f868c58b490f3f9b Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Wed, 19 Sep 2018 12:10:29 -0700
Subject: [PATCH] include warcprox host and port in filenames

when using --crawl-log-dir, to avoid collisions (outside of warcprox
itself, in most cases) with crawl logs written by other warcprox
instances
---
 tests/test_warcprox.py | 33 ++++++++++++++++++++++-----------
 warcprox/crawl_log.py  | 13 ++++++++-----
 warcprox/warcproxy.py  |  7 +++++--
 3 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py
index c41f457..91cf7c0 100755
--- a/tests/test_warcprox.py
+++ b/tests/test_warcprox.py
@@ -1716,8 +1716,14 @@ def test_slash_in_warc_prefix(warcprox_, http_daemon, archiving_proxies):
 def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
     urls_before = warcprox_.proxy.running_stats.urls
 
+    hostname = socket.gethostname().split('.', 1)[0]
+    port = warcprox_.proxy.server_port
+    default_crawl_log_path = os.path.join(
+            warcprox_.options.crawl_log_dir,
+            'crawl-%s-%s.log' % (hostname, port))
+
     try:
-        os.unlink(os.path.join(warcprox_.options.crawl_log_dir, 'crawl.log'))
+        os.unlink(default_crawl_log_path)
     except:
         pass
 
@@ -1738,14 +1744,14 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
     # wait for postfetch chain
     wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2)
 
-    file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log')
+    file = os.path.join(
+            warcprox_.options.crawl_log_dir,
+            'test_crawl_log_1-%s-%s.log' % (hostname, port))
     assert os.path.exists(file)
     assert os.stat(file).st_size > 0
-    assert os.path.exists(os.path.join(
-        warcprox_.options.crawl_log_dir, 'crawl.log'))
+    assert os.path.exists(default_crawl_log_path)
 
-    crawl_log = open(os.path.join(
-        warcprox_.options.crawl_log_dir, 'crawl.log'), 'rb').read()
+    crawl_log = open(default_crawl_log_path, 'rb').read()
     # tests will fail in year 3000 :)
     assert re.match(b'\A2[^\n]+\n\Z', crawl_log)
     assert crawl_log[24:31] == b'   200 '
@@ -1766,8 +1772,7 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
             'contentSize', 'warcFilename', 'warcFileOffset'}
     assert extra_info['contentSize'] == 145
 
-    crawl_log_1 = open(os.path.join(
-        warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log'), 'rb').read()
+    crawl_log_1 = open(file, 'rb').read()
     assert re.match(b'\A2[^\n]+\n\Z', crawl_log_1)
     assert crawl_log_1[24:31] == b'   200 '
     assert crawl_log_1[31:42] == b'        54 '
@@ -1798,7 +1803,9 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
     # wait for postfetch chain
     wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 3)
 
-    file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log')
+    file = os.path.join(
+            warcprox_.options.crawl_log_dir,
+            'test_crawl_log_2-%s-%s.log' % (hostname, port))
     assert os.path.exists(file)
     assert os.stat(file).st_size > 0
 
@@ -1831,7 +1838,9 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
     # wait for postfetch chain
     wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 4)
 
-    file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_3.log')
+    file = os.path.join(
+            warcprox_.options.crawl_log_dir,
+            'test_crawl_log_3-%s-%s.log' % (hostname, port))
 
     assert os.path.exists(file)
     crawl_log_3 = open(file, 'rb').read()
@@ -1869,7 +1878,9 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
     # wait for postfetch chain
     wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 5)
 
-    file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_4.log')
+    file = os.path.join(
+            warcprox_.options.crawl_log_dir,
+            'test_crawl_log_4-%s-%s.log' % (hostname, port))
     assert os.path.exists(file)
     crawl_log_4 = open(file, 'rb').read()
 
diff --git a/warcprox/crawl_log.py b/warcprox/crawl_log.py
index 19dde96..2f7ea5e 100644
--- a/warcprox/crawl_log.py
+++ b/warcprox/crawl_log.py
@@ -24,11 +24,15 @@ import datetime
 import json
 import os
 import warcprox
+import socket
 
 class CrawlLogger(object):
     def __init__(self, dir_, options=warcprox.Options()):
         self.dir = dir_
         self.options = options
+        self.hostname = socket.gethostname().split('.', 1)[0]
+
+    def start(self):
         if not os.path.exists(self.dir):
             logging.info('creating directory %r', self.dir)
             os.mkdir(self.dir)
@@ -77,12 +81,11 @@ class CrawlLogger(object):
                 pass
         line = b' '.join(fields) + b'\n'
 
-        if 'warc-prefix' in recorded_url.warcprox_meta:
-            filename = '%s.log' % recorded_url.warcprox_meta['warc-prefix']
-        else:
-            filename = 'crawl.log'
-
+        prefix = recorded_url.warcprox_meta.get('warc-prefix', 'crawl')
+        filename = '%s-%s-%s.log' % (
+                prefix, self.hostname, self.options.server_port)
         crawl_log_path = os.path.join(self.dir, filename)
+
         with open(crawl_log_path, 'ab') as f:
             f.write(line)
 
diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py
index f50691a..cfe2314 100644
--- a/warcprox/warcproxy.py
+++ b/warcprox/warcproxy.py
@@ -507,12 +507,15 @@ class WarcProxy(SingleThreadedWarcProxy, warcprox.mitmproxy.PooledMitmProxy):
 
     def server_activate(self):
         http_server.HTTPServer.server_activate(self)
-        self.logger.info(
+        self.logger.notice(
                 'listening on %s:%s', self.server_address[0],
                 self.server_address[1])
+        # take note of actual port in case running with --port=0 so that other
+        # parts of warcprox have easy access to it
+        self.options.server_port = self.server_address[1]
 
     def server_close(self):
-        self.logger.info('shutting down')
+        self.logger.notice('shutting down')
         http_server.HTTPServer.server_close(self)
         self.remote_connection_pool.clear()