From 269e9604c1608610e63f5e56f868c58b490f3f9b Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 19 Sep 2018 12:10:29 -0700 Subject: [PATCH] include warcprox host and port in filenames when using --crawl-log-dir, to avoid collisions (outside of warcprox itself, in most cases) with crawl logs written by other warcprox instances --- tests/test_warcprox.py | 33 ++++++++++++++++++++++----------- warcprox/crawl_log.py | 13 ++++++++----- warcprox/warcproxy.py | 7 +++++-- 3 files changed, 35 insertions(+), 18 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index c41f457..91cf7c0 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1716,8 +1716,14 @@ def test_slash_in_warc_prefix(warcprox_, http_daemon, archiving_proxies): def test_crawl_log(warcprox_, http_daemon, archiving_proxies): urls_before = warcprox_.proxy.running_stats.urls + hostname = socket.gethostname().split('.', 1)[0] + port = warcprox_.proxy.server_port + default_crawl_log_path = os.path.join( + warcprox_.options.crawl_log_dir, + 'crawl-%s-%s.log' % (hostname, port)) + try: - os.unlink(os.path.join(warcprox_.options.crawl_log_dir, 'crawl.log')) + os.unlink(default_crawl_log_path) except: pass @@ -1738,14 +1744,14 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): # wait for postfetch chain wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2) - file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log') + file = os.path.join( + warcprox_.options.crawl_log_dir, + 'test_crawl_log_1-%s-%s.log' % (hostname, port)) assert os.path.exists(file) assert os.stat(file).st_size > 0 - assert os.path.exists(os.path.join( - warcprox_.options.crawl_log_dir, 'crawl.log')) + assert os.path.exists(default_crawl_log_path) - crawl_log = open(os.path.join( - warcprox_.options.crawl_log_dir, 'crawl.log'), 'rb').read() + crawl_log = open(default_crawl_log_path, 'rb').read() # tests will fail in year 3000 :) assert re.match(b'\A2[^\n]+\n\Z', crawl_log) assert crawl_log[24:31] == b' 200 ' @@ -1766,8 +1772,7 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): 'contentSize', 'warcFilename', 'warcFileOffset'} assert extra_info['contentSize'] == 145 - crawl_log_1 = open(os.path.join( - warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log'), 'rb').read() + crawl_log_1 = open(file, 'rb').read() assert re.match(b'\A2[^\n]+\n\Z', crawl_log_1) assert crawl_log_1[24:31] == b' 200 ' assert crawl_log_1[31:42] == b' 54 ' @@ -1798,7 +1803,9 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): # wait for postfetch chain wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 3) - file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log') + file = os.path.join( + warcprox_.options.crawl_log_dir, + 'test_crawl_log_2-%s-%s.log' % (hostname, port)) assert os.path.exists(file) assert os.stat(file).st_size > 0 @@ -1831,7 +1838,9 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): # wait for postfetch chain wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 4) - file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_3.log') + file = os.path.join( + warcprox_.options.crawl_log_dir, + 'test_crawl_log_3-%s-%s.log' % (hostname, port)) assert os.path.exists(file) crawl_log_3 = open(file, 'rb').read() @@ -1869,7 +1878,9 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): # wait for postfetch chain wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 5) - file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_4.log') + file = os.path.join( + warcprox_.options.crawl_log_dir, + 'test_crawl_log_4-%s-%s.log' % (hostname, port)) assert os.path.exists(file) crawl_log_4 = open(file, 'rb').read() diff --git a/warcprox/crawl_log.py b/warcprox/crawl_log.py index 19dde96..2f7ea5e 100644 --- a/warcprox/crawl_log.py +++ b/warcprox/crawl_log.py @@ -24,11 +24,15 @@ import datetime import json import os import warcprox +import socket class CrawlLogger(object): def __init__(self, dir_, options=warcprox.Options()): self.dir = dir_ self.options = options + self.hostname = socket.gethostname().split('.', 1)[0] + + def start(self): if not os.path.exists(self.dir): logging.info('creating directory %r', self.dir) os.mkdir(self.dir) @@ -77,12 +81,11 @@ class CrawlLogger(object): pass line = b' '.join(fields) + b'\n' - if 'warc-prefix' in recorded_url.warcprox_meta: - filename = '%s.log' % recorded_url.warcprox_meta['warc-prefix'] - else: - filename = 'crawl.log' - + prefix = recorded_url.warcprox_meta.get('warc-prefix', 'crawl') + filename = '%s-%s-%s.log' % ( + prefix, self.hostname, self.options.server_port) crawl_log_path = os.path.join(self.dir, filename) + with open(crawl_log_path, 'ab') as f: f.write(line) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index f50691a..cfe2314 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -507,12 +507,15 @@ class WarcProxy(SingleThreadedWarcProxy, warcprox.mitmproxy.PooledMitmProxy): def server_activate(self): http_server.HTTPServer.server_activate(self) - self.logger.info( + self.logger.notice( 'listening on %s:%s', self.server_address[0], self.server_address[1]) + # take note of actual port in case running with --port=0 so that other + # parts of warcprox have easy access to it + self.options.server_port = self.server_address[1] def server_close(self): - self.logger.info('shutting down') + self.logger.notice('shutting down') http_server.HTTPServer.server_close(self) self.remote_connection_pool.clear()