Merge pull request #105 from nlevitt/host-port-in-log-name

include warcprox host and port in filenames
This commit is contained in:
Noah Levitt 2018-09-19 13:03:19 -07:00 committed by GitHub
commit d8edc551ba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 35 additions and 18 deletions

View File

@ -1716,8 +1716,14 @@ def test_slash_in_warc_prefix(warcprox_, http_daemon, archiving_proxies):
def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
urls_before = warcprox_.proxy.running_stats.urls
hostname = socket.gethostname().split('.', 1)[0]
port = warcprox_.proxy.server_port
default_crawl_log_path = os.path.join(
warcprox_.options.crawl_log_dir,
'crawl-%s-%s.log' % (hostname, port))
try:
os.unlink(os.path.join(warcprox_.options.crawl_log_dir, 'crawl.log'))
os.unlink(default_crawl_log_path)
except:
pass
@ -1738,14 +1744,14 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
# wait for postfetch chain
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2)
file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log')
file = os.path.join(
warcprox_.options.crawl_log_dir,
'test_crawl_log_1-%s-%s.log' % (hostname, port))
assert os.path.exists(file)
assert os.stat(file).st_size > 0
assert os.path.exists(os.path.join(
warcprox_.options.crawl_log_dir, 'crawl.log'))
assert os.path.exists(default_crawl_log_path)
crawl_log = open(os.path.join(
warcprox_.options.crawl_log_dir, 'crawl.log'), 'rb').read()
crawl_log = open(default_crawl_log_path, 'rb').read()
# tests will fail in year 3000 :)
assert re.match(b'\A2[^\n]+\n\Z', crawl_log)
assert crawl_log[24:31] == b' 200 '
@ -1766,8 +1772,7 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
'contentSize', 'warcFilename', 'warcFileOffset'}
assert extra_info['contentSize'] == 145
crawl_log_1 = open(os.path.join(
warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log'), 'rb').read()
crawl_log_1 = open(file, 'rb').read()
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_1)
assert crawl_log_1[24:31] == b' 200 '
assert crawl_log_1[31:42] == b' 54 '
@ -1798,7 +1803,9 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
# wait for postfetch chain
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 3)
file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log')
file = os.path.join(
warcprox_.options.crawl_log_dir,
'test_crawl_log_2-%s-%s.log' % (hostname, port))
assert os.path.exists(file)
assert os.stat(file).st_size > 0
@ -1831,7 +1838,9 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
# wait for postfetch chain
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 4)
file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_3.log')
file = os.path.join(
warcprox_.options.crawl_log_dir,
'test_crawl_log_3-%s-%s.log' % (hostname, port))
assert os.path.exists(file)
crawl_log_3 = open(file, 'rb').read()
@ -1869,7 +1878,9 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
# wait for postfetch chain
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 5)
file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_4.log')
file = os.path.join(
warcprox_.options.crawl_log_dir,
'test_crawl_log_4-%s-%s.log' % (hostname, port))
assert os.path.exists(file)
crawl_log_4 = open(file, 'rb').read()

View File

@ -24,11 +24,15 @@ import datetime
import json
import os
import warcprox
import socket
class CrawlLogger(object):
def __init__(self, dir_, options=warcprox.Options()):
self.dir = dir_
self.options = options
self.hostname = socket.gethostname().split('.', 1)[0]
def start(self):
if not os.path.exists(self.dir):
logging.info('creating directory %r', self.dir)
os.mkdir(self.dir)
@ -77,12 +81,11 @@ class CrawlLogger(object):
pass
line = b' '.join(fields) + b'\n'
if 'warc-prefix' in recorded_url.warcprox_meta:
filename = '%s.log' % recorded_url.warcprox_meta['warc-prefix']
else:
filename = 'crawl.log'
prefix = recorded_url.warcprox_meta.get('warc-prefix', 'crawl')
filename = '%s-%s-%s.log' % (
prefix, self.hostname, self.options.server_port)
crawl_log_path = os.path.join(self.dir, filename)
with open(crawl_log_path, 'ab') as f:
f.write(line)

View File

@ -507,12 +507,15 @@ class WarcProxy(SingleThreadedWarcProxy, warcprox.mitmproxy.PooledMitmProxy):
def server_activate(self):
http_server.HTTPServer.server_activate(self)
self.logger.info(
self.logger.notice(
'listening on %s:%s', self.server_address[0],
self.server_address[1])
# take note of actual port in case running with --port=0 so that other
# parts of warcprox have easy access to it
self.options.server_port = self.server_address[1]
def server_close(self):
self.logger.info('shutting down')
self.logger.notice('shutting down')
http_server.HTTPServer.server_close(self)
self.remote_connection_pool.clear()