mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
include warcprox host and port in filenames
when using --crawl-log-dir, to avoid collisions (outside of warcprox itself, in most cases) with crawl logs written by other warcprox instances
This commit is contained in:
parent
45aed2e4f6
commit
269e9604c1
@ -1716,8 +1716,14 @@ def test_slash_in_warc_prefix(warcprox_, http_daemon, archiving_proxies):
|
||||
def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
||||
urls_before = warcprox_.proxy.running_stats.urls
|
||||
|
||||
hostname = socket.gethostname().split('.', 1)[0]
|
||||
port = warcprox_.proxy.server_port
|
||||
default_crawl_log_path = os.path.join(
|
||||
warcprox_.options.crawl_log_dir,
|
||||
'crawl-%s-%s.log' % (hostname, port))
|
||||
|
||||
try:
|
||||
os.unlink(os.path.join(warcprox_.options.crawl_log_dir, 'crawl.log'))
|
||||
os.unlink(default_crawl_log_path)
|
||||
except:
|
||||
pass
|
||||
|
||||
@ -1738,14 +1744,14 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
||||
# wait for postfetch chain
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2)
|
||||
|
||||
file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log')
|
||||
file = os.path.join(
|
||||
warcprox_.options.crawl_log_dir,
|
||||
'test_crawl_log_1-%s-%s.log' % (hostname, port))
|
||||
assert os.path.exists(file)
|
||||
assert os.stat(file).st_size > 0
|
||||
assert os.path.exists(os.path.join(
|
||||
warcprox_.options.crawl_log_dir, 'crawl.log'))
|
||||
assert os.path.exists(default_crawl_log_path)
|
||||
|
||||
crawl_log = open(os.path.join(
|
||||
warcprox_.options.crawl_log_dir, 'crawl.log'), 'rb').read()
|
||||
crawl_log = open(default_crawl_log_path, 'rb').read()
|
||||
# tests will fail in year 3000 :)
|
||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log)
|
||||
assert crawl_log[24:31] == b' 200 '
|
||||
@ -1766,8 +1772,7 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
||||
'contentSize', 'warcFilename', 'warcFileOffset'}
|
||||
assert extra_info['contentSize'] == 145
|
||||
|
||||
crawl_log_1 = open(os.path.join(
|
||||
warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log'), 'rb').read()
|
||||
crawl_log_1 = open(file, 'rb').read()
|
||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_1)
|
||||
assert crawl_log_1[24:31] == b' 200 '
|
||||
assert crawl_log_1[31:42] == b' 54 '
|
||||
@ -1798,7 +1803,9 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
||||
# wait for postfetch chain
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 3)
|
||||
|
||||
file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log')
|
||||
file = os.path.join(
|
||||
warcprox_.options.crawl_log_dir,
|
||||
'test_crawl_log_2-%s-%s.log' % (hostname, port))
|
||||
assert os.path.exists(file)
|
||||
assert os.stat(file).st_size > 0
|
||||
|
||||
@ -1831,7 +1838,9 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
||||
# wait for postfetch chain
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 4)
|
||||
|
||||
file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_3.log')
|
||||
file = os.path.join(
|
||||
warcprox_.options.crawl_log_dir,
|
||||
'test_crawl_log_3-%s-%s.log' % (hostname, port))
|
||||
|
||||
assert os.path.exists(file)
|
||||
crawl_log_3 = open(file, 'rb').read()
|
||||
@ -1869,7 +1878,9 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
||||
# wait for postfetch chain
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 5)
|
||||
|
||||
file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_4.log')
|
||||
file = os.path.join(
|
||||
warcprox_.options.crawl_log_dir,
|
||||
'test_crawl_log_4-%s-%s.log' % (hostname, port))
|
||||
assert os.path.exists(file)
|
||||
crawl_log_4 = open(file, 'rb').read()
|
||||
|
||||
|
@ -24,11 +24,15 @@ import datetime
|
||||
import json
|
||||
import os
|
||||
import warcprox
|
||||
import socket
|
||||
|
||||
class CrawlLogger(object):
|
||||
def __init__(self, dir_, options=warcprox.Options()):
|
||||
self.dir = dir_
|
||||
self.options = options
|
||||
self.hostname = socket.gethostname().split('.', 1)[0]
|
||||
|
||||
def start(self):
|
||||
if not os.path.exists(self.dir):
|
||||
logging.info('creating directory %r', self.dir)
|
||||
os.mkdir(self.dir)
|
||||
@ -77,12 +81,11 @@ class CrawlLogger(object):
|
||||
pass
|
||||
line = b' '.join(fields) + b'\n'
|
||||
|
||||
if 'warc-prefix' in recorded_url.warcprox_meta:
|
||||
filename = '%s.log' % recorded_url.warcprox_meta['warc-prefix']
|
||||
else:
|
||||
filename = 'crawl.log'
|
||||
|
||||
prefix = recorded_url.warcprox_meta.get('warc-prefix', 'crawl')
|
||||
filename = '%s-%s-%s.log' % (
|
||||
prefix, self.hostname, self.options.server_port)
|
||||
crawl_log_path = os.path.join(self.dir, filename)
|
||||
|
||||
with open(crawl_log_path, 'ab') as f:
|
||||
f.write(line)
|
||||
|
||||
|
@ -507,12 +507,15 @@ class WarcProxy(SingleThreadedWarcProxy, warcprox.mitmproxy.PooledMitmProxy):
|
||||
|
||||
def server_activate(self):
|
||||
http_server.HTTPServer.server_activate(self)
|
||||
self.logger.info(
|
||||
self.logger.notice(
|
||||
'listening on %s:%s', self.server_address[0],
|
||||
self.server_address[1])
|
||||
# take note of actual port in case running with --port=0 so that other
|
||||
# parts of warcprox have easy access to it
|
||||
self.options.server_port = self.server_address[1]
|
||||
|
||||
def server_close(self):
|
||||
self.logger.info('shutting down')
|
||||
self.logger.notice('shutting down')
|
||||
http_server.HTTPServer.server_close(self)
|
||||
self.remote_connection_pool.clear()
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user