mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
include warcprox host and port in filenames
when using --crawl-log-dir, to avoid collisions (outside of warcprox itself, in most cases) with crawl logs written by other warcprox instances
This commit is contained in:
parent
45aed2e4f6
commit
269e9604c1
@ -1716,8 +1716,14 @@ def test_slash_in_warc_prefix(warcprox_, http_daemon, archiving_proxies):
|
|||||||
def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
||||||
urls_before = warcprox_.proxy.running_stats.urls
|
urls_before = warcprox_.proxy.running_stats.urls
|
||||||
|
|
||||||
|
hostname = socket.gethostname().split('.', 1)[0]
|
||||||
|
port = warcprox_.proxy.server_port
|
||||||
|
default_crawl_log_path = os.path.join(
|
||||||
|
warcprox_.options.crawl_log_dir,
|
||||||
|
'crawl-%s-%s.log' % (hostname, port))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
os.unlink(os.path.join(warcprox_.options.crawl_log_dir, 'crawl.log'))
|
os.unlink(default_crawl_log_path)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -1738,14 +1744,14 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
|||||||
# wait for postfetch chain
|
# wait for postfetch chain
|
||||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2)
|
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2)
|
||||||
|
|
||||||
file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log')
|
file = os.path.join(
|
||||||
|
warcprox_.options.crawl_log_dir,
|
||||||
|
'test_crawl_log_1-%s-%s.log' % (hostname, port))
|
||||||
assert os.path.exists(file)
|
assert os.path.exists(file)
|
||||||
assert os.stat(file).st_size > 0
|
assert os.stat(file).st_size > 0
|
||||||
assert os.path.exists(os.path.join(
|
assert os.path.exists(default_crawl_log_path)
|
||||||
warcprox_.options.crawl_log_dir, 'crawl.log'))
|
|
||||||
|
|
||||||
crawl_log = open(os.path.join(
|
crawl_log = open(default_crawl_log_path, 'rb').read()
|
||||||
warcprox_.options.crawl_log_dir, 'crawl.log'), 'rb').read()
|
|
||||||
# tests will fail in year 3000 :)
|
# tests will fail in year 3000 :)
|
||||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log)
|
assert re.match(b'\A2[^\n]+\n\Z', crawl_log)
|
||||||
assert crawl_log[24:31] == b' 200 '
|
assert crawl_log[24:31] == b' 200 '
|
||||||
@ -1766,8 +1772,7 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
|||||||
'contentSize', 'warcFilename', 'warcFileOffset'}
|
'contentSize', 'warcFilename', 'warcFileOffset'}
|
||||||
assert extra_info['contentSize'] == 145
|
assert extra_info['contentSize'] == 145
|
||||||
|
|
||||||
crawl_log_1 = open(os.path.join(
|
crawl_log_1 = open(file, 'rb').read()
|
||||||
warcprox_.options.crawl_log_dir, 'test_crawl_log_1.log'), 'rb').read()
|
|
||||||
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_1)
|
assert re.match(b'\A2[^\n]+\n\Z', crawl_log_1)
|
||||||
assert crawl_log_1[24:31] == b' 200 '
|
assert crawl_log_1[24:31] == b' 200 '
|
||||||
assert crawl_log_1[31:42] == b' 54 '
|
assert crawl_log_1[31:42] == b' 54 '
|
||||||
@ -1798,7 +1803,9 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
|||||||
# wait for postfetch chain
|
# wait for postfetch chain
|
||||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 3)
|
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 3)
|
||||||
|
|
||||||
file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_2.log')
|
file = os.path.join(
|
||||||
|
warcprox_.options.crawl_log_dir,
|
||||||
|
'test_crawl_log_2-%s-%s.log' % (hostname, port))
|
||||||
assert os.path.exists(file)
|
assert os.path.exists(file)
|
||||||
assert os.stat(file).st_size > 0
|
assert os.stat(file).st_size > 0
|
||||||
|
|
||||||
@ -1831,7 +1838,9 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
|||||||
# wait for postfetch chain
|
# wait for postfetch chain
|
||||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 4)
|
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 4)
|
||||||
|
|
||||||
file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_3.log')
|
file = os.path.join(
|
||||||
|
warcprox_.options.crawl_log_dir,
|
||||||
|
'test_crawl_log_3-%s-%s.log' % (hostname, port))
|
||||||
|
|
||||||
assert os.path.exists(file)
|
assert os.path.exists(file)
|
||||||
crawl_log_3 = open(file, 'rb').read()
|
crawl_log_3 = open(file, 'rb').read()
|
||||||
@ -1869,7 +1878,9 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
|||||||
# wait for postfetch chain
|
# wait for postfetch chain
|
||||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 5)
|
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 5)
|
||||||
|
|
||||||
file = os.path.join(warcprox_.options.crawl_log_dir, 'test_crawl_log_4.log')
|
file = os.path.join(
|
||||||
|
warcprox_.options.crawl_log_dir,
|
||||||
|
'test_crawl_log_4-%s-%s.log' % (hostname, port))
|
||||||
assert os.path.exists(file)
|
assert os.path.exists(file)
|
||||||
crawl_log_4 = open(file, 'rb').read()
|
crawl_log_4 = open(file, 'rb').read()
|
||||||
|
|
||||||
|
@ -24,11 +24,15 @@ import datetime
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import warcprox
|
import warcprox
|
||||||
|
import socket
|
||||||
|
|
||||||
class CrawlLogger(object):
|
class CrawlLogger(object):
|
||||||
def __init__(self, dir_, options=warcprox.Options()):
|
def __init__(self, dir_, options=warcprox.Options()):
|
||||||
self.dir = dir_
|
self.dir = dir_
|
||||||
self.options = options
|
self.options = options
|
||||||
|
self.hostname = socket.gethostname().split('.', 1)[0]
|
||||||
|
|
||||||
|
def start(self):
|
||||||
if not os.path.exists(self.dir):
|
if not os.path.exists(self.dir):
|
||||||
logging.info('creating directory %r', self.dir)
|
logging.info('creating directory %r', self.dir)
|
||||||
os.mkdir(self.dir)
|
os.mkdir(self.dir)
|
||||||
@ -77,12 +81,11 @@ class CrawlLogger(object):
|
|||||||
pass
|
pass
|
||||||
line = b' '.join(fields) + b'\n'
|
line = b' '.join(fields) + b'\n'
|
||||||
|
|
||||||
if 'warc-prefix' in recorded_url.warcprox_meta:
|
prefix = recorded_url.warcprox_meta.get('warc-prefix', 'crawl')
|
||||||
filename = '%s.log' % recorded_url.warcprox_meta['warc-prefix']
|
filename = '%s-%s-%s.log' % (
|
||||||
else:
|
prefix, self.hostname, self.options.server_port)
|
||||||
filename = 'crawl.log'
|
|
||||||
|
|
||||||
crawl_log_path = os.path.join(self.dir, filename)
|
crawl_log_path = os.path.join(self.dir, filename)
|
||||||
|
|
||||||
with open(crawl_log_path, 'ab') as f:
|
with open(crawl_log_path, 'ab') as f:
|
||||||
f.write(line)
|
f.write(line)
|
||||||
|
|
||||||
|
@ -507,12 +507,15 @@ class WarcProxy(SingleThreadedWarcProxy, warcprox.mitmproxy.PooledMitmProxy):
|
|||||||
|
|
||||||
def server_activate(self):
|
def server_activate(self):
|
||||||
http_server.HTTPServer.server_activate(self)
|
http_server.HTTPServer.server_activate(self)
|
||||||
self.logger.info(
|
self.logger.notice(
|
||||||
'listening on %s:%s', self.server_address[0],
|
'listening on %s:%s', self.server_address[0],
|
||||||
self.server_address[1])
|
self.server_address[1])
|
||||||
|
# take note of actual port in case running with --port=0 so that other
|
||||||
|
# parts of warcprox have easy access to it
|
||||||
|
self.options.server_port = self.server_address[1]
|
||||||
|
|
||||||
def server_close(self):
|
def server_close(self):
|
||||||
self.logger.info('shutting down')
|
self.logger.notice('shutting down')
|
||||||
http_server.HTTPServer.server_close(self)
|
http_server.HTTPServer.server_close(self)
|
||||||
self.remote_connection_pool.clear()
|
self.remote_connection_pool.clear()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user