mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
hidden --profile option to enable profiling of warc writer thread and periodic logging of memory usage info; at shutdown, close stats db and unregister from service registry; logging improvements
This commit is contained in:
parent
7eb82ab8a2
commit
e3a5717446
1
.gitignore
vendored
1
.gitignore
vendored
@ -11,3 +11,4 @@ warcs
|
|||||||
build
|
build
|
||||||
dist
|
dist
|
||||||
.tox
|
.tox
|
||||||
|
out.*
|
||||||
|
@ -44,9 +44,6 @@ class WarcproxController(object):
|
|||||||
|
|
||||||
def debug_mem(self):
|
def debug_mem(self):
|
||||||
self.logger.info("self.proxy.recorded_url_q.qsize()=%s", self.proxy.recorded_url_q.qsize())
|
self.logger.info("self.proxy.recorded_url_q.qsize()=%s", self.proxy.recorded_url_q.qsize())
|
||||||
if self.proxy.stats_db and hasattr(self.proxy.stats_db, "_executor"):
|
|
||||||
self.logger.info("self.proxy.stats_db._executor._work_queue.qsize()=%s",
|
|
||||||
self.proxy.stats_db._executor._work_queue.qsize())
|
|
||||||
with open("/proc/self/status") as f:
|
with open("/proc/self/status") as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
fields = line.split()
|
fields = line.split()
|
||||||
@ -118,6 +115,7 @@ class WarcproxController(object):
|
|||||||
'port': self.options.port,
|
'port': self.options.port,
|
||||||
}
|
}
|
||||||
status_info['load'] = 1.0 * self.proxy.recorded_url_q.qsize() / (self.proxy.recorded_url_q.maxsize or 100)
|
status_info['load'] = 1.0 * self.proxy.recorded_url_q.qsize() / (self.proxy.recorded_url_q.maxsize or 100)
|
||||||
|
status_info['queue_size'] = self.proxy.recorded_url_q.qsize()
|
||||||
|
|
||||||
self.status_info = self.service_registry.heartbeat(status_info)
|
self.status_info = self.service_registry.heartbeat(status_info)
|
||||||
self.logger.debug("status in service registry: %s", self.status_info)
|
self.logger.debug("status in service registry: %s", self.status_info)
|
||||||
@ -154,9 +152,9 @@ class WarcproxController(object):
|
|||||||
if self.service_registry and (not hasattr(self, "status_info") or (datetime.datetime.now(utc) - self.status_info["last_heartbeat"]).total_seconds() > self.HEARTBEAT_INTERVAL):
|
if self.service_registry and (not hasattr(self, "status_info") or (datetime.datetime.now(utc) - self.status_info["last_heartbeat"]).total_seconds() > self.HEARTBEAT_INTERVAL):
|
||||||
self._service_heartbeat()
|
self._service_heartbeat()
|
||||||
|
|
||||||
# if (datetime.datetime.utcnow() - last_mem_dbg).total_seconds() > 60:
|
if self.options.profile and (datetime.datetime.utcnow() - last_mem_dbg).total_seconds() > 60:
|
||||||
# self.debug_mem()
|
self.debug_mem()
|
||||||
# last_mem_dbg = datetime.datetime.utcnow()
|
last_mem_dbg = datetime.datetime.utcnow()
|
||||||
|
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
except:
|
except:
|
||||||
@ -176,10 +174,15 @@ class WarcproxController(object):
|
|||||||
# wait for threads to finish
|
# wait for threads to finish
|
||||||
self.warc_writer_thread.join()
|
self.warc_writer_thread.join()
|
||||||
|
|
||||||
if self.warc_writer_thread.dedup_db is not None:
|
if self.proxy.stats_db:
|
||||||
|
self.proxy.stats_db.close()
|
||||||
|
if self.warc_writer_thread.dedup_db:
|
||||||
self.warc_writer_thread.dedup_db.close()
|
self.warc_writer_thread.dedup_db.close()
|
||||||
|
|
||||||
proxy_thread.join()
|
proxy_thread.join()
|
||||||
if self.playback_proxy is not None:
|
if self.playback_proxy is not None:
|
||||||
playback_proxy_thread.join()
|
playback_proxy_thread.join()
|
||||||
|
|
||||||
|
if self.service_registry and hasattr(self, "status_info"):
|
||||||
|
self.service_registry.unregister(self.status_info["id"])
|
||||||
|
|
||||||
|
@ -80,6 +80,8 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
|||||||
default=None, help='kafka capture feed topic')
|
default=None, help='kafka capture feed topic')
|
||||||
arg_parser.add_argument('--queue-size', dest='queue_size', default=1000,
|
arg_parser.add_argument('--queue-size', dest='queue_size', default=1000,
|
||||||
help=argparse.SUPPRESS)
|
help=argparse.SUPPRESS)
|
||||||
|
arg_parser.add_argument('--profile', action='store_true', default=False,
|
||||||
|
help=argparse.SUPPRESS)
|
||||||
arg_parser.add_argument('--version', action='version',
|
arg_parser.add_argument('--version', action='version',
|
||||||
version="warcprox {}".format(warcprox.__version__))
|
version="warcprox {}".format(warcprox.__version__))
|
||||||
arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')
|
arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')
|
||||||
|
@ -16,6 +16,7 @@ import random
|
|||||||
import warcprox
|
import warcprox
|
||||||
import threading
|
import threading
|
||||||
import rethinkdb as r
|
import rethinkdb as r
|
||||||
|
import datetime
|
||||||
|
|
||||||
def _empty_bucket(bucket):
|
def _empty_bucket(bucket):
|
||||||
return {
|
return {
|
||||||
@ -160,7 +161,7 @@ class RethinkStatsDb:
|
|||||||
|
|
||||||
if not self._stop.is_set():
|
if not self._stop.is_set():
|
||||||
self._timer = threading.Timer(0.5, self._update_batch)
|
self._timer = threading.Timer(0.5, self._update_batch)
|
||||||
self._timer.name = "RethinkCaptures-batch-insert-timer"
|
self._timer.name = "RethinkStats-batch-update-timer-%s" % datetime.datetime.utcnow().isoformat()
|
||||||
self._timer.start()
|
self._timer.start()
|
||||||
else:
|
else:
|
||||||
self.logger.info("finished")
|
self.logger.info("finished")
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# vim:set sw=4 et:
|
|
||||||
#
|
#
|
||||||
"""
|
"""
|
||||||
WARC writing MITM HTTP/S proxy
|
WARC writing MITM HTTP/S proxy
|
||||||
@ -151,8 +150,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
key, limit = item
|
key, limit = item
|
||||||
bucket0, bucket1, bucket2 = key.rsplit(".", 2)
|
bucket0, bucket1, bucket2 = key.rsplit(".", 2)
|
||||||
value = self.server.stats_db.value(bucket0, bucket1, bucket2)
|
value = self.server.stats_db.value(bucket0, bucket1, bucket2)
|
||||||
# self.logger.debug("warcprox_meta['limits']=%s stats['%s']=%s recorded_url_q.qsize()=%s",
|
self.logger.debug("warcprox_meta['limits']=%s stats['%s']=%s recorded_url_q.qsize()=%s",
|
||||||
# warcprox_meta['limits'], key, value, self.server.recorded_url_q.qsize())
|
warcprox_meta['limits'], key, value, self.server.recorded_url_q.qsize())
|
||||||
if value and value >= limit:
|
if value and value >= limit:
|
||||||
body = "request rejected by warcprox: reached limit {}={}\n".format(key, limit).encode("utf-8")
|
body = "request rejected by warcprox: reached limit {}={}\n".format(key, limit).encode("utf-8")
|
||||||
self.send_response(420, "Reached limit")
|
self.send_response(420, "Reached limit")
|
||||||
@ -369,7 +368,7 @@ class SingleThreadedWarcProxy(http_server.HTTPServer):
|
|||||||
if recorded_url_q is not None:
|
if recorded_url_q is not None:
|
||||||
self.recorded_url_q = recorded_url_q
|
self.recorded_url_q = recorded_url_q
|
||||||
else:
|
else:
|
||||||
self.recorded_url_q = queue.Queue()
|
self.recorded_url_q = queue.Queue(maxsize=options.queue_size or 1000)
|
||||||
|
|
||||||
self.stats_db = stats_db
|
self.stats_db = stats_db
|
||||||
|
|
||||||
@ -383,6 +382,8 @@ class SingleThreadedWarcProxy(http_server.HTTPServer):
|
|||||||
self.logger.info('WarcProxy shutting down')
|
self.logger.info('WarcProxy shutting down')
|
||||||
http_server.HTTPServer.server_close(self)
|
http_server.HTTPServer.server_close(self)
|
||||||
|
|
||||||
|
def handle_error(self, request, client_address):
|
||||||
|
self.logger.warn("exception processing request %s from %s", request, client_address, exc_info=True)
|
||||||
|
|
||||||
class WarcProxy(socketserver.ThreadingMixIn, SingleThreadedWarcProxy):
|
class WarcProxy(socketserver.ThreadingMixIn, SingleThreadedWarcProxy):
|
||||||
pass
|
pass
|
||||||
|
@ -36,7 +36,10 @@ class WarcWriterThread(threading.Thread):
|
|||||||
self.idle = None
|
self.idle = None
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
cProfile.runctx('self._run()', globals(), locals(), sort='cumulative')
|
if self.options.profile:
|
||||||
|
cProfile.runctx('self._run()', globals(), locals(), sort='cumulative')
|
||||||
|
else:
|
||||||
|
self._run()
|
||||||
|
|
||||||
def _run(self):
|
def _run(self):
|
||||||
while not self.stop.is_set():
|
while not self.stop.is_set():
|
||||||
@ -44,6 +47,11 @@ class WarcWriterThread(threading.Thread):
|
|||||||
self.name = 'WarcWriterThread(tid={})'.format(warcprox.gettid())
|
self.name = 'WarcWriterThread(tid={})'.format(warcprox.gettid())
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
|
if self.stop.is_set():
|
||||||
|
qsize = self.recorded_url_q.qsize()
|
||||||
|
if qsize % 50 == 0:
|
||||||
|
self.logger.info("%s urls left to write", qsize)
|
||||||
|
|
||||||
recorded_url = self.recorded_url_q.get(block=True, timeout=0.5)
|
recorded_url = self.recorded_url_q.get(block=True, timeout=0.5)
|
||||||
self.idle = None
|
self.idle = None
|
||||||
if self.dedup_db:
|
if self.dedup_db:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user