mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
update stats in RethinkDb asynchronously, since profiling shows this to be a bottleneck in WarcWriterThread (which in turn makes it a bottleneck for the whole app)
This commit is contained in:
parent
6b3cd9de2e
commit
95e611a5d0
14
setup.py
14
setup.py
@ -17,6 +17,12 @@ class PyTest(TestCommand):
|
|||||||
errno = pytest.main(self.test_args)
|
errno = pytest.main(self.test_args)
|
||||||
sys.exit(errno)
|
sys.exit(errno)
|
||||||
|
|
||||||
|
deps = ['certauth>=1.1.0', 'warctools', 'kafka-python', 'surt', 'rethinkstuff']
|
||||||
|
try:
|
||||||
|
import concurrent.futures
|
||||||
|
except:
|
||||||
|
deps.append('futures')
|
||||||
|
|
||||||
setuptools.setup(name='warcprox',
|
setuptools.setup(name='warcprox',
|
||||||
version='1.5.0',
|
version='1.5.0',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
@ -26,13 +32,7 @@ setuptools.setup(name='warcprox',
|
|||||||
long_description=open('README.rst').read(),
|
long_description=open('README.rst').read(),
|
||||||
license='GPL',
|
license='GPL',
|
||||||
packages=['warcprox'],
|
packages=['warcprox'],
|
||||||
install_requires=[
|
install_requires=deps,
|
||||||
'certauth>=1.1.0',
|
|
||||||
'warctools',
|
|
||||||
'kafka-python',
|
|
||||||
'surt',
|
|
||||||
'rethinkstuff',
|
|
||||||
],
|
|
||||||
tests_require=['requests>=2.0.1', 'pytest'], # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636
|
tests_require=['requests>=2.0.1', 'pytest'], # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636
|
||||||
cmdclass = {'test': PyTest},
|
cmdclass = {'test': PyTest},
|
||||||
test_suite='warcprox.tests',
|
test_suite='warcprox.tests',
|
||||||
|
@ -398,8 +398,12 @@ def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies)
|
|||||||
|
|
||||||
# wait for writer thread to process
|
# wait for writer thread to process
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
while not warcprox_.warc_writer_thread.idle:
|
while (not warcprox_.warc_writer_thread.idle
|
||||||
|
or (warcprox_.proxy.stats_db
|
||||||
|
and hasattr(warcprox_.proxy.stats_db, "_executor")
|
||||||
|
and warcprox_.proxy.stats_db._executor._work_queue.qsize() > 0)):
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
# check in dedup db (no change from prev)
|
# check in dedup db (no change from prev)
|
||||||
dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
|
dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
|
||||||
@ -461,8 +465,13 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie
|
|||||||
|
|
||||||
# wait for writer thread to process
|
# wait for writer thread to process
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
while not warcprox_.warc_writer_thread.idle:
|
while (not warcprox_.warc_writer_thread.idle
|
||||||
|
or (warcprox_.proxy.stats_db
|
||||||
|
and hasattr(warcprox_.proxy.stats_db, "_executor")
|
||||||
|
and warcprox_.proxy.stats_db._executor._work_queue.qsize() > 0)):
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
|
||||||
# check in dedup db (no change from prev)
|
# check in dedup db (no change from prev)
|
||||||
dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
|
dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
|
||||||
@ -491,8 +500,12 @@ def test_limits(http_daemon, warcprox_, archiving_proxies):
|
|||||||
|
|
||||||
# wait for writer thread to process
|
# wait for writer thread to process
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
while not warcprox_.warc_writer_thread.idle:
|
while (not warcprox_.warc_writer_thread.idle
|
||||||
|
or (warcprox_.proxy.stats_db
|
||||||
|
and hasattr(warcprox_.proxy.stats_db, "_executor")
|
||||||
|
and warcprox_.proxy.stats_db._executor._work_queue.qsize() > 0)):
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True)
|
response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||||
assert response.status_code == 420
|
assert response.status_code == 420
|
||||||
@ -515,8 +528,12 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
|
|||||||
|
|
||||||
# wait for writer thread to process
|
# wait for writer thread to process
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
while not warcprox_.warc_writer_thread.idle:
|
while (not warcprox_.warc_writer_thread.idle
|
||||||
|
or (warcprox_.proxy.stats_db
|
||||||
|
and hasattr(warcprox_.proxy.stats_db, "_executor")
|
||||||
|
and warcprox_.proxy.stats_db._executor._work_queue.qsize() > 0)):
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
# check url1 in dedup db bucket_a
|
# check url1 in dedup db bucket_a
|
||||||
dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_a")
|
dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_a")
|
||||||
@ -541,6 +558,7 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
|
|||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
while not warcprox_.warc_writer_thread.idle:
|
while not warcprox_.warc_writer_thread.idle:
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
# check url2 in dedup db bucket_b
|
# check url2 in dedup db bucket_b
|
||||||
dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_b")
|
dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_b")
|
||||||
@ -568,6 +586,7 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
|
|||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
while not warcprox_.warc_writer_thread.idle:
|
while not warcprox_.warc_writer_thread.idle:
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
# close the warc
|
# close the warc
|
||||||
assert warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"]
|
assert warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"]
|
||||||
@ -575,14 +594,14 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
|
|||||||
warc_path = os.path.join(writer.directory, writer._f_finalname)
|
warc_path = os.path.join(writer.directory, writer._f_finalname)
|
||||||
warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"].close_writer()
|
warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"].close_writer()
|
||||||
assert os.path.exists(warc_path)
|
assert os.path.exists(warc_path)
|
||||||
|
|
||||||
# read the warc
|
# read the warc
|
||||||
fh = warctools.ArchiveRecord.open_archive(warc_path)
|
fh = warctools.ArchiveRecord.open_archive(warc_path)
|
||||||
record_iter = fh.read_records(limit=None, offsets=True)
|
record_iter = fh.read_records(limit=None, offsets=True)
|
||||||
try:
|
try:
|
||||||
(offset, record, errors) = next(record_iter)
|
(offset, record, errors) = next(record_iter)
|
||||||
assert record.type == b'warcinfo'
|
assert record.type == b'warcinfo'
|
||||||
|
|
||||||
# url1 bucket_a
|
# url1 bucket_a
|
||||||
(offset, record, errors) = next(record_iter)
|
(offset, record, errors) = next(record_iter)
|
||||||
assert record.type == b'response'
|
assert record.type == b'response'
|
||||||
|
@ -14,6 +14,7 @@ import json
|
|||||||
from hanzo import warctools
|
from hanzo import warctools
|
||||||
import random
|
import random
|
||||||
import warcprox
|
import warcprox
|
||||||
|
import concurrent.futures
|
||||||
|
|
||||||
def _empty_bucket(bucket):
|
def _empty_bucket(bucket):
|
||||||
return {
|
return {
|
||||||
@ -95,7 +96,7 @@ class StatsDb:
|
|||||||
if b in self.db:
|
if b in self.db:
|
||||||
bucket_stats = json.loads(self.db[b].decode("utf-8"))
|
bucket_stats = json.loads(self.db[b].decode("utf-8"))
|
||||||
else:
|
else:
|
||||||
bucket_stats = _empty_bucket(b)
|
bucket_stats = _empty_bucket(b)
|
||||||
|
|
||||||
bucket_stats["total"]["urls"] += 1
|
bucket_stats["total"]["urls"] += 1
|
||||||
bucket_stats["total"]["wire_bytes"] += recorded_url.size
|
bucket_stats["total"]["wire_bytes"] += recorded_url.size
|
||||||
@ -119,6 +120,7 @@ class RethinkStatsDb:
|
|||||||
self.replicas = replicas or min(3, len(r.servers))
|
self.replicas = replicas or min(3, len(r.servers))
|
||||||
self._ensure_db_table()
|
self._ensure_db_table()
|
||||||
self.options = options
|
self.options = options
|
||||||
|
self._executor = concurrent.futures.ThreadPoolExecutor(max_workers=10)
|
||||||
|
|
||||||
def _ensure_db_table(self):
|
def _ensure_db_table(self):
|
||||||
dbs = self.r.db_list().run()
|
dbs = self.r.db_list().run()
|
||||||
@ -127,12 +129,15 @@ class RethinkStatsDb:
|
|||||||
self.r.db_create(self.r.dbname).run()
|
self.r.db_create(self.r.dbname).run()
|
||||||
tables = self.r.table_list().run()
|
tables = self.r.table_list().run()
|
||||||
if not self.table in tables:
|
if not self.table in tables:
|
||||||
self.logger.info("creating rethinkdb table %s in database %s shards=%s replicas=%s",
|
self.logger.info("creating rethinkdb table %s in database %s shards=%s replicas=%s",
|
||||||
repr(self.table), repr(self.r.dbname), self.shards, self.replicas)
|
repr(self.table), repr(self.r.dbname), self.shards, self.replicas)
|
||||||
self.r.table_create(self.table, primary_key="bucket", shards=self.shards, replicas=self.replicas).run()
|
self.r.table_create(self.table, primary_key="bucket", shards=self.shards, replicas=self.replicas).run()
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
pass
|
self.logger.info("waiting for ~%s tasks to finish",
|
||||||
|
self._executor._work_queue.qsize() + (self._executor._max_workers/2))
|
||||||
|
self._executor.shutdown(wait=True)
|
||||||
|
self.logger.info("shut down complete")
|
||||||
|
|
||||||
def sync(self):
|
def sync(self):
|
||||||
pass
|
pass
|
||||||
@ -149,7 +154,32 @@ class RethinkStatsDb:
|
|||||||
return bucket0_stats[bucket1]
|
return bucket0_stats[bucket1]
|
||||||
return bucket0_stats
|
return bucket0_stats
|
||||||
|
|
||||||
def tally(self, recorded_url, records):
|
def _tally(self, buckets, size, is_revisit):
|
||||||
|
try:
|
||||||
|
self.logger.info("starting task self._tally(%s)", (buckets, size, is_revisit))
|
||||||
|
for bucket in buckets:
|
||||||
|
bucket_stats = self.value(bucket) or _empty_bucket(bucket)
|
||||||
|
|
||||||
|
bucket_stats["total"]["urls"] += 1
|
||||||
|
bucket_stats["total"]["wire_bytes"] += size
|
||||||
|
|
||||||
|
if is_revisit:
|
||||||
|
bucket_stats["revisit"]["urls"] += 1
|
||||||
|
bucket_stats["revisit"]["wire_bytes"] += size
|
||||||
|
else:
|
||||||
|
bucket_stats["new"]["urls"] += 1
|
||||||
|
bucket_stats["new"]["wire_bytes"] += size
|
||||||
|
|
||||||
|
self.logger.debug("saving %s", bucket_stats)
|
||||||
|
result = self.r.table(self.table).insert(bucket_stats, conflict="replace").run()
|
||||||
|
if sorted(result.values()) != [0,0,0,0,0,1] or [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]:
|
||||||
|
raise Exception("unexpected result %s saving %s", result, record)
|
||||||
|
|
||||||
|
self.logger.info("finished task self._tally(%s)", (buckets, size, is_revisit))
|
||||||
|
except:
|
||||||
|
self.logger.error("unexpected problem tallying stats", exc_info=True)
|
||||||
|
|
||||||
|
def _extract_stats_info(self, recorded_url, records):
|
||||||
buckets = ["__all__"]
|
buckets = ["__all__"]
|
||||||
|
|
||||||
if (recorded_url.warcprox_meta
|
if (recorded_url.warcprox_meta
|
||||||
@ -159,24 +189,15 @@ class RethinkStatsDb:
|
|||||||
else:
|
else:
|
||||||
buckets.append("__unspecified__")
|
buckets.append("__unspecified__")
|
||||||
|
|
||||||
for bucket in buckets:
|
is_revisit = records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT
|
||||||
bucket_stats = self.value(bucket) or _empty_bucket(bucket)
|
|
||||||
|
|
||||||
bucket_stats["total"]["urls"] += 1
|
return buckets, recorded_url.size, is_revisit
|
||||||
bucket_stats["total"]["wire_bytes"] += recorded_url.size
|
|
||||||
|
|
||||||
if records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT:
|
def tally(self, recorded_url, records):
|
||||||
bucket_stats["revisit"]["urls"] += 1
|
self._tally(self._extract_stats_info(recorded_url, records))
|
||||||
bucket_stats["revisit"]["wire_bytes"] += recorded_url.size
|
|
||||||
else:
|
|
||||||
bucket_stats["new"]["urls"] += 1
|
|
||||||
bucket_stats["new"]["wire_bytes"] += recorded_url.size
|
|
||||||
|
|
||||||
self.logger.debug("saving %s", bucket_stats)
|
|
||||||
result = self.r.table(self.table).insert(bucket_stats, conflict="replace").run()
|
|
||||||
if sorted(result.values()) != [0,0,0,0,0,1] or [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]:
|
|
||||||
raise Exception("unexpected result %s saving %s", result, record)
|
|
||||||
|
|
||||||
def notify(self, recorded_url, records):
|
def notify(self, recorded_url, records):
|
||||||
self.tally(recorded_url, records)
|
args = self._extract_stats_info(recorded_url, records)
|
||||||
|
self.logger.info("submitting task self._tally(%s)", args)
|
||||||
|
self._executor.submit(self._tally, *args)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user