mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
rethinkstuff -> doublethink
This commit is contained in:
parent
3a80fde50c
commit
842bfd651c
4
setup.py
4
setup.py
@ -41,7 +41,7 @@ deps = [
|
|||||||
'warctools',
|
'warctools',
|
||||||
'kafka-python>=1.0.1',
|
'kafka-python>=1.0.1',
|
||||||
'surt>=0.3b4',
|
'surt>=0.3b4',
|
||||||
'rethinkstuff',
|
'doublethink>=0.2.0.dev69',
|
||||||
'PySocks',
|
'PySocks',
|
||||||
]
|
]
|
||||||
try:
|
try:
|
||||||
@ -51,7 +51,7 @@ except:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='warcprox',
|
name='warcprox',
|
||||||
version='2.1b1.dev52',
|
version='2.1b1.dev53',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -36,7 +36,7 @@ import requests
|
|||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import rethinkstuff
|
import doublethink
|
||||||
from hanzo import warctools
|
from hanzo import warctools
|
||||||
import warnings
|
import warnings
|
||||||
import pprint
|
import pprint
|
||||||
@ -245,15 +245,15 @@ def captures_db(request, rethinkdb_servers, rethinkdb_big_table):
|
|||||||
servers = rethinkdb_servers.split(",")
|
servers = rethinkdb_servers.split(",")
|
||||||
if rethinkdb_big_table:
|
if rethinkdb_big_table:
|
||||||
db = 'warcprox_test_captures_' + "".join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8))
|
db = 'warcprox_test_captures_' + "".join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8))
|
||||||
r = rethinkstuff.Rethinker(servers, db)
|
rr = doublethink.Rethinker(servers, db)
|
||||||
captures_db = warcprox.bigtable.RethinkCaptures(r)
|
captures_db = warcprox.bigtable.RethinkCaptures(rr)
|
||||||
captures_db.start()
|
captures_db.start()
|
||||||
|
|
||||||
def fin():
|
def fin():
|
||||||
if captures_db:
|
if captures_db:
|
||||||
captures_db.close()
|
captures_db.close()
|
||||||
# logging.info('dropping rethinkdb database {}'.format(db))
|
# logging.info('dropping rethinkdb database {}'.format(db))
|
||||||
# result = captures_db.r.db_drop(db).run()
|
# result = captures_db.rr.db_drop(db).run()
|
||||||
# logging.info("result=%s", result)
|
# logging.info("result=%s", result)
|
||||||
request.addfinalizer(fin)
|
request.addfinalizer(fin)
|
||||||
|
|
||||||
@ -268,15 +268,15 @@ def rethink_dedup_db(request, rethinkdb_servers, captures_db):
|
|||||||
else:
|
else:
|
||||||
servers = rethinkdb_servers.split(",")
|
servers = rethinkdb_servers.split(",")
|
||||||
db = 'warcprox_test_dedup_' + "".join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8))
|
db = 'warcprox_test_dedup_' + "".join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8))
|
||||||
r = rethinkstuff.Rethinker(servers, db)
|
rr = doublethink.Rethinker(servers, db)
|
||||||
ddb = warcprox.dedup.RethinkDedupDb(r)
|
ddb = warcprox.dedup.RethinkDedupDb(rr)
|
||||||
|
|
||||||
def fin():
|
def fin():
|
||||||
if rethinkdb_servers:
|
if rethinkdb_servers:
|
||||||
ddb.close()
|
ddb.close()
|
||||||
if not captures_db:
|
if not captures_db:
|
||||||
logging.info('dropping rethinkdb database {}'.format(db))
|
logging.info('dropping rethinkdb database {}'.format(db))
|
||||||
result = ddb.r.db_drop(db).run()
|
result = ddb.rr.db_drop(db).run()
|
||||||
logging.info("result=%s", result)
|
logging.info("result=%s", result)
|
||||||
request.addfinalizer(fin)
|
request.addfinalizer(fin)
|
||||||
|
|
||||||
@ -305,8 +305,8 @@ def stats_db(request, rethinkdb_servers):
|
|||||||
if rethinkdb_servers:
|
if rethinkdb_servers:
|
||||||
servers = rethinkdb_servers.split(",")
|
servers = rethinkdb_servers.split(",")
|
||||||
db = 'warcprox_test_stats_' + "".join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8))
|
db = 'warcprox_test_stats_' + "".join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8))
|
||||||
r = rethinkstuff.Rethinker(servers, db)
|
rr = doublethink.Rethinker(servers, db)
|
||||||
sdb = warcprox.stats.RethinkStatsDb(r)
|
sdb = warcprox.stats.RethinkStatsDb(rr)
|
||||||
sdb.start()
|
sdb.start()
|
||||||
else:
|
else:
|
||||||
f = tempfile.NamedTemporaryFile(prefix='warcprox-test-stats-', suffix='.db', delete=False)
|
f = tempfile.NamedTemporaryFile(prefix='warcprox-test-stats-', suffix='.db', delete=False)
|
||||||
@ -318,7 +318,7 @@ def stats_db(request, rethinkdb_servers):
|
|||||||
sdb.close()
|
sdb.close()
|
||||||
if rethinkdb_servers:
|
if rethinkdb_servers:
|
||||||
logging.info('dropping rethinkdb database {}'.format(db))
|
logging.info('dropping rethinkdb database {}'.format(db))
|
||||||
result = sdb.r.db_drop(db).run()
|
result = sdb.rr.db_drop(db).run()
|
||||||
logging.info("result=%s", result)
|
logging.info("result=%s", result)
|
||||||
else:
|
else:
|
||||||
logging.info('deleting file {}'.format(stats_db_file))
|
logging.info('deleting file {}'.format(stats_db_file))
|
||||||
@ -332,15 +332,15 @@ def service_registry(request, rethinkdb_servers):
|
|||||||
if rethinkdb_servers:
|
if rethinkdb_servers:
|
||||||
servers = rethinkdb_servers.split(",")
|
servers = rethinkdb_servers.split(",")
|
||||||
db = 'warcprox_test_services_' + "".join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8))
|
db = 'warcprox_test_services_' + "".join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8))
|
||||||
r = rethinkstuff.Rethinker(servers, db)
|
rr = doublethink.Rethinker(servers, db)
|
||||||
|
|
||||||
def fin():
|
def fin():
|
||||||
logging.info('dropping rethinkdb database {}'.format(db))
|
logging.info('dropping rethinkdb database {}'.format(db))
|
||||||
result = r.db_drop(db).run()
|
result = rr.db_drop(db).run()
|
||||||
logging.info("result=%s", result)
|
logging.info("result=%s", result)
|
||||||
request.addfinalizer(fin)
|
request.addfinalizer(fin)
|
||||||
|
|
||||||
return rethinkstuff.ServiceRegistry(r)
|
return doublethink.ServiceRegistry(rr)
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -1265,7 +1265,7 @@ def test_dedup_ok_flag(
|
|||||||
|
|
||||||
# inspect what's in rethinkdb more closely
|
# inspect what's in rethinkdb more closely
|
||||||
rethink_captures = warcprox_.warc_writer_thread.dedup_db.captures_db
|
rethink_captures = warcprox_.warc_writer_thread.dedup_db.captures_db
|
||||||
results_iter = rethink_captures.r.table(rethink_captures.table).get_all(
|
results_iter = rethink_captures.rr.table(rethink_captures.table).get_all(
|
||||||
['FV7RGGA3SCRFNTS6L275N2OJQJXM5EDZ', 'response',
|
['FV7RGGA3SCRFNTS6L275N2OJQJXM5EDZ', 'response',
|
||||||
'test_dedup_ok_flag'], index='sha1_warc_type').order_by(
|
'test_dedup_ok_flag'], index='sha1_warc_type').order_by(
|
||||||
'timestamp').run()
|
'timestamp').run()
|
||||||
|
@ -34,19 +34,20 @@ import os
|
|||||||
import hashlib
|
import hashlib
|
||||||
import threading
|
import threading
|
||||||
import datetime
|
import datetime
|
||||||
import rethinkstuff
|
import doublethink
|
||||||
|
import rethinkdb as r
|
||||||
|
|
||||||
class RethinkCaptures:
|
class RethinkCaptures:
|
||||||
"""Inserts in batches every 0.5 seconds"""
|
"""Inserts in batches every 0.5 seconds"""
|
||||||
logger = logging.getLogger("warcprox.bigtable.RethinkCaptures")
|
logger = logging.getLogger("warcprox.bigtable.RethinkCaptures")
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, r, table="captures", shards=None, replicas=None,
|
self, rr, table="captures", shards=None, replicas=None,
|
||||||
options=warcprox.Options()):
|
options=warcprox.Options()):
|
||||||
self.r = r
|
self.rr = rr
|
||||||
self.table = table
|
self.table = table
|
||||||
self.shards = shards or len(r.servers)
|
self.shards = shards or len(rr.servers)
|
||||||
self.replicas = replicas or min(3, len(r.servers))
|
self.replicas = replicas or min(3, len(rr.servers))
|
||||||
self.options = options
|
self.options = options
|
||||||
self._ensure_db_table()
|
self._ensure_db_table()
|
||||||
|
|
||||||
@ -64,7 +65,7 @@ class RethinkCaptures:
|
|||||||
try:
|
try:
|
||||||
with self._batch_lock:
|
with self._batch_lock:
|
||||||
if len(self._batch) > 0:
|
if len(self._batch) > 0:
|
||||||
result = self.r.table(self.table).insert(
|
result = self.rr.table(self.table).insert(
|
||||||
self._batch, conflict="replace").run()
|
self._batch, conflict="replace").run()
|
||||||
if (result["inserted"] + result["replaced"]
|
if (result["inserted"] + result["replaced"]
|
||||||
+ result["unchanged"] != len(self._batch)):
|
+ result["unchanged"] != len(self._batch)):
|
||||||
@ -99,16 +100,22 @@ class RethinkCaptures:
|
|||||||
self.logger.info("finished")
|
self.logger.info("finished")
|
||||||
|
|
||||||
def _ensure_db_table(self):
|
def _ensure_db_table(self):
|
||||||
dbs = self.r.db_list().run()
|
dbs = self.rr.db_list().run()
|
||||||
if not self.r.dbname in dbs:
|
if not self.rr.dbname in dbs:
|
||||||
self.logger.info("creating rethinkdb database %s", repr(self.r.dbname))
|
self.logger.info(
|
||||||
self.r.db_create(self.r.dbname).run()
|
"creating rethinkdb database %s", repr(self.rr.dbname))
|
||||||
tables = self.r.table_list().run()
|
self.rr.db_create(self.rr.dbname).run()
|
||||||
|
tables = self.rr.table_list().run()
|
||||||
if not self.table in tables:
|
if not self.table in tables:
|
||||||
self.logger.info("creating rethinkdb table %s in database %s", repr(self.table), repr(self.r.dbname))
|
self.logger.info(
|
||||||
self.r.table_create(self.table, shards=self.shards, replicas=self.replicas).run()
|
"creating rethinkdb table %s in database %s",
|
||||||
self.r.table(self.table).index_create("abbr_canon_surt_timestamp", [self.r.row["abbr_canon_surt"], self.r.row["timestamp"]]).run()
|
repr(self.table), repr(self.rr.dbname))
|
||||||
self.r.table(self.table).index_create("sha1_warc_type", [self.r.row["sha1base32"], self.r.row["warc_type"], self.r.row["bucket"]]).run()
|
self.rr.table_create(self.table, shards=self.shards, replicas=self.replicas).run()
|
||||||
|
self.rr.table(self.table).index_create(
|
||||||
|
"abbr_canon_surt_timestamp",
|
||||||
|
[r.row["abbr_canon_surt"], r.row["timestamp"]]).run()
|
||||||
|
self.rr.table(self.table).index_create("sha1_warc_type", [
|
||||||
|
r.row["sha1base32"], r.row["warc_type"], r.row["bucket"]]).run()
|
||||||
|
|
||||||
def find_response_by_digest(self, algo, raw_digest, bucket="__unspecified__"):
|
def find_response_by_digest(self, algo, raw_digest, bucket="__unspecified__"):
|
||||||
if algo != "sha1":
|
if algo != "sha1":
|
||||||
@ -116,10 +123,10 @@ class RethinkCaptures:
|
|||||||
"digest type is %s but big captures table is indexed by "
|
"digest type is %s but big captures table is indexed by "
|
||||||
"sha1" % algo)
|
"sha1" % algo)
|
||||||
sha1base32 = base64.b32encode(raw_digest).decode("utf-8")
|
sha1base32 = base64.b32encode(raw_digest).decode("utf-8")
|
||||||
results_iter = self.r.table(self.table).get_all(
|
results_iter = self.rr.table(self.table).get_all(
|
||||||
[sha1base32, "response", bucket],
|
[sha1base32, "response", bucket],
|
||||||
index="sha1_warc_type").filter(
|
index="sha1_warc_type").filter(
|
||||||
self.r.row["dedup_ok"], default=True).run()
|
r.row["dedup_ok"], default=True).run()
|
||||||
results = list(results_iter)
|
results = list(results_iter)
|
||||||
if len(results) > 0:
|
if len(results) > 0:
|
||||||
if len(results) > 1:
|
if len(results) > 1:
|
||||||
@ -162,7 +169,7 @@ class RethinkCaptures:
|
|||||||
"abbr_canon_surt": canon_surt[:150],
|
"abbr_canon_surt": canon_surt[:150],
|
||||||
"canon_surt": canon_surt,
|
"canon_surt": canon_surt,
|
||||||
"timestamp": recorded_url.timestamp.replace(
|
"timestamp": recorded_url.timestamp.replace(
|
||||||
tzinfo=rethinkstuff.UTC),
|
tzinfo=doublethink.UTC),
|
||||||
"url": recorded_url.url.decode("utf-8"),
|
"url": recorded_url.url.decode("utf-8"),
|
||||||
"offset": records[0].offset,
|
"offset": records[0].offset,
|
||||||
"filename": os.path.basename(records[0].warc_filename),
|
"filename": os.path.basename(records[0].warc_filename),
|
||||||
|
@ -1,23 +1,23 @@
|
|||||||
#
|
'''
|
||||||
# warcprox/dedup.py - identical payload digest deduplication
|
warcprox/dedup.py - identical payload digest deduplication
|
||||||
#
|
|
||||||
# Copyright (C) 2013-2016 Internet Archive
|
Copyright (C) 2013-2017 Internet Archive
|
||||||
#
|
|
||||||
# This program is free software; you can redistribute it and/or
|
This program is free software; you can redistribute it and/or
|
||||||
# modify it under the terms of the GNU General Public License
|
modify it under the terms of the GNU General Public License
|
||||||
# as published by the Free Software Foundation; either version 2
|
as published by the Free Software Foundation; either version 2
|
||||||
# of the License, or (at your option) any later version.
|
of the License, or (at your option) any later version.
|
||||||
#
|
|
||||||
# This program is distributed in the hope that it will be useful,
|
This program is distributed in the hope that it will be useful,
|
||||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
# GNU General Public License for more details.
|
GNU General Public License for more details.
|
||||||
#
|
|
||||||
# You should have received a copy of the GNU General Public License
|
You should have received a copy of the GNU General Public License
|
||||||
# along with this program; if not, write to the Free Software
|
along with this program; if not, write to the Free Software
|
||||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||||
# USA.
|
USA.
|
||||||
#
|
'''
|
||||||
|
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
|
|
||||||
@ -112,24 +112,29 @@ def decorate_with_dedup_info(dedup_db, recorded_url, base32=False):
|
|||||||
class RethinkDedupDb:
|
class RethinkDedupDb:
|
||||||
logger = logging.getLogger("warcprox.dedup.RethinkDedupDb")
|
logger = logging.getLogger("warcprox.dedup.RethinkDedupDb")
|
||||||
|
|
||||||
def __init__(self, r, table="dedup", shards=None, replicas=None, options=warcprox.Options()):
|
def __init__(self, rr, table="dedup", shards=None, replicas=None, options=warcprox.Options()):
|
||||||
self.r = r
|
self.rr = rr
|
||||||
self.table = table
|
self.table = table
|
||||||
self.shards = shards or len(r.servers)
|
self.shards = shards or len(rr.servers)
|
||||||
self.replicas = replicas or min(3, len(r.servers))
|
self.replicas = replicas or min(3, len(rr.servers))
|
||||||
self._ensure_db_table()
|
self._ensure_db_table()
|
||||||
self.options = options
|
self.options = options
|
||||||
|
|
||||||
def _ensure_db_table(self):
|
def _ensure_db_table(self):
|
||||||
dbs = self.r.db_list().run()
|
dbs = self.rr.db_list().run()
|
||||||
if not self.r.dbname in dbs:
|
if not self.rr.dbname in dbs:
|
||||||
self.logger.info("creating rethinkdb database %s", repr(self.r.dbname))
|
self.logger.info(
|
||||||
self.r.db_create(self.r.dbname).run()
|
"creating rethinkdb database %s", repr(self.rr.dbname))
|
||||||
tables = self.r.table_list().run()
|
self.rr.db_create(self.rr.dbname).run()
|
||||||
|
tables = self.rr.table_list().run()
|
||||||
if not self.table in tables:
|
if not self.table in tables:
|
||||||
self.logger.info("creating rethinkdb table %s in database %s shards=%s replicas=%s",
|
self.logger.info(
|
||||||
repr(self.table), repr(self.r.dbname), self.shards, self.replicas)
|
"creating rethinkdb table %s in database %s shards=%s "
|
||||||
self.r.table_create(self.table, primary_key="key", shards=self.shards, replicas=self.replicas).run()
|
"replicas=%s", repr(self.table), repr(self.rr.dbname),
|
||||||
|
self.shards, self.replicas)
|
||||||
|
self.rr.table_create(
|
||||||
|
self.table, primary_key="key", shards=self.shards,
|
||||||
|
replicas=self.replicas).run()
|
||||||
|
|
||||||
|
|
||||||
def start(self):
|
def start(self):
|
||||||
@ -151,7 +156,8 @@ class RethinkDedupDb:
|
|||||||
url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1')
|
url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1')
|
||||||
date = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1')
|
date = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1')
|
||||||
record = {'key':k,'url':url,'date':date,'id':record_id}
|
record = {'key':k,'url':url,'date':date,'id':record_id}
|
||||||
result = self.r.table(self.table).insert(record,conflict="replace").run()
|
result = self.rr.table(self.table).insert(
|
||||||
|
record, conflict="replace").run()
|
||||||
if sorted(result.values()) != [0,0,0,0,0,1] and [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]:
|
if sorted(result.values()) != [0,0,0,0,0,1] and [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]:
|
||||||
raise Exception("unexpected result %s saving %s", result, record)
|
raise Exception("unexpected result %s saving %s", result, record)
|
||||||
self.logger.debug('dedup db saved %s:%s', k, record)
|
self.logger.debug('dedup db saved %s:%s', k, record)
|
||||||
@ -159,7 +165,7 @@ class RethinkDedupDb:
|
|||||||
def lookup(self, digest_key, bucket=""):
|
def lookup(self, digest_key, bucket=""):
|
||||||
k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key
|
k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key
|
||||||
k = "{}|{}".format(k, bucket)
|
k = "{}|{}".format(k, bucket)
|
||||||
result = self.r.table(self.table).get(k).run()
|
result = self.rr.table(self.table).get(k).run()
|
||||||
if result:
|
if result:
|
||||||
for x in result:
|
for x in result:
|
||||||
result[x] = result[x].encode("utf-8")
|
result[x] = result[x].encode("utf-8")
|
||||||
@ -175,3 +181,4 @@ class RethinkDedupDb:
|
|||||||
self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"])
|
self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"])
|
||||||
else:
|
else:
|
||||||
self.save(digest_key, records[0])
|
self.save(digest_key, records[0])
|
||||||
|
|
||||||
|
@ -40,7 +40,7 @@ import threading
|
|||||||
import certauth.certauth
|
import certauth.certauth
|
||||||
import warcprox
|
import warcprox
|
||||||
import re
|
import re
|
||||||
import rethinkstuff
|
import doublethink
|
||||||
import cryptography.hazmat.backends.openssl
|
import cryptography.hazmat.backends.openssl
|
||||||
|
|
||||||
def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
||||||
@ -149,13 +149,15 @@ def init_controller(args):
|
|||||||
|
|
||||||
listeners = []
|
listeners = []
|
||||||
if args.rethinkdb_servers:
|
if args.rethinkdb_servers:
|
||||||
r = rethinkstuff.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db)
|
rr = doublethink.Rethinker(
|
||||||
|
args.rethinkdb_servers.split(","), args.rethinkdb_db)
|
||||||
if args.rethinkdb_big_table:
|
if args.rethinkdb_big_table:
|
||||||
captures_db = warcprox.bigtable.RethinkCaptures(r, options=options)
|
captures_db = warcprox.bigtable.RethinkCaptures(rr, options=options)
|
||||||
dedup_db = warcprox.bigtable.RethinkCapturesDedup(captures_db, options=options)
|
dedup_db = warcprox.bigtable.RethinkCapturesDedup(
|
||||||
|
captures_db, options=options)
|
||||||
listeners.append(captures_db)
|
listeners.append(captures_db)
|
||||||
else:
|
else:
|
||||||
dedup_db = warcprox.dedup.RethinkDedupDb(r, options=options)
|
dedup_db = warcprox.dedup.RethinkDedupDb(rr, options=options)
|
||||||
listeners.append(dedup_db)
|
listeners.append(dedup_db)
|
||||||
elif args.dedup_db_file in (None, '', '/dev/null'):
|
elif args.dedup_db_file in (None, '', '/dev/null'):
|
||||||
logging.info('deduplication disabled')
|
logging.info('deduplication disabled')
|
||||||
@ -165,7 +167,7 @@ def init_controller(args):
|
|||||||
listeners.append(dedup_db)
|
listeners.append(dedup_db)
|
||||||
|
|
||||||
if args.rethinkdb_servers:
|
if args.rethinkdb_servers:
|
||||||
stats_db = warcprox.stats.RethinkStatsDb(r, options=options)
|
stats_db = warcprox.stats.RethinkStatsDb(rr, options=options)
|
||||||
listeners.append(stats_db)
|
listeners.append(stats_db)
|
||||||
elif args.stats_db_file in (None, '', '/dev/null'):
|
elif args.stats_db_file in (None, '', '/dev/null'):
|
||||||
logging.info('statistics tracking disabled')
|
logging.info('statistics tracking disabled')
|
||||||
@ -205,7 +207,7 @@ def init_controller(args):
|
|||||||
dedup_db=dedup_db, listeners=listeners, options=options)
|
dedup_db=dedup_db, listeners=listeners, options=options)
|
||||||
|
|
||||||
if args.rethinkdb_servers:
|
if args.rethinkdb_servers:
|
||||||
svcreg = rethinkstuff.ServiceRegistry(r)
|
svcreg = doublethink.ServiceRegistry(rr)
|
||||||
else:
|
else:
|
||||||
svcreg = None
|
svcreg = None
|
||||||
|
|
||||||
@ -286,17 +288,17 @@ def ensure_rethinkdb_tables():
|
|||||||
'%(asctime)s %(levelname)s %(name)s.%(funcName)s'
|
'%(asctime)s %(levelname)s %(name)s.%(funcName)s'
|
||||||
'(%(filename)s:%(lineno)d) %(message)s'))
|
'(%(filename)s:%(lineno)d) %(message)s'))
|
||||||
|
|
||||||
r = rethinkstuff.Rethinker(
|
rr = doublethink.Rethinker(
|
||||||
args.rethinkdb_servers.split(','), args.rethinkdb_db)
|
args.rethinkdb_servers.split(','), args.rethinkdb_db)
|
||||||
|
|
||||||
# services table
|
# services table
|
||||||
rethinkstuff.ServiceRegistry(r)
|
doublethink.ServiceRegistry(rr)
|
||||||
|
|
||||||
# stats table
|
# stats table
|
||||||
warcprox.stats.RethinkStatsDb(r)
|
warcprox.stats.RethinkStatsDb(rr)
|
||||||
|
|
||||||
# captures table
|
# captures table
|
||||||
warcprox.bigtable.RethinkCaptures(r)
|
warcprox.bigtable.RethinkCaptures(rr)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
'''
|
'''
|
||||||
warcprox/stats.py - keeps statistics on what has been archived
|
warcprox/stats.py - keeps statistics on what has been archived
|
||||||
|
|
||||||
Copyright (C) 2013-2016 Internet Archive
|
Copyright (C) 2013-2017 Internet Archive
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or
|
This program is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU General Public License
|
modify it under the terms of the GNU General Public License
|
||||||
@ -176,10 +176,10 @@ class RethinkStatsDb(StatsDb):
|
|||||||
logger = logging.getLogger("warcprox.stats.RethinkStatsDb")
|
logger = logging.getLogger("warcprox.stats.RethinkStatsDb")
|
||||||
|
|
||||||
def __init__(self, rethinker, table="stats", shards=None, replicas=None, options=warcprox.Options()):
|
def __init__(self, rethinker, table="stats", shards=None, replicas=None, options=warcprox.Options()):
|
||||||
self.r = rethinker
|
self.rr = rethinker
|
||||||
self.table = table
|
self.table = table
|
||||||
self.shards = shards or 1 # 1 shard by default because it's probably a small table
|
self.shards = shards or 1 # 1 shard by default because it's probably a small table
|
||||||
self.replicas = replicas or min(3, len(self.r.servers))
|
self.replicas = replicas or min(3, len(self.rr.servers))
|
||||||
self._ensure_db_table()
|
self._ensure_db_table()
|
||||||
self.options = options
|
self.options = options
|
||||||
|
|
||||||
@ -194,7 +194,7 @@ class RethinkStatsDb(StatsDb):
|
|||||||
self._update_batch() # starts repeating timer
|
self._update_batch() # starts repeating timer
|
||||||
|
|
||||||
def _bucket_batch_update_reql(self, bucket):
|
def _bucket_batch_update_reql(self, bucket):
|
||||||
return self.r.table(self.table).get(bucket).replace(
|
return self.rr.table(self.table).get(bucket).replace(
|
||||||
lambda old: r.branch(
|
lambda old: r.branch(
|
||||||
old.eq(None), self._batch[bucket], old.merge({
|
old.eq(None), self._batch[bucket], old.merge({
|
||||||
"total": {
|
"total": {
|
||||||
@ -239,18 +239,18 @@ class RethinkStatsDb(StatsDb):
|
|||||||
self.logger.info("finished")
|
self.logger.info("finished")
|
||||||
|
|
||||||
def _ensure_db_table(self):
|
def _ensure_db_table(self):
|
||||||
dbs = self.r.db_list().run()
|
dbs = self.rr.db_list().run()
|
||||||
if not self.r.dbname in dbs:
|
if not self.rr.dbname in dbs:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"creating rethinkdb database %s", repr(self.r.dbname))
|
"creating rethinkdb database %s", repr(self.rr.dbname))
|
||||||
self.r.db_create(self.r.dbname).run()
|
self.rr.db_create(self.rr.dbname).run()
|
||||||
tables = self.r.table_list().run()
|
tables = self.rr.table_list().run()
|
||||||
if not self.table in tables:
|
if not self.table in tables:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"creating rethinkdb table %s in database %s shards=%s "
|
"creating rethinkdb table %s in database %s shards=%s "
|
||||||
"replicas=%s", repr(self.table), repr(self.r.dbname),
|
"replicas=%s", repr(self.table), repr(self.rr.dbname),
|
||||||
self.shards, self.replicas)
|
self.shards, self.replicas)
|
||||||
self.r.table_create(
|
self.rr.table_create(
|
||||||
self.table, primary_key="bucket", shards=self.shards,
|
self.table, primary_key="bucket", shards=self.shards,
|
||||||
replicas=self.replicas).run()
|
replicas=self.replicas).run()
|
||||||
|
|
||||||
@ -267,7 +267,7 @@ class RethinkStatsDb(StatsDb):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
def value(self, bucket0="__all__", bucket1=None, bucket2=None):
|
def value(self, bucket0="__all__", bucket1=None, bucket2=None):
|
||||||
bucket0_stats = self.r.table(self.table).get(bucket0).run()
|
bucket0_stats = self.rr.table(self.table).get(bucket0).run()
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
'stats db lookup of bucket=%s returned %s',
|
'stats db lookup of bucket=%s returned %s',
|
||||||
bucket0, bucket0_stats)
|
bucket0, bucket0_stats)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user