From 71221dbe54cb59d2dce85ed0b30efc57b0d14010 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 2 Nov 2018 18:05:18 +0000 Subject: [PATCH] minimize impact of down server The last approach was not good, timeout of 0.1 seconds was too short. A bunch of stuff has to happen in the timeout period inside of rethinkdb.connect(). It doesn't offer a way to set only the socket timeout. Even a timeout of 0.5 seconds results in a noticeable error rate. The new approach is to put a server in the penalty box for 5 minutes when it errors. While the server is in the penalty box, we don't try to connect to it, unless all the servers are in the penalty box, in which case we try the server that errored least recently. --- doublethink/rethinker.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/doublethink/rethinker.py b/doublethink/rethinker.py index fcafa20..2e5b58e 100644 --- a/doublethink/rethinker.py +++ b/doublethink/rethinker.py @@ -113,21 +113,22 @@ class Rethinker(object): else: self.servers = servers self.dbname = db + self.last_error = {} # {server: time} # https://github.com/rethinkdb/rethinkdb-example-webpy-blog/blob/master/model.py # "Best practices: Managing connections: a connection per request" def _random_server_connection(self): retry_wait = 0.01 while True: - server = random.choice(self.servers) + server = random.choice(self._server_whitelist()) try: try: host, port = server.split(':') - return r.connect( - host=host, port=port, timeout=max(0.1, retry_wait)) + return r.connect(host=host, port=port) except ValueError: - return r.connect(host=server, timeout=max(0.1, retry_wait)) + return r.connect(host=server) except Exception as e: + self.last_error[server] = time.time() self.logger.warn( 'will keep trying after failure connecting to ' 'rethinkdb server at %s: %s (sleeping for %s sec)', @@ -135,6 +136,24 @@ class Rethinker(object): time.sleep(retry_wait) retry_wait = min(retry_wait * 2, 10.0) + # https://en.wikipedia.org/wiki/Penalty_(ice_hockey)#Major_penalty + PENALTY_BOX_TIME = 300 + def _server_whitelist(self): + ''' + Returns list of servers that have not errored in the last five minutes. + If all servers have errored in the last five minutes, returns list with + one item, the server that errored least recently. + ''' + whitelist = [] + for server in self.servers: + if (server not in self.last_error + or self.last_error[server] < time.time() - self.PENALTY_BOX_TIME): + whitelist.append(server) + if not whitelist: + whitelist.append(sorted( + self.last_error.items(), key=lambda kv: kv[1])[0][0]) + return whitelist + def wrap(self, delegate): if isinstance(delegate, (types.FunctionType, types.MethodType)): def wrapper(*args, **kwargs):