mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Rethinker class moved to its own pyrethink project
This commit is contained in:
parent
2e482d67cc
commit
f90c3a6403
@ -1,5 +1,13 @@
|
|||||||
certauth>=1.1.0
|
certauth>=1.1.0
|
||||||
rethinkdb
|
rethinkdb
|
||||||
git+https://github.com/internetarchive/warctools.git
|
git+https://github.com/internetarchive/warctools.git
|
||||||
git+https://github.com/nlevitt/surt.git@py3
|
|
||||||
kafka-python
|
kafka-python
|
||||||
|
|
||||||
|
.
|
||||||
|
# -e .
|
||||||
|
|
||||||
|
git+https://github.com/nlevitt/surt.git@py3
|
||||||
|
# -e /home/nlevitt/workspace/surt
|
||||||
|
|
||||||
|
https://github.com/nlevitt/pyrethink.git
|
||||||
|
# -e /home/nlevitt/workspace/pyrethink
|
||||||
|
@ -17,44 +17,10 @@ class Options(_Namespace):
|
|||||||
except AttributeError:
|
except AttributeError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
class Rethinker:
|
|
||||||
import logging
|
|
||||||
logger = logging.getLogger("warcprox.Rethinker")
|
|
||||||
|
|
||||||
def __init__(self, servers=["localhost"], db=None):
|
|
||||||
self.servers = servers
|
|
||||||
self.db = db
|
|
||||||
|
|
||||||
# https://github.com/rethinkdb/rethinkdb-example-webpy-blog/blob/master/model.py
|
|
||||||
# "Best practices: Managing connections: a connection per request"
|
|
||||||
def _random_server_connection(self):
|
|
||||||
import rethinkdb as r
|
|
||||||
import random
|
|
||||||
while True:
|
|
||||||
server = random.choice(self.servers)
|
|
||||||
try:
|
|
||||||
try:
|
|
||||||
host, port = server.split(":")
|
|
||||||
return r.connect(host=host, port=port)
|
|
||||||
except ValueError:
|
|
||||||
return r.connect(host=server)
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error("will keep trying to get a connection after failure connecting to %s", server, exc_info=True)
|
|
||||||
import time
|
|
||||||
time.sleep(0.5)
|
|
||||||
|
|
||||||
def run(self, query):
|
|
||||||
import rethinkdb as r
|
|
||||||
while True:
|
|
||||||
with self._random_server_connection() as conn:
|
|
||||||
try:
|
|
||||||
return query.run(conn, db=self.db)
|
|
||||||
except (r.ReqlAvailabilityError, r.ReqlTimeoutError) as e:
|
|
||||||
self.logger.error("will retry rethinkdb query/operation %s which failed like so:", query, exc_info=True)
|
|
||||||
|
|
||||||
version_bytes = _read_version_bytes().strip()
|
version_bytes = _read_version_bytes().strip()
|
||||||
version_str = version_bytes.decode('utf-8')
|
version_str = version_bytes.decode('utf-8')
|
||||||
|
|
||||||
|
# XXX linux-specific
|
||||||
def gettid():
|
def gettid():
|
||||||
try:
|
try:
|
||||||
import ctypes
|
import ctypes
|
||||||
@ -63,7 +29,7 @@ def gettid():
|
|||||||
tid = libc.syscall(SYS_gettid)
|
tid = libc.syscall(SYS_gettid)
|
||||||
return tid
|
return tid
|
||||||
except:
|
except:
|
||||||
logging.warn("gettid failed?")
|
logging.warn("gettid failed?", exc_info=True)
|
||||||
|
|
||||||
import warcprox.controller as controller
|
import warcprox.controller as controller
|
||||||
import warcprox.playback as playback
|
import warcprox.playback as playback
|
||||||
|
@ -12,12 +12,13 @@ import base64
|
|||||||
import surt
|
import surt
|
||||||
import os
|
import os
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import pyrethink
|
||||||
|
|
||||||
class RethinkCaptures:
|
class RethinkCaptures:
|
||||||
logger = logging.getLogger("warcprox.bigtables.RethinkCaptures")
|
logger = logging.getLogger("warcprox.bigtables.RethinkCaptures")
|
||||||
|
|
||||||
def __init__(self, servers=["localhost"], db="warcprox", table="captures", shards=3, replicas=3, options=warcprox.Options()):
|
def __init__(self, servers=["localhost"], db="warcprox", table="captures", shards=3, replicas=3, options=warcprox.Options()):
|
||||||
self.r = warcprox.Rethinker(servers, db)
|
self.r = pyrethink.Rethinker(servers, db)
|
||||||
self.table = table
|
self.table = table
|
||||||
self.shards = shards
|
self.shards = shards
|
||||||
self.replicas = replicas
|
self.replicas = replicas
|
||||||
@ -40,8 +41,8 @@ class RethinkCaptures:
|
|||||||
if algo != "sha1":
|
if algo != "sha1":
|
||||||
raise Exception("digest type is {} but big capture table is indexed by sha1".format(algo))
|
raise Exception("digest type is {} but big capture table is indexed by sha1".format(algo))
|
||||||
sha1base32 = base64.b32encode(raw_digest).decode("utf-8")
|
sha1base32 = base64.b32encode(raw_digest).decode("utf-8")
|
||||||
cursor = self.r.run(r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type"))
|
results_iter = self.r.results_iter(r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type"))
|
||||||
results = list(cursor)
|
results = list(results_iter)
|
||||||
if len(results) > 1:
|
if len(results) > 1:
|
||||||
raise Exception("expected 0 or 1 but found %s results for sha1base32=%s", len(results), sha1base32)
|
raise Exception("expected 0 or 1 but found %s results for sha1base32=%s", len(results), sha1base32)
|
||||||
elif len(results) == 1:
|
elif len(results) == 1:
|
||||||
|
@ -18,6 +18,7 @@ import warcprox
|
|||||||
import rethinkdb
|
import rethinkdb
|
||||||
r = rethinkdb
|
r = rethinkdb
|
||||||
import random
|
import random
|
||||||
|
import pyrethink
|
||||||
|
|
||||||
class DedupDb(object):
|
class DedupDb(object):
|
||||||
logger = logging.getLogger("warcprox.dedup.DedupDb")
|
logger = logging.getLogger("warcprox.dedup.DedupDb")
|
||||||
@ -88,7 +89,7 @@ class RethinkDedupDb:
|
|||||||
logger = logging.getLogger("warcprox.dedup.RethinkDedupDb")
|
logger = logging.getLogger("warcprox.dedup.RethinkDedupDb")
|
||||||
|
|
||||||
def __init__(self, servers=["localhost"], db="warcprox", table="dedup", shards=3, replicas=3, options=warcprox.Options()):
|
def __init__(self, servers=["localhost"], db="warcprox", table="dedup", shards=3, replicas=3, options=warcprox.Options()):
|
||||||
self.r = warcprox.Rethinker(servers, db)
|
self.r = pyrethink.Rethinker(servers, db)
|
||||||
self.table = table
|
self.table = table
|
||||||
self.shards = shards
|
self.shards = shards
|
||||||
self.replicas = replicas
|
self.replicas = replicas
|
||||||
|
@ -16,6 +16,7 @@ import rethinkdb
|
|||||||
r = rethinkdb
|
r = rethinkdb
|
||||||
import random
|
import random
|
||||||
import warcprox
|
import warcprox
|
||||||
|
import pyrethink
|
||||||
|
|
||||||
def _empty_bucket(bucket):
|
def _empty_bucket(bucket):
|
||||||
return {
|
return {
|
||||||
@ -106,7 +107,7 @@ class RethinkStatsDb:
|
|||||||
logger = logging.getLogger("warcprox.stats.RethinkStatsDb")
|
logger = logging.getLogger("warcprox.stats.RethinkStatsDb")
|
||||||
|
|
||||||
def __init__(self, servers=["localhost"], db="warcprox", table="stats", shards=3, replicas=3, options=warcprox.Options()):
|
def __init__(self, servers=["localhost"], db="warcprox", table="stats", shards=3, replicas=3, options=warcprox.Options()):
|
||||||
self.r = warcprox.Rethinker(servers, db)
|
self.r = pyrethink.Rethinker(servers, db)
|
||||||
self.table = table
|
self.table = table
|
||||||
self.shards = shards
|
self.shards = shards
|
||||||
self.replicas = replicas
|
self.replicas = replicas
|
||||||
|
Loading…
x
Reference in New Issue
Block a user