mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
for big captures table, do insert with conflict="replace"
We're doing this because one time this happened: rethinkdb.errors.ReqlOpIndeterminateError: Cannot perform write: The primary replica isn't connected to a quorum of replicas.... and on the next attempt this happened: {'errors': 1, 'inserted': 1, 'first_error': 'Duplicate primary key `id`: .... When we got ReqlOpIndeterminateError the operation actually succeeded partially, one of the records was inserted. After that the batch insert failed every time because it was trying to insert the same entry. With this change there will be no error from a duplicate key.
This commit is contained in:
parent
1671080755
commit
41bd6c72af
2
setup.py
2
setup.py
@ -51,7 +51,7 @@ except:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='warcprox',
|
name='warcprox',
|
||||||
version='2.0b2.dev34',
|
version='2.0b2.dev35',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -40,7 +40,9 @@ class RethinkCaptures:
|
|||||||
"""Inserts in batches every 0.5 seconds"""
|
"""Inserts in batches every 0.5 seconds"""
|
||||||
logger = logging.getLogger("warcprox.bigtable.RethinkCaptures")
|
logger = logging.getLogger("warcprox.bigtable.RethinkCaptures")
|
||||||
|
|
||||||
def __init__(self, r, table="captures", shards=None, replicas=None, options=warcprox.Options()):
|
def __init__(
|
||||||
|
self, r, table="captures", shards=None, replicas=None,
|
||||||
|
options=warcprox.Options()):
|
||||||
self.r = r
|
self.r = r
|
||||||
self.table = table
|
self.table = table
|
||||||
self.shards = shards or len(r.servers)
|
self.shards = shards or len(r.servers)
|
||||||
@ -62,15 +64,21 @@ class RethinkCaptures:
|
|||||||
try:
|
try:
|
||||||
with self._batch_lock:
|
with self._batch_lock:
|
||||||
if len(self._batch) > 0:
|
if len(self._batch) > 0:
|
||||||
result = self.r.table(self.table).insert(self._batch).run()
|
result = self.r.table(self.table).insert(
|
||||||
if result["inserted"] != len(self._batch) or sorted(
|
self._batch, conflict="replace").run()
|
||||||
result.values()) != [0,0,0,0,0,len(self._batch)]:
|
if result["inserted"] + result["replaced"] != len(self._batch):
|
||||||
raise Exception(
|
raise Exception(
|
||||||
"unexpected result %s saving batch of %s "
|
"unexpected result %s saving batch of %s "
|
||||||
"entries", result, len(self._batch))
|
"entries", result, len(self._batch))
|
||||||
self.logger.debug(
|
if result["replaced"] > 0:
|
||||||
"saved %s entries to big capture table db",
|
self.logger.warn(
|
||||||
len(self._batch))
|
"inserted %s entries and replaced %s entries "
|
||||||
|
"in big captures table", result["inserted"],
|
||||||
|
result["replaced"])
|
||||||
|
else:
|
||||||
|
self.logger.debug(
|
||||||
|
"inserted %s entries to big captures table",
|
||||||
|
len(self._batch))
|
||||||
self._batch = []
|
self._batch = []
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
@ -82,7 +90,8 @@ class RethinkCaptures:
|
|||||||
t = threading.Timer(0.5, self._insert_batch)
|
t = threading.Timer(0.5, self._insert_batch)
|
||||||
t.name = "RethinkCaptures-batch-insert-timer-%s" % datetime.datetime.utcnow().isoformat()
|
t.name = "RethinkCaptures-batch-insert-timer-%s" % datetime.datetime.utcnow().isoformat()
|
||||||
t.start()
|
t.start()
|
||||||
# ensure self._timer joinable (already started) whenever close() happens to be called
|
# ensure self._timer joinable (already started) whenever
|
||||||
|
# close() happens to be called
|
||||||
self._timer = t
|
self._timer = t
|
||||||
else:
|
else:
|
||||||
self.logger.info("finished")
|
self.logger.info("finished")
|
||||||
@ -101,7 +110,9 @@ class RethinkCaptures:
|
|||||||
|
|
||||||
def find_response_by_digest(self, algo, raw_digest, bucket="__unspecified__"):
|
def find_response_by_digest(self, algo, raw_digest, bucket="__unspecified__"):
|
||||||
if algo != "sha1":
|
if algo != "sha1":
|
||||||
raise Exception("digest type is {} but big capture table is indexed by sha1".format(algo))
|
raise Exception(
|
||||||
|
"digest type is %s but big captures table is indexed by "
|
||||||
|
"sha1" % algo)
|
||||||
sha1base32 = base64.b32encode(raw_digest).decode("utf-8")
|
sha1base32 = base64.b32encode(raw_digest).decode("utf-8")
|
||||||
results_iter = self.r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type").run()
|
results_iter = self.r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type").run()
|
||||||
results = list(results_iter)
|
results = list(results_iter)
|
||||||
@ -123,7 +134,7 @@ class RethinkCaptures:
|
|||||||
).decode("utf-8")
|
).decode("utf-8")
|
||||||
else:
|
else:
|
||||||
self.logger.warn(
|
self.logger.warn(
|
||||||
"digest type is %s but big capture table is indexed "
|
"digest type is %s but big captures table is indexed "
|
||||||
"by sha1",
|
"by sha1",
|
||||||
recorded_url.response_recorder.payload_digest.name)
|
recorded_url.response_recorder.payload_digest.name)
|
||||||
else:
|
else:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user