From 255d359ad4090bd9d47f8fac02a24f4c8a465c00 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Tue, 24 Apr 2018 17:06:56 +0000 Subject: [PATCH] Use DedupableMixin in RethinkCapturesDedup I note that we didn't do any payload_size check at all here. --- warcprox/bigtable.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index e6674a6..cb4671e 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -34,6 +34,7 @@ import threading import datetime import doublethink import rethinkdb as r +from warcprox.dedup import DedupableMixin class RethinkCaptures: """Inserts in batches every 0.5 seconds""" @@ -215,10 +216,11 @@ class RethinkCaptures: if self._timer: self._timer.join() -class RethinkCapturesDedup(warcprox.dedup.DedupDb): +class RethinkCapturesDedup(warcprox.dedup.DedupDb, DedupableMixin): logger = logging.getLogger("warcprox.dedup.RethinkCapturesDedup") def __init__(self, options=warcprox.Options()): + DedupableMixin.__init__(self, options) self.captures_db = RethinkCaptures(options=options) self.options = options @@ -251,5 +253,6 @@ class RethinkCapturesDedup(warcprox.dedup.DedupDb): self.captures_db.close() def notify(self, recorded_url, records): - self.captures_db.notify(recorded_url, records) - + if (records and records[0].type == b'response' + and self.should_dedup(recorded_url)): + self.captures_db.notify(recorded_url, records)