From f77aef91108c4398d56fb13aee885c236901e635 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Fri, 20 Oct 2017 21:59:43 +0000 Subject: [PATCH] Filter out warc/revisit records in CdxServerDedup --- warcprox/dedup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 41b9249..53b27c9 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -215,7 +215,8 @@ class CdxServerDedup(object): u = url.decode("utf-8") if isinstance(url, bytes) else url try: result = self.http_pool.request('GET', self.cdx_url, fields=dict( - url=u, fl="timestamp,digest", limit=-10)) + url=u, fl="timestamp,digest", filter="!mimetype:warc/revisit", + limit=-10)) assert result.status == 200 if isinstance(digest_key, bytes): dkey = digest_key