From 1c502353057a84cd77b9050ac84a3669a240ffcb Mon Sep 17 00:00:00 2001
From: Vangelis Banos <vangelis@archive.org>
Date: Fri, 19 Jan 2018 15:16:26 +0000
Subject: [PATCH] Add --cdxserver-dedup-cookies option

It is necessary to pass cookies to the CDX Server we use for deduplication.
To do this, we add the optional CLI argument
``--cdxserver-dedup-cookies="cookie1=val1;cookie2=val2"`` and if it is
available, its used in the `Cookie` HTTP header in CDX server requests.
---
 warcprox/controller.py | 2 +-
 warcprox/dedup.py      | 6 +++++-
 warcprox/main.py       | 3 +++
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/warcprox/controller.py b/warcprox/controller.py
index dfd930b..fe9960a 100644
--- a/warcprox/controller.py
+++ b/warcprox/controller.py
@@ -47,7 +47,7 @@ class Factory:
             dedup_db = warcprox.dedup.TroughDedupDb(options)
         elif options.cdxserver_dedup:
             dedup_db = warcprox.dedup.CdxServerDedup(
-                cdx_url=options.cdxserver_dedup)
+                cdx_url=options.cdxserver_dedup, options=options)
         elif options.dedup_db_file in (None, '', '/dev/null'):
             logging.info('deduplication disabled')
             dedup_db = None
diff --git a/warcprox/dedup.py b/warcprox/dedup.py
index 950c110..cd3b397 100644
--- a/warcprox/dedup.py
+++ b/warcprox/dedup.py
@@ -201,12 +201,15 @@ class CdxServerDedup(DedupDb):
     """Query a CDX server to perform deduplication.
     """
     logger = logging.getLogger("warcprox.dedup.CdxServerDedup")
+    cookies = None
 
     def __init__(self, cdx_url="https://web.archive.org/cdx/search",
                  maxsize=200, options=warcprox.Options()):
         self.cdx_url = cdx_url
         self.options = options
         self.http_pool = urllib3.PoolManager(maxsize=maxsize)
+        if options.cdxserver_dedup_cookies:
+            self.cookies = options.cdxserver_dedup_cookies
 
     def start(self):
         pass
@@ -233,9 +236,10 @@ class CdxServerDedup(DedupDb):
         """
         u = url.decode("utf-8") if isinstance(url, bytes) else url
         try:
+            headers = {'Cookie': self.cookies} if self.cookies else {}
             result = self.http_pool.request('GET', self.cdx_url, fields=dict(
                 url=u, fl="timestamp,digest", filter="!mimetype:warc/revisit",
-                limit=-1))
+                limit=-1), headers=headers)
             assert result.status == 200
             if isinstance(digest_key, bytes):
                 dkey = digest_key
diff --git a/warcprox/main.py b/warcprox/main.py
index 59e4b4a..1f270a1 100644
--- a/warcprox/main.py
+++ b/warcprox/main.py
@@ -145,6 +145,9 @@ def _build_arg_parser(prog='warcprox'):
             '--rethinkdb-services-url', dest='rethinkdb_services_url', help=(
                 'rethinkdb service registry table url; if provided, warcprox '
                 'will create and heartbeat entry for itself'))
+    # optional cookie values to pass to CDX Server; e.g. "cookie1=val1;cookie2=val2"
+    arg_parser.add_argument('--cdxserver-dedup-cookies', dest='cdxserver_dedup_cookies',
+            help=argparse.SUPPRESS)
     arg_parser.add_argument('--queue-size', dest='queue_size', type=int,
             default=500, help=argparse.SUPPRESS)
     arg_parser.add_argument('--max-threads', dest='max_threads', type=int,