diff --git a/warcprox/controller.py b/warcprox/controller.py index dfd930b..fe9960a 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -47,7 +47,7 @@ class Factory: dedup_db = warcprox.dedup.TroughDedupDb(options) elif options.cdxserver_dedup: dedup_db = warcprox.dedup.CdxServerDedup( - cdx_url=options.cdxserver_dedup) + cdx_url=options.cdxserver_dedup, options=options) elif options.dedup_db_file in (None, '', '/dev/null'): logging.info('deduplication disabled') dedup_db = None diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 950c110..cd3b397 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -201,12 +201,15 @@ class CdxServerDedup(DedupDb): """Query a CDX server to perform deduplication. """ logger = logging.getLogger("warcprox.dedup.CdxServerDedup") + cookies = None def __init__(self, cdx_url="https://web.archive.org/cdx/search", maxsize=200, options=warcprox.Options()): self.cdx_url = cdx_url self.options = options self.http_pool = urllib3.PoolManager(maxsize=maxsize) + if options.cdxserver_dedup_cookies: + self.cookies = options.cdxserver_dedup_cookies def start(self): pass @@ -233,9 +236,10 @@ class CdxServerDedup(DedupDb): """ u = url.decode("utf-8") if isinstance(url, bytes) else url try: + headers = {'Cookie': self.cookies} if self.cookies else {} result = self.http_pool.request('GET', self.cdx_url, fields=dict( url=u, fl="timestamp,digest", filter="!mimetype:warc/revisit", - limit=-1)) + limit=-1), headers=headers) assert result.status == 200 if isinstance(digest_key, bytes): dkey = digest_key diff --git a/warcprox/main.py b/warcprox/main.py index 59e4b4a..1f270a1 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -145,6 +145,9 @@ def _build_arg_parser(prog='warcprox'): '--rethinkdb-services-url', dest='rethinkdb_services_url', help=( 'rethinkdb service registry table url; if provided, warcprox ' 'will create and heartbeat entry for itself')) + # optional cookie values to pass to CDX Server; e.g. "cookie1=val1;cookie2=val2" + arg_parser.add_argument('--cdxserver-dedup-cookies', dest='cdxserver_dedup_cookies', + help=argparse.SUPPRESS) arg_parser.add_argument('--queue-size', dest='queue_size', type=int, default=500, help=argparse.SUPPRESS) arg_parser.add_argument('--max-threads', dest='max_threads', type=int,