mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Merge pull request #53 from vbanos/cdx-dedup-cookies
Add --cdxserver-dedup-cookies option
This commit is contained in:
commit
5aafceaeb9
@ -47,7 +47,7 @@ class Factory:
|
||||
dedup_db = warcprox.dedup.TroughDedupDb(options)
|
||||
elif options.cdxserver_dedup:
|
||||
dedup_db = warcprox.dedup.CdxServerDedup(
|
||||
cdx_url=options.cdxserver_dedup)
|
||||
cdx_url=options.cdxserver_dedup, options=options)
|
||||
elif options.dedup_db_file in (None, '', '/dev/null'):
|
||||
logging.info('deduplication disabled')
|
||||
dedup_db = None
|
||||
|
@ -201,12 +201,15 @@ class CdxServerDedup(DedupDb):
|
||||
"""Query a CDX server to perform deduplication.
|
||||
"""
|
||||
logger = logging.getLogger("warcprox.dedup.CdxServerDedup")
|
||||
cookies = None
|
||||
|
||||
def __init__(self, cdx_url="https://web.archive.org/cdx/search",
|
||||
maxsize=200, options=warcprox.Options()):
|
||||
self.cdx_url = cdx_url
|
||||
self.options = options
|
||||
self.http_pool = urllib3.PoolManager(maxsize=maxsize)
|
||||
if options.cdxserver_dedup_cookies:
|
||||
self.cookies = options.cdxserver_dedup_cookies
|
||||
|
||||
def start(self):
|
||||
pass
|
||||
@ -233,9 +236,10 @@ class CdxServerDedup(DedupDb):
|
||||
"""
|
||||
u = url.decode("utf-8") if isinstance(url, bytes) else url
|
||||
try:
|
||||
headers = {'Cookie': self.cookies} if self.cookies else {}
|
||||
result = self.http_pool.request('GET', self.cdx_url, fields=dict(
|
||||
url=u, fl="timestamp,digest", filter="!mimetype:warc/revisit",
|
||||
limit=-1))
|
||||
limit=-1), headers=headers)
|
||||
assert result.status == 200
|
||||
if isinstance(digest_key, bytes):
|
||||
dkey = digest_key
|
||||
|
@ -145,6 +145,9 @@ def _build_arg_parser(prog='warcprox'):
|
||||
'--rethinkdb-services-url', dest='rethinkdb_services_url', help=(
|
||||
'rethinkdb service registry table url; if provided, warcprox '
|
||||
'will create and heartbeat entry for itself'))
|
||||
# optional cookie values to pass to CDX Server; e.g. "cookie1=val1;cookie2=val2"
|
||||
arg_parser.add_argument('--cdxserver-dedup-cookies', dest='cdxserver_dedup_cookies',
|
||||
help=argparse.SUPPRESS)
|
||||
arg_parser.add_argument('--queue-size', dest='queue_size', type=int,
|
||||
default=500, help=argparse.SUPPRESS)
|
||||
arg_parser.add_argument('--max-threads', dest='max_threads', type=int,
|
||||
|
Loading…
x
Reference in New Issue
Block a user