Fix bug with dedup_info date encoding

This commit is contained in:
Vangelis Banos 2017-10-19 22:54:34 +00:00
parent 59e995ccdf
commit a0821575b4

View File

@ -209,7 +209,7 @@ class CdxServerDedup(object):
logger = logging.getLogger("warcprox.dedup.CdxServerDedup") logger = logging.getLogger("warcprox.dedup.CdxServerDedup")
http_pool = urllib3.PoolManager() http_pool = urllib3.PoolManager()
def __init__(self, cdx_url="https://web.archive.org/cdx/search/cdx", def __init__(self, cdx_url="https://web.archive.org/cdx/search",
options=warcprox.Options()): options=warcprox.Options()):
self.cdx_url = cdx_url self.cdx_url = cdx_url
self.options = options self.options = options
@ -237,7 +237,7 @@ class CdxServerDedup(object):
u = url.decode("utf-8") if isinstance(url, bytes) else url u = url.decode("utf-8") if isinstance(url, bytes) else url
try: try:
result = self.http_pool.request('GET', self.cdx_url, fields=dict( result = self.http_pool.request('GET', self.cdx_url, fields=dict(
url=u, fl="timestamp,digest", limit=-1)) url=u, fl="timestamp,digest", limit=-10))
assert result.status == 200 assert result.status == 200
if isinstance(digest_key, bytes): if isinstance(digest_key, bytes):
dkey = digest_key dkey = digest_key
@ -249,8 +249,8 @@ class CdxServerDedup(object):
(cdx_ts, cdx_digest) = line.split(b' ') (cdx_ts, cdx_digest) = line.split(b' ')
if cdx_digest == dkey: if cdx_digest == dkey:
dt = datetime(*_split_timestamp(cdx_ts.decode('ascii'))) dt = datetime(*_split_timestamp(cdx_ts.decode('ascii')))
return dict(url=url, date = dt.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8')
date=dt.strftime('%Y-%m-%dT%H:%M:%SZ')) return dict(url=url, date=date)
except (HTTPError, AssertionError, ValueError) as exc: except (HTTPError, AssertionError, ValueError) as exc:
self.logger.error('CdxServerDedup request failed for url=%s %s', self.logger.error('CdxServerDedup request failed for url=%s %s',
url, exc) url, exc)