Merge pull request #139 from vbanos/dedup-impr

Skip cdx dedup for volatile URLs with session params
This commit is contained in:
Noah Levitt 2019-09-20 14:20:54 -07:00 committed by GitHub
commit 8460a670b2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -266,6 +266,9 @@ class CdxServerDedup(DedupDb):
performance optimisation to handle that. limit < 0 is very inefficient
in general. Maybe it could be configurable in the future.
Skip dedup for URLs with session params. These URLs are certainly
unique and highly volatile, we cannot dedup them.
:param digest_key: b'sha1:<KEY-VALUE>' (prefix is optional).
Example: b'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
:param url: Target URL string
@ -274,6 +277,8 @@ class CdxServerDedup(DedupDb):
"""
u = url.decode("utf-8") if isinstance(url, bytes) else url
try:
if any(s in u for s in ('JSESSIONID=', 'session=', 'sess=')):
return None
result = self.http_pool.request('GET', self.cdx_url, fields=dict(
url=u, fl="timestamp,digest", filter="!mimetype:warc/revisit",
limit=-1))