mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Merge pull request #139 from vbanos/dedup-impr
Skip cdx dedup for volatile URLs with session params
This commit is contained in:
commit
8460a670b2
@ -266,6 +266,9 @@ class CdxServerDedup(DedupDb):
|
||||
performance optimisation to handle that. limit < 0 is very inefficient
|
||||
in general. Maybe it could be configurable in the future.
|
||||
|
||||
Skip dedup for URLs with session params. These URLs are certainly
|
||||
unique and highly volatile, we cannot dedup them.
|
||||
|
||||
:param digest_key: b'sha1:<KEY-VALUE>' (prefix is optional).
|
||||
Example: b'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
|
||||
:param url: Target URL string
|
||||
@ -274,6 +277,8 @@ class CdxServerDedup(DedupDb):
|
||||
"""
|
||||
u = url.decode("utf-8") if isinstance(url, bytes) else url
|
||||
try:
|
||||
if any(s in u for s in ('JSESSIONID=', 'session=', 'sess=')):
|
||||
return None
|
||||
result = self.http_pool.request('GET', self.cdx_url, fields=dict(
|
||||
url=u, fl="timestamp,digest", filter="!mimetype:warc/revisit",
|
||||
limit=-1))
|
||||
|
Loading…
x
Reference in New Issue
Block a user