mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Merge pull request #139 from vbanos/dedup-impr
Skip cdx dedup for volatile URLs with session params
This commit is contained in:
commit
8460a670b2
@ -266,6 +266,9 @@ class CdxServerDedup(DedupDb):
|
|||||||
performance optimisation to handle that. limit < 0 is very inefficient
|
performance optimisation to handle that. limit < 0 is very inefficient
|
||||||
in general. Maybe it could be configurable in the future.
|
in general. Maybe it could be configurable in the future.
|
||||||
|
|
||||||
|
Skip dedup for URLs with session params. These URLs are certainly
|
||||||
|
unique and highly volatile, we cannot dedup them.
|
||||||
|
|
||||||
:param digest_key: b'sha1:<KEY-VALUE>' (prefix is optional).
|
:param digest_key: b'sha1:<KEY-VALUE>' (prefix is optional).
|
||||||
Example: b'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
|
Example: b'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
|
||||||
:param url: Target URL string
|
:param url: Target URL string
|
||||||
@ -274,6 +277,8 @@ class CdxServerDedup(DedupDb):
|
|||||||
"""
|
"""
|
||||||
u = url.decode("utf-8") if isinstance(url, bytes) else url
|
u = url.decode("utf-8") if isinstance(url, bytes) else url
|
||||||
try:
|
try:
|
||||||
|
if any(s in u for s in ('JSESSIONID=', 'session=', 'sess=')):
|
||||||
|
return None
|
||||||
result = self.http_pool.request('GET', self.cdx_url, fields=dict(
|
result = self.http_pool.request('GET', self.cdx_url, fields=dict(
|
||||||
url=u, fl="timestamp,digest", filter="!mimetype:warc/revisit",
|
url=u, fl="timestamp,digest", filter="!mimetype:warc/revisit",
|
||||||
limit=-1))
|
limit=-1))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user