mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Add hidden CLI option --dedup-only-with-bucket
When we use `--dedup-only-with-bucket`, dedup will be done only when a request has key `dedup-bucket` in `Warcprox-Meta`.
This commit is contained in:
parent
432e42803c
commit
abb54e42d1
@ -41,14 +41,16 @@ class DedupableMixin(object):
|
|||||||
def __init__(self, options=warcprox.Options()):
|
def __init__(self, options=warcprox.Options()):
|
||||||
self.min_text_size = options.dedup_min_text_size
|
self.min_text_size = options.dedup_min_text_size
|
||||||
self.min_binary_size = options.dedup_min_binary_size
|
self.min_binary_size = options.dedup_min_binary_size
|
||||||
|
self.dedup_only_with_bucket = options.dedup_only_with_bucket
|
||||||
|
|
||||||
def should_dedup(self, recorded_url):
|
def should_dedup(self, recorded_url):
|
||||||
"""Check if we should try to run dedup on resource based on payload
|
"""Check if we should try to run dedup on resource based on payload
|
||||||
size compared with min text/binary dedup size options.
|
size compared with min text/binary dedup size options.
|
||||||
`dedup-bucket` is required in Warcprox-Meta to perform dedup.
|
When we use option --dedup-only-with-bucket, `dedup-bucket` is required
|
||||||
|
in Warcprox-Meta to perform dedup.
|
||||||
Return Boolean.
|
Return Boolean.
|
||||||
"""
|
"""
|
||||||
if "dedup-bucket" not in recorded_url.warcprox_meta:
|
if self.dedup_only_with_bucket and "dedup-bucket" not in recorded_url.warcprox_meta:
|
||||||
return False
|
return False
|
||||||
if recorded_url.is_text():
|
if recorded_url.is_text():
|
||||||
return recorded_url.response_recorder.payload_size() > self.min_text_size
|
return recorded_url.response_recorder.payload_size() > self.min_text_size
|
||||||
|
@ -154,6 +154,10 @@ def _build_arg_parser(prog='warcprox'):
|
|||||||
arg_parser.add_argument('--dedup-min-binary-size', dest='dedup_min_binary_size',
|
arg_parser.add_argument('--dedup-min-binary-size', dest='dedup_min_binary_size',
|
||||||
type=int, default=0, help=(
|
type=int, default=0, help=(
|
||||||
'try to dedup binary resources with payload size over this limit in bytes'))
|
'try to dedup binary resources with payload size over this limit in bytes'))
|
||||||
|
# optionally, dedup request only when `dedup-bucket` is available in
|
||||||
|
# Warcprox-Meta HTTP header. By default, we dedup all requests.
|
||||||
|
arg_parser.add_argument('--dedup-only-with-bucket', dest='dedup_only_with_bucket',
|
||||||
|
action='store_true', default=False, help=argparse.SUPPRESS)
|
||||||
arg_parser.add_argument('--queue-size', dest='queue_size', type=int,
|
arg_parser.add_argument('--queue-size', dest='queue_size', type=int,
|
||||||
default=500, help=argparse.SUPPRESS)
|
default=500, help=argparse.SUPPRESS)
|
||||||
arg_parser.add_argument('--max-threads', dest='max_threads', type=int,
|
arg_parser.add_argument('--max-threads', dest='max_threads', type=int,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user