Add hidden CLI option --dedup-only-with-bucket

When we use `--dedup-only-with-bucket`, dedup will be done only when a
request has key `dedup-bucket` in `Warcprox-Meta`.
This commit is contained in:
Vangelis Banos 2018-05-04 20:50:54 +00:00
parent 432e42803c
commit abb54e42d1
2 changed files with 8 additions and 2 deletions

View File

@ -41,14 +41,16 @@ class DedupableMixin(object):
def __init__(self, options=warcprox.Options()): def __init__(self, options=warcprox.Options()):
self.min_text_size = options.dedup_min_text_size self.min_text_size = options.dedup_min_text_size
self.min_binary_size = options.dedup_min_binary_size self.min_binary_size = options.dedup_min_binary_size
self.dedup_only_with_bucket = options.dedup_only_with_bucket
def should_dedup(self, recorded_url): def should_dedup(self, recorded_url):
"""Check if we should try to run dedup on resource based on payload """Check if we should try to run dedup on resource based on payload
size compared with min text/binary dedup size options. size compared with min text/binary dedup size options.
`dedup-bucket` is required in Warcprox-Meta to perform dedup. When we use option --dedup-only-with-bucket, `dedup-bucket` is required
in Warcprox-Meta to perform dedup.
Return Boolean. Return Boolean.
""" """
if "dedup-bucket" not in recorded_url.warcprox_meta: if self.dedup_only_with_bucket and "dedup-bucket" not in recorded_url.warcprox_meta:
return False return False
if recorded_url.is_text(): if recorded_url.is_text():
return recorded_url.response_recorder.payload_size() > self.min_text_size return recorded_url.response_recorder.payload_size() > self.min_text_size

View File

@ -154,6 +154,10 @@ def _build_arg_parser(prog='warcprox'):
arg_parser.add_argument('--dedup-min-binary-size', dest='dedup_min_binary_size', arg_parser.add_argument('--dedup-min-binary-size', dest='dedup_min_binary_size',
type=int, default=0, help=( type=int, default=0, help=(
'try to dedup binary resources with payload size over this limit in bytes')) 'try to dedup binary resources with payload size over this limit in bytes'))
# optionally, dedup request only when `dedup-bucket` is available in
# Warcprox-Meta HTTP header. By default, we dedup all requests.
arg_parser.add_argument('--dedup-only-with-bucket', dest='dedup_only_with_bucket',
action='store_true', default=False, help=argparse.SUPPRESS)
arg_parser.add_argument('--queue-size', dest='queue_size', type=int, arg_parser.add_argument('--queue-size', dest='queue_size', type=int,
default=500, help=argparse.SUPPRESS) default=500, help=argparse.SUPPRESS)
arg_parser.add_argument('--max-threads', dest='max_threads', type=int, arg_parser.add_argument('--max-threads', dest='max_threads', type=int,