diff --git a/warcprox/dedup.py b/warcprox/dedup.py index ec03c7c..f979d97 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -41,14 +41,16 @@ class DedupableMixin(object): def __init__(self, options=warcprox.Options()): self.min_text_size = options.dedup_min_text_size self.min_binary_size = options.dedup_min_binary_size + self.dedup_only_with_bucket = options.dedup_only_with_bucket def should_dedup(self, recorded_url): """Check if we should try to run dedup on resource based on payload size compared with min text/binary dedup size options. - `dedup-bucket` is required in Warcprox-Meta to perform dedup. + When we use option --dedup-only-with-bucket, `dedup-bucket` is required + in Warcprox-Meta to perform dedup. Return Boolean. """ - if "dedup-bucket" not in recorded_url.warcprox_meta: + if self.dedup_only_with_bucket and "dedup-bucket" not in recorded_url.warcprox_meta: return False if recorded_url.is_text(): return recorded_url.response_recorder.payload_size() > self.min_text_size diff --git a/warcprox/main.py b/warcprox/main.py index 3723445..6fb46ef 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -154,6 +154,10 @@ def _build_arg_parser(prog='warcprox'): arg_parser.add_argument('--dedup-min-binary-size', dest='dedup_min_binary_size', type=int, default=0, help=( 'try to dedup binary resources with payload size over this limit in bytes')) + # optionally, dedup request only when `dedup-bucket` is available in + # Warcprox-Meta HTTP header. By default, we dedup all requests. + arg_parser.add_argument('--dedup-only-with-bucket', dest='dedup_only_with_bucket', + action='store_true', default=False, help=argparse.SUPPRESS) arg_parser.add_argument('--queue-size', dest='queue_size', type=int, default=500, help=argparse.SUPPRESS) arg_parser.add_argument('--max-threads', dest='max_threads', type=int,