From 9baa2e22d547fd72967e2a9df65980d5daf1be20 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Fri, 4 May 2018 13:26:38 +0000 Subject: [PATCH 1/3] Rename captures-bucket to dedup-bucket in Warcprox-Meta --- tests/test_warcprox.py | 12 ++++++------ warcprox/bigtable.py | 4 ++-- warcprox/dedup.py | 24 ++++++++++++------------ 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 8bb58ab..d9d7341 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -745,7 +745,7 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, url2 = 'https://localhost:{}/k/l'.format(https_daemon.server_port) # archive url1 bucket_a - headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","captures-bucket":"bucket_a"})} + headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","dedup-bucket":"bucket_a"})} response = requests.get(url1, proxies=archiving_proxies, verify=False, headers=headers) assert response.status_code == 200 assert response.headers['warcprox-test-header'] == 'k!' @@ -771,7 +771,7 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, assert dedup_lookup is None # archive url2 bucket_b - headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","captures-bucket":"bucket_b"})} + headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","dedup-bucket":"bucket_b"})} response = requests.get(url2, proxies=archiving_proxies, verify=False, headers=headers) assert response.status_code == 200 assert response.headers['warcprox-test-header'] == 'k!' @@ -790,7 +790,7 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, dedup_date = dedup_lookup['date'] # archive url2 bucket_a - headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","captures-bucket":"bucket_a"})} + headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","dedup-bucket":"bucket_a"})} response = requests.get(url2, proxies=archiving_proxies, verify=False, headers=headers) assert response.status_code == 200 assert response.headers['warcprox-test-header'] == 'k!' @@ -800,7 +800,7 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 3) # archive url1 bucket_b - headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","captures-bucket":"bucket_b"})} + headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","dedup-bucket":"bucket_b"})} response = requests.get(url1, proxies=archiving_proxies, verify=False, headers=headers) assert response.status_code == 200 assert response.headers['warcprox-test-header'] == 'k!' @@ -1371,7 +1371,7 @@ def test_dedup_ok_flag( assert dedup_lookup is None # archive with dedup_ok:False - request_meta = {'captures-bucket':'test_dedup_ok_flag','dedup-ok':False} + request_meta = {'dedup-bucket':'test_dedup_ok_flag','dedup-ok':False} headers = {'Warcprox-Meta': json.dumps(request_meta)} response = requests.get( url, proxies=archiving_proxies, headers=headers, verify=False) @@ -1389,7 +1389,7 @@ def test_dedup_ok_flag( assert dedup_lookup is None # archive without dedup_ok:False - request_meta = {'captures-bucket':'test_dedup_ok_flag'} + request_meta = {'dedup-bucket':'test_dedup_ok_flag'} headers = {'Warcprox-Meta': json.dumps(request_meta)} response = requests.get( url, proxies=archiving_proxies, headers=headers, verify=False) diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index cb4671e..d8cd218 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -157,8 +157,8 @@ class RethinkCaptures: sha1base32 = base64.b32encode(digest.digest()).decode("utf-8") if (recorded_url.warcprox_meta - and "captures-bucket" in recorded_url.warcprox_meta): - bucket = recorded_url.warcprox_meta["captures-bucket"] + and "dedup-bucket" in recorded_url.warcprox_meta): + bucket = recorded_url.warcprox_meta["dedup-bucket"] else: bucket = "__unspecified__" diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 17a4fd9..33eab16 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -62,9 +62,9 @@ class DedupLoader(warcprox.BaseStandardPostfetchProcessor, DedupableMixin): and recorded_url.payload_digest and self.should_dedup(recorded_url)): digest_key = warcprox.digest_str(recorded_url.payload_digest, self.options.base32) - if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: + if recorded_url.warcprox_meta and "dedup-bucket" in recorded_url.warcprox_meta: recorded_url.dedup_info = self.dedup_db.lookup( - digest_key, recorded_url.warcprox_meta["captures-bucket"], + digest_key, recorded_url.warcprox_meta["dedup-bucket"], recorded_url.url) else: recorded_url.dedup_info = self.dedup_db.lookup( @@ -141,10 +141,10 @@ class DedupDb(DedupableMixin): and self.should_dedup(recorded_url)): digest_key = warcprox.digest_str( recorded_url.payload_digest, self.options.base32) - if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: + if recorded_url.warcprox_meta and "dedup-bucket" in recorded_url.warcprox_meta: self.save( digest_key, records[0], - bucket=recorded_url.warcprox_meta["captures-bucket"]) + bucket=recorded_url.warcprox_meta["dedup-bucket"]) else: self.save(digest_key, records[0]) @@ -206,8 +206,8 @@ class RethinkDedupDb(DedupDb, DedupableMixin): and self.should_dedup(recorded_url)): digest_key = warcprox.digest_str( recorded_url.payload_digest, self.options.base32) - if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: - self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"]) + if recorded_url.warcprox_meta and "dedup-bucket" in recorded_url.warcprox_meta: + self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["dedup-bucket"]) else: self.save(digest_key, records[0]) @@ -337,8 +337,8 @@ class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor, DedupableMixin): and recorded_url.warc_records[0].type == b'response' and self.should_dedup(recorded_url)): if (recorded_url.warcprox_meta - and 'captures-bucket' in recorded_url.warcprox_meta): - bucket = recorded_url.warcprox_meta['captures-bucket'] + and 'dedup-bucket' in recorded_url.warcprox_meta): + bucket = recorded_url.warcprox_meta['dedup-bucket'] else: bucket = '__unspecified__' buckets[bucket].append(recorded_url) @@ -387,8 +387,8 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin): and recorded_url.payload_digest and self.should_dedup(recorded_url)): if (recorded_url.warcprox_meta - and 'captures-bucket' in recorded_url.warcprox_meta): - bucket = recorded_url.warcprox_meta['captures-bucket'] + and 'dedup-bucket' in recorded_url.warcprox_meta): + bucket = recorded_url.warcprox_meta['dedup-bucket'] else: bucket = '__unspecified__' buckets[bucket].append(recorded_url) @@ -538,9 +538,9 @@ class TroughDedupDb(DedupDb, DedupableMixin): and self.should_dedup(recorded_url)): digest_key = warcprox.digest_str( recorded_url.payload_digest, self.options.base32) - if recorded_url.warcprox_meta and 'captures-bucket' in recorded_url.warcprox_meta: + if recorded_url.warcprox_meta and 'dedup-bucket' in recorded_url.warcprox_meta: self.save( digest_key, records[0], - bucket=recorded_url.warcprox_meta['captures-bucket']) + bucket=recorded_url.warcprox_meta['dedup-bucket']) else: self.save(digest_key, records[0]) From 432e42803cba7e197c86aff4b5b2091986abb10a Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Fri, 4 May 2018 14:27:42 +0000 Subject: [PATCH 2/3] dedup-bucket is required in Warcprox-Meta to do dedup Modify `DedupableMixin.should_dedup` to check Warcprox-Meta for `dedup-bucket` in order to perform dedup. --- warcprox/dedup.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 33eab16..ec03c7c 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -44,8 +44,12 @@ class DedupableMixin(object): def should_dedup(self, recorded_url): """Check if we should try to run dedup on resource based on payload - size compared with min text/binary dedup size options. Return Boolean. + size compared with min text/binary dedup size options. + `dedup-bucket` is required in Warcprox-Meta to perform dedup. + Return Boolean. """ + if "dedup-bucket" not in recorded_url.warcprox_meta: + return False if recorded_url.is_text(): return recorded_url.response_recorder.payload_size() > self.min_text_size else: From abb54e42d1b80b2f405f8314c6dbf4c6e9a6511c Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Fri, 4 May 2018 20:50:54 +0000 Subject: [PATCH 3/3] Add hidden CLI option --dedup-only-with-bucket When we use `--dedup-only-with-bucket`, dedup will be done only when a request has key `dedup-bucket` in `Warcprox-Meta`. --- warcprox/dedup.py | 6 ++++-- warcprox/main.py | 4 ++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index ec03c7c..f979d97 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -41,14 +41,16 @@ class DedupableMixin(object): def __init__(self, options=warcprox.Options()): self.min_text_size = options.dedup_min_text_size self.min_binary_size = options.dedup_min_binary_size + self.dedup_only_with_bucket = options.dedup_only_with_bucket def should_dedup(self, recorded_url): """Check if we should try to run dedup on resource based on payload size compared with min text/binary dedup size options. - `dedup-bucket` is required in Warcprox-Meta to perform dedup. + When we use option --dedup-only-with-bucket, `dedup-bucket` is required + in Warcprox-Meta to perform dedup. Return Boolean. """ - if "dedup-bucket" not in recorded_url.warcprox_meta: + if self.dedup_only_with_bucket and "dedup-bucket" not in recorded_url.warcprox_meta: return False if recorded_url.is_text(): return recorded_url.response_recorder.payload_size() > self.min_text_size diff --git a/warcprox/main.py b/warcprox/main.py index 3723445..6fb46ef 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -154,6 +154,10 @@ def _build_arg_parser(prog='warcprox'): arg_parser.add_argument('--dedup-min-binary-size', dest='dedup_min_binary_size', type=int, default=0, help=( 'try to dedup binary resources with payload size over this limit in bytes')) + # optionally, dedup request only when `dedup-bucket` is available in + # Warcprox-Meta HTTP header. By default, we dedup all requests. + arg_parser.add_argument('--dedup-only-with-bucket', dest='dedup_only_with_bucket', + action='store_true', default=False, help=argparse.SUPPRESS) arg_parser.add_argument('--queue-size', dest='queue_size', type=int, default=500, help=argparse.SUPPRESS) arg_parser.add_argument('--max-threads', dest='max_threads', type=int,