From e61099ff5f64fa6205013658804559f3a0b9411b Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Apr 2021 10:26:45 -0700 Subject: [PATCH 01/15] idna==2.10 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 190ac54..d76e45e 100755 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ deps = [ 'requests>=2.0.1', 'PySocks>=1.6.8', 'cryptography>=2.3', - 'idna>=2.5', + 'idna==2.10', 'PyYAML>=5.1', 'cachetools', ] From 1476bfec8c71854ad92083295270240fc4e7d93d Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 2 Dec 2021 11:09:17 -0800 Subject: [PATCH 02/15] discard batch hash+url match --- warcprox/dedup.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 77bc7c6..2648bf1 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -1,7 +1,7 @@ ''' warcprox/dedup.py - identical payload digest deduplication using sqlite db -Copyright (C) 2013-2018 Internet Archive +Copyright (C) 2013-2021 Internet Archive This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License @@ -412,10 +412,14 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): ''' buckets = collections.defaultdict(list) discards = [] + # for duplicate checks, see https://webarchive.jira.com/browse/WT-31 + hash_plus_urls = set() for recorded_url in batch: if (recorded_url.response_recorder and recorded_url.payload_digest - and self.trough_dedup_db.should_dedup(recorded_url)): + and self.trough_dedup_db.should_dedup(recorded_url) + and f'{recorded_url.payload_digest}{recorded_url.url}' not in hash_plus_urls): + hash_plus_urls.add(f'{recorded_url.payload_digest}{recorded_url.url}') if (recorded_url.warcprox_meta and 'dedup-buckets' in recorded_url.warcprox_meta): for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items(): @@ -423,6 +427,9 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): else: buckets['__unspecified__'].append(recorded_url) else: + if f'{recorded_url.payload_digest}{recorded_url.url}' in hash_plus_urls: + self.logger.debug( + f'discarding duplicate {recorded_url.payload_digest} {recorded_url.url}') discards.append( warcprox.digest_str( recorded_url.payload_digest, self.options.base32) From e74407591352ebb25a99c5c4a29229d0c47a6c4f Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 2 Dec 2021 11:46:30 -0800 Subject: [PATCH 03/15] python 3.5 version, mostly --- warcprox/dedup.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 2648bf1..98cfea1 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -418,8 +418,8 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): if (recorded_url.response_recorder and recorded_url.payload_digest and self.trough_dedup_db.should_dedup(recorded_url) - and f'{recorded_url.payload_digest}{recorded_url.url}' not in hash_plus_urls): - hash_plus_urls.add(f'{recorded_url.payload_digest}{recorded_url.url}') + and '{}{}'.format(recorded_url.payload_digest, recorded_url.url) not in hash_plus_urls): + hash_plus_urls.add('{}{}'.format(recorded_url.payload_digest, recorded_url.url)) if (recorded_url.warcprox_meta and 'dedup-buckets' in recorded_url.warcprox_meta): for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items(): @@ -427,9 +427,9 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): else: buckets['__unspecified__'].append(recorded_url) else: - if f'{recorded_url.payload_digest}{recorded_url.url}' in hash_plus_urls: + if recorded_url.payload_digest and '{}{}'.format(recorded_url.payload_digest, recorded_url.url) in hash_plus_urls: self.logger.debug( - f'discarding duplicate {recorded_url.payload_digest} {recorded_url.url}') + 'discarding duplicate {} {}'.format(recorded_url.payload_digest, recorded_url.url)) discards.append( warcprox.digest_str( recorded_url.payload_digest, self.options.base32) From e6a1a7dd7e5633249abd4ad865dad48b9aef8241 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 6 Dec 2021 17:29:02 -0800 Subject: [PATCH 04/15] increase trough dedup batch window --- warcprox/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 9cd09a8..9fe3a74 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -1,7 +1,7 @@ """ warcprox/__init__.py - warcprox package main file, contains some utility code -Copyright (C) 2013-2019 Internet Archive +Copyright (C) 2013-2021 Internet Archive This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License @@ -175,8 +175,8 @@ class BaseStandardPostfetchProcessor(BasePostfetchProcessor): class BaseBatchPostfetchProcessor(BasePostfetchProcessor): MAX_BATCH_SIZE = 500 - MAX_BATCH_SEC = 10 - MIN_BATCH_SEC = 2.0 + MAX_BATCH_SEC = 30 + MIN_BATCH_SEC = 10 def _get_process_put(self): batch = [] From b67f1ad0f302489afc5bf4511fbc99179dfc736c Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 6 Dec 2021 17:29:27 -0800 Subject: [PATCH 05/15] add logging --- warcprox/dedup.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 98cfea1..e8e95c7 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -384,6 +384,9 @@ class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor): self.trough_dedup_db.batch_save, buckets[bucket], bucket) fs[future] = bucket + logging.debug( + 'storing dedup info for %s urls ' + 'in bucket %s', len(buckets[bucket]), bucket) # wait for results try: @@ -434,6 +437,8 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): warcprox.digest_str( recorded_url.payload_digest, self.options.base32) if recorded_url.payload_digest else 'n/a') + self.logger.debug( + 'hash_plus_urls: {}'.format(hash_plus_urls)) self.logger.debug( 'len(batch)=%s len(discards)=%s buckets=%s', len(batch), len(discards), From 5e5a74f204cc77b4429df0e1ed24385963a0c538 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 6 Dec 2021 19:32:35 -0800 Subject: [PATCH 06/15] str, not object --- warcprox/dedup.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index e8e95c7..09f5996 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -418,11 +418,14 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): # for duplicate checks, see https://webarchive.jira.com/browse/WT-31 hash_plus_urls = set() for recorded_url in batch: + if recorded_url.payload_digest: + hash_plus_url = ''.join((warcprox.digest_str( + recorded_url.payload_digest, self.options.base32), recorded_url.url.decode())) if (recorded_url.response_recorder and recorded_url.payload_digest and self.trough_dedup_db.should_dedup(recorded_url) and '{}{}'.format(recorded_url.payload_digest, recorded_url.url) not in hash_plus_urls): - hash_plus_urls.add('{}{}'.format(recorded_url.payload_digest, recorded_url.url)) + hash_plus_urls.add(hash_plus_url) if (recorded_url.warcprox_meta and 'dedup-buckets' in recorded_url.warcprox_meta): for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items(): @@ -430,9 +433,9 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): else: buckets['__unspecified__'].append(recorded_url) else: - if recorded_url.payload_digest and '{}{}'.format(recorded_url.payload_digest, recorded_url.url) in hash_plus_urls: + if hash_plus_url in hash_plus_urls: self.logger.debug( - 'discarding duplicate {} {}'.format(recorded_url.payload_digest, recorded_url.url)) + 'discarding duplicate {}'.format(hash_plus_url) discards.append( warcprox.digest_str( recorded_url.payload_digest, self.options.base32) From 3eeccd00166b750c1713bc2687359c0a2e319ca5 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 6 Dec 2021 19:43:27 -0800 Subject: [PATCH 07/15] more hash_plus_url --- warcprox/dedup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 09f5996..336747f 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -424,7 +424,7 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): if (recorded_url.response_recorder and recorded_url.payload_digest and self.trough_dedup_db.should_dedup(recorded_url) - and '{}{}'.format(recorded_url.payload_digest, recorded_url.url) not in hash_plus_urls): + and hash_plus_url not in hash_plus_urls): hash_plus_urls.add(hash_plus_url) if (recorded_url.warcprox_meta and 'dedup-buckets' in recorded_url.warcprox_meta): @@ -435,13 +435,13 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): else: if hash_plus_url in hash_plus_urls: self.logger.debug( - 'discarding duplicate {}'.format(hash_plus_url) + 'discarding duplicate {}'.format(hash_plus_url)) discards.append( warcprox.digest_str( recorded_url.payload_digest, self.options.base32) if recorded_url.payload_digest else 'n/a') self.logger.debug( - 'hash_plus_urls: {}'.format(hash_plus_urls)) + 'hash_plus_urls: {}...'.format(hash_plus_urls[0])) self.logger.debug( 'len(batch)=%s len(discards)=%s buckets=%s', len(batch), len(discards), From da089e0a92d03f28a52fe4983240c30bcef76481 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 6 Dec 2021 20:33:16 -0800 Subject: [PATCH 08/15] bytes not str --- warcprox/dedup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 336747f..44319ea 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -419,8 +419,8 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): hash_plus_urls = set() for recorded_url in batch: if recorded_url.payload_digest: - hash_plus_url = ''.join((warcprox.digest_str( - recorded_url.payload_digest, self.options.base32), recorded_url.url.decode())) + hash_plus_url = b''.join((warcprox.digest_str( + recorded_url.payload_digest, self.options.base32), recorded_url.url)) if (recorded_url.response_recorder and recorded_url.payload_digest and self.trough_dedup_db.should_dedup(recorded_url) @@ -441,7 +441,7 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): recorded_url.payload_digest, self.options.base32) if recorded_url.payload_digest else 'n/a') self.logger.debug( - 'hash_plus_urls: {}...'.format(hash_plus_urls[0])) + 'hash_plus_urls: {}'.format(len(hash_plus_urls))) self.logger.debug( 'len(batch)=%s len(discards)=%s buckets=%s', len(batch), len(discards), From 7d4c8dcb4ec8c8afd8f65753f38630cc2776e065 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 8 Dec 2021 11:04:09 -0800 Subject: [PATCH 09/15] recorded_url.do_not_archive = True --- warcprox/dedup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 44319ea..2ceb876 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -435,7 +435,8 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): else: if hash_plus_url in hash_plus_urls: self.logger.debug( - 'discarding duplicate {}'.format(hash_plus_url)) + 'discarding duplicate {}, setting do_not_archive'.format(hash_plus_url)) + recorded_url.do_not_archive = True discards.append( warcprox.digest_str( recorded_url.payload_digest, self.options.base32) From bcaf293081fe8d9dc5359019b1c9232e7b39c34b Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 9 Dec 2021 12:19:45 -0800 Subject: [PATCH 10/15] better logging --- warcprox/dedup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 2ceb876..26319d3 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -435,14 +435,14 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): else: if hash_plus_url in hash_plus_urls: self.logger.debug( - 'discarding duplicate {}, setting do_not_archive'.format(hash_plus_url)) + 'discarding duplicate and setting do_not_archive for %, hash %'.format( + recorded_url.url, warcprox.digest_str( + recorded_url.payload_digest, self.options.base32))) recorded_url.do_not_archive = True discards.append( warcprox.digest_str( recorded_url.payload_digest, self.options.base32) if recorded_url.payload_digest else 'n/a') - self.logger.debug( - 'hash_plus_urls: {}'.format(len(hash_plus_urls))) self.logger.debug( 'len(batch)=%s len(discards)=%s buckets=%s', len(batch), len(discards), From d7aec77597d911f7bf22b6e36dc0297ffc29461d Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 16 Dec 2021 18:36:00 -0800 Subject: [PATCH 11/15] faster, likely --- warcprox/dedup.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 26319d3..43286d7 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -418,13 +418,15 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): # for duplicate checks, see https://webarchive.jira.com/browse/WT-31 hash_plus_urls = set() for recorded_url in batch: - if recorded_url.payload_digest: - hash_plus_url = b''.join((warcprox.digest_str( - recorded_url.payload_digest, self.options.base32), recorded_url.url)) + if not recorded_url.payload_digest: + discards.append('n/a') + continue + payload_hash = warcprox.digest_str( + recorded_url.payload_digest, self.options.base32) + hash_plus_url = b''.join((payload_hash, recorded_url.url)) if (recorded_url.response_recorder - and recorded_url.payload_digest - and self.trough_dedup_db.should_dedup(recorded_url) - and hash_plus_url not in hash_plus_urls): + and hash_plus_url not in hash_plus_urls + and self.trough_dedup_db.should_dedup(recorded_url)): hash_plus_urls.add(hash_plus_url) if (recorded_url.warcprox_meta and 'dedup-buckets' in recorded_url.warcprox_meta): @@ -436,13 +438,9 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): if hash_plus_url in hash_plus_urls: self.logger.debug( 'discarding duplicate and setting do_not_archive for %, hash %'.format( - recorded_url.url, warcprox.digest_str( - recorded_url.payload_digest, self.options.base32))) + recorded_url.url, payload_hash)) recorded_url.do_not_archive = True - discards.append( - warcprox.digest_str( - recorded_url.payload_digest, self.options.base32) - if recorded_url.payload_digest else 'n/a') + discards.append(payload_hash) self.logger.debug( 'len(batch)=%s len(discards)=%s buckets=%s', len(batch), len(discards), From aeecb6515f04aa3ba64a8a51dd4cd3d9e474ce2e Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 28 Dec 2021 11:58:30 -0800 Subject: [PATCH 12/15] bump version --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 190ac54..60da37c 100755 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ ''' setup.py - setuptools installation configuration for warcprox -Copyright (C) 2013-2020 Internet Archive +Copyright (C) 2013-2021 Internet Archive This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License @@ -43,7 +43,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.27', + version='2.4.28', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From 5d8fbf7038f8e419322effc2a678a0a145c35fa2 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 29 Dec 2021 10:25:04 -0800 Subject: [PATCH 13/15] fix logging buglet --- warcprox/dedup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 43286d7..167fdba 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -437,7 +437,7 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): else: if hash_plus_url in hash_plus_urls: self.logger.debug( - 'discarding duplicate and setting do_not_archive for %, hash %'.format( + 'discarding duplicate and setting do_not_archive for %s, hash %s'.format( recorded_url.url, payload_hash)) recorded_url.do_not_archive = True discards.append(payload_hash) From bc3d1e6d0049c1854e06a81b26dec1885c5ec960 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 29 Dec 2021 11:55:39 -0800 Subject: [PATCH 14/15] fix logging buglet ii --- warcprox/dedup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 167fdba..18d6532 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -437,8 +437,8 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): else: if hash_plus_url in hash_plus_urls: self.logger.debug( - 'discarding duplicate and setting do_not_archive for %s, hash %s'.format( - recorded_url.url, payload_hash)) + 'discarding duplicate and setting do_not_archive for + %s, hash %s', recorded_url.url, payload_hash) recorded_url.do_not_archive = True discards.append(payload_hash) self.logger.debug( From 9e8ea5bb456335542782515a27c0d3926b9ab98b Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 29 Dec 2021 12:06:18 -0800 Subject: [PATCH 15/15] fix logging buglet iii --- warcprox/dedup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 18d6532..705b74a 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -437,8 +437,8 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): else: if hash_plus_url in hash_plus_urls: self.logger.debug( - 'discarding duplicate and setting do_not_archive for - %s, hash %s', recorded_url.url, payload_hash) + 'discarding duplicate and setting do_not_archive for %s, hash %s', + recorded_url.url, payload_hash) recorded_url.do_not_archive = True discards.append(payload_hash) self.logger.debug(