Merge branch 'master' into adds-hop-path-logging

This commit is contained in:
Adam Miller 2022-02-09 18:18:32 +00:00
commit 366ed5155f
3 changed files with 26 additions and 12 deletions

View File

@ -2,7 +2,7 @@
''' '''
setup.py - setuptools installation configuration for warcprox setup.py - setuptools installation configuration for warcprox
Copyright (C) 2013-2020 Internet Archive Copyright (C) 2013-2021 Internet Archive
This program is free software; you can redistribute it and/or This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License modify it under the terms of the GNU General Public License
@ -32,7 +32,7 @@ deps = [
'requests>=2.0.1', 'requests>=2.0.1',
'PySocks>=1.6.8', 'PySocks>=1.6.8',
'cryptography>=2.3', 'cryptography>=2.3',
'idna>=2.5', 'idna==2.10',
'PyYAML>=5.1', 'PyYAML>=5.1',
'cachetools', 'cachetools',
] ]
@ -43,7 +43,7 @@ except:
setuptools.setup( setuptools.setup(
name='warcprox', name='warcprox',
version='2.4.27', version='2.4.28',
description='WARC writing MITM HTTP/S proxy', description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox', url='https://github.com/internetarchive/warcprox',
author='Noah Levitt', author='Noah Levitt',

View File

@ -1,7 +1,7 @@
""" """
warcprox/__init__.py - warcprox package main file, contains some utility code warcprox/__init__.py - warcprox package main file, contains some utility code
Copyright (C) 2013-2019 Internet Archive Copyright (C) 2013-2021 Internet Archive
This program is free software; you can redistribute it and/or This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License modify it under the terms of the GNU General Public License
@ -175,8 +175,8 @@ class BaseStandardPostfetchProcessor(BasePostfetchProcessor):
class BaseBatchPostfetchProcessor(BasePostfetchProcessor): class BaseBatchPostfetchProcessor(BasePostfetchProcessor):
MAX_BATCH_SIZE = 500 MAX_BATCH_SIZE = 500
MAX_BATCH_SEC = 10 MAX_BATCH_SEC = 30
MIN_BATCH_SEC = 2.0 MIN_BATCH_SEC = 10
def _get_process_put(self): def _get_process_put(self):
batch = [] batch = []

View File

@ -1,7 +1,7 @@
''' '''
warcprox/dedup.py - identical payload digest deduplication using sqlite db warcprox/dedup.py - identical payload digest deduplication using sqlite db
Copyright (C) 2013-2018 Internet Archive Copyright (C) 2013-2021 Internet Archive
This program is free software; you can redistribute it and/or This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License modify it under the terms of the GNU General Public License
@ -384,6 +384,9 @@ class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor):
self.trough_dedup_db.batch_save, self.trough_dedup_db.batch_save,
buckets[bucket], bucket) buckets[bucket], bucket)
fs[future] = bucket fs[future] = bucket
logging.debug(
'storing dedup info for %s urls '
'in bucket %s', len(buckets[bucket]), bucket)
# wait for results # wait for results
try: try:
@ -412,10 +415,19 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
''' '''
buckets = collections.defaultdict(list) buckets = collections.defaultdict(list)
discards = [] discards = []
# for duplicate checks, see https://webarchive.jira.com/browse/WT-31
hash_plus_urls = set()
for recorded_url in batch: for recorded_url in batch:
if not recorded_url.payload_digest:
discards.append('n/a')
continue
payload_hash = warcprox.digest_str(
recorded_url.payload_digest, self.options.base32)
hash_plus_url = b''.join((payload_hash, recorded_url.url))
if (recorded_url.response_recorder if (recorded_url.response_recorder
and recorded_url.payload_digest and hash_plus_url not in hash_plus_urls
and self.trough_dedup_db.should_dedup(recorded_url)): and self.trough_dedup_db.should_dedup(recorded_url)):
hash_plus_urls.add(hash_plus_url)
if (recorded_url.warcprox_meta if (recorded_url.warcprox_meta
and 'dedup-buckets' in recorded_url.warcprox_meta): and 'dedup-buckets' in recorded_url.warcprox_meta):
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items(): for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
@ -423,10 +435,12 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
else: else:
buckets['__unspecified__'].append(recorded_url) buckets['__unspecified__'].append(recorded_url)
else: else:
discards.append( if hash_plus_url in hash_plus_urls:
warcprox.digest_str( self.logger.debug(
recorded_url.payload_digest, self.options.base32) 'discarding duplicate and setting do_not_archive for %s, hash %s',
if recorded_url.payload_digest else 'n/a') recorded_url.url, payload_hash)
recorded_url.do_not_archive = True
discards.append(payload_hash)
self.logger.debug( self.logger.debug(
'len(batch)=%s len(discards)=%s buckets=%s', 'len(batch)=%s len(discards)=%s buckets=%s',
len(batch), len(discards), len(batch), len(discards),