mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Merge branch 'master' into adds-hop-path-logging
This commit is contained in:
commit
366ed5155f
6
setup.py
6
setup.py
@ -2,7 +2,7 @@
|
|||||||
'''
|
'''
|
||||||
setup.py - setuptools installation configuration for warcprox
|
setup.py - setuptools installation configuration for warcprox
|
||||||
|
|
||||||
Copyright (C) 2013-2020 Internet Archive
|
Copyright (C) 2013-2021 Internet Archive
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or
|
This program is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU General Public License
|
modify it under the terms of the GNU General Public License
|
||||||
@ -32,7 +32,7 @@ deps = [
|
|||||||
'requests>=2.0.1',
|
'requests>=2.0.1',
|
||||||
'PySocks>=1.6.8',
|
'PySocks>=1.6.8',
|
||||||
'cryptography>=2.3',
|
'cryptography>=2.3',
|
||||||
'idna>=2.5',
|
'idna==2.10',
|
||||||
'PyYAML>=5.1',
|
'PyYAML>=5.1',
|
||||||
'cachetools',
|
'cachetools',
|
||||||
]
|
]
|
||||||
@ -43,7 +43,7 @@ except:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='warcprox',
|
name='warcprox',
|
||||||
version='2.4.27',
|
version='2.4.28',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
"""
|
"""
|
||||||
warcprox/__init__.py - warcprox package main file, contains some utility code
|
warcprox/__init__.py - warcprox package main file, contains some utility code
|
||||||
|
|
||||||
Copyright (C) 2013-2019 Internet Archive
|
Copyright (C) 2013-2021 Internet Archive
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or
|
This program is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU General Public License
|
modify it under the terms of the GNU General Public License
|
||||||
@ -175,8 +175,8 @@ class BaseStandardPostfetchProcessor(BasePostfetchProcessor):
|
|||||||
|
|
||||||
class BaseBatchPostfetchProcessor(BasePostfetchProcessor):
|
class BaseBatchPostfetchProcessor(BasePostfetchProcessor):
|
||||||
MAX_BATCH_SIZE = 500
|
MAX_BATCH_SIZE = 500
|
||||||
MAX_BATCH_SEC = 10
|
MAX_BATCH_SEC = 30
|
||||||
MIN_BATCH_SEC = 2.0
|
MIN_BATCH_SEC = 10
|
||||||
|
|
||||||
def _get_process_put(self):
|
def _get_process_put(self):
|
||||||
batch = []
|
batch = []
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
'''
|
'''
|
||||||
warcprox/dedup.py - identical payload digest deduplication using sqlite db
|
warcprox/dedup.py - identical payload digest deduplication using sqlite db
|
||||||
|
|
||||||
Copyright (C) 2013-2018 Internet Archive
|
Copyright (C) 2013-2021 Internet Archive
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or
|
This program is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU General Public License
|
modify it under the terms of the GNU General Public License
|
||||||
@ -384,6 +384,9 @@ class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
self.trough_dedup_db.batch_save,
|
self.trough_dedup_db.batch_save,
|
||||||
buckets[bucket], bucket)
|
buckets[bucket], bucket)
|
||||||
fs[future] = bucket
|
fs[future] = bucket
|
||||||
|
logging.debug(
|
||||||
|
'storing dedup info for %s urls '
|
||||||
|
'in bucket %s', len(buckets[bucket]), bucket)
|
||||||
|
|
||||||
# wait for results
|
# wait for results
|
||||||
try:
|
try:
|
||||||
@ -412,10 +415,19 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
'''
|
'''
|
||||||
buckets = collections.defaultdict(list)
|
buckets = collections.defaultdict(list)
|
||||||
discards = []
|
discards = []
|
||||||
|
# for duplicate checks, see https://webarchive.jira.com/browse/WT-31
|
||||||
|
hash_plus_urls = set()
|
||||||
for recorded_url in batch:
|
for recorded_url in batch:
|
||||||
|
if not recorded_url.payload_digest:
|
||||||
|
discards.append('n/a')
|
||||||
|
continue
|
||||||
|
payload_hash = warcprox.digest_str(
|
||||||
|
recorded_url.payload_digest, self.options.base32)
|
||||||
|
hash_plus_url = b''.join((payload_hash, recorded_url.url))
|
||||||
if (recorded_url.response_recorder
|
if (recorded_url.response_recorder
|
||||||
and recorded_url.payload_digest
|
and hash_plus_url not in hash_plus_urls
|
||||||
and self.trough_dedup_db.should_dedup(recorded_url)):
|
and self.trough_dedup_db.should_dedup(recorded_url)):
|
||||||
|
hash_plus_urls.add(hash_plus_url)
|
||||||
if (recorded_url.warcprox_meta
|
if (recorded_url.warcprox_meta
|
||||||
and 'dedup-buckets' in recorded_url.warcprox_meta):
|
and 'dedup-buckets' in recorded_url.warcprox_meta):
|
||||||
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
|
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
|
||||||
@ -423,10 +435,12 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
else:
|
else:
|
||||||
buckets['__unspecified__'].append(recorded_url)
|
buckets['__unspecified__'].append(recorded_url)
|
||||||
else:
|
else:
|
||||||
discards.append(
|
if hash_plus_url in hash_plus_urls:
|
||||||
warcprox.digest_str(
|
self.logger.debug(
|
||||||
recorded_url.payload_digest, self.options.base32)
|
'discarding duplicate and setting do_not_archive for %s, hash %s',
|
||||||
if recorded_url.payload_digest else 'n/a')
|
recorded_url.url, payload_hash)
|
||||||
|
recorded_url.do_not_archive = True
|
||||||
|
discards.append(payload_hash)
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
'len(batch)=%s len(discards)=%s buckets=%s',
|
'len(batch)=%s len(discards)=%s buckets=%s',
|
||||||
len(batch), len(discards),
|
len(batch), len(discards),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user