mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Configurable min dedupable size for text/binary resources
New `--dedup-min-text-size` and `--dedup-min-binary-size` cli options with default value = `0`. New `DedupableMixin` which can be used in any dedup class. It is currently used only in CDX dedup. Instead of checking `payload_size() > 0`, we now use `.is_dedupable(recorded_url)` New utility method `RecordedUrl.is_text`.
This commit is contained in:
parent
ebf5453c2f
commit
d32bf743bd
@ -37,6 +37,20 @@ from concurrent import futures
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
class DedupableMixin(object):
|
||||
def __init__(self, options=warcprox.Options()):
|
||||
self.min_text_size = options.dedup_min_text_size
|
||||
self.min_binary_size = options.dedup_min_binary_size
|
||||
|
||||
def is_dedupable(self, recorded_url):
|
||||
"""Check if we should try to run dedup on resource based on payload
|
||||
size compared with min text/binary dedup size options. Return Boolean.
|
||||
"""
|
||||
if recorded_url.is_text():
|
||||
return recorded_url.response_recorder.payload_size() > self.min_text_size
|
||||
else:
|
||||
return recorded_url.response_recorder.payload_size() > self.min_binary_size
|
||||
|
||||
class DedupLoader(warcprox.BaseStandardPostfetchProcessor):
|
||||
def __init__(self, dedup_db, options=warcprox.Options()):
|
||||
warcprox.BaseStandardPostfetchProcessor.__init__(self, options=options)
|
||||
@ -273,9 +287,10 @@ class CdxServerDedup(DedupDb):
|
||||
"""
|
||||
pass
|
||||
|
||||
class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor):
|
||||
class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin):
|
||||
def __init__(self, cdx_dedup, options=warcprox.Options()):
|
||||
warcprox.BaseBatchPostfetchProcessor.__init__(self, options)
|
||||
DedupableMixin.__init__(self, options)
|
||||
self.pool = futures.ThreadPoolExecutor(max_workers=400)
|
||||
self.batch = set()
|
||||
self.cdx_dedup = cdx_dedup
|
||||
@ -284,7 +299,7 @@ class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor):
|
||||
recorded_url = self.inq.get(block=True, timeout=0.5)
|
||||
if (recorded_url.response_recorder
|
||||
and recorded_url.payload_digest
|
||||
and recorded_url.response_recorder.payload_size() > 0):
|
||||
and self.is_dedupable(recorded_url)):
|
||||
self.batch.add(recorded_url)
|
||||
self.pool.submit(self._process_url, recorded_url)
|
||||
else:
|
||||
|
@ -148,6 +148,12 @@ def _build_arg_parser(prog='warcprox'):
|
||||
# optional cookie values to pass to CDX Server; e.g. "cookie1=val1;cookie2=val2"
|
||||
arg_parser.add_argument('--cdxserver-dedup-cookies', dest='cdxserver_dedup_cookies',
|
||||
help=argparse.SUPPRESS)
|
||||
arg_parser.add_argument('--dedup-min-text-size', dest='dedup_min_text_size',
|
||||
type=int, default=0,
|
||||
help=('try to dedup text resources with payload size over this limit in bytes'))
|
||||
arg_parser.add_argument('--dedup-min-binary-size', dest='dedup_min_binary_size',
|
||||
type=int, default=0, help=(
|
||||
'try to dedup binary resources with payload size over this limit in bytes'))
|
||||
arg_parser.add_argument('--queue-size', dest='queue_size', type=int,
|
||||
default=500, help=argparse.SUPPRESS)
|
||||
arg_parser.add_argument('--max-threads', dest='max_threads', type=int,
|
||||
|
@ -376,6 +376,18 @@ class RecordedUrl:
|
||||
self.warc_records = warc_records
|
||||
self.do_not_archive = do_not_archive
|
||||
|
||||
def is_text(self):
|
||||
"""Ref: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types
|
||||
Alternative method: try to decode('ascii') first N bytes to make sure
|
||||
its text.
|
||||
"""
|
||||
if self.mimetype:
|
||||
return self.mimetype[:5] == "text/" or self.mimetype in (
|
||||
"application/xml", "application/javascript", "application/json",
|
||||
"application/xhtml+xml", "application/typescript",
|
||||
"image/svg+xml")
|
||||
return False
|
||||
|
||||
# inherit from object so that multiple inheritance from this class works
|
||||
# properly in python 2
|
||||
# http://stackoverflow.com/questions/1713038/super-fails-with-error-typeerror-argument-1-must-be-type-not-classobj#18392639
|
||||
|
Loading…
x
Reference in New Issue
Block a user