Configurable min dedupable size for text/binary resources

New `--dedup-min-text-size` and `--dedup-min-binary-size` cli options
with default value = `0`.

New `DedupableMixin` which can be used in any dedup class. It is
currently used only in CDX dedup. Instead of checking `payload_size() >
0`, we now use `.is_dedupable(recorded_url)`

New utility method `RecordedUrl.is_text`.
This commit is contained in:
Vangelis Banos 2018-04-09 15:52:44 +00:00
parent ebf5453c2f
commit d32bf743bd
3 changed files with 35 additions and 2 deletions

View File

@ -37,6 +37,20 @@ from concurrent import futures
urllib3.disable_warnings()
class DedupableMixin(object):
def __init__(self, options=warcprox.Options()):
self.min_text_size = options.dedup_min_text_size
self.min_binary_size = options.dedup_min_binary_size
def is_dedupable(self, recorded_url):
"""Check if we should try to run dedup on resource based on payload
size compared with min text/binary dedup size options. Return Boolean.
"""
if recorded_url.is_text():
return recorded_url.response_recorder.payload_size() > self.min_text_size
else:
return recorded_url.response_recorder.payload_size() > self.min_binary_size
class DedupLoader(warcprox.BaseStandardPostfetchProcessor):
def __init__(self, dedup_db, options=warcprox.Options()):
warcprox.BaseStandardPostfetchProcessor.__init__(self, options=options)
@ -273,9 +287,10 @@ class CdxServerDedup(DedupDb):
"""
pass
class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor):
class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin):
def __init__(self, cdx_dedup, options=warcprox.Options()):
warcprox.BaseBatchPostfetchProcessor.__init__(self, options)
DedupableMixin.__init__(self, options)
self.pool = futures.ThreadPoolExecutor(max_workers=400)
self.batch = set()
self.cdx_dedup = cdx_dedup
@ -284,7 +299,7 @@ class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor):
recorded_url = self.inq.get(block=True, timeout=0.5)
if (recorded_url.response_recorder
and recorded_url.payload_digest
and recorded_url.response_recorder.payload_size() > 0):
and self.is_dedupable(recorded_url)):
self.batch.add(recorded_url)
self.pool.submit(self._process_url, recorded_url)
else:

View File

@ -148,6 +148,12 @@ def _build_arg_parser(prog='warcprox'):
# optional cookie values to pass to CDX Server; e.g. "cookie1=val1;cookie2=val2"
arg_parser.add_argument('--cdxserver-dedup-cookies', dest='cdxserver_dedup_cookies',
help=argparse.SUPPRESS)
arg_parser.add_argument('--dedup-min-text-size', dest='dedup_min_text_size',
type=int, default=0,
help=('try to dedup text resources with payload size over this limit in bytes'))
arg_parser.add_argument('--dedup-min-binary-size', dest='dedup_min_binary_size',
type=int, default=0, help=(
'try to dedup binary resources with payload size over this limit in bytes'))
arg_parser.add_argument('--queue-size', dest='queue_size', type=int,
default=500, help=argparse.SUPPRESS)
arg_parser.add_argument('--max-threads', dest='max_threads', type=int,

View File

@ -376,6 +376,18 @@ class RecordedUrl:
self.warc_records = warc_records
self.do_not_archive = do_not_archive
def is_text(self):
"""Ref: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types
Alternative method: try to decode('ascii') first N bytes to make sure
its text.
"""
if self.mimetype:
return self.mimetype[:5] == "text/" or self.mimetype in (
"application/xml", "application/javascript", "application/json",
"application/xhtml+xml", "application/typescript",
"image/svg+xml")
return False
# inherit from object so that multiple inheritance from this class works
# properly in python 2
# http://stackoverflow.com/questions/1713038/super-fails-with-error-typeerror-argument-1-must-be-type-not-classobj#18392639