From d32bf743bd1de9ababeb465b45f08d86234d5bcd Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Mon, 9 Apr 2018 15:52:44 +0000 Subject: [PATCH] Configurable min dedupable size for text/binary resources New `--dedup-min-text-size` and `--dedup-min-binary-size` cli options with default value = `0`. New `DedupableMixin` which can be used in any dedup class. It is currently used only in CDX dedup. Instead of checking `payload_size() > 0`, we now use `.is_dedupable(recorded_url)` New utility method `RecordedUrl.is_text`. --- warcprox/dedup.py | 19 +++++++++++++++++-- warcprox/main.py | 6 ++++++ warcprox/warcproxy.py | 12 ++++++++++++ 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 5db8e34..7e19150 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -37,6 +37,20 @@ from concurrent import futures urllib3.disable_warnings() +class DedupableMixin(object): + def __init__(self, options=warcprox.Options()): + self.min_text_size = options.dedup_min_text_size + self.min_binary_size = options.dedup_min_binary_size + + def is_dedupable(self, recorded_url): + """Check if we should try to run dedup on resource based on payload + size compared with min text/binary dedup size options. Return Boolean. + """ + if recorded_url.is_text(): + return recorded_url.response_recorder.payload_size() > self.min_text_size + else: + return recorded_url.response_recorder.payload_size() > self.min_binary_size + class DedupLoader(warcprox.BaseStandardPostfetchProcessor): def __init__(self, dedup_db, options=warcprox.Options()): warcprox.BaseStandardPostfetchProcessor.__init__(self, options=options) @@ -273,9 +287,10 @@ class CdxServerDedup(DedupDb): """ pass -class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor): +class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin): def __init__(self, cdx_dedup, options=warcprox.Options()): warcprox.BaseBatchPostfetchProcessor.__init__(self, options) + DedupableMixin.__init__(self, options) self.pool = futures.ThreadPoolExecutor(max_workers=400) self.batch = set() self.cdx_dedup = cdx_dedup @@ -284,7 +299,7 @@ class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor): recorded_url = self.inq.get(block=True, timeout=0.5) if (recorded_url.response_recorder and recorded_url.payload_digest - and recorded_url.response_recorder.payload_size() > 0): + and self.is_dedupable(recorded_url)): self.batch.add(recorded_url) self.pool.submit(self._process_url, recorded_url) else: diff --git a/warcprox/main.py b/warcprox/main.py index 8ff466b..3723445 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -148,6 +148,12 @@ def _build_arg_parser(prog='warcprox'): # optional cookie values to pass to CDX Server; e.g. "cookie1=val1;cookie2=val2" arg_parser.add_argument('--cdxserver-dedup-cookies', dest='cdxserver_dedup_cookies', help=argparse.SUPPRESS) + arg_parser.add_argument('--dedup-min-text-size', dest='dedup_min_text_size', + type=int, default=0, + help=('try to dedup text resources with payload size over this limit in bytes')) + arg_parser.add_argument('--dedup-min-binary-size', dest='dedup_min_binary_size', + type=int, default=0, help=( + 'try to dedup binary resources with payload size over this limit in bytes')) arg_parser.add_argument('--queue-size', dest='queue_size', type=int, default=500, help=argparse.SUPPRESS) arg_parser.add_argument('--max-threads', dest='max_threads', type=int, diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index b6a0943..2477a06 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -376,6 +376,18 @@ class RecordedUrl: self.warc_records = warc_records self.do_not_archive = do_not_archive + def is_text(self): + """Ref: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types + Alternative method: try to decode('ascii') first N bytes to make sure + its text. + """ + if self.mimetype: + return self.mimetype[:5] == "text/" or self.mimetype in ( + "application/xml", "application/javascript", "application/json", + "application/xhtml+xml", "application/typescript", + "image/svg+xml") + return False + # inherit from object so that multiple inheritance from this class works # properly in python 2 # http://stackoverflow.com/questions/1713038/super-fails-with-error-typeerror-argument-1-must-be-type-not-classobj#18392639