From ddf3207e40b36f6fb8d432f6801be1221e1904d4 Mon Sep 17 00:00:00 2001 From: Lukey3332 Date: Wed, 27 Jan 2021 02:06:18 +0100 Subject: [PATCH] Add configuration options for dedup (#597) * Add configuration options for dedup Signed-off-by: Lukas Straub * Add documentation for new dedup_index configuration options Signed-off-by: Lukas Straub --- docs/manual/configuring.rst | 15 ++++++++++++++- pywb/apps/frontendapp.py | 31 +++++++++++++++++++++++++++++-- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/docs/manual/configuring.rst b/docs/manual/configuring.rst index e5922948..2c72f319 100644 --- a/docs/manual/configuring.rst +++ b/docs/manual/configuring.rst @@ -267,7 +267,6 @@ The full set of configurable options (with their default settings) is as follows filename_template: my-warc-{timestamp}-{hostname}-{random}.warc.gz source_filter: live - The required ``source_coll`` setting specifies the source collection from which to load content that will be recorded. Most likely this will be the :ref:`live-web` collection, which should also be defined. However, it could be any other collection, allowing for "extraction" from other collections or remote web archives. @@ -295,6 +294,20 @@ If running with auto indexing, the WARC will also get automatically indexed and As a shortcut, ``recorder: live`` can also be used to specify only the ``source_coll`` option. +Optionally, a ``dedup_index`` key can be placed under the ``recorder`` key to enable deduplication of responses via an index:: + + recorder: + ... + dedup_index: + type: redis + dupe_policy: revisit + redis_url: 'redis://localhost/2/{coll}:cdxj' + +For ``type`` currently only ``redis`` is supported. + +The ``dupe_policy`` key specifies what will hapen when a duplicate response is found. Can be ``duplicate``, to write duplicate responses, ``revisit``, to write a revisit record or ``skip`` to ignore duplicates and don't write anything to the WARC. + +The ``redis_url`` key specifies which redis database to use and the template for the sorted-set key to use. .. _auto-fetch: diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index eab2e67f..be6e9df9 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -10,6 +10,8 @@ from wsgiprox.wsgiprox import WSGIProxMiddleware from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter from pywb.recorder.recorderapp import RecorderApp +from pywb.recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy +from pywb.recorder.redisindexer import WritableRedisIndexer from pywb.utils.loaders import load_yaml_config from pywb.utils.geventserver import GeventServer @@ -207,8 +209,33 @@ class FrontEndApp(object): else: recorder_coll = recorder_config['source_coll'] - # TODO: support dedup - dedup_index = None + if 'dedup_index' in recorder_config: + dedup_config = recorder_config['dedup_index'] + else: + dedup_config = None + + if dedup_config: + type = dedup_config.get('type') + if type != 'redis': + msg = 'Invalid option for dedup_index: type: {0}' + raise Exception(msg.format(type)) + + dupe_policy = dedup_config.get('dupe_policy') + if dupe_policy == 'duplicate': + dupe_policy = WriteDupePolicy() + elif dupe_policy == 'revisit': + dupe_policy = WriteRevisitDupePolicy() + elif dupe_policy == 'skip': + dupe_policy = SkipDupePolicy() + else: + msg = 'Invalid option for dedup_index: dupe_policy: {0}' + raise Exception(msg.format(dupe_policy)) + + dedup_index = WritableRedisIndexer(redis_url=dedup_config.get('redis_url'), + dupe_policy=dupe_policy) + else: + dedup_index = None + warc_writer = MultiFileWARCWriter(self.warcserver.archive_paths, max_size=int(recorder_config.get('rollover_size', 1000000000)), max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)),