mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Add configuration options for dedup (#597)
* Add configuration options for dedup Signed-off-by: Lukas Straub <lukasstraub2@web.de> * Add documentation for new dedup_index configuration options Signed-off-by: Lukas Straub <lukasstraub2@web.de>
This commit is contained in:
parent
04d0586244
commit
ddf3207e40
@ -267,7 +267,6 @@ The full set of configurable options (with their default settings) is as follows
|
||||
filename_template: my-warc-{timestamp}-{hostname}-{random}.warc.gz
|
||||
source_filter: live
|
||||
|
||||
|
||||
The required ``source_coll`` setting specifies the source collection from which to load content that will be recorded.
|
||||
Most likely this will be the :ref:`live-web` collection, which should also be defined.
|
||||
However, it could be any other collection, allowing for "extraction" from other collections or remote web archives.
|
||||
@ -295,6 +294,20 @@ If running with auto indexing, the WARC will also get automatically indexed and
|
||||
|
||||
As a shortcut, ``recorder: live`` can also be used to specify only the ``source_coll`` option.
|
||||
|
||||
Optionally, a ``dedup_index`` key can be placed under the ``recorder`` key to enable deduplication of responses via an index::
|
||||
|
||||
recorder:
|
||||
...
|
||||
dedup_index:
|
||||
type: redis
|
||||
dupe_policy: revisit
|
||||
redis_url: 'redis://localhost/2/{coll}:cdxj'
|
||||
|
||||
For ``type`` currently only ``redis`` is supported.
|
||||
|
||||
The ``dupe_policy`` key specifies what will hapen when a duplicate response is found. Can be ``duplicate``, to write duplicate responses, ``revisit``, to write a revisit record or ``skip`` to ignore duplicates and don't write anything to the WARC.
|
||||
|
||||
The ``redis_url`` key specifies which redis database to use and the template for the sorted-set key to use.
|
||||
|
||||
.. _auto-fetch:
|
||||
|
||||
|
@ -10,6 +10,8 @@ from wsgiprox.wsgiprox import WSGIProxMiddleware
|
||||
|
||||
from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter
|
||||
from pywb.recorder.recorderapp import RecorderApp
|
||||
from pywb.recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy
|
||||
from pywb.recorder.redisindexer import WritableRedisIndexer
|
||||
|
||||
from pywb.utils.loaders import load_yaml_config
|
||||
from pywb.utils.geventserver import GeventServer
|
||||
@ -207,8 +209,33 @@ class FrontEndApp(object):
|
||||
else:
|
||||
recorder_coll = recorder_config['source_coll']
|
||||
|
||||
# TODO: support dedup
|
||||
dedup_index = None
|
||||
if 'dedup_index' in recorder_config:
|
||||
dedup_config = recorder_config['dedup_index']
|
||||
else:
|
||||
dedup_config = None
|
||||
|
||||
if dedup_config:
|
||||
type = dedup_config.get('type')
|
||||
if type != 'redis':
|
||||
msg = 'Invalid option for dedup_index: type: {0}'
|
||||
raise Exception(msg.format(type))
|
||||
|
||||
dupe_policy = dedup_config.get('dupe_policy')
|
||||
if dupe_policy == 'duplicate':
|
||||
dupe_policy = WriteDupePolicy()
|
||||
elif dupe_policy == 'revisit':
|
||||
dupe_policy = WriteRevisitDupePolicy()
|
||||
elif dupe_policy == 'skip':
|
||||
dupe_policy = SkipDupePolicy()
|
||||
else:
|
||||
msg = 'Invalid option for dedup_index: dupe_policy: {0}'
|
||||
raise Exception(msg.format(dupe_policy))
|
||||
|
||||
dedup_index = WritableRedisIndexer(redis_url=dedup_config.get('redis_url'),
|
||||
dupe_policy=dupe_policy)
|
||||
else:
|
||||
dedup_index = None
|
||||
|
||||
warc_writer = MultiFileWARCWriter(self.warcserver.archive_paths,
|
||||
max_size=int(recorder_config.get('rollover_size', 1000000000)),
|
||||
max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)),
|
||||
|
Loading…
x
Reference in New Issue
Block a user