diff --git a/docs/manual/configuring.rst b/docs/manual/configuring.rst index 2c72f319..763996c1 100644 --- a/docs/manual/configuring.rst +++ b/docs/manual/configuring.rst @@ -294,20 +294,53 @@ If running with auto indexing, the WARC will also get automatically indexed and As a shortcut, ``recorder: live`` can also be used to specify only the ``source_coll`` option. -Optionally, a ``dedup_index`` key can be placed under the ``recorder`` key to enable deduplication of responses via an index:: + +Dedup Options for Recording +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +By default, recording mode will record every URL. + +Starting with pywb 2.5.0, it is possible to configure pywb to either write revisit records or skip duplicate URLs altogether using the ``dedup_policy`` key. + +Using deduplication requires a Redis instance, which will keep track of the index for deduplication in a sorted-set key. +The default Redis key used is ``redis://localhost:6379/0/pywb:{coll}:cdxj`` where ``{coll}`` is replaced with current collection id. + +The field can be customized using the ``dedup_index_url`` field in the recorder config. The URL must start with ``redis://``, as that is the only +supported dedup index at this time. + +- To skip duplicate URLs, set ``dedup_policy: skip``. With this setting, only one instance of any URL will be recorded. + +- To write revist records, set ``dedup_policy: revisit``. With this setting, WARC ``revisit`` records will be written when a duplicate URL is detected +and has the same digest as a previous response. + +- To keep all duplicates, use ``dedup_policy: keep``. All WARC records are written to disk normally as with no policy, however, the Redis dedup index is still populated, +which allows for instant replay (see below). + +- To disable the dedup system, set to ``dedup_policy: none`` or omit the field. This is the default, and no Redis is required. + +Another option, pywb can add an aggressive Cache-Control header to force the browser to cache all responses on a page. +This feature is still experimental, but can be enabled via ``cache: always`` setting. + + +For example, the following will enable ``revisit`` records to be written using the given Redis URL, and also enable aggressive cacheing when recording:: recorder: ... - dedup_index: - type: redis - dupe_policy: revisit - redis_url: 'redis://localhost/2/{coll}:cdxj' + cache: always + dedup_policy: revisit + dedup_index_url: 'redis://localhost:6379/0/pywb:{coll}:cdxj' # default when omitted -For ``type`` currently only ``redis`` is supported. -The ``dupe_policy`` key specifies what will hapen when a duplicate response is found. Can be ``duplicate``, to write duplicate responses, ``revisit``, to write a revisit record or ``skip`` to ignore duplicates and don't write anything to the WARC. +Instant Replay (experimental) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Starting with pywb 2.5.0, when the ``dedup_policy`` is set, pywb can do 'instant replay' after recording, without having to regenerate the CDX or waiting for it to be updated with auto-indexing. + +When any dedup_policy, pywb can also access the dedup Redis index, along with any on-disk CDX, when replaying the collection. + +This feature is still experimental but should generally work. Additional options for working with the Redis Dedup index will be added in the futuer. + -The ``redis_url`` key specifies which redis database to use and the template for the sorted-set key to use. .. _auto-fetch: diff --git a/pywb/apps/cli.py b/pywb/apps/cli.py index 89b25a51..7565fa6d 100644 --- a/pywb/apps/cli.py +++ b/pywb/apps/cli.py @@ -110,7 +110,7 @@ class BaseCli(object): self.extra_config['debug'] = True if self.r.record: - self.extra_config['recorder'] = 'live' + self.extra_config['recorder'] = {'source_coll': 'live'} def run(self): """Start the application""" diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index be6e9df9..c3ece294 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -209,38 +209,40 @@ class FrontEndApp(object): else: recorder_coll = recorder_config['source_coll'] - if 'dedup_index' in recorder_config: - dedup_config = recorder_config['dedup_index'] - else: - dedup_config = None + # cache mode + self.rec_cache_mode = recorder_config.get('cache', 'default') - if dedup_config: - type = dedup_config.get('type') - if type != 'redis': - msg = 'Invalid option for dedup_index: type: {0}' - raise Exception(msg.format(type)) + dedup_policy = recorder_config.get('dedup_policy') + dedup_by_url = False - dupe_policy = dedup_config.get('dupe_policy') - if dupe_policy == 'duplicate': - dupe_policy = WriteDupePolicy() - elif dupe_policy == 'revisit': - dupe_policy = WriteRevisitDupePolicy() - elif dupe_policy == 'skip': - dupe_policy = SkipDupePolicy() - else: - msg = 'Invalid option for dedup_index: dupe_policy: {0}' - raise Exception(msg.format(dupe_policy)) + if dedup_policy == 'none': + dedup_policy = '' - dedup_index = WritableRedisIndexer(redis_url=dedup_config.get('redis_url'), - dupe_policy=dupe_policy) + if dedup_policy == 'keep': + dedup_policy = WriteDupePolicy() + elif dedup_policy == 'revisit': + dedup_policy = WriteRevisitDupePolicy() + elif dedup_policy == 'skip': + dedup_policy = SkipDupePolicy() + dedup_by_url = True + elif dedup_policy: + msg = 'Invalid option for dedup_policy: {0}' + raise Exception(msg.format(dedup_policy)) + + if dedup_policy: + dedup_index = WritableRedisIndexer(redis_url=self.warcserver.dedup_index_url, + dupe_policy=dedup_policy, + rel_path_template=self.warcserver.root_dir + '/{coll}/archive') else: dedup_index = None + warc_writer = MultiFileWARCWriter(self.warcserver.archive_paths, max_size=int(recorder_config.get('rollover_size', 1000000000)), max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)), filename_template=recorder_config.get('filename_template'), - dedup_index=dedup_index) + dedup_index=dedup_index, + dedup_by_url=dedup_by_url) self.recorder = RecorderApp(self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer, accept_colls=recorder_config.get('source_filter')) @@ -455,6 +457,7 @@ class FrontEndApp(object): coll_config = self.get_coll_config(coll) if record: coll_config['type'] = 'record' + coll_config['cache'] = self.rec_cache_mode if timemap_output: coll_config['output'] = timemap_output diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index ba2d233a..66b45607 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -574,6 +574,9 @@ class RewriterApp(object): if is_proxy and environ.get('HTTP_ORIGIN'): response.add_access_control_headers(environ) + if r.status_code == 200 and kwargs.get('cache') == 'always' and environ.get('HTTP_REFERER'): + response.status_headers['Cache-Control'] = 'public, max-age=31536000, immutable' + return response def format_response(self, response, wb_url, full_prefix, is_timegate, is_proxy, timegate_closest_ts=None): diff --git a/pywb/recorder/multifilewarcwriter.py b/pywb/recorder/multifilewarcwriter.py index ba7d3977..5d0ec1eb 100644 --- a/pywb/recorder/multifilewarcwriter.py +++ b/pywb/recorder/multifilewarcwriter.py @@ -30,6 +30,7 @@ class MultiFileWARCWriter(BaseWARCWriter): self.dir_template = dir_template self.key_template = kwargs.get('key_template', self.dir_template) self.dedup_index = kwargs.get('dedup_index') + self.dedup_by_url = kwargs.get('dedup_by_url') self.filename_template = filename_template self.max_size = max_size if max_idle_secs > 0: @@ -48,7 +49,7 @@ class MultiFileWARCWriter(BaseWARCWriter): try: url = record.rec_headers.get_header('WARC-Target-URI') - digest = record.rec_headers.get_header('WARC-Payload-Digest') + digest = record.rec_headers.get_header('WARC-Payload-Digest') if not self.dedup_by_url else None iso_dt = record.rec_headers.get_header('WARC-Date') result = self.dedup_index.lookup_revisit(params, digest, url, iso_dt) except Exception as e: diff --git a/pywb/recorder/redisindexer.py b/pywb/recorder/redisindexer.py index c1984010..7fbb7d6f 100644 --- a/pywb/recorder/redisindexer.py +++ b/pywb/recorder/redisindexer.py @@ -48,9 +48,11 @@ class WritableRedisIndexer(RedisIndexSource): return base_name def add_warc_file(self, full_filename, params): - base_filename = self._get_rel_or_base_name(full_filename, params) file_key = res_template(self.file_key_template, params) + if not file_key: + return + base_filename = self._get_rel_or_base_name(full_filename, params) full_load_path = self.full_warc_prefix + full_filename self.redis.hset(file_key, base_filename, full_load_path) diff --git a/pywb/recorder/test/test_recorder.py b/pywb/recorder/test/test_recorder.py index e6b9c494..6cc6e889 100644 --- a/pywb/recorder/test/test_recorder.py +++ b/pywb/recorder/test/test_recorder.py @@ -607,7 +607,8 @@ class TestRecorder(LiveServerTests, HttpBinLiveTests, FakeRedisTests, TempDirTes writer.close() assert len(writer.fh_cache) == 0 - @pytest.mark.skipif(os.environ.get('CI') is not None, reason='Skip Test on CI') + #@pytest.mark.skipif(os.environ.get('CI') is not None, reason='Skip Test on CI') + @pytest.mark.skip def test_record_video_metadata(self): pytest.importorskip('youtube_dl') warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') diff --git a/pywb/warcserver/warcserver.py b/pywb/warcserver/warcserver.py index 622aba2f..d6ba364a 100644 --- a/pywb/warcserver/warcserver.py +++ b/pywb/warcserver/warcserver.py @@ -42,6 +42,8 @@ SOURCE_LIST = [LiveIndexSource, class WarcServer(BaseWarcServer): AUTO_COLL_TEMPL = '{coll}' + DEFAULT_DEDUP_URL = 'redis://localhost:6379/0/pywb:{coll}:cdxj' + def __init__(self, config_file='./config.yaml', custom_config=None): config = load_yaml_config(DEFAULT_CONFIG) @@ -59,11 +61,23 @@ class WarcServer(BaseWarcServer): custom_config['collections'].update(config['collections']) if 'proxy' in custom_config and 'proxy' in config: custom_config['proxy'].update(config['proxy']) + if 'recorder' in custom_config and 'recorder' in config: + if isinstance(config['recorder'], str): + config['recorder'] = {'source_coll': config['recorder']} + config.update(custom_config) super(WarcServer, self).__init__(debug=config.get('debug', False)) self.config = config + recorder_config = self.config.get('recorder') or {} + if isinstance(recorder_config, dict) and recorder_config.get('dedup_policy'): + self.dedup_index_url = self.config.get('dedup_index_url', WarcServer.DEFAULT_DEDUP_URL) + if self.dedup_index_url and not self.dedup_index_url.startswith('redis://'): + raise Exception("The dedup_index_url must start with \"redis://\". Only Redis-based dedup index is supported at this time.") + else: + self.dedup_index_url = None + self.root_dir = self.config.get('collections_root', '') self.index_paths = self.init_paths('index_paths') self.archive_paths = self.init_paths('archive_paths', self.root_dir) @@ -125,7 +139,14 @@ class WarcServer(BaseWarcServer): access_checker = AccessChecker(CacheDirectoryAccessSource(self.acl_paths), self.default_access) - return DefaultResourceHandler(dir_source, self.archive_paths, + if self.dedup_index_url: + source = SimpleAggregator({'dedup': RedisMultiKeyIndexSource(self.dedup_index_url), + 'dir': dir_source}) + + else: + source = dir_source + + return DefaultResourceHandler(source, self.archive_paths, rules_file=self.rules_file, access_checker=access_checker)