diff --git a/pywb/warcserver/warcserver.py b/pywb/warcserver/warcserver.py index d6ba364a..c162437a 100644 --- a/pywb/warcserver/warcserver.py +++ b/pywb/warcserver/warcserver.py @@ -62,9 +62,15 @@ class WarcServer(BaseWarcServer): if 'proxy' in custom_config and 'proxy' in config: custom_config['proxy'].update(config['proxy']) if 'recorder' in custom_config and 'recorder' in config: + if isinstance(custom_config['recorder'], str): + custom_config['recorder'] = {'source_coll': custom_config['recorder']} + if isinstance(config['recorder'], str): config['recorder'] = {'source_coll': config['recorder']} + config['recorder'].update(custom_config['recorder']) + custom_config['recorder'] = config['recorder'] + config.update(custom_config) super(WarcServer, self).__init__(debug=config.get('debug', False)) diff --git a/tests/config_test_record_dedup.yaml b/tests/config_test_record_dedup.yaml new file mode 100644 index 00000000..af466146 --- /dev/null +++ b/tests/config_test_record_dedup.yaml @@ -0,0 +1,12 @@ +debug: true + +collections_root: _test_colls + +recorder: + source_coll: live + dedup_policy: skip + +collections: + 'live': '$live' + + diff --git a/tests/test_record_dedup.py b/tests/test_record_dedup.py new file mode 100644 index 00000000..724a32cd --- /dev/null +++ b/tests/test_record_dedup.py @@ -0,0 +1,52 @@ +from .base_config_test import BaseConfigTest, CollsDirMixin, BaseTestClass +from pywb.manager.manager import main as manager +from pywb.warcserver.test.testutils import to_path, HttpBinLiveTests, FakeRedisTests + +from fakeredis import FakeStrictRedis + +from warcio import ArchiveIterator + +import os +import time +import json + +import pytest + + +# ============================================================================ +class TestRecordDedup(HttpBinLiveTests, CollsDirMixin, BaseConfigTest, FakeRedisTests, BaseTestClass): + @classmethod + def setup_class(cls): + super(TestRecordDedup, cls).setup_class('config_test_record_dedup.yaml', custom_config={'recorder': 'live'}) + cls.redis = FakeStrictRedis.from_url("redis://localhost/0") + + def test_init_coll(self): + manager(['init', 'test-dedup']) + assert os.path.isdir(os.path.join(self.root_dir, '_test_colls', 'test-dedup', 'archive')) + + def test_record_1(self): + res = self.testapp.get('/test-dedup/record/mp_/http://httpbin.org/get?A=B', headers={"Referer": "http://httpbin.org/"}) + assert '"A": "B"' in res.text + + time.sleep(1.2) + + res = self.testapp.get('/test-dedup/record/mp_/http://httpbin.org/get?A=B', headers={"Referer": "http://httpbin.org/"}) + assert '"A": "B"' in res.text + + def test_single_redis_entry(self): + res = self.redis.zrange("pywb:test-dedup:cdxj", 0, -1) + assert len(res) == 1 + + def test_single_warc_record(self): + dir_name = os.path.join(self.root_dir, '_test_colls', 'test-dedup', 'archive') + files = os.listdir(dir_name) + assert len(files) == 1 + + records = [] + + with open(os.path.join(dir_name, files[0]), 'rb') as fh: + for record in ArchiveIterator(fh): + records.append(record.rec_type) + + # ensure only one response/request pair written + assert records == ['response', 'request']