diff --git a/docs/manual/configuring.rst b/docs/manual/configuring.rst index d65d07aa..625e0e08 100644 --- a/docs/manual/configuring.rst +++ b/docs/manual/configuring.rst @@ -334,6 +334,7 @@ The full set of configurable options (with their default settings) is as follows rollover_size: 100000000 rollover_idle_secs: 600 filename_template: my-warc-{timestamp}-{hostname}-{random}.warc.gz + source_filter: live The required ``source_coll`` setting specifies the source collection from which to load content that will be recorded. @@ -349,6 +350,9 @@ subsequent requests. This allows the WARC size to be more manageable and prevent The ``filename-template`` specifies the naming convention for WARC files, and allows a timestamp, current hostname, and random string to be inserted into the filename. +When using an aggregate collection or sequential fallback collection as the source, recording can be limited to pages +fetched from certain child collection by specifying ``source_filter`` as an regex matching the name of the sub-collection. + For example, if recording with the above config into a collection called ``my-coll``, the user would access: ``http://my-archive.example.com/my-coll/record/http://example.com/``, which would load ``http://example.com/`` from the live web diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index 06ce556d..3b0b46dc 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -133,7 +133,9 @@ class FrontEndApp(object): filename_template=recorder_config.get('filename_template'), dedup_index=dedup_index) - self.recorder = RecorderApp(self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer) + self.recorder = RecorderApp(self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer, + accept_colls=recorder_config.get('source_filter')) + recorder_server = GeventServer(self.recorder, port=0) diff --git a/tests/test_record_replay.py b/tests/test_record_replay.py index 9caa2e80..d6a3f583 100644 --- a/tests/test_record_replay.py +++ b/tests/test_record_replay.py @@ -1,7 +1,7 @@ from .base_config_test import BaseConfigTest, fmod, CollsDirMixin from pywb.manager.manager import main as manager from pywb.manager.autoindex import AutoIndexer -from pywb.warcserver.test.testutils import to_path, HttpBinLiveTests +from pywb.warcserver.test.testutils import to_path, HttpBinLiveTests, TEST_WARC_PATH, TEST_CDX_PATH import os import time @@ -154,3 +154,48 @@ class TestRecordCustomConfig(HttpBinLiveTests, CollsDirMixin, BaseConfigTest): assert names[0].endswith('.warcgz') +# ============================================================================ +class TestRecordFilter(HttpBinLiveTests, CollsDirMixin, BaseConfigTest): + + @classmethod + def setup_class(cls): + rec_custom = {'collections': {'fallback': {'sequence': [ + { + 'index_paths': os.path.join(TEST_CDX_PATH, 'example.cdxj'), + 'archive_paths': TEST_WARC_PATH, + 'name': 'example' + },{ + 'index':'$live', + 'name': 'live' + }]}}, + 'recorder': {'source_coll': 'fallback', + 'source_filter': 'live', + } + } + super(TestRecordFilter, cls).setup_class('config_test_record.yaml', custom_config=rec_custom) + manager(['init', 'test-new']) + + def test_skip_existing(self): + dir_name = os.path.join(self.root_dir, '_test_colls', 'test-new', 'archive') + assert os.path.isdir(dir_name) + res = self.testapp.get('/fallback/cdx?url=http://example.com/?example=1') + assert res.text != '' + + res = self.testapp.get('/test-new/record/mp_/http://example.com/?example=1') + assert 'Example Domain' in res.text + assert os.listdir(dir_name) == [] + + def test_record_new(self): + dir_name = os.path.join(self.root_dir, '_test_colls', 'test-new', 'archive') + assert os.path.isdir(dir_name) + res = self.testapp.get('/fallback/cdx?url=http://httpbin.org/get?A=B') + assert res.text == '' + + res = self.testapp.get('/test-new/record/mp_/http://httpbin.org/get?A=B') + assert res.json['args']['A'] == 'B' + names = os.listdir(dir_name) + assert len(names) == 1 + assert names[0].endswith('.warc.gz') + + +