1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Add recorder option to filter source collection (#368)

* Add source_filter option to recorder.

* Add test and docs for source_filter option.

* Update test_record_replay.py - Split source_filter test into skip existing and new recording
This commit is contained in:
eszense 2018-08-25 08:57:47 +08:00 committed by Ilya Kreymer
parent 9c44739bae
commit 6a2423e754
3 changed files with 53 additions and 2 deletions

View File

@ -334,6 +334,7 @@ The full set of configurable options (with their default settings) is as follows
rollover_size: 100000000
rollover_idle_secs: 600
filename_template: my-warc-{timestamp}-{hostname}-{random}.warc.gz
source_filter: live
The required ``source_coll`` setting specifies the source collection from which to load content that will be recorded.
@ -349,6 +350,9 @@ subsequent requests. This allows the WARC size to be more manageable and prevent
The ``filename-template`` specifies the naming convention for WARC files, and allows a timestamp, current hostname, and
random string to be inserted into the filename.
When using an aggregate collection or sequential fallback collection as the source, recording can be limited to pages
fetched from certain child collection by specifying ``source_filter`` as an regex matching the name of the sub-collection.
For example, if recording with the above config into a collection called ``my-coll``, the user would access:
``http://my-archive.example.com/my-coll/record/http://example.com/``, which would load ``http://example.com/`` from the live web

View File

@ -133,7 +133,9 @@ class FrontEndApp(object):
filename_template=recorder_config.get('filename_template'),
dedup_index=dedup_index)
self.recorder = RecorderApp(self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer)
self.recorder = RecorderApp(self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer,
accept_colls=recorder_config.get('source_filter'))
recorder_server = GeventServer(self.recorder, port=0)

View File

@ -1,7 +1,7 @@
from .base_config_test import BaseConfigTest, fmod, CollsDirMixin
from pywb.manager.manager import main as manager
from pywb.manager.autoindex import AutoIndexer
from pywb.warcserver.test.testutils import to_path, HttpBinLiveTests
from pywb.warcserver.test.testutils import to_path, HttpBinLiveTests, TEST_WARC_PATH, TEST_CDX_PATH
import os
import time
@ -154,3 +154,48 @@ class TestRecordCustomConfig(HttpBinLiveTests, CollsDirMixin, BaseConfigTest):
assert names[0].endswith('.warcgz')
# ============================================================================
class TestRecordFilter(HttpBinLiveTests, CollsDirMixin, BaseConfigTest):
@classmethod
def setup_class(cls):
rec_custom = {'collections': {'fallback': {'sequence': [
{
'index_paths': os.path.join(TEST_CDX_PATH, 'example.cdxj'),
'archive_paths': TEST_WARC_PATH,
'name': 'example'
},{
'index':'$live',
'name': 'live'
}]}},
'recorder': {'source_coll': 'fallback',
'source_filter': 'live',
}
}
super(TestRecordFilter, cls).setup_class('config_test_record.yaml', custom_config=rec_custom)
manager(['init', 'test-new'])
def test_skip_existing(self):
dir_name = os.path.join(self.root_dir, '_test_colls', 'test-new', 'archive')
assert os.path.isdir(dir_name)
res = self.testapp.get('/fallback/cdx?url=http://example.com/?example=1')
assert res.text != ''
res = self.testapp.get('/test-new/record/mp_/http://example.com/?example=1')
assert 'Example Domain' in res.text
assert os.listdir(dir_name) == []
def test_record_new(self):
dir_name = os.path.join(self.root_dir, '_test_colls', 'test-new', 'archive')
assert os.path.isdir(dir_name)
res = self.testapp.get('/fallback/cdx?url=http://httpbin.org/get?A=B')
assert res.text == ''
res = self.testapp.get('/test-new/record/mp_/http://httpbin.org/get?A=B')
assert res.json['args']['A'] == 'B'
names = os.listdir(dir_name)
assert len(names) == 1
assert names[0].endswith('.warc.gz')