mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Add recorder option to filter source collection (#368)
* Add source_filter option to recorder. * Add test and docs for source_filter option. * Update test_record_replay.py - Split source_filter test into skip existing and new recording
This commit is contained in:
parent
9c44739bae
commit
6a2423e754
@ -334,6 +334,7 @@ The full set of configurable options (with their default settings) is as follows
|
||||
rollover_size: 100000000
|
||||
rollover_idle_secs: 600
|
||||
filename_template: my-warc-{timestamp}-{hostname}-{random}.warc.gz
|
||||
source_filter: live
|
||||
|
||||
|
||||
The required ``source_coll`` setting specifies the source collection from which to load content that will be recorded.
|
||||
@ -349,6 +350,9 @@ subsequent requests. This allows the WARC size to be more manageable and prevent
|
||||
The ``filename-template`` specifies the naming convention for WARC files, and allows a timestamp, current hostname, and
|
||||
random string to be inserted into the filename.
|
||||
|
||||
When using an aggregate collection or sequential fallback collection as the source, recording can be limited to pages
|
||||
fetched from certain child collection by specifying ``source_filter`` as an regex matching the name of the sub-collection.
|
||||
|
||||
For example, if recording with the above config into a collection called ``my-coll``, the user would access:
|
||||
|
||||
``http://my-archive.example.com/my-coll/record/http://example.com/``, which would load ``http://example.com/`` from the live web
|
||||
|
@ -133,7 +133,9 @@ class FrontEndApp(object):
|
||||
filename_template=recorder_config.get('filename_template'),
|
||||
dedup_index=dedup_index)
|
||||
|
||||
self.recorder = RecorderApp(self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer)
|
||||
self.recorder = RecorderApp(self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer,
|
||||
accept_colls=recorder_config.get('source_filter'))
|
||||
|
||||
|
||||
recorder_server = GeventServer(self.recorder, port=0)
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
from .base_config_test import BaseConfigTest, fmod, CollsDirMixin
|
||||
from pywb.manager.manager import main as manager
|
||||
from pywb.manager.autoindex import AutoIndexer
|
||||
from pywb.warcserver.test.testutils import to_path, HttpBinLiveTests
|
||||
from pywb.warcserver.test.testutils import to_path, HttpBinLiveTests, TEST_WARC_PATH, TEST_CDX_PATH
|
||||
|
||||
import os
|
||||
import time
|
||||
@ -154,3 +154,48 @@ class TestRecordCustomConfig(HttpBinLiveTests, CollsDirMixin, BaseConfigTest):
|
||||
assert names[0].endswith('.warcgz')
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class TestRecordFilter(HttpBinLiveTests, CollsDirMixin, BaseConfigTest):
|
||||
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
rec_custom = {'collections': {'fallback': {'sequence': [
|
||||
{
|
||||
'index_paths': os.path.join(TEST_CDX_PATH, 'example.cdxj'),
|
||||
'archive_paths': TEST_WARC_PATH,
|
||||
'name': 'example'
|
||||
},{
|
||||
'index':'$live',
|
||||
'name': 'live'
|
||||
}]}},
|
||||
'recorder': {'source_coll': 'fallback',
|
||||
'source_filter': 'live',
|
||||
}
|
||||
}
|
||||
super(TestRecordFilter, cls).setup_class('config_test_record.yaml', custom_config=rec_custom)
|
||||
manager(['init', 'test-new'])
|
||||
|
||||
def test_skip_existing(self):
|
||||
dir_name = os.path.join(self.root_dir, '_test_colls', 'test-new', 'archive')
|
||||
assert os.path.isdir(dir_name)
|
||||
res = self.testapp.get('/fallback/cdx?url=http://example.com/?example=1')
|
||||
assert res.text != ''
|
||||
|
||||
res = self.testapp.get('/test-new/record/mp_/http://example.com/?example=1')
|
||||
assert 'Example Domain' in res.text
|
||||
assert os.listdir(dir_name) == []
|
||||
|
||||
def test_record_new(self):
|
||||
dir_name = os.path.join(self.root_dir, '_test_colls', 'test-new', 'archive')
|
||||
assert os.path.isdir(dir_name)
|
||||
res = self.testapp.get('/fallback/cdx?url=http://httpbin.org/get?A=B')
|
||||
assert res.text == ''
|
||||
|
||||
res = self.testapp.get('/test-new/record/mp_/http://httpbin.org/get?A=B')
|
||||
assert res.json['args']['A'] == 'B'
|
||||
names = os.listdir(dir_name)
|
||||
assert len(names) == 1
|
||||
assert names[0].endswith('.warc.gz')
|
||||
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user