mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Add recorder option to filter source collection (#368)
* Add source_filter option to recorder. * Add test and docs for source_filter option. * Update test_record_replay.py - Split source_filter test into skip existing and new recording
This commit is contained in:
parent
9c44739bae
commit
6a2423e754
@ -334,6 +334,7 @@ The full set of configurable options (with their default settings) is as follows
|
|||||||
rollover_size: 100000000
|
rollover_size: 100000000
|
||||||
rollover_idle_secs: 600
|
rollover_idle_secs: 600
|
||||||
filename_template: my-warc-{timestamp}-{hostname}-{random}.warc.gz
|
filename_template: my-warc-{timestamp}-{hostname}-{random}.warc.gz
|
||||||
|
source_filter: live
|
||||||
|
|
||||||
|
|
||||||
The required ``source_coll`` setting specifies the source collection from which to load content that will be recorded.
|
The required ``source_coll`` setting specifies the source collection from which to load content that will be recorded.
|
||||||
@ -349,6 +350,9 @@ subsequent requests. This allows the WARC size to be more manageable and prevent
|
|||||||
The ``filename-template`` specifies the naming convention for WARC files, and allows a timestamp, current hostname, and
|
The ``filename-template`` specifies the naming convention for WARC files, and allows a timestamp, current hostname, and
|
||||||
random string to be inserted into the filename.
|
random string to be inserted into the filename.
|
||||||
|
|
||||||
|
When using an aggregate collection or sequential fallback collection as the source, recording can be limited to pages
|
||||||
|
fetched from certain child collection by specifying ``source_filter`` as an regex matching the name of the sub-collection.
|
||||||
|
|
||||||
For example, if recording with the above config into a collection called ``my-coll``, the user would access:
|
For example, if recording with the above config into a collection called ``my-coll``, the user would access:
|
||||||
|
|
||||||
``http://my-archive.example.com/my-coll/record/http://example.com/``, which would load ``http://example.com/`` from the live web
|
``http://my-archive.example.com/my-coll/record/http://example.com/``, which would load ``http://example.com/`` from the live web
|
||||||
|
@ -133,7 +133,9 @@ class FrontEndApp(object):
|
|||||||
filename_template=recorder_config.get('filename_template'),
|
filename_template=recorder_config.get('filename_template'),
|
||||||
dedup_index=dedup_index)
|
dedup_index=dedup_index)
|
||||||
|
|
||||||
self.recorder = RecorderApp(self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer)
|
self.recorder = RecorderApp(self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer,
|
||||||
|
accept_colls=recorder_config.get('source_filter'))
|
||||||
|
|
||||||
|
|
||||||
recorder_server = GeventServer(self.recorder, port=0)
|
recorder_server = GeventServer(self.recorder, port=0)
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
from .base_config_test import BaseConfigTest, fmod, CollsDirMixin
|
from .base_config_test import BaseConfigTest, fmod, CollsDirMixin
|
||||||
from pywb.manager.manager import main as manager
|
from pywb.manager.manager import main as manager
|
||||||
from pywb.manager.autoindex import AutoIndexer
|
from pywb.manager.autoindex import AutoIndexer
|
||||||
from pywb.warcserver.test.testutils import to_path, HttpBinLiveTests
|
from pywb.warcserver.test.testutils import to_path, HttpBinLiveTests, TEST_WARC_PATH, TEST_CDX_PATH
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
@ -154,3 +154,48 @@ class TestRecordCustomConfig(HttpBinLiveTests, CollsDirMixin, BaseConfigTest):
|
|||||||
assert names[0].endswith('.warcgz')
|
assert names[0].endswith('.warcgz')
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
class TestRecordFilter(HttpBinLiveTests, CollsDirMixin, BaseConfigTest):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setup_class(cls):
|
||||||
|
rec_custom = {'collections': {'fallback': {'sequence': [
|
||||||
|
{
|
||||||
|
'index_paths': os.path.join(TEST_CDX_PATH, 'example.cdxj'),
|
||||||
|
'archive_paths': TEST_WARC_PATH,
|
||||||
|
'name': 'example'
|
||||||
|
},{
|
||||||
|
'index':'$live',
|
||||||
|
'name': 'live'
|
||||||
|
}]}},
|
||||||
|
'recorder': {'source_coll': 'fallback',
|
||||||
|
'source_filter': 'live',
|
||||||
|
}
|
||||||
|
}
|
||||||
|
super(TestRecordFilter, cls).setup_class('config_test_record.yaml', custom_config=rec_custom)
|
||||||
|
manager(['init', 'test-new'])
|
||||||
|
|
||||||
|
def test_skip_existing(self):
|
||||||
|
dir_name = os.path.join(self.root_dir, '_test_colls', 'test-new', 'archive')
|
||||||
|
assert os.path.isdir(dir_name)
|
||||||
|
res = self.testapp.get('/fallback/cdx?url=http://example.com/?example=1')
|
||||||
|
assert res.text != ''
|
||||||
|
|
||||||
|
res = self.testapp.get('/test-new/record/mp_/http://example.com/?example=1')
|
||||||
|
assert 'Example Domain' in res.text
|
||||||
|
assert os.listdir(dir_name) == []
|
||||||
|
|
||||||
|
def test_record_new(self):
|
||||||
|
dir_name = os.path.join(self.root_dir, '_test_colls', 'test-new', 'archive')
|
||||||
|
assert os.path.isdir(dir_name)
|
||||||
|
res = self.testapp.get('/fallback/cdx?url=http://httpbin.org/get?A=B')
|
||||||
|
assert res.text == ''
|
||||||
|
|
||||||
|
res = self.testapp.get('/test-new/record/mp_/http://httpbin.org/get?A=B')
|
||||||
|
assert res.json['args']['A'] == 'B'
|
||||||
|
names = os.listdir(dir_name)
|
||||||
|
assert len(names) == 1
|
||||||
|
assert names[0].endswith('.warc.gz')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user