1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

aggregator: support 'invert_sources' option to exclude source list, rather than include

can be set explicitly or via '!' on the sources list
tests: test invert sources
filters: include params to skip_response() filter
warc headers: change headers for recording from other source to: WARC-Source-URI and WARC-Created-Date
This commit is contained in:
Ilya Kreymer 2017-06-01 07:45:02 -07:00
parent f2c2829f49
commit 06b1134be5
7 changed files with 77 additions and 22 deletions

View File

@ -63,7 +63,7 @@ class SkipNothingFilter(object):
def skip_request(self, path, req_headers):
return False
def skip_response(self, path, req_headers, resp_headers):
def skip_response(self, path, req_headers, resp_headers, params):
return False
@ -85,7 +85,7 @@ class CollectionFilter(SkipNothingFilter):
return False
def skip_response(self, path, req_headers, resp_headers):
def skip_response(self, path, req_headers, resp_headers, params):
if resp_headers.get('Recorder-Skip') == '1':
return True

View File

@ -138,8 +138,8 @@ class MultiFileWARCWriter(BaseWARCWriter):
to_rec.rec_headers.add_header(name, header)
def _do_write_req_resp(self, req, resp, params):
self._copy_header(resp, req, 'WARC-Recorded-From-URI')
self._copy_header(resp, req, 'WARC-Recorded-On-Date')
self._copy_header(resp, req, 'WARC-Source-URI')
self._copy_header(resp, req, 'WARC-Creation-Date')
resp = self._check_revisit(resp, params)
if not resp:

View File

@ -302,7 +302,8 @@ class RespWrapper(Wrapper):
else:
skipping = any(x.skip_response(self.path,
self.req.headers,
self.headers)
self.headers,
self.params)
for x in self.skip_filters)
if not skipping:

View File

@ -95,8 +95,8 @@ class BaseAggregator(object):
raise NotImplemented()
def get_source_list(self, params):
srcs = self._iter_sources(params)
result = [(name, str(value)) for name, value in srcs]
sources = self._iter_sources(params)
result = [(name, str(value)) for name, value in sources]
result = {'sources': dict(result)}
return result
@ -105,30 +105,51 @@ class BaseAggregator(object):
class BaseSourceListAggregator(BaseAggregator):
def __init__(self, sources, **kwargs):
self.sources = sources
self.sources_key = kwargs.get('sources_key', 'sources')
self.invert_sources = kwargs.get('invert_sources', False)
def get_all_sources(self, params):
return self.sources
def _iter_sources(self, params):
invert_sources = self.invert_sources
sel_sources = params.get(self.sources_key)
if sel_sources and sel_sources[0] == '!':
invert_sources = True
sel_sources = sel_sources[1:]
if not sel_sources or sel_sources == '*':
if not invert_sources:
return six.iteritems(self.get_all_sources(params))
else:
return iter([])
if not invert_sources:
return self.yield_sources(sel_sources, params)
else:
return self.yield_invert_sources(sel_sources, params)
def yield_sources(self, sel_sources, params):
sources = self.get_all_sources(params)
srcs_list = params.get('sources')
if not srcs_list or srcs_list == '*':
return sources.items()
sel_sources = tuple(sel_sources.split(','))
for name in sel_sources:
if name in sources:
yield (name, sources[name])
sel_sources = tuple(srcs_list.split(','))
def yield_sources(sources, sel_sources, params):
for name in sel_sources:
elif ':' in name:
name, param = name.split(':', 1)
if name in sources:
params['param.' + name + '.src_coll'] = param
yield (name, sources[name])
elif ':' in name:
name, param = name.split(':', 1)
if name in sources:
params['param.' + name + '.src_coll'] = param
yield (name, sources[name])
def yield_invert_sources(self, sel_sources, params):
sources = self.get_all_sources(params)
sel_sources = tuple([src.split(':', 1)[0]
for src in sel_sources.split(',')])
return yield_sources(sources, sel_sources, params)
for name in six.iterkeys(sources):
if name not in sel_sources:
yield (name, sources[name])
#=============================================================================

View File

@ -438,8 +438,8 @@ class LiveWebLoader(BaseLoader):
if not cdx.get('is_live'):
now = datetime.datetime.utcnow()
warc_headers['WARC-Recorded-From-URI'] = cdx.get('load_url')
warc_headers['WARC-Recorded-On-Date'] = datetime_to_iso_date(now)
warc_headers['WARC-Source-URI'] = cdx.get('load_url')
warc_headers['WARC-Creation-Date'] = datetime_to_iso_date(now)
if remote_ip:
warc_headers['WARC-IP-Address'] = remote_ip

View File

@ -29,6 +29,10 @@ aggs = {'simple': SimpleAggregator(sources),
'gevent': GeventTimeoutAggregator(sources, timeout=5.0),
}
aggs_inv = {'simple': SimpleAggregator(sources, invert_sources=True),
'gevent': GeventTimeoutAggregator(sources, invert_sources=True, timeout=5.0),
}
agg_tm = {'gevent': GeventTimeoutAggregator(sources, timeout=0.0)}
nf = {'notfound': FileIndexSource(to_path('testdata/not-found-x'))}
@ -104,6 +108,30 @@ class TestMemAgg(MementoOverrideTests, BaseTestClass):
assert(errs == {})
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('agg_test_5'))
def test_mem_agg_index_5(self, agg):
url = 'http://vvork.com/'
res, errs = agg(dict(url=url, closest='20141001', limit=2, sources='!rhiz,ait'))
exp = [{'timestamp': '20141018133107', 'load_url': 'http://web.archive.org/web/20141018133107id_/http://vvork.com/', 'source': 'ia'}]
assert(to_json_list(res) == exp)
assert(errs == {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://vvork.com/',)"})
@pytest.mark.parametrize("agg", list(aggs_inv.values()), ids=list(aggs_inv.keys()))
@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('agg_test_5'))
def test_mem_agg_index_5_inverse_preset(self, agg):
url = 'http://vvork.com/'
res, errs = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
exp = [{'timestamp': '20141018133107', 'load_url': 'http://web.archive.org/web/20141018133107id_/http://vvork.com/', 'source': 'ia'}]
assert(to_json_list(res) == exp)
assert(errs == {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://vvork.com/',)"})
@pytest.mark.parametrize("agg", list(agg_nf.values()), ids=list(agg_nf.keys()))
def test_mem_agg_not_found(self, agg):
url = 'http://vvork.com/'

View File

@ -28,6 +28,11 @@ agg_test_4:
'http://webenact.rhizome.org/vvork/{url}': '<http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/>; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT", <http://www.vvork.com/>; rel="original", <http://webenact.rhizome.org/vvork/timemap/*/http://www.vvork.com/>; rel="timemap"; type="application/link-format"'
agg_test_5:
'http://web.archive.org/web/{url}': '<http://vvork.com/>; rel="original", <http://web.archive.org/web/20141018133107/http://www.vvork.com/>; rel="memento"; datetime="Sat, 18 Oct 2014 13:31:07 GMT", <http://web.archive.org/web/timemap/link/http://vvork.com/>; rel="timemap"; type="application/link-format"'
select_mem_1:
'http://web.archive.org/web/{url}': '<http://vvork.com/>; rel="original", <http://web.archive.org/web/timemap/link/http://vvork.com/>; rel="timemap"; type="application/link-format", <http://web.archive.org/web/20020727091331/http://vvork.com/>; rel="first memento"; datetime="Sat, 27 Jul 2002 09:13:31 GMT", <http://web.archive.org/web/20140806161228/http://vvork.com/>; rel="prev memento"; datetime="Wed, 06 Aug 2014 16:12:28 GMT", <http://web.archive.org/web/20141018133107/http://vvork.com/>; rel="memento"; datetime="Sat, 18 Oct 2014 13:31:07 GMT", <http://web.archive.org/web/20141020161243/http://vvork.com/>; rel="next memento"; datetime="Mon, 20 Oct 2014 16:12:43 GMT", <http://web.archive.org/web/20161027001353/http://vvork.com/>; rel="last memento"; datetime="Thu, 27 Oct 2016 00:13:53 GMT"'