diff --git a/pywb/recorder/filters.py b/pywb/recorder/filters.py
index 1a1e7fd4..dd8cb45c 100644
--- a/pywb/recorder/filters.py
+++ b/pywb/recorder/filters.py
@@ -59,16 +59,22 @@ class WriteDupePolicy(object):
# ============================================================================
# Skip Record Filters
# ============================================================================
-class SkipNothingFilter(object):
+class SkipDefaultFilter(object):
def skip_request(self, path, req_headers):
+ if req_headers.get('Recorder-Skip') == '1':
+ return True
+
return False
- def skip_response(self, path, req_headers, resp_headers):
+ def skip_response(self, path, req_headers, resp_headers, params):
+ if resp_headers.get('Recorder-Skip') == '1':
+ return True
+
return False
# ============================================================================
-class CollectionFilter(SkipNothingFilter):
+class CollectionFilter(SkipDefaultFilter):
def __init__(self, accept_colls):
self.rx_accept_map = {}
@@ -79,14 +85,9 @@ class CollectionFilter(SkipNothingFilter):
for name in accept_colls:
self.rx_accept_map[name] = re.compile(accept_colls[name])
- def skip_request(self, path, req_headers):
- if req_headers.get('Recorder-Skip') == '1':
- return True
-
- return False
-
- def skip_response(self, path, req_headers, resp_headers):
- if resp_headers.get('Recorder-Skip') == '1':
+ def skip_response(self, path, req_headers, resp_headers, params):
+ if super(CollectionFilter, self).skip_response(path, req_headers,
+ resp_headers, params):
return True
path = path[1:].split('/', 1)[0]
@@ -102,8 +103,12 @@ class CollectionFilter(SkipNothingFilter):
# ============================================================================
-class SkipRangeRequestFilter(SkipNothingFilter):
+class SkipRangeRequestFilter(SkipDefaultFilter):
def skip_request(self, path, req_headers):
+ if super(SkipRangeRequestFilter, self).skip_request(path,
+ req_headers):
+ return True
+
range_ = req_headers.get('Range')
if range_ and not range_.lower().startswith('bytes=0-'):
return True
diff --git a/pywb/recorder/multifilewarcwriter.py b/pywb/recorder/multifilewarcwriter.py
index 266c2a14..f0518de3 100644
--- a/pywb/recorder/multifilewarcwriter.py
+++ b/pywb/recorder/multifilewarcwriter.py
@@ -138,8 +138,8 @@ class MultiFileWARCWriter(BaseWARCWriter):
to_rec.rec_headers.add_header(name, header)
def _do_write_req_resp(self, req, resp, params):
- self._copy_header(resp, req, 'WARC-Recorded-From-URI')
- self._copy_header(resp, req, 'WARC-Recorded-On-Date')
+ self._copy_header(resp, req, 'WARC-Source-URI')
+ self._copy_header(resp, req, 'WARC-Creation-Date')
resp = self._check_revisit(resp, params)
if not resp:
diff --git a/pywb/recorder/recorderapp.py b/pywb/recorder/recorderapp.py
index 3233962d..8e164b24 100644
--- a/pywb/recorder/recorderapp.py
+++ b/pywb/recorder/recorderapp.py
@@ -225,7 +225,13 @@ class RecorderApp(object):
req_stream.out.close()
return self.send_error(e, start_response)
- start_response('200 OK', list(res.headers.items()))
+ if not skipping:
+ skipping = any(x.skip_response(path,
+ req_stream.headers,
+ res.headers,
+ params)
+ for x in self.skip_filters)
+
if not skipping:
resp_stream = RespWrapper(res.raw,
@@ -233,14 +239,15 @@ class RecorderApp(object):
req_stream,
params,
self.write_queue,
- self.skip_filters,
path,
self.create_buff_func)
+
else:
resp_stream = res.raw
resp_iter = StreamIter(resp_stream)
+ start_response('200 OK', list(res.headers.items()))
return resp_iter
@@ -267,13 +274,12 @@ class Wrapper(object):
#==============================================================================
class RespWrapper(Wrapper):
def __init__(self, stream, headers, req,
- params, queue, skip_filters, path, create_func):
+ params, queue, path, create_func):
super(RespWrapper, self).__init__(stream, params, create_func)
self.headers = headers
self.req = req
self.queue = queue
- self.skip_filters = skip_filters
self.path = path
def close(self):
@@ -299,11 +305,6 @@ class RespWrapper(Wrapper):
try:
if self.interrupted:
skipping = True
- else:
- skipping = any(x.skip_response(self.path,
- self.req.headers,
- self.headers)
- for x in self.skip_filters)
if not skipping:
entry = (self.req.headers, self.req.out,
diff --git a/pywb/warcserver/index/aggregator.py b/pywb/warcserver/index/aggregator.py
index fb80f02f..90f396cf 100644
--- a/pywb/warcserver/index/aggregator.py
+++ b/pywb/warcserver/index/aggregator.py
@@ -94,8 +94,8 @@ class BaseAggregator(object):
raise NotImplemented()
def get_source_list(self, params):
- srcs = self._iter_sources(params)
- result = [(name, str(value)) for name, value in srcs]
+ sources = self._iter_sources(params)
+ result = [(name, str(value)) for name, value in sources]
result = {'sources': dict(result)}
return result
@@ -104,30 +104,51 @@ class BaseAggregator(object):
class BaseSourceListAggregator(BaseAggregator):
def __init__(self, sources, **kwargs):
self.sources = sources
+ self.sources_key = kwargs.get('sources_key', 'sources')
+ self.invert_sources = kwargs.get('invert_sources', False)
def get_all_sources(self, params):
return self.sources
def _iter_sources(self, params):
+ invert_sources = self.invert_sources
+ sel_sources = params.get(self.sources_key)
+ if sel_sources and sel_sources[0] == '!':
+ invert_sources = True
+ sel_sources = sel_sources[1:]
+
+ if not sel_sources or sel_sources == '*':
+ if not invert_sources:
+ return six.iteritems(self.get_all_sources(params))
+ else:
+ return iter([])
+
+ if not invert_sources:
+ return self.yield_sources(sel_sources, params)
+ else:
+ return self.yield_invert_sources(sel_sources, params)
+
+ def yield_sources(self, sel_sources, params):
sources = self.get_all_sources(params)
- srcs_list = params.get('sources')
- if not srcs_list or srcs_list == '*':
- return sources.items()
+ sel_sources = tuple(sel_sources.split(','))
+ for name in sel_sources:
+ if name in sources:
+ yield (name, sources[name])
- sel_sources = tuple(srcs_list.split(','))
-
- def yield_sources(sources, sel_sources, params):
- for name in sel_sources:
+ elif ':' in name:
+ name, param = name.split(':', 1)
if name in sources:
+ params['param.' + name + '.src_coll'] = param
yield (name, sources[name])
- elif ':' in name:
- name, param = name.split(':', 1)
- if name in sources:
- params['param.' + name + '.src_coll'] = param
- yield (name, sources[name])
+ def yield_invert_sources(self, sel_sources, params):
+ sources = self.get_all_sources(params)
+ sel_sources = tuple([src.split(':', 1)[0]
+ for src in sel_sources.split(',')])
- return yield_sources(sources, sel_sources, params)
+ for name in six.iterkeys(sources):
+ if name not in sel_sources:
+ yield (name, sources[name])
#=============================================================================
diff --git a/pywb/warcserver/index/indexsource.py b/pywb/warcserver/index/indexsource.py
index 133d37e5..4bd89f23 100644
--- a/pywb/warcserver/index/indexsource.py
+++ b/pywb/warcserver/index/indexsource.py
@@ -260,6 +260,7 @@ class RedisIndexSource(BaseIndexSource):
key = res_template(member_key, params)
keys = self.redis.smembers(key)
+ params['scan:' + key] = keys
match_templ = match_templ.encode('utf-8')
diff --git a/pywb/warcserver/index/test/test_memento_agg.py b/pywb/warcserver/index/test/test_memento_agg.py
index 0c28ee71..0125106c 100644
--- a/pywb/warcserver/index/test/test_memento_agg.py
+++ b/pywb/warcserver/index/test/test_memento_agg.py
@@ -30,6 +30,10 @@ aggs = {'simple': SimpleAggregator(sources),
'gevent': GeventTimeoutAggregator(sources, timeout=5.0),
}
+aggs_inv = {'simple': SimpleAggregator(sources, invert_sources=True),
+ 'gevent': GeventTimeoutAggregator(sources, invert_sources=True, timeout=5.0),
+ }
+
agg_tm = {'gevent': GeventTimeoutAggregator(sources, timeout=0.0)}
nf = {'notfound': FileIndexSource(to_path('testdata/not-found-x'))}
@@ -105,6 +109,30 @@ class TestMemAgg(MementoOverrideTests, BaseTestClass):
assert(errs == {})
+ @pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
+ @patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('agg_test_5'))
+ def test_mem_agg_index_5(self, agg):
+ url = 'http://vvork.com/'
+ res, errs = agg(dict(url=url, closest='20141001', limit=2, sources='!rhiz,ait'))
+
+
+ exp = [{'timestamp': '20141018133107', 'load_url': 'http://web.archive.org/web/20141018133107id_/http://vvork.com/', 'source': 'ia'}]
+
+ assert(to_json_list(res) == exp)
+ assert(errs == {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://vvork.com/',)"})
+
+ @pytest.mark.parametrize("agg", list(aggs_inv.values()), ids=list(aggs_inv.keys()))
+ @patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('agg_test_5'))
+ def test_mem_agg_index_5_inverse_preset(self, agg):
+ url = 'http://vvork.com/'
+ res, errs = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
+
+
+ exp = [{'timestamp': '20141018133107', 'load_url': 'http://web.archive.org/web/20141018133107id_/http://vvork.com/', 'source': 'ia'}]
+
+ assert(to_json_list(res) == exp)
+ assert(errs == {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://vvork.com/',)"})
+
@pytest.mark.parametrize("agg", list(agg_nf.values()), ids=list(agg_nf.keys()))
def test_mem_agg_not_found(self, agg):
url = 'http://vvork.com/'
diff --git a/pywb/warcserver/resource/responseloader.py b/pywb/warcserver/resource/responseloader.py
index 5ad353e8..c4ff4fdf 100644
--- a/pywb/warcserver/resource/responseloader.py
+++ b/pywb/warcserver/resource/responseloader.py
@@ -48,11 +48,12 @@ class BaseLoader(object):
out_headers['WebAgg-Type'] = 'warc'
out_headers['Content-Type'] = 'application/warc-record'
- out_headers['WebAgg-Cdx'] = to_native_str(cdx.to_cdxj().rstrip())
- out_headers['WebAgg-Source-Coll'] = source
-
if params.get('recorder_skip'):
out_headers['Recorder-Skip'] = '1'
+ cdx['recorder_skip'] = '1'
+
+ out_headers['WebAgg-Cdx'] = to_native_str(cdx.to_cdxj().rstrip())
+ out_headers['WebAgg-Source-Coll'] = source
if not warc_headers:
if other_headers:
@@ -371,8 +372,8 @@ class LiveWebLoader(BaseLoader):
if not cdx.get('is_live'):
now = datetime.datetime.utcnow()
- warc_headers['WARC-Recorded-From-URI'] = cdx.get('load_url')
- warc_headers['WARC-Recorded-On-Date'] = datatime_to_iso_date(now)
+ warc_headers['WARC-Source-URI'] = cdx.get('load_url')
+ warc_headers['WARC-Creation-Date'] = datetime_to_iso_date(now)
if remote_ip:
warc_headers['WARC-IP-Address'] = remote_ip
diff --git a/pywb/warcserver/test/test_upstream.py b/pywb/warcserver/test/test_upstream.py
index 6b3b689d..1de91710 100644
--- a/pywb/warcserver/test/test_upstream.py
+++ b/pywb/warcserver/test/test_upstream.py
@@ -5,6 +5,10 @@ import webtest
from io import BytesIO
import requests
+from pywb.webagg.handlers import DefaultResourceHandler
+from pywb.webagg.aggregator import SimpleAggregator
+from pywb.webagg.upstreamindexsource import UpstreamMementoIndexSource, UpstreamAggIndexSource
+
from warcio.recordloader import ArcWarcRecordLoader
from pywb.warcserver.handlers import DefaultResourceHandler
diff --git a/requirements.txt b/requirements.txt
index 61b4653a..bf94b46a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
six
-warcio==1.3
+warcio==1.3.3
chardet
requests
redis
diff --git a/sample_archive/text_content/link_headers.yaml b/sample_archive/text_content/link_headers.yaml
index c629f587..3216193b 100644
--- a/sample_archive/text_content/link_headers.yaml
+++ b/sample_archive/text_content/link_headers.yaml
@@ -28,6 +28,11 @@ agg_test_4:
'http://webenact.rhizome.org/vvork/{url}': '; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT", ; rel="original", ; rel="timemap"; type="application/link-format"'
+
+agg_test_5:
+ 'http://web.archive.org/web/{url}': '; rel="original", ; rel="memento"; datetime="Sat, 18 Oct 2014 13:31:07 GMT", ; rel="timemap"; type="application/link-format"'
+
+
select_mem_1:
'http://web.archive.org/web/{url}': '; rel="original", ; rel="timemap"; type="application/link-format", ; rel="first memento"; datetime="Sat, 27 Jul 2002 09:13:31 GMT", ; rel="prev memento"; datetime="Wed, 06 Aug 2014 16:12:28 GMT", ; rel="memento"; datetime="Sat, 18 Oct 2014 13:31:07 GMT", ; rel="next memento"; datetime="Mon, 20 Oct 2014 16:12:43 GMT", ; rel="last memento"; datetime="Thu, 27 Oct 2016 00:13:53 GMT"'