diff --git a/pywb/recorder/filters.py b/pywb/recorder/filters.py index 1a1e7fd4..dd8cb45c 100644 --- a/pywb/recorder/filters.py +++ b/pywb/recorder/filters.py @@ -59,16 +59,22 @@ class WriteDupePolicy(object): # ============================================================================ # Skip Record Filters # ============================================================================ -class SkipNothingFilter(object): +class SkipDefaultFilter(object): def skip_request(self, path, req_headers): + if req_headers.get('Recorder-Skip') == '1': + return True + return False - def skip_response(self, path, req_headers, resp_headers): + def skip_response(self, path, req_headers, resp_headers, params): + if resp_headers.get('Recorder-Skip') == '1': + return True + return False # ============================================================================ -class CollectionFilter(SkipNothingFilter): +class CollectionFilter(SkipDefaultFilter): def __init__(self, accept_colls): self.rx_accept_map = {} @@ -79,14 +85,9 @@ class CollectionFilter(SkipNothingFilter): for name in accept_colls: self.rx_accept_map[name] = re.compile(accept_colls[name]) - def skip_request(self, path, req_headers): - if req_headers.get('Recorder-Skip') == '1': - return True - - return False - - def skip_response(self, path, req_headers, resp_headers): - if resp_headers.get('Recorder-Skip') == '1': + def skip_response(self, path, req_headers, resp_headers, params): + if super(CollectionFilter, self).skip_response(path, req_headers, + resp_headers, params): return True path = path[1:].split('/', 1)[0] @@ -102,8 +103,12 @@ class CollectionFilter(SkipNothingFilter): # ============================================================================ -class SkipRangeRequestFilter(SkipNothingFilter): +class SkipRangeRequestFilter(SkipDefaultFilter): def skip_request(self, path, req_headers): + if super(SkipRangeRequestFilter, self).skip_request(path, + req_headers): + return True + range_ = req_headers.get('Range') if range_ and not range_.lower().startswith('bytes=0-'): return True diff --git a/pywb/recorder/multifilewarcwriter.py b/pywb/recorder/multifilewarcwriter.py index 266c2a14..f0518de3 100644 --- a/pywb/recorder/multifilewarcwriter.py +++ b/pywb/recorder/multifilewarcwriter.py @@ -138,8 +138,8 @@ class MultiFileWARCWriter(BaseWARCWriter): to_rec.rec_headers.add_header(name, header) def _do_write_req_resp(self, req, resp, params): - self._copy_header(resp, req, 'WARC-Recorded-From-URI') - self._copy_header(resp, req, 'WARC-Recorded-On-Date') + self._copy_header(resp, req, 'WARC-Source-URI') + self._copy_header(resp, req, 'WARC-Creation-Date') resp = self._check_revisit(resp, params) if not resp: diff --git a/pywb/recorder/recorderapp.py b/pywb/recorder/recorderapp.py index 3233962d..8e164b24 100644 --- a/pywb/recorder/recorderapp.py +++ b/pywb/recorder/recorderapp.py @@ -225,7 +225,13 @@ class RecorderApp(object): req_stream.out.close() return self.send_error(e, start_response) - start_response('200 OK', list(res.headers.items())) + if not skipping: + skipping = any(x.skip_response(path, + req_stream.headers, + res.headers, + params) + for x in self.skip_filters) + if not skipping: resp_stream = RespWrapper(res.raw, @@ -233,14 +239,15 @@ class RecorderApp(object): req_stream, params, self.write_queue, - self.skip_filters, path, self.create_buff_func) + else: resp_stream = res.raw resp_iter = StreamIter(resp_stream) + start_response('200 OK', list(res.headers.items())) return resp_iter @@ -267,13 +274,12 @@ class Wrapper(object): #============================================================================== class RespWrapper(Wrapper): def __init__(self, stream, headers, req, - params, queue, skip_filters, path, create_func): + params, queue, path, create_func): super(RespWrapper, self).__init__(stream, params, create_func) self.headers = headers self.req = req self.queue = queue - self.skip_filters = skip_filters self.path = path def close(self): @@ -299,11 +305,6 @@ class RespWrapper(Wrapper): try: if self.interrupted: skipping = True - else: - skipping = any(x.skip_response(self.path, - self.req.headers, - self.headers) - for x in self.skip_filters) if not skipping: entry = (self.req.headers, self.req.out, diff --git a/pywb/warcserver/index/aggregator.py b/pywb/warcserver/index/aggregator.py index fb80f02f..90f396cf 100644 --- a/pywb/warcserver/index/aggregator.py +++ b/pywb/warcserver/index/aggregator.py @@ -94,8 +94,8 @@ class BaseAggregator(object): raise NotImplemented() def get_source_list(self, params): - srcs = self._iter_sources(params) - result = [(name, str(value)) for name, value in srcs] + sources = self._iter_sources(params) + result = [(name, str(value)) for name, value in sources] result = {'sources': dict(result)} return result @@ -104,30 +104,51 @@ class BaseAggregator(object): class BaseSourceListAggregator(BaseAggregator): def __init__(self, sources, **kwargs): self.sources = sources + self.sources_key = kwargs.get('sources_key', 'sources') + self.invert_sources = kwargs.get('invert_sources', False) def get_all_sources(self, params): return self.sources def _iter_sources(self, params): + invert_sources = self.invert_sources + sel_sources = params.get(self.sources_key) + if sel_sources and sel_sources[0] == '!': + invert_sources = True + sel_sources = sel_sources[1:] + + if not sel_sources or sel_sources == '*': + if not invert_sources: + return six.iteritems(self.get_all_sources(params)) + else: + return iter([]) + + if not invert_sources: + return self.yield_sources(sel_sources, params) + else: + return self.yield_invert_sources(sel_sources, params) + + def yield_sources(self, sel_sources, params): sources = self.get_all_sources(params) - srcs_list = params.get('sources') - if not srcs_list or srcs_list == '*': - return sources.items() + sel_sources = tuple(sel_sources.split(',')) + for name in sel_sources: + if name in sources: + yield (name, sources[name]) - sel_sources = tuple(srcs_list.split(',')) - - def yield_sources(sources, sel_sources, params): - for name in sel_sources: + elif ':' in name: + name, param = name.split(':', 1) if name in sources: + params['param.' + name + '.src_coll'] = param yield (name, sources[name]) - elif ':' in name: - name, param = name.split(':', 1) - if name in sources: - params['param.' + name + '.src_coll'] = param - yield (name, sources[name]) + def yield_invert_sources(self, sel_sources, params): + sources = self.get_all_sources(params) + sel_sources = tuple([src.split(':', 1)[0] + for src in sel_sources.split(',')]) - return yield_sources(sources, sel_sources, params) + for name in six.iterkeys(sources): + if name not in sel_sources: + yield (name, sources[name]) #============================================================================= diff --git a/pywb/warcserver/index/indexsource.py b/pywb/warcserver/index/indexsource.py index 133d37e5..4bd89f23 100644 --- a/pywb/warcserver/index/indexsource.py +++ b/pywb/warcserver/index/indexsource.py @@ -260,6 +260,7 @@ class RedisIndexSource(BaseIndexSource): key = res_template(member_key, params) keys = self.redis.smembers(key) + params['scan:' + key] = keys match_templ = match_templ.encode('utf-8') diff --git a/pywb/warcserver/index/test/test_memento_agg.py b/pywb/warcserver/index/test/test_memento_agg.py index 0c28ee71..0125106c 100644 --- a/pywb/warcserver/index/test/test_memento_agg.py +++ b/pywb/warcserver/index/test/test_memento_agg.py @@ -30,6 +30,10 @@ aggs = {'simple': SimpleAggregator(sources), 'gevent': GeventTimeoutAggregator(sources, timeout=5.0), } +aggs_inv = {'simple': SimpleAggregator(sources, invert_sources=True), + 'gevent': GeventTimeoutAggregator(sources, invert_sources=True, timeout=5.0), + } + agg_tm = {'gevent': GeventTimeoutAggregator(sources, timeout=0.0)} nf = {'notfound': FileIndexSource(to_path('testdata/not-found-x'))} @@ -105,6 +109,30 @@ class TestMemAgg(MementoOverrideTests, BaseTestClass): assert(errs == {}) + @pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys())) + @patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('agg_test_5')) + def test_mem_agg_index_5(self, agg): + url = 'http://vvork.com/' + res, errs = agg(dict(url=url, closest='20141001', limit=2, sources='!rhiz,ait')) + + + exp = [{'timestamp': '20141018133107', 'load_url': 'http://web.archive.org/web/20141018133107id_/http://vvork.com/', 'source': 'ia'}] + + assert(to_json_list(res) == exp) + assert(errs == {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://vvork.com/',)"}) + + @pytest.mark.parametrize("agg", list(aggs_inv.values()), ids=list(aggs_inv.keys())) + @patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('agg_test_5')) + def test_mem_agg_index_5_inverse_preset(self, agg): + url = 'http://vvork.com/' + res, errs = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait')) + + + exp = [{'timestamp': '20141018133107', 'load_url': 'http://web.archive.org/web/20141018133107id_/http://vvork.com/', 'source': 'ia'}] + + assert(to_json_list(res) == exp) + assert(errs == {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://vvork.com/',)"}) + @pytest.mark.parametrize("agg", list(agg_nf.values()), ids=list(agg_nf.keys())) def test_mem_agg_not_found(self, agg): url = 'http://vvork.com/' diff --git a/pywb/warcserver/resource/responseloader.py b/pywb/warcserver/resource/responseloader.py index 5ad353e8..c4ff4fdf 100644 --- a/pywb/warcserver/resource/responseloader.py +++ b/pywb/warcserver/resource/responseloader.py @@ -48,11 +48,12 @@ class BaseLoader(object): out_headers['WebAgg-Type'] = 'warc' out_headers['Content-Type'] = 'application/warc-record' - out_headers['WebAgg-Cdx'] = to_native_str(cdx.to_cdxj().rstrip()) - out_headers['WebAgg-Source-Coll'] = source - if params.get('recorder_skip'): out_headers['Recorder-Skip'] = '1' + cdx['recorder_skip'] = '1' + + out_headers['WebAgg-Cdx'] = to_native_str(cdx.to_cdxj().rstrip()) + out_headers['WebAgg-Source-Coll'] = source if not warc_headers: if other_headers: @@ -371,8 +372,8 @@ class LiveWebLoader(BaseLoader): if not cdx.get('is_live'): now = datetime.datetime.utcnow() - warc_headers['WARC-Recorded-From-URI'] = cdx.get('load_url') - warc_headers['WARC-Recorded-On-Date'] = datatime_to_iso_date(now) + warc_headers['WARC-Source-URI'] = cdx.get('load_url') + warc_headers['WARC-Creation-Date'] = datetime_to_iso_date(now) if remote_ip: warc_headers['WARC-IP-Address'] = remote_ip diff --git a/pywb/warcserver/test/test_upstream.py b/pywb/warcserver/test/test_upstream.py index 6b3b689d..1de91710 100644 --- a/pywb/warcserver/test/test_upstream.py +++ b/pywb/warcserver/test/test_upstream.py @@ -5,6 +5,10 @@ import webtest from io import BytesIO import requests +from pywb.webagg.handlers import DefaultResourceHandler +from pywb.webagg.aggregator import SimpleAggregator +from pywb.webagg.upstreamindexsource import UpstreamMementoIndexSource, UpstreamAggIndexSource + from warcio.recordloader import ArcWarcRecordLoader from pywb.warcserver.handlers import DefaultResourceHandler diff --git a/requirements.txt b/requirements.txt index 61b4653a..bf94b46a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ six -warcio==1.3 +warcio==1.3.3 chardet requests redis diff --git a/sample_archive/text_content/link_headers.yaml b/sample_archive/text_content/link_headers.yaml index c629f587..3216193b 100644 --- a/sample_archive/text_content/link_headers.yaml +++ b/sample_archive/text_content/link_headers.yaml @@ -28,6 +28,11 @@ agg_test_4: 'http://webenact.rhizome.org/vvork/{url}': '; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT", ; rel="original", ; rel="timemap"; type="application/link-format"' + +agg_test_5: + 'http://web.archive.org/web/{url}': '; rel="original", ; rel="memento"; datetime="Sat, 18 Oct 2014 13:31:07 GMT", ; rel="timemap"; type="application/link-format"' + + select_mem_1: 'http://web.archive.org/web/{url}': '; rel="original", ; rel="timemap"; type="application/link-format", ; rel="first memento"; datetime="Sat, 27 Jul 2002 09:13:31 GMT", ; rel="prev memento"; datetime="Wed, 06 Aug 2014 16:12:28 GMT", ; rel="memento"; datetime="Sat, 18 Oct 2014 13:31:07 GMT", ; rel="next memento"; datetime="Mon, 20 Oct 2014 16:12:43 GMT", ; rel="last memento"; datetime="Thu, 27 Oct 2016 00:13:53 GMT"'