From 685804919af5d70f6aa72d2c0432d09a68b689ed Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 20 May 2017 02:11:04 -0700 Subject: [PATCH 1/8] aggregator improvements: - support for 'WARC-Provenance' header added to response - aggregator supports source collection: if 'name:coll', coll parsed out and stored in 'param..src_coll' field, available for use in remote index, included in provenance - remoteindexsource: support interpolating '{src_coll}' in api_url and replay_url to allow handling src_coll - recorder: CollectionFilter supports dict of prefixes to filter regexs, and catch-all '*' prefix - recorder: provenance written to paired request record - rename: ProxyIndexSource -> UpstreamIndexSource to avoid confusion with actual proxy - autoapp: register_source() supports adding source classes at beginning of list --- pywb/recorder/filters.py | 27 ++++++++++++++----- pywb/recorder/multifilewarcwriter.py | 4 +++ pywb/recorder/recorderapp.py | 12 ++++++--- pywb/webagg/aggregator.py | 22 ++++++++++++--- pywb/webagg/autoapp.py | 7 +++-- pywb/webagg/indexsource.py | 14 ++++++---- pywb/webagg/responseloader.py | 12 ++++++++- pywb/webagg/test/test_upstream.py | 6 +++-- ...yindexsource.py => upstreamindexsource.py} | 10 +++---- 9 files changed, 86 insertions(+), 28 deletions(-) rename pywb/webagg/{proxyindexsource.py => upstreamindexsource.py} (85%) diff --git a/pywb/recorder/filters.py b/pywb/recorder/filters.py index b9ccd540..aaedc596 100644 --- a/pywb/recorder/filters.py +++ b/pywb/recorder/filters.py @@ -60,26 +60,39 @@ class WriteDupePolicy(object): # Skip Record Filters # ============================================================================ class SkipNothingFilter(object): - def skip_request(self, req_headers): + def skip_request(self, path, req_headers): return False - def skip_response(self, req_headers, resp_headers): + def skip_response(self, path, req_headers, resp_headers): return False # ============================================================================ class CollectionFilter(SkipNothingFilter): def __init__(self, accept_colls): - self.rx_accept_colls = re.compile(accept_colls) + self.rx_accept_map = {} - def skip_request(self, req_headers): + if isinstance(accept_colls, str): + self.rx_accept_map = {'*': re.compile(accept_colls)} + + elif isinstance(accept_colls, dict): + for name in accept_colls: + self.rx_accept_map[name] = re.compile(accept_colls[name]) + + def skip_request(self, path, req_headers): if req_headers.get('Recorder-Skip') == '1': return True return False - def skip_response(self, req_headers, resp_headers): - if not self.rx_accept_colls.match(resp_headers.get('WebAgg-Source-Coll', '')): + def skip_response(self, path, req_headers, resp_headers): + path = path[1:].split('/', 1)[0] + + rx = self.rx_accept_map.get(path) + if not rx: + rx = self.rx_accept_map.get('*') + + if rx and not rx.match(resp_headers.get('WebAgg-Source-Coll', '')): return True return False @@ -87,7 +100,7 @@ class CollectionFilter(SkipNothingFilter): # ============================================================================ class SkipRangeRequestFilter(SkipNothingFilter): - def skip_request(self, req_headers): + def skip_request(self, path, req_headers): range_ = req_headers.get('Range') if range_ and not range_.lower().startswith('bytes=0-'): return True diff --git a/pywb/recorder/multifilewarcwriter.py b/pywb/recorder/multifilewarcwriter.py index 06f980bb..806cc009 100644 --- a/pywb/recorder/multifilewarcwriter.py +++ b/pywb/recorder/multifilewarcwriter.py @@ -133,6 +133,10 @@ class MultiFileWARCWriter(BaseWARCWriter): self._do_write_req_resp(None, record, params) def _do_write_req_resp(self, req, resp, params): + prov = resp.rec_headers.get_header('WARC-Provenance') + if prov: + req.rec_headers.add_header('WARC-Provenance', prov) + resp = self._check_revisit(resp, params) if not resp: print('Skipping due to dedup') diff --git a/pywb/recorder/recorderapp.py b/pywb/recorder/recorderapp.py index 56d897ef..b1fdecb5 100644 --- a/pywb/recorder/recorderapp.py +++ b/pywb/recorder/recorderapp.py @@ -186,6 +186,8 @@ class RecorderApp(object): method = input_req.get_req_method() + path = environ['PATH_INFO'] + # write request body as metadata/resource put_record = params.get('put_record') if put_record and method in ('PUT', 'POST'): @@ -196,7 +198,7 @@ class RecorderApp(object): params, start_response) - skipping = any(x.skip_request(headers) for x in self.skip_filters) + skipping = any(x.skip_request(path, headers) for x in self.skip_filters) if not skipping: req_stream = ReqWrapper(input_buff, @@ -232,6 +234,7 @@ class RecorderApp(object): params, self.write_queue, self.skip_filters, + path, self.create_buff_func) else: resp_stream = res.raw @@ -264,13 +267,14 @@ class Wrapper(object): #============================================================================== class RespWrapper(Wrapper): def __init__(self, stream, headers, req, - params, queue, skip_filters, create_func): + params, queue, skip_filters, path, create_func): super(RespWrapper, self).__init__(stream, params, create_func) self.headers = headers self.req = req self.queue = queue self.skip_filters = skip_filters + self.path = path def close(self): try: @@ -296,7 +300,9 @@ class RespWrapper(Wrapper): if self.interrupted: skipping = True else: - skipping = any(x.skip_response(self.req.headers, self.headers) + skipping = any(x.skip_response(self.path, + self.req.headers, + self.headers) for x in self.skip_filters) if not skipping: diff --git a/pywb/webagg/aggregator.py b/pywb/webagg/aggregator.py index c5fe9739..656ac4cf 100644 --- a/pywb/webagg/aggregator.py +++ b/pywb/webagg/aggregator.py @@ -42,6 +42,7 @@ class BaseAggregator(object): def load_child_source(self, name, source, params): try: + params['_name'] = name params['_formatter'] = ParamFormatter(params, name) res = source.load_index(params) if isinstance(res, tuple): @@ -62,6 +63,10 @@ class BaseAggregator(object): return cdx if params.get('nosource') != 'true': + src_coll = params.get('param.' + name + '.src_coll') + if src_coll: + name += ':' + src_coll + cdx_iter = (add_name(cdx, name) for cdx in cdx_iter) return cdx_iter, err_list @@ -107,12 +112,23 @@ class BaseSourceListAggregator(BaseAggregator): def _iter_sources(self, params): sources = self.get_all_sources(params) srcs_list = params.get('sources') - if not srcs_list: + if not srcs_list or srcs_list == '*': return sources.items() sel_sources = tuple(srcs_list.split(',')) - return [(name, sources[name]) for name in sources.keys() if name in sel_sources] + def yield_sources(sources, sel_sources, params): + for name in sel_sources: + if name in sources: + yield (name, sources[name]) + + elif ':' in name: + name, param = name.split(':', 1) + if name in sources: + params['param.' + name + '.src_coll'] = param + yield (name, sources[name]) + + return yield_sources(sources, sel_sources, params) #============================================================================= @@ -320,7 +336,7 @@ class BaseRedisMultiKeyIndexSource(BaseAggregator, RedisIndexSource): return RedisIndexSource(None, self.redis, key) def __str__(self): - return 'redis' + return 'redis-multikey' #============================================================================= diff --git a/pywb/webagg/autoapp.py b/pywb/webagg/autoapp.py index 9388bd0a..b9106469 100644 --- a/pywb/webagg/autoapp.py +++ b/pywb/webagg/autoapp.py @@ -215,8 +215,11 @@ def init_index_source(value, source_list=None): # ============================================================================ -def register_source(source_cls): - SOURCE_LIST.append(source_cls) +def register_source(source_cls, end=False): + if not end: + SOURCE_LIST.insert(0, source_cls) + else: + SOURCE_LIST.append(source_cls) # ============================================================================ diff --git a/pywb/webagg/indexsource.py b/pywb/webagg/indexsource.py index 2ddb63bc..c2a7f673 100644 --- a/pywb/webagg/indexsource.py +++ b/pywb/webagg/indexsource.py @@ -117,16 +117,20 @@ class RemoteIndexSource(BaseIndexSource): continue cdx = CDXObject(line) - self._set_load_url(cdx) + self._set_load_url(cdx, params) yield cdx return do_load(lines) - def _set_load_url(self, cdx): - cdx[self.url_field] = self.replay_url.format( - timestamp=cdx['timestamp'], - url=cdx['url']) + def _set_load_url(self, cdx, params): + source_coll = '' + name = params.get('_name') + if name: + source_coll = params.get('param.' + name + '.src_coll', '') + cdx[self.url_field] = self.replay_url.format(url=cdx['url'], + timestamp=cdx['timestamp'], + src_coll=source_coll) def __repr__(self): return '{0}({1}, {2})'.format(self.__class__.__name__, self.api_url, diff --git a/pywb/webagg/responseloader.py b/pywb/webagg/responseloader.py index 4a41392c..c07de57f 100644 --- a/pywb/webagg/responseloader.py +++ b/pywb/webagg/responseloader.py @@ -41,12 +41,14 @@ class BaseLoader(object): warc_headers, other_headers, stream = entry + source = self._get_provenance(cdx) + out_headers = {} out_headers['WebAgg-Type'] = 'warc' - out_headers['WebAgg-Source-Coll'] = quote(cdx.get('source', ''), safe=':/') out_headers['Content-Type'] = 'application/warc-record' out_headers['WebAgg-Cdx'] = to_native_str(cdx.to_cdxj().rstrip()) + out_headers['WebAgg-Source-Coll'] = source if not warc_headers: if other_headers: @@ -60,6 +62,7 @@ class BaseLoader(object): target_uri = warc_headers.get_header('WARC-Target-URI') out_headers['WARC-Target-URI'] = target_uri + out_headers['Link'] = MementoUtils.make_link(target_uri, 'original') memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date')) @@ -88,6 +91,9 @@ class BaseLoader(object): return out_headers, streamiter + def _get_provenance(self, cdx): + return quote(cdx.get('source', ''), safe=':/') + def _set_content_len(self, content_len_str, headers, existing_len): # Try to set content-length, if it is available and valid try: @@ -424,6 +430,10 @@ class LiveWebLoader(BaseLoader): warc_headers['WARC-Record-ID'] = self._make_warc_id() warc_headers['WARC-Target-URI'] = cdx['url'] warc_headers['WARC-Date'] = datetime_to_iso_date(dt) + + if not cdx.get('is_live'): + warc_headers['WARC-Provenance'] = self._get_provenance(cdx) + if remote_ip: warc_headers['WARC-IP-Address'] = remote_ip diff --git a/pywb/webagg/test/test_upstream.py b/pywb/webagg/test/test_upstream.py index 037a9ea8..30367ba9 100644 --- a/pywb/webagg/test/test_upstream.py +++ b/pywb/webagg/test/test_upstream.py @@ -1,3 +1,5 @@ +from gevent import monkey; monkey.patch_all(thread=False) + import webtest from io import BytesIO @@ -6,7 +8,7 @@ import requests from pywb.webagg.handlers import DefaultResourceHandler from pywb.webagg.aggregator import SimpleAggregator -from pywb.webagg.proxyindexsource import ProxyMementoIndexSource, UpstreamAggIndexSource +from pywb.webagg.upstreamindexsource import UpstreamMementoIndexSource, UpstreamAggIndexSource from warcio.recordloader import ArcWarcRecordLoader @@ -26,7 +28,7 @@ class TestUpstream(LiveServerTests, BaseTestClass): app.add_route('/upstream_opt', DefaultResourceHandler(SimpleAggregator( - {'upstream_opt': ProxyMementoIndexSource.upstream_resource(base_url + '/live')}) + {'upstream_opt': UpstreamMementoIndexSource.upstream_resource(base_url + '/live')}) ) ) diff --git a/pywb/webagg/proxyindexsource.py b/pywb/webagg/upstreamindexsource.py similarity index 85% rename from pywb/webagg/proxyindexsource.py rename to pywb/webagg/upstreamindexsource.py index 60b0f195..9b0ba05d 100644 --- a/pywb/webagg/proxyindexsource.py +++ b/pywb/webagg/upstreamindexsource.py @@ -13,14 +13,14 @@ class UpstreamAggIndexSource(RemoteIndexSource): proxy_url = base_url + '/resource?url={url}&closest={timestamp}' super(UpstreamAggIndexSource, self).__init__(api_url, proxy_url, 'filename') - def _set_load_url(self, cdx): - super(UpstreamAggIndexSource, self)._set_load_url(cdx) + def _set_load_url(self, cdx, params): + super(UpstreamAggIndexSource, self)._set_load_url(cdx, params) cdx['offset'] = '0' cdx.pop('load_url', '') #============================================================================= -class ProxyMementoIndexSource(BaseIndexSource): +class UpstreamMementoIndexSource(BaseIndexSource): def __init__(self, proxy_url='{url}'): self.proxy_url = proxy_url self.loader = LiveWebLoader() @@ -45,10 +45,10 @@ class ProxyMementoIndexSource(BaseIndexSource): yield cdx def __str__(self): - return 'proxy' + return 'upstream' @staticmethod def upstream_resource(base_url): - return ProxyMementoIndexSource(base_url + '/resource?url={url}&closest={closest}') + return UpstreamMementoIndexSource(base_url + '/resource?url={url}&closest={closest}') From f0fdc50574ed7e1a5aef09a676b3406a554e7db6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 25 May 2017 13:06:24 -0700 Subject: [PATCH 2/8] fuzzymatcher: don't modify original params, instad create new fuzzy_params for fuzzy query --- pywb/webagg/fuzzymatcher.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/pywb/webagg/fuzzymatcher.py b/pywb/webagg/fuzzymatcher.py index 656889a1..1885c53d 100644 --- a/pywb/webagg/fuzzymatcher.py +++ b/pywb/webagg/fuzzymatcher.py @@ -4,6 +4,7 @@ from pywb.utils.loaders import load_yaml_config import re import os +from six import iterkeys from six.moves.urllib.parse import urlsplit from collections import namedtuple @@ -20,7 +21,8 @@ class FuzzyMatcher(object): DEFAULT_MATCH_TYPE = 'prefix' DEFAULT_REPLACE_AFTER = '?' - REMOVE_PARAMS = ['alt_url', 'reverse', 'closest', 'end_key'] + FUZZY_SKIP_PARAMS = ('alt_url', 'reverse', 'closest', 'end_key', + 'url', 'matchType', 'filter') def __init__(self, filename): config = load_yaml_config(filename) @@ -103,14 +105,15 @@ class FuzzyMatcher(object): host = urlsplit(url).netloc url = host.split('.', 1)[1] - params.update({'url': url, - 'matchType': matched_rule.match_type, - 'filter': filters}) + fuzzy_params = {'url': url, + 'matchType': matched_rule.match_type, + 'filter': filters} - for param in self.REMOVE_PARAMS: - params.pop(param, '') + for key in iterkeys(params): + if key not in self.FUZZY_SKIP_PARAMS: + fuzzy_params[key] = params[key] - return matched_rule + return matched_rule, fuzzy_params def make_regex(self, config): if isinstance(config, list): @@ -148,11 +151,13 @@ class FuzzyMatcher(object): url = params['url'] - rule = self.get_fuzzy_match(params) - if not rule: + res = self.get_fuzzy_match(params) + if not res: return - new_iter, errs = index_source(params) + rule, fuzzy_params = res + + new_iter, errs = index_source(fuzzy_params) for cdx in new_iter: if self.allow_fuzzy_result(rule, url, cdx): From afbe2478cb4067adcb81c5dec0e82e82ebf99414 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 25 May 2017 13:07:25 -0700 Subject: [PATCH 3/8] recorder: filters: check for 'Recorder-Skip: 1' on record response also --- pywb/recorder/filters.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pywb/recorder/filters.py b/pywb/recorder/filters.py index aaedc596..1a1e7fd4 100644 --- a/pywb/recorder/filters.py +++ b/pywb/recorder/filters.py @@ -86,6 +86,9 @@ class CollectionFilter(SkipNothingFilter): return False def skip_response(self, path, req_headers, resp_headers): + if resp_headers.get('Recorder-Skip') == '1': + return True + path = path[1:].split('/', 1)[0] rx = self.rx_accept_map.get(path) From 630911ef23f1621e3d295aafa98b6d0acd529f42 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 25 May 2017 13:10:24 -0700 Subject: [PATCH 4/8] provenance improvement: don't store source id as provenance, instead write full url to WARC-Recorded-From-URI, current datetime to WARC-Recorded-On-Date warcwriter: ensure WARC-Recorded-* headers copied to request record as well --- pywb/recorder/multifilewarcwriter.py | 10 +++++++--- pywb/webagg/responseloader.py | 12 +++++++++--- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/pywb/recorder/multifilewarcwriter.py b/pywb/recorder/multifilewarcwriter.py index 806cc009..9c206213 100644 --- a/pywb/recorder/multifilewarcwriter.py +++ b/pywb/recorder/multifilewarcwriter.py @@ -132,10 +132,14 @@ class MultiFileWARCWriter(BaseWARCWriter): params = params or {} self._do_write_req_resp(None, record, params) + def _copy_header(self, from_rec, to_rec, name): + header = from_rec.rec_headers.get_header(name) + if header: + to_rec.rec_headers.add_header(name, header) + def _do_write_req_resp(self, req, resp, params): - prov = resp.rec_headers.get_header('WARC-Provenance') - if prov: - req.rec_headers.add_header('WARC-Provenance', prov) + self._copy_header(resp, req, 'WARC-Recorded-From-URI') + self._copy_header(resp, req, 'WARC-Recorded-On-Date') resp = self._check_revisit(resp, params) if not resp: diff --git a/pywb/webagg/responseloader.py b/pywb/webagg/responseloader.py index c07de57f..566c1f88 100644 --- a/pywb/webagg/responseloader.py +++ b/pywb/webagg/responseloader.py @@ -22,6 +22,7 @@ import six import itertools import json import glob +import datetime from requests.models import PreparedRequest from requests.packages import urllib3 @@ -41,7 +42,7 @@ class BaseLoader(object): warc_headers, other_headers, stream = entry - source = self._get_provenance(cdx) + source = self._get_source_id(cdx) out_headers = {} out_headers['WebAgg-Type'] = 'warc' @@ -50,6 +51,9 @@ class BaseLoader(object): out_headers['WebAgg-Cdx'] = to_native_str(cdx.to_cdxj().rstrip()) out_headers['WebAgg-Source-Coll'] = source + if params.get('recorder_skip'): + out_headers['Recorder-Skip'] = '1' + if not warc_headers: if other_headers: out_headers['Link'] = other_headers.get('Link') @@ -91,7 +95,7 @@ class BaseLoader(object): return out_headers, streamiter - def _get_provenance(self, cdx): + def _get_source_id(self, cdx): return quote(cdx.get('source', ''), safe=':/') def _set_content_len(self, content_len_str, headers, existing_len): @@ -432,7 +436,9 @@ class LiveWebLoader(BaseLoader): warc_headers['WARC-Date'] = datetime_to_iso_date(dt) if not cdx.get('is_live'): - warc_headers['WARC-Provenance'] = self._get_provenance(cdx) + now = datetime.datetime.utcnow() + warc_headers['WARC-Recorded-From-URI'] = cdx.get('load_url') + warc_headers['WARC-Recorded-On-Date'] = datatime_to_iso_date(now) if remote_ip: warc_headers['WARC-IP-Address'] = remote_ip From 481bc40ccc6b7278b5b42f67d646bc819b064f5d Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 25 May 2017 13:28:57 -0700 Subject: [PATCH 5/8] fix typo! --- pywb/webagg/responseloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pywb/webagg/responseloader.py b/pywb/webagg/responseloader.py index 566c1f88..9e3e5ad2 100644 --- a/pywb/webagg/responseloader.py +++ b/pywb/webagg/responseloader.py @@ -438,7 +438,7 @@ class LiveWebLoader(BaseLoader): if not cdx.get('is_live'): now = datetime.datetime.utcnow() warc_headers['WARC-Recorded-From-URI'] = cdx.get('load_url') - warc_headers['WARC-Recorded-On-Date'] = datatime_to_iso_date(now) + warc_headers['WARC-Recorded-On-Date'] = datetime_to_iso_date(now) if remote_ip: warc_headers['WARC-IP-Address'] = remote_ip From f2c2829f494da6873f54060716ce7dc6846d369c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 31 May 2017 16:05:57 -0700 Subject: [PATCH 6/8] misc improvements: redis multi-key source: store member listing from hgetall 'scan:' key add 'recorder-skip' to cdx line also use latest warcio (1.3.3) --- pywb/webagg/indexsource.py | 1 + pywb/webagg/responseloader.py | 7 ++++--- requirements.txt | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pywb/webagg/indexsource.py b/pywb/webagg/indexsource.py index c2a7f673..c8fa4705 100644 --- a/pywb/webagg/indexsource.py +++ b/pywb/webagg/indexsource.py @@ -259,6 +259,7 @@ class RedisIndexSource(BaseIndexSource): key = res_template(member_key, params) keys = self.redis.smembers(key) + params['scan:' + key] = keys match_templ = match_templ.encode('utf-8') diff --git a/pywb/webagg/responseloader.py b/pywb/webagg/responseloader.py index 9e3e5ad2..9153a93d 100644 --- a/pywb/webagg/responseloader.py +++ b/pywb/webagg/responseloader.py @@ -48,11 +48,12 @@ class BaseLoader(object): out_headers['WebAgg-Type'] = 'warc' out_headers['Content-Type'] = 'application/warc-record' - out_headers['WebAgg-Cdx'] = to_native_str(cdx.to_cdxj().rstrip()) - out_headers['WebAgg-Source-Coll'] = source - if params.get('recorder_skip'): out_headers['Recorder-Skip'] = '1' + cdx['recorder_skip'] = '1' + + out_headers['WebAgg-Cdx'] = to_native_str(cdx.to_cdxj().rstrip()) + out_headers['WebAgg-Source-Coll'] = source if not warc_headers: if other_headers: diff --git a/requirements.txt b/requirements.txt index 61b4653a..bf94b46a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ six -warcio==1.3 +warcio==1.3.3 chardet requests redis From 06b1134be5ecee1e9cec8a25851338e4f3acb639 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 1 Jun 2017 07:45:02 -0700 Subject: [PATCH 7/8] aggregator: support 'invert_sources' option to exclude source list, rather than include can be set explicitly or via '!' on the sources list tests: test invert sources filters: include params to skip_response() filter warc headers: change headers for recording from other source to: WARC-Source-URI and WARC-Created-Date --- pywb/recorder/filters.py | 4 +- pywb/recorder/multifilewarcwriter.py | 4 +- pywb/recorder/recorderapp.py | 3 +- pywb/webagg/aggregator.py | 51 +++++++++++++------ pywb/webagg/responseloader.py | 4 +- pywb/webagg/test/test_memento_agg.py | 28 ++++++++++ sample_archive/text_content/link_headers.yaml | 5 ++ 7 files changed, 77 insertions(+), 22 deletions(-) diff --git a/pywb/recorder/filters.py b/pywb/recorder/filters.py index 1a1e7fd4..a001741c 100644 --- a/pywb/recorder/filters.py +++ b/pywb/recorder/filters.py @@ -63,7 +63,7 @@ class SkipNothingFilter(object): def skip_request(self, path, req_headers): return False - def skip_response(self, path, req_headers, resp_headers): + def skip_response(self, path, req_headers, resp_headers, params): return False @@ -85,7 +85,7 @@ class CollectionFilter(SkipNothingFilter): return False - def skip_response(self, path, req_headers, resp_headers): + def skip_response(self, path, req_headers, resp_headers, params): if resp_headers.get('Recorder-Skip') == '1': return True diff --git a/pywb/recorder/multifilewarcwriter.py b/pywb/recorder/multifilewarcwriter.py index 9c206213..1adfdf60 100644 --- a/pywb/recorder/multifilewarcwriter.py +++ b/pywb/recorder/multifilewarcwriter.py @@ -138,8 +138,8 @@ class MultiFileWARCWriter(BaseWARCWriter): to_rec.rec_headers.add_header(name, header) def _do_write_req_resp(self, req, resp, params): - self._copy_header(resp, req, 'WARC-Recorded-From-URI') - self._copy_header(resp, req, 'WARC-Recorded-On-Date') + self._copy_header(resp, req, 'WARC-Source-URI') + self._copy_header(resp, req, 'WARC-Creation-Date') resp = self._check_revisit(resp, params) if not resp: diff --git a/pywb/recorder/recorderapp.py b/pywb/recorder/recorderapp.py index b1fdecb5..4febed12 100644 --- a/pywb/recorder/recorderapp.py +++ b/pywb/recorder/recorderapp.py @@ -302,7 +302,8 @@ class RespWrapper(Wrapper): else: skipping = any(x.skip_response(self.path, self.req.headers, - self.headers) + self.headers, + self.params) for x in self.skip_filters) if not skipping: diff --git a/pywb/webagg/aggregator.py b/pywb/webagg/aggregator.py index 656ac4cf..f7d9db75 100644 --- a/pywb/webagg/aggregator.py +++ b/pywb/webagg/aggregator.py @@ -95,8 +95,8 @@ class BaseAggregator(object): raise NotImplemented() def get_source_list(self, params): - srcs = self._iter_sources(params) - result = [(name, str(value)) for name, value in srcs] + sources = self._iter_sources(params) + result = [(name, str(value)) for name, value in sources] result = {'sources': dict(result)} return result @@ -105,30 +105,51 @@ class BaseAggregator(object): class BaseSourceListAggregator(BaseAggregator): def __init__(self, sources, **kwargs): self.sources = sources + self.sources_key = kwargs.get('sources_key', 'sources') + self.invert_sources = kwargs.get('invert_sources', False) def get_all_sources(self, params): return self.sources def _iter_sources(self, params): + invert_sources = self.invert_sources + sel_sources = params.get(self.sources_key) + if sel_sources and sel_sources[0] == '!': + invert_sources = True + sel_sources = sel_sources[1:] + + if not sel_sources or sel_sources == '*': + if not invert_sources: + return six.iteritems(self.get_all_sources(params)) + else: + return iter([]) + + if not invert_sources: + return self.yield_sources(sel_sources, params) + else: + return self.yield_invert_sources(sel_sources, params) + + def yield_sources(self, sel_sources, params): sources = self.get_all_sources(params) - srcs_list = params.get('sources') - if not srcs_list or srcs_list == '*': - return sources.items() + sel_sources = tuple(sel_sources.split(',')) + for name in sel_sources: + if name in sources: + yield (name, sources[name]) - sel_sources = tuple(srcs_list.split(',')) - - def yield_sources(sources, sel_sources, params): - for name in sel_sources: + elif ':' in name: + name, param = name.split(':', 1) if name in sources: + params['param.' + name + '.src_coll'] = param yield (name, sources[name]) - elif ':' in name: - name, param = name.split(':', 1) - if name in sources: - params['param.' + name + '.src_coll'] = param - yield (name, sources[name]) + def yield_invert_sources(self, sel_sources, params): + sources = self.get_all_sources(params) + sel_sources = tuple([src.split(':', 1)[0] + for src in sel_sources.split(',')]) - return yield_sources(sources, sel_sources, params) + for name in six.iterkeys(sources): + if name not in sel_sources: + yield (name, sources[name]) #============================================================================= diff --git a/pywb/webagg/responseloader.py b/pywb/webagg/responseloader.py index 9153a93d..374c2934 100644 --- a/pywb/webagg/responseloader.py +++ b/pywb/webagg/responseloader.py @@ -438,8 +438,8 @@ class LiveWebLoader(BaseLoader): if not cdx.get('is_live'): now = datetime.datetime.utcnow() - warc_headers['WARC-Recorded-From-URI'] = cdx.get('load_url') - warc_headers['WARC-Recorded-On-Date'] = datetime_to_iso_date(now) + warc_headers['WARC-Source-URI'] = cdx.get('load_url') + warc_headers['WARC-Creation-Date'] = datetime_to_iso_date(now) if remote_ip: warc_headers['WARC-IP-Address'] = remote_ip diff --git a/pywb/webagg/test/test_memento_agg.py b/pywb/webagg/test/test_memento_agg.py index e1e5673b..ef08d78e 100644 --- a/pywb/webagg/test/test_memento_agg.py +++ b/pywb/webagg/test/test_memento_agg.py @@ -29,6 +29,10 @@ aggs = {'simple': SimpleAggregator(sources), 'gevent': GeventTimeoutAggregator(sources, timeout=5.0), } +aggs_inv = {'simple': SimpleAggregator(sources, invert_sources=True), + 'gevent': GeventTimeoutAggregator(sources, invert_sources=True, timeout=5.0), + } + agg_tm = {'gevent': GeventTimeoutAggregator(sources, timeout=0.0)} nf = {'notfound': FileIndexSource(to_path('testdata/not-found-x'))} @@ -104,6 +108,30 @@ class TestMemAgg(MementoOverrideTests, BaseTestClass): assert(errs == {}) + @pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys())) + @patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('agg_test_5')) + def test_mem_agg_index_5(self, agg): + url = 'http://vvork.com/' + res, errs = agg(dict(url=url, closest='20141001', limit=2, sources='!rhiz,ait')) + + + exp = [{'timestamp': '20141018133107', 'load_url': 'http://web.archive.org/web/20141018133107id_/http://vvork.com/', 'source': 'ia'}] + + assert(to_json_list(res) == exp) + assert(errs == {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://vvork.com/',)"}) + + @pytest.mark.parametrize("agg", list(aggs_inv.values()), ids=list(aggs_inv.keys())) + @patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('agg_test_5')) + def test_mem_agg_index_5_inverse_preset(self, agg): + url = 'http://vvork.com/' + res, errs = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait')) + + + exp = [{'timestamp': '20141018133107', 'load_url': 'http://web.archive.org/web/20141018133107id_/http://vvork.com/', 'source': 'ia'}] + + assert(to_json_list(res) == exp) + assert(errs == {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://vvork.com/',)"}) + @pytest.mark.parametrize("agg", list(agg_nf.values()), ids=list(agg_nf.keys())) def test_mem_agg_not_found(self, agg): url = 'http://vvork.com/' diff --git a/sample_archive/text_content/link_headers.yaml b/sample_archive/text_content/link_headers.yaml index c629f587..3216193b 100644 --- a/sample_archive/text_content/link_headers.yaml +++ b/sample_archive/text_content/link_headers.yaml @@ -28,6 +28,11 @@ agg_test_4: 'http://webenact.rhizome.org/vvork/{url}': '; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT", ; rel="original", ; rel="timemap"; type="application/link-format"' + +agg_test_5: + 'http://web.archive.org/web/{url}': '; rel="original", ; rel="memento"; datetime="Sat, 18 Oct 2014 13:31:07 GMT", ; rel="timemap"; type="application/link-format"' + + select_mem_1: 'http://web.archive.org/web/{url}': '; rel="original", ; rel="timemap"; type="application/link-format", ; rel="first memento"; datetime="Sat, 27 Jul 2002 09:13:31 GMT", ; rel="prev memento"; datetime="Wed, 06 Aug 2014 16:12:28 GMT", ; rel="memento"; datetime="Sat, 18 Oct 2014 13:31:07 GMT", ; rel="next memento"; datetime="Mon, 20 Oct 2014 16:12:43 GMT", ; rel="last memento"; datetime="Thu, 27 Oct 2016 00:13:53 GMT"' From eac5d18985f6704ef9e7fa7a67f972bdb7fafa1e Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 1 Jun 2017 14:03:56 -0700 Subject: [PATCH 8/8] recorder: move skip_response() check to occur before response is sent, rather than at the end filters: replace SkipNothingFilter with SkipDefaultFilter which checks for 'Recorder-Skip', call base filter checks on all filters --- pywb/recorder/filters.py | 25 +++++++++++++++---------- pywb/recorder/recorderapp.py | 20 ++++++++++---------- 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/pywb/recorder/filters.py b/pywb/recorder/filters.py index a001741c..dd8cb45c 100644 --- a/pywb/recorder/filters.py +++ b/pywb/recorder/filters.py @@ -59,16 +59,22 @@ class WriteDupePolicy(object): # ============================================================================ # Skip Record Filters # ============================================================================ -class SkipNothingFilter(object): +class SkipDefaultFilter(object): def skip_request(self, path, req_headers): + if req_headers.get('Recorder-Skip') == '1': + return True + return False def skip_response(self, path, req_headers, resp_headers, params): + if resp_headers.get('Recorder-Skip') == '1': + return True + return False # ============================================================================ -class CollectionFilter(SkipNothingFilter): +class CollectionFilter(SkipDefaultFilter): def __init__(self, accept_colls): self.rx_accept_map = {} @@ -79,14 +85,9 @@ class CollectionFilter(SkipNothingFilter): for name in accept_colls: self.rx_accept_map[name] = re.compile(accept_colls[name]) - def skip_request(self, path, req_headers): - if req_headers.get('Recorder-Skip') == '1': - return True - - return False - def skip_response(self, path, req_headers, resp_headers, params): - if resp_headers.get('Recorder-Skip') == '1': + if super(CollectionFilter, self).skip_response(path, req_headers, + resp_headers, params): return True path = path[1:].split('/', 1)[0] @@ -102,8 +103,12 @@ class CollectionFilter(SkipNothingFilter): # ============================================================================ -class SkipRangeRequestFilter(SkipNothingFilter): +class SkipRangeRequestFilter(SkipDefaultFilter): def skip_request(self, path, req_headers): + if super(SkipRangeRequestFilter, self).skip_request(path, + req_headers): + return True + range_ = req_headers.get('Range') if range_ and not range_.lower().startswith('bytes=0-'): return True diff --git a/pywb/recorder/recorderapp.py b/pywb/recorder/recorderapp.py index 4febed12..f9332412 100644 --- a/pywb/recorder/recorderapp.py +++ b/pywb/recorder/recorderapp.py @@ -225,7 +225,13 @@ class RecorderApp(object): req_stream.out.close() return self.send_error(e, start_response) - start_response('200 OK', list(res.headers.items())) + if not skipping: + skipping = any(x.skip_response(path, + req_stream.headers, + res.headers, + params) + for x in self.skip_filters) + if not skipping: resp_stream = RespWrapper(res.raw, @@ -233,14 +239,15 @@ class RecorderApp(object): req_stream, params, self.write_queue, - self.skip_filters, path, self.create_buff_func) + else: resp_stream = res.raw resp_iter = StreamIter(resp_stream) + start_response('200 OK', list(res.headers.items())) return resp_iter @@ -267,13 +274,12 @@ class Wrapper(object): #============================================================================== class RespWrapper(Wrapper): def __init__(self, stream, headers, req, - params, queue, skip_filters, path, create_func): + params, queue, path, create_func): super(RespWrapper, self).__init__(stream, params, create_func) self.headers = headers self.req = req self.queue = queue - self.skip_filters = skip_filters self.path = path def close(self): @@ -299,12 +305,6 @@ class RespWrapper(Wrapper): try: if self.interrupted: skipping = True - else: - skipping = any(x.skip_response(self.path, - self.req.headers, - self.headers, - self.params) - for x in self.skip_filters) if not skipping: entry = (self.req.headers, self.req.out,