mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
aggregator improvements:
- support for 'WARC-Provenance' header added to response - aggregator supports source collection: if 'name:coll', coll parsed out and stored in 'param.<name>.src_coll' field, available for use in remote index, included in provenance - remoteindexsource: support interpolating '{src_coll}' in api_url and replay_url to allow handling src_coll - recorder: CollectionFilter supports dict of prefixes to filter regexs, and catch-all '*' prefix - recorder: provenance written to paired request record - rename: ProxyIndexSource -> UpstreamIndexSource to avoid confusion with actual proxy - autoapp: register_source() supports adding source classes at beginning of list
This commit is contained in:
parent
d8f035642b
commit
685804919a
@ -60,26 +60,39 @@ class WriteDupePolicy(object):
|
|||||||
# Skip Record Filters
|
# Skip Record Filters
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class SkipNothingFilter(object):
|
class SkipNothingFilter(object):
|
||||||
def skip_request(self, req_headers):
|
def skip_request(self, path, req_headers):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def skip_response(self, req_headers, resp_headers):
|
def skip_response(self, path, req_headers, resp_headers):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class CollectionFilter(SkipNothingFilter):
|
class CollectionFilter(SkipNothingFilter):
|
||||||
def __init__(self, accept_colls):
|
def __init__(self, accept_colls):
|
||||||
self.rx_accept_colls = re.compile(accept_colls)
|
self.rx_accept_map = {}
|
||||||
|
|
||||||
def skip_request(self, req_headers):
|
if isinstance(accept_colls, str):
|
||||||
|
self.rx_accept_map = {'*': re.compile(accept_colls)}
|
||||||
|
|
||||||
|
elif isinstance(accept_colls, dict):
|
||||||
|
for name in accept_colls:
|
||||||
|
self.rx_accept_map[name] = re.compile(accept_colls[name])
|
||||||
|
|
||||||
|
def skip_request(self, path, req_headers):
|
||||||
if req_headers.get('Recorder-Skip') == '1':
|
if req_headers.get('Recorder-Skip') == '1':
|
||||||
return True
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def skip_response(self, req_headers, resp_headers):
|
def skip_response(self, path, req_headers, resp_headers):
|
||||||
if not self.rx_accept_colls.match(resp_headers.get('WebAgg-Source-Coll', '')):
|
path = path[1:].split('/', 1)[0]
|
||||||
|
|
||||||
|
rx = self.rx_accept_map.get(path)
|
||||||
|
if not rx:
|
||||||
|
rx = self.rx_accept_map.get('*')
|
||||||
|
|
||||||
|
if rx and not rx.match(resp_headers.get('WebAgg-Source-Coll', '')):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
@ -87,7 +100,7 @@ class CollectionFilter(SkipNothingFilter):
|
|||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class SkipRangeRequestFilter(SkipNothingFilter):
|
class SkipRangeRequestFilter(SkipNothingFilter):
|
||||||
def skip_request(self, req_headers):
|
def skip_request(self, path, req_headers):
|
||||||
range_ = req_headers.get('Range')
|
range_ = req_headers.get('Range')
|
||||||
if range_ and not range_.lower().startswith('bytes=0-'):
|
if range_ and not range_.lower().startswith('bytes=0-'):
|
||||||
return True
|
return True
|
||||||
|
@ -133,6 +133,10 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
|||||||
self._do_write_req_resp(None, record, params)
|
self._do_write_req_resp(None, record, params)
|
||||||
|
|
||||||
def _do_write_req_resp(self, req, resp, params):
|
def _do_write_req_resp(self, req, resp, params):
|
||||||
|
prov = resp.rec_headers.get_header('WARC-Provenance')
|
||||||
|
if prov:
|
||||||
|
req.rec_headers.add_header('WARC-Provenance', prov)
|
||||||
|
|
||||||
resp = self._check_revisit(resp, params)
|
resp = self._check_revisit(resp, params)
|
||||||
if not resp:
|
if not resp:
|
||||||
print('Skipping due to dedup')
|
print('Skipping due to dedup')
|
||||||
|
@ -186,6 +186,8 @@ class RecorderApp(object):
|
|||||||
|
|
||||||
method = input_req.get_req_method()
|
method = input_req.get_req_method()
|
||||||
|
|
||||||
|
path = environ['PATH_INFO']
|
||||||
|
|
||||||
# write request body as metadata/resource
|
# write request body as metadata/resource
|
||||||
put_record = params.get('put_record')
|
put_record = params.get('put_record')
|
||||||
if put_record and method in ('PUT', 'POST'):
|
if put_record and method in ('PUT', 'POST'):
|
||||||
@ -196,7 +198,7 @@ class RecorderApp(object):
|
|||||||
params,
|
params,
|
||||||
start_response)
|
start_response)
|
||||||
|
|
||||||
skipping = any(x.skip_request(headers) for x in self.skip_filters)
|
skipping = any(x.skip_request(path, headers) for x in self.skip_filters)
|
||||||
|
|
||||||
if not skipping:
|
if not skipping:
|
||||||
req_stream = ReqWrapper(input_buff,
|
req_stream = ReqWrapper(input_buff,
|
||||||
@ -232,6 +234,7 @@ class RecorderApp(object):
|
|||||||
params,
|
params,
|
||||||
self.write_queue,
|
self.write_queue,
|
||||||
self.skip_filters,
|
self.skip_filters,
|
||||||
|
path,
|
||||||
self.create_buff_func)
|
self.create_buff_func)
|
||||||
else:
|
else:
|
||||||
resp_stream = res.raw
|
resp_stream = res.raw
|
||||||
@ -264,13 +267,14 @@ class Wrapper(object):
|
|||||||
#==============================================================================
|
#==============================================================================
|
||||||
class RespWrapper(Wrapper):
|
class RespWrapper(Wrapper):
|
||||||
def __init__(self, stream, headers, req,
|
def __init__(self, stream, headers, req,
|
||||||
params, queue, skip_filters, create_func):
|
params, queue, skip_filters, path, create_func):
|
||||||
|
|
||||||
super(RespWrapper, self).__init__(stream, params, create_func)
|
super(RespWrapper, self).__init__(stream, params, create_func)
|
||||||
self.headers = headers
|
self.headers = headers
|
||||||
self.req = req
|
self.req = req
|
||||||
self.queue = queue
|
self.queue = queue
|
||||||
self.skip_filters = skip_filters
|
self.skip_filters = skip_filters
|
||||||
|
self.path = path
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
try:
|
try:
|
||||||
@ -296,7 +300,9 @@ class RespWrapper(Wrapper):
|
|||||||
if self.interrupted:
|
if self.interrupted:
|
||||||
skipping = True
|
skipping = True
|
||||||
else:
|
else:
|
||||||
skipping = any(x.skip_response(self.req.headers, self.headers)
|
skipping = any(x.skip_response(self.path,
|
||||||
|
self.req.headers,
|
||||||
|
self.headers)
|
||||||
for x in self.skip_filters)
|
for x in self.skip_filters)
|
||||||
|
|
||||||
if not skipping:
|
if not skipping:
|
||||||
|
@ -42,6 +42,7 @@ class BaseAggregator(object):
|
|||||||
|
|
||||||
def load_child_source(self, name, source, params):
|
def load_child_source(self, name, source, params):
|
||||||
try:
|
try:
|
||||||
|
params['_name'] = name
|
||||||
params['_formatter'] = ParamFormatter(params, name)
|
params['_formatter'] = ParamFormatter(params, name)
|
||||||
res = source.load_index(params)
|
res = source.load_index(params)
|
||||||
if isinstance(res, tuple):
|
if isinstance(res, tuple):
|
||||||
@ -62,6 +63,10 @@ class BaseAggregator(object):
|
|||||||
return cdx
|
return cdx
|
||||||
|
|
||||||
if params.get('nosource') != 'true':
|
if params.get('nosource') != 'true':
|
||||||
|
src_coll = params.get('param.' + name + '.src_coll')
|
||||||
|
if src_coll:
|
||||||
|
name += ':' + src_coll
|
||||||
|
|
||||||
cdx_iter = (add_name(cdx, name) for cdx in cdx_iter)
|
cdx_iter = (add_name(cdx, name) for cdx in cdx_iter)
|
||||||
|
|
||||||
return cdx_iter, err_list
|
return cdx_iter, err_list
|
||||||
@ -107,12 +112,23 @@ class BaseSourceListAggregator(BaseAggregator):
|
|||||||
def _iter_sources(self, params):
|
def _iter_sources(self, params):
|
||||||
sources = self.get_all_sources(params)
|
sources = self.get_all_sources(params)
|
||||||
srcs_list = params.get('sources')
|
srcs_list = params.get('sources')
|
||||||
if not srcs_list:
|
if not srcs_list or srcs_list == '*':
|
||||||
return sources.items()
|
return sources.items()
|
||||||
|
|
||||||
sel_sources = tuple(srcs_list.split(','))
|
sel_sources = tuple(srcs_list.split(','))
|
||||||
|
|
||||||
return [(name, sources[name]) for name in sources.keys() if name in sel_sources]
|
def yield_sources(sources, sel_sources, params):
|
||||||
|
for name in sel_sources:
|
||||||
|
if name in sources:
|
||||||
|
yield (name, sources[name])
|
||||||
|
|
||||||
|
elif ':' in name:
|
||||||
|
name, param = name.split(':', 1)
|
||||||
|
if name in sources:
|
||||||
|
params['param.' + name + '.src_coll'] = param
|
||||||
|
yield (name, sources[name])
|
||||||
|
|
||||||
|
return yield_sources(sources, sel_sources, params)
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
@ -320,7 +336,7 @@ class BaseRedisMultiKeyIndexSource(BaseAggregator, RedisIndexSource):
|
|||||||
return RedisIndexSource(None, self.redis, key)
|
return RedisIndexSource(None, self.redis, key)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'redis'
|
return 'redis-multikey'
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
|
@ -215,8 +215,11 @@ def init_index_source(value, source_list=None):
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
def register_source(source_cls):
|
def register_source(source_cls, end=False):
|
||||||
SOURCE_LIST.append(source_cls)
|
if not end:
|
||||||
|
SOURCE_LIST.insert(0, source_cls)
|
||||||
|
else:
|
||||||
|
SOURCE_LIST.append(source_cls)
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
@ -117,16 +117,20 @@ class RemoteIndexSource(BaseIndexSource):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
cdx = CDXObject(line)
|
cdx = CDXObject(line)
|
||||||
self._set_load_url(cdx)
|
self._set_load_url(cdx, params)
|
||||||
yield cdx
|
yield cdx
|
||||||
|
|
||||||
return do_load(lines)
|
return do_load(lines)
|
||||||
|
|
||||||
def _set_load_url(self, cdx):
|
def _set_load_url(self, cdx, params):
|
||||||
cdx[self.url_field] = self.replay_url.format(
|
source_coll = ''
|
||||||
timestamp=cdx['timestamp'],
|
name = params.get('_name')
|
||||||
url=cdx['url'])
|
if name:
|
||||||
|
source_coll = params.get('param.' + name + '.src_coll', '')
|
||||||
|
|
||||||
|
cdx[self.url_field] = self.replay_url.format(url=cdx['url'],
|
||||||
|
timestamp=cdx['timestamp'],
|
||||||
|
src_coll=source_coll)
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '{0}({1}, {2})'.format(self.__class__.__name__,
|
return '{0}({1}, {2})'.format(self.__class__.__name__,
|
||||||
self.api_url,
|
self.api_url,
|
||||||
|
@ -41,12 +41,14 @@ class BaseLoader(object):
|
|||||||
|
|
||||||
warc_headers, other_headers, stream = entry
|
warc_headers, other_headers, stream = entry
|
||||||
|
|
||||||
|
source = self._get_provenance(cdx)
|
||||||
|
|
||||||
out_headers = {}
|
out_headers = {}
|
||||||
out_headers['WebAgg-Type'] = 'warc'
|
out_headers['WebAgg-Type'] = 'warc'
|
||||||
out_headers['WebAgg-Source-Coll'] = quote(cdx.get('source', ''), safe=':/')
|
|
||||||
out_headers['Content-Type'] = 'application/warc-record'
|
out_headers['Content-Type'] = 'application/warc-record'
|
||||||
|
|
||||||
out_headers['WebAgg-Cdx'] = to_native_str(cdx.to_cdxj().rstrip())
|
out_headers['WebAgg-Cdx'] = to_native_str(cdx.to_cdxj().rstrip())
|
||||||
|
out_headers['WebAgg-Source-Coll'] = source
|
||||||
|
|
||||||
if not warc_headers:
|
if not warc_headers:
|
||||||
if other_headers:
|
if other_headers:
|
||||||
@ -60,6 +62,7 @@ class BaseLoader(object):
|
|||||||
target_uri = warc_headers.get_header('WARC-Target-URI')
|
target_uri = warc_headers.get_header('WARC-Target-URI')
|
||||||
|
|
||||||
out_headers['WARC-Target-URI'] = target_uri
|
out_headers['WARC-Target-URI'] = target_uri
|
||||||
|
|
||||||
out_headers['Link'] = MementoUtils.make_link(target_uri, 'original')
|
out_headers['Link'] = MementoUtils.make_link(target_uri, 'original')
|
||||||
|
|
||||||
memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date'))
|
memento_dt = iso_date_to_datetime(warc_headers.get_header('WARC-Date'))
|
||||||
@ -88,6 +91,9 @@ class BaseLoader(object):
|
|||||||
|
|
||||||
return out_headers, streamiter
|
return out_headers, streamiter
|
||||||
|
|
||||||
|
def _get_provenance(self, cdx):
|
||||||
|
return quote(cdx.get('source', ''), safe=':/')
|
||||||
|
|
||||||
def _set_content_len(self, content_len_str, headers, existing_len):
|
def _set_content_len(self, content_len_str, headers, existing_len):
|
||||||
# Try to set content-length, if it is available and valid
|
# Try to set content-length, if it is available and valid
|
||||||
try:
|
try:
|
||||||
@ -424,6 +430,10 @@ class LiveWebLoader(BaseLoader):
|
|||||||
warc_headers['WARC-Record-ID'] = self._make_warc_id()
|
warc_headers['WARC-Record-ID'] = self._make_warc_id()
|
||||||
warc_headers['WARC-Target-URI'] = cdx['url']
|
warc_headers['WARC-Target-URI'] = cdx['url']
|
||||||
warc_headers['WARC-Date'] = datetime_to_iso_date(dt)
|
warc_headers['WARC-Date'] = datetime_to_iso_date(dt)
|
||||||
|
|
||||||
|
if not cdx.get('is_live'):
|
||||||
|
warc_headers['WARC-Provenance'] = self._get_provenance(cdx)
|
||||||
|
|
||||||
if remote_ip:
|
if remote_ip:
|
||||||
warc_headers['WARC-IP-Address'] = remote_ip
|
warc_headers['WARC-IP-Address'] = remote_ip
|
||||||
|
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
from gevent import monkey; monkey.patch_all(thread=False)
|
||||||
|
|
||||||
import webtest
|
import webtest
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
@ -6,7 +8,7 @@ import requests
|
|||||||
|
|
||||||
from pywb.webagg.handlers import DefaultResourceHandler
|
from pywb.webagg.handlers import DefaultResourceHandler
|
||||||
from pywb.webagg.aggregator import SimpleAggregator
|
from pywb.webagg.aggregator import SimpleAggregator
|
||||||
from pywb.webagg.proxyindexsource import ProxyMementoIndexSource, UpstreamAggIndexSource
|
from pywb.webagg.upstreamindexsource import UpstreamMementoIndexSource, UpstreamAggIndexSource
|
||||||
|
|
||||||
from warcio.recordloader import ArcWarcRecordLoader
|
from warcio.recordloader import ArcWarcRecordLoader
|
||||||
|
|
||||||
@ -26,7 +28,7 @@ class TestUpstream(LiveServerTests, BaseTestClass):
|
|||||||
|
|
||||||
app.add_route('/upstream_opt',
|
app.add_route('/upstream_opt',
|
||||||
DefaultResourceHandler(SimpleAggregator(
|
DefaultResourceHandler(SimpleAggregator(
|
||||||
{'upstream_opt': ProxyMementoIndexSource.upstream_resource(base_url + '/live')})
|
{'upstream_opt': UpstreamMementoIndexSource.upstream_resource(base_url + '/live')})
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -13,14 +13,14 @@ class UpstreamAggIndexSource(RemoteIndexSource):
|
|||||||
proxy_url = base_url + '/resource?url={url}&closest={timestamp}'
|
proxy_url = base_url + '/resource?url={url}&closest={timestamp}'
|
||||||
super(UpstreamAggIndexSource, self).__init__(api_url, proxy_url, 'filename')
|
super(UpstreamAggIndexSource, self).__init__(api_url, proxy_url, 'filename')
|
||||||
|
|
||||||
def _set_load_url(self, cdx):
|
def _set_load_url(self, cdx, params):
|
||||||
super(UpstreamAggIndexSource, self)._set_load_url(cdx)
|
super(UpstreamAggIndexSource, self)._set_load_url(cdx, params)
|
||||||
cdx['offset'] = '0'
|
cdx['offset'] = '0'
|
||||||
cdx.pop('load_url', '')
|
cdx.pop('load_url', '')
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class ProxyMementoIndexSource(BaseIndexSource):
|
class UpstreamMementoIndexSource(BaseIndexSource):
|
||||||
def __init__(self, proxy_url='{url}'):
|
def __init__(self, proxy_url='{url}'):
|
||||||
self.proxy_url = proxy_url
|
self.proxy_url = proxy_url
|
||||||
self.loader = LiveWebLoader()
|
self.loader = LiveWebLoader()
|
||||||
@ -45,10 +45,10 @@ class ProxyMementoIndexSource(BaseIndexSource):
|
|||||||
yield cdx
|
yield cdx
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'proxy'
|
return 'upstream'
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def upstream_resource(base_url):
|
def upstream_resource(base_url):
|
||||||
return ProxyMementoIndexSource(base_url + '/resource?url={url}&closest={closest}')
|
return UpstreamMementoIndexSource(base_url + '/resource?url={url}&closest={closest}')
|
||||||
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user