1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

drop process/thread mixin support (doesn't work as well on py2) could readd processes only if need arises, but for now focusing on gevent

rename header Source-Coll -> WebAgg-Source-Coll
This commit is contained in:
Ilya Kreymer 2016-03-08 10:56:03 -08:00
parent 348fb133e0
commit 3477cb0bb5
5 changed files with 20 additions and 80 deletions

View File

@ -132,7 +132,7 @@ class TestResAgg(object):
headers = {'foo': 'bar'} headers = {'foo': 'bar'}
resp = self.testapp.get('/live/resource?url=http://httpbin.org/get?foo=bar', headers=headers) resp = self.testapp.get('/live/resource?url=http://httpbin.org/get?foo=bar', headers=headers)
assert resp.headers['Source-Coll'] == 'live' assert resp.headers['WebAgg-Source-Coll'] == 'live'
self._check_uri_date(resp, 'http://httpbin.org/get?foo=bar', True) self._check_uri_date(resp, 'http://httpbin.org/get?foo=bar', True)
@ -148,7 +148,7 @@ class TestResAgg(object):
resp = self.testapp.post('/live/resource?url=http://httpbin.org/post', resp = self.testapp.post('/live/resource?url=http://httpbin.org/post',
OrderedDict([('foo', 'bar')])) OrderedDict([('foo', 'bar')]))
assert resp.headers['Source-Coll'] == 'live' assert resp.headers['WebAgg-Source-Coll'] == 'live'
self._check_uri_date(resp, 'http://httpbin.org/post', True) self._check_uri_date(resp, 'http://httpbin.org/post', True)
@ -163,7 +163,7 @@ class TestResAgg(object):
def test_agg_select_mem_1(self): def test_agg_select_mem_1(self):
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20141001') resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20141001')
assert resp.headers['Source-Coll'] == 'rhiz' assert resp.headers['WebAgg-Source-Coll'] == 'rhiz'
self._check_uri_date(resp, 'http://www.vvork.com/', '2014-10-06T18:43:57Z') self._check_uri_date(resp, 'http://www.vvork.com/', '2014-10-06T18:43:57Z')
@ -177,7 +177,7 @@ class TestResAgg(object):
def test_agg_select_mem_2(self): def test_agg_select_mem_2(self):
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20151231') resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=20151231')
assert resp.headers['Source-Coll'] == 'ia' assert resp.headers['WebAgg-Source-Coll'] == 'ia'
self._check_uri_date(resp, 'http://vvork.com/', '2016-01-10T13:48:55Z') self._check_uri_date(resp, 'http://vvork.com/', '2016-01-10T13:48:55Z')
@ -191,7 +191,7 @@ class TestResAgg(object):
def test_agg_select_live(self): def test_agg_select_live(self):
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=2016') resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=2016')
assert resp.headers['Source-Coll'] == 'live' assert resp.headers['WebAgg-Source-Coll'] == 'live'
self._check_uri_date(resp, 'http://vvork.com/', True) self._check_uri_date(resp, 'http://vvork.com/', True)
@ -203,7 +203,7 @@ class TestResAgg(object):
def test_agg_select_local(self): def test_agg_select_local(self):
resp = self.testapp.get('/many/resource?url=http://iana.org/&closest=20140126200624') resp = self.testapp.get('/many/resource?url=http://iana.org/&closest=20140126200624')
assert resp.headers['Source-Coll'] == 'local' assert resp.headers['WebAgg-Source-Coll'] == 'local'
self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z') self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z')
@ -222,7 +222,7 @@ Host: iana.org
resp = self.testapp.post('/many/resource/postreq?url=http://iana.org/&closest=20140126200624', req_data) resp = self.testapp.post('/many/resource/postreq?url=http://iana.org/&closest=20140126200624', req_data)
assert resp.headers['Source-Coll'] == 'local' assert resp.headers['WebAgg-Source-Coll'] == 'local'
self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z') self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z')
@ -241,7 +241,7 @@ Host: httpbin.org
resp = self.testapp.post('/many/resource/postreq?url=http://httpbin.org/get?foo=bar&closest=now', req_data) resp = self.testapp.post('/many/resource/postreq?url=http://httpbin.org/get?foo=bar&closest=now', req_data)
assert resp.headers['Source-Coll'] == 'live' assert resp.headers['WebAgg-Source-Coll'] == 'live'
self._check_uri_date(resp, 'http://httpbin.org/get?foo=bar', True) self._check_uri_date(resp, 'http://httpbin.org/get?foo=bar', True)
@ -266,7 +266,7 @@ foo=bar&test=abc"""
resp = self.testapp.post('/posttest/resource/postreq?url=http://httpbin.org/post', req_data) resp = self.testapp.post('/posttest/resource/postreq?url=http://httpbin.org/post', req_data)
assert resp.headers['Source-Coll'] == 'post' assert resp.headers['WebAgg-Source-Coll'] == 'post'
self._check_uri_date(resp, 'http://httpbin.org/post', True) self._check_uri_date(resp, 'http://httpbin.org/post', True)
@ -285,7 +285,7 @@ foo=bar&test=abc"""
resp = self.testapp.post('/fallback/resource?url=http://httpbin.org/post', req_data) resp = self.testapp.post('/fallback/resource?url=http://httpbin.org/post', req_data)
assert resp.headers['Source-Coll'] == 'post' assert resp.headers['WebAgg-Source-Coll'] == 'post'
self._check_uri_date(resp, 'http://httpbin.org/post', True) self._check_uri_date(resp, 'http://httpbin.org/post', True)
@ -301,7 +301,7 @@ foo=bar&test=abc"""
def test_agg_seq_fallback_1(self): def test_agg_seq_fallback_1(self):
resp = self.testapp.get('/fallback/resource?url=http://www.iana.org/') resp = self.testapp.get('/fallback/resource?url=http://www.iana.org/')
assert resp.headers['Source-Coll'] == 'live' assert resp.headers['WebAgg-Source-Coll'] == 'live'
self._check_uri_date(resp, 'http://www.iana.org/', True) self._check_uri_date(resp, 'http://www.iana.org/', True)
@ -314,7 +314,7 @@ foo=bar&test=abc"""
def test_agg_seq_fallback_2(self): def test_agg_seq_fallback_2(self):
resp = self.testapp.get('/fallback/resource?url=http://www.example.com/') resp = self.testapp.get('/fallback/resource?url=http://www.example.com/')
assert resp.headers['Source-Coll'] == 'example' assert resp.headers['WebAgg-Source-Coll'] == 'example'
self._check_uri_date(resp, 'http://example.com/', '2016-02-25T04:23:29Z') self._check_uri_date(resp, 'http://example.com/', '2016-02-25T04:23:29Z')
@ -336,7 +336,7 @@ foo=bar&test=abc"""
def test_agg_local_revisit(self): def test_agg_local_revisit(self):
resp = self.testapp.get('/many/resource?url=http://www.example.com/&closest=20140127171251&sources=local') resp = self.testapp.get('/many/resource?url=http://www.example.com/&closest=20140127171251&sources=local')
assert resp.headers['Source-Coll'] == 'local' assert resp.headers['WebAgg-Source-Coll'] == 'local'
buff = BytesIO(resp.body) buff = BytesIO(resp.body)
status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff) status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff)

View File

@ -1,7 +1,7 @@
from gevent import monkey; monkey.patch_all(thread=False) from gevent import monkey; monkey.patch_all(thread=False)
from webagg.aggregator import SimpleAggregator, GeventTimeoutAggregator from webagg.aggregator import SimpleAggregator, GeventTimeoutAggregator
from webagg.aggregator import ThreadedTimeoutAggregator, BaseAggregator from webagg.aggregator import BaseAggregator
from webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource from webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource
from .testutils import json_list, to_path from .testutils import json_list, to_path
@ -25,29 +25,15 @@ sources = {
aggs = {'simple': SimpleAggregator(sources), aggs = {'simple': SimpleAggregator(sources),
'gevent': GeventTimeoutAggregator(sources, timeout=5.0), 'gevent': GeventTimeoutAggregator(sources, timeout=5.0),
'threaded': ThreadedTimeoutAggregator(sources, timeout=5.0),
'processes': ThreadedTimeoutAggregator(sources, timeout=5.0, use_processes=True),
} }
agg_tm = {'gevent': GeventTimeoutAggregator(sources, timeout=0.0), agg_tm = {'gevent': GeventTimeoutAggregator(sources, timeout=0.0)}
'threaded': ThreadedTimeoutAggregator(sources, timeout=0.0),
'processes': ThreadedTimeoutAggregator(sources, timeout=0.0, use_processes=True)}
nf = {'notfound': FileIndexSource(to_path('testdata/not-found-x'))} nf = {'notfound': FileIndexSource(to_path('testdata/not-found-x'))}
agg_nf = {'simple': SimpleAggregator(nf), agg_nf = {'simple': SimpleAggregator(nf),
'gevent': GeventTimeoutAggregator(nf, timeout=5.0), 'gevent': GeventTimeoutAggregator(nf, timeout=5.0),
'threaded': ThreadedTimeoutAggregator(nf, timeout=5.0),
'processes': ThreadedTimeoutAggregator(nf, timeout=5.0, use_processes=True),
} }
if six.PY2:
del aggs['threaded']
del aggs['processes']
del agg_tm['threaded']
del agg_tm['processes']
del agg_nf['threaded']
del agg_nf['processes']
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys())) @pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
def test_mem_agg_index_1(agg): def test_mem_agg_index_1(agg):

View File

@ -83,7 +83,7 @@ class TestUpstream(object):
def test_live_1(self): def test_live_1(self):
resp = requests.get(self.base_url + '/live/resource?url=http://httpbin.org/get', stream=True) resp = requests.get(self.base_url + '/live/resource?url=http://httpbin.org/get', stream=True)
assert resp.headers['Source-Coll'] == 'live' assert resp.headers['WebAgg-Source-Coll'] == 'live'
record = ArcWarcRecordLoader().parse_record_stream(resp.raw, no_record_parse=False) record = ArcWarcRecordLoader().parse_record_stream(resp.raw, no_record_parse=False)
assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get' assert record.rec_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/get'
@ -91,7 +91,7 @@ class TestUpstream(object):
def test_upstream_1(self): def test_upstream_1(self):
resp = self.testapp.get('/upstream/resource?url=http://httpbin.org/get') resp = self.testapp.get('/upstream/resource?url=http://httpbin.org/get')
assert resp.headers['Source-Coll'] == 'upstream:live' assert resp.headers['WebAgg-Source-Coll'] == 'upstream:live'
raw = BytesIO(resp.body) raw = BytesIO(resp.body)
@ -101,7 +101,7 @@ class TestUpstream(object):
def test_upstream_2(self): def test_upstream_2(self):
resp = self.testapp.get('/upstream_opt/resource?url=http://httpbin.org/get') resp = self.testapp.get('/upstream_opt/resource?url=http://httpbin.org/get')
assert resp.headers['Source-Coll'] == 'upstream_opt:live', resp.headers assert resp.headers['WebAgg-Source-Coll'] == 'upstream_opt:live', resp.headers
raw = BytesIO(resp.body) raw = BytesIO(resp.body)

View File

@ -37,10 +37,6 @@ class BaseAggregator(object):
cdx_iter = process_cdx(cdx_iter, query) cdx_iter = process_cdx(cdx_iter, query)
return cdx_iter, dict(errs) return cdx_iter, dict(errs)
def load_child_source_list(self, name, source, params):
res = self.load_child_source(name, source, params)
return list(res[0]), res[1]
def load_child_source(self, name, source, params): def load_child_source(self, name, source, params):
try: try:
params['_formatter'] = ParamFormatter(params, name) params['_formatter'] = ParamFormatter(params, name)
@ -205,48 +201,6 @@ class GeventTimeoutAggregator(TimeoutMixin, GeventMixin, BaseSourceListAggregato
pass pass
#=============================================================================
class ConcurrentMixin(object):
def __init__(self, *args, **kwargs):
super(ConcurrentMixin, self).__init__(*args, **kwargs)
if kwargs.get('use_processes'):
self.pool_class = futures.ThreadPoolExecutor
else:
self.pool_class = futures.ProcessPoolExecutor
self.timeout = kwargs.get('timeout', 5.0)
self.size = kwargs.get('size')
def _load_all(self, params):
params['_timeout'] = self.timeout
sources = list(self._iter_sources(params))
with self.pool_class(max_workers=self.size) as executor:
def do_spawn(name, source):
return executor.submit(self.load_child_source_list,
name, source, params), name
jobs = dict([do_spawn(name, source) for name, source in sources])
res_done, res_not_done = futures.wait(jobs.keys(), timeout=self.timeout)
results = []
for job in res_done:
results.append(job.result())
for job in res_not_done:
name = jobs[job]
results.append((iter([]), [(name, 'timeout')]))
self._on_source_error(name)
return results
#=============================================================================
class ThreadedTimeoutAggregator(TimeoutMixin, ConcurrentMixin, BaseSourceListAggregator):
pass
#============================================================================= #=============================================================================
class BaseDirectoryIndexSource(BaseAggregator): class BaseDirectoryIndexSource(BaseAggregator):
CDX_EXT = ('.cdx', '.cdxj') CDX_EXT = ('.cdx', '.cdxj')

View File

@ -70,7 +70,7 @@ class BaseLoader(object):
out_headers = {} out_headers = {}
out_headers['WebAgg-Type'] = 'warc' out_headers['WebAgg-Type'] = 'warc'
out_headers['Source-Coll'] = cdx.get('source', '') out_headers['WebAgg-Source-Coll'] = cdx.get('source', '')
out_headers['Content-Type'] = 'application/warc-record' out_headers['Content-Type'] = 'application/warc-record'
if not warc_headers: if not warc_headers:
@ -237,7 +237,7 @@ class LiveWebLoader(BaseLoader):
agg_type = upstream_res.headers.get('WebAgg-Type') agg_type = upstream_res.headers.get('WebAgg-Type')
if agg_type == 'warc': if agg_type == 'warc':
cdx['source'] = upstream_res.headers.get('Source-Coll') cdx['source'] = upstream_res.headers.get('WebAgg-Source-Coll')
return None, upstream_res.headers, upstream_res.raw return None, upstream_res.headers, upstream_res.raw
http_headers_buff = recorder.get_headers_buff() http_headers_buff = recorder.get_headers_buff()