Merge branch 'do_not_archive' into qa

This commit is contained in:
Barbara Miller 2018-02-27 22:25:20 -08:00
commit 1edab7a0ca
4 changed files with 55 additions and 3 deletions

View File

@ -1426,11 +1426,19 @@ def test_controller_with_defaults():
assert not wwp.writer_pool.default_warc_writer.record_builder.base32
assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1'
class MyEarlyPlugin(warcprox.BaseStandardPostfetchProcessor):
CHAIN_POSITION = 'early'
def _process_put(self):
pass
def test_load_plugin():
options = warcprox.Options(port=0, plugins=[
'warcprox.stats.RunningStats',
'warcprox.BaseStandardPostfetchProcessor',
'warcprox.BaseBatchPostfetchProcessor',])
'warcprox.BaseBatchPostfetchProcessor',
'%s.%s' % (__name__, MyEarlyPlugin.__name__),])
controller = warcprox.controller.WarcproxController(options)
assert isinstance(
controller._postfetch_chain[-1],
@ -1451,6 +1459,10 @@ def test_load_plugin():
assert isinstance(
controller._postfetch_chain[-4].listener,
warcprox.stats.RunningStats)
# MyEarlyPlugin
assert isinstance(
controller._postfetch_chain[0],
warcprox.BaseStandardPostfetchProcessor)
def test_choose_a_port_for_me(warcprox_):
options = warcprox.Options()

View File

@ -163,6 +163,47 @@ def test_special_dont_write_prefix():
wwt.join()
def test_do_not_archive():
with tempfile.TemporaryDirectory() as tmpdir:
logging.debug('cd %s', tmpdir)
os.chdir(tmpdir)
wwt = warcprox.writerthread.WarcWriterProcessor(
Options(writer_threads=1))
wwt.inq = warcprox.TimestampedQueue(maxsize=1)
wwt.outq = warcprox.TimestampedQueue(maxsize=1)
try:
wwt.start()
# to be written -- default do_not_archive False
recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
recorder.read()
wwt.inq.put(RecordedUrl(
url='http://example.com/yes', content_type='text/plain',
status=200, client_ip='127.0.0.2', request_data=b'abc',
response_recorder=recorder, remote_ip='127.0.0.3',
timestamp=datetime.utcnow(),
payload_digest=recorder.block_digest))
# not to be written -- do_not_archive set True
recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
recorder.read()
wwt.inq.put(RecordedUrl(
url='http://example.com/no', content_type='text/plain',
status=200, client_ip='127.0.0.2', request_data=b'abc',
response_recorder=recorder, remote_ip='127.0.0.3',
timestamp=datetime.utcnow(),
payload_digest=recorder.block_digest,
warcprox_meta={'warc-prefix': '-'},
do_not_archive=True))
recorded_url = wwt.outq.get(timeout=10)
assert recorded_url.warc_records
recorded_url = wwt.outq.get(timeout=10)
assert not recorded_url.warc_records
assert wwt.outq.empty()
finally:
wwt.stop.set()
wwt.join()
def test_warc_writer_filename(tmpdir):
"""Test if WarcWriter is writing WARC files with custom filenames.
"""

View File

@ -213,7 +213,7 @@ class WarcproxController(object):
warcprox.ListenerPostfetchProcessor(
plugin, self.options))
elif hasattr(plugin, 'CHAIN_POSITION') and plugin.CHAIN_POSITION == 'early':
self._postfetch_chain.insert(0, plugin) # or insert early but later than 0?
self._postfetch_chain.insert(0, plugin)
else:
self._postfetch_chain.append(plugin)

View File

@ -1 +0,0 @@
1.4-20160105052702-f79e744