Merge pull request #68 from internetarchive/do_not_archive

add support for do_not_archive attribute and for plugin CHAIN_POSITION...
This commit is contained in:
Noah Levitt 2018-02-28 15:42:19 -08:00 committed by GitHub
commit 1b4fbef26a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 60 additions and 3 deletions

View File

@ -1426,11 +1426,19 @@ def test_controller_with_defaults():
assert not wwp.writer_pool.default_warc_writer.record_builder.base32
assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1'
class EarlyPlugin(warcprox.BaseStandardPostfetchProcessor):
CHAIN_POSITION = 'early'
def _process_url(self):
pass
def test_load_plugin():
options = warcprox.Options(port=0, plugins=[
'warcprox.stats.RunningStats',
'warcprox.BaseStandardPostfetchProcessor',
'warcprox.BaseBatchPostfetchProcessor',])
'warcprox.BaseBatchPostfetchProcessor',
'%s.%s' % (__name__, EarlyPlugin.__name__),])
controller = warcprox.controller.WarcproxController(options)
assert isinstance(
controller._postfetch_chain[-1],
@ -1451,6 +1459,9 @@ def test_load_plugin():
assert isinstance(
controller._postfetch_chain[-4].listener,
warcprox.stats.RunningStats)
assert isinstance(
controller._postfetch_chain[0],
EarlyPlugin)
def test_choose_a_port_for_me(warcprox_):
options = warcprox.Options()

View File

@ -163,6 +163,47 @@ def test_special_dont_write_prefix():
wwt.join()
def test_do_not_archive():
with tempfile.TemporaryDirectory() as tmpdir:
logging.debug('cd %s', tmpdir)
os.chdir(tmpdir)
wwt = warcprox.writerthread.WarcWriterProcessor(
Options(writer_threads=1))
wwt.inq = warcprox.TimestampedQueue(maxsize=1)
wwt.outq = warcprox.TimestampedQueue(maxsize=1)
try:
wwt.start()
# to be written -- default do_not_archive False
recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
recorder.read()
wwt.inq.put(RecordedUrl(
url='http://example.com/yes', content_type='text/plain',
status=200, client_ip='127.0.0.2', request_data=b'abc',
response_recorder=recorder, remote_ip='127.0.0.3',
timestamp=datetime.utcnow(),
payload_digest=recorder.block_digest))
# not to be written -- do_not_archive set True
recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
recorder.read()
wwt.inq.put(RecordedUrl(
url='http://example.com/no', content_type='text/plain',
status=200, client_ip='127.0.0.2', request_data=b'abc',
response_recorder=recorder, remote_ip='127.0.0.3',
timestamp=datetime.utcnow(),
payload_digest=recorder.block_digest,
warcprox_meta={'warc-prefix': '-'},
do_not_archive=True))
recorded_url = wwt.outq.get(timeout=10)
assert recorded_url.warc_records
recorded_url = wwt.outq.get(timeout=10)
assert not recorded_url.warc_records
assert wwt.outq.empty()
finally:
wwt.stop.set()
wwt.join()
def test_warc_writer_filename(tmpdir):
"""Test if WarcWriter is writing WARC files with custom filenames.
"""

View File

@ -212,6 +212,8 @@ class WarcproxController(object):
self._postfetch_chain.append(
warcprox.ListenerPostfetchProcessor(
plugin, self.options))
elif hasattr(plugin, 'CHAIN_POSITION') and plugin.CHAIN_POSITION == 'early':
self._postfetch_chain.insert(0, plugin)
else:
self._postfetch_chain.append(plugin)

View File

@ -331,7 +331,8 @@ class RecordedUrl:
warcprox_meta=None, content_type=None, custom_type=None,
status=None, size=None, client_ip=None, method=None,
timestamp=None, host=None, duration=None, referer=None,
payload_digest=None, truncated=None, warc_records=None):
payload_digest=None, truncated=None, warc_records=None,
do_not_archive=False):
# XXX should test what happens with non-ascii url (when does
# url-encoding happen?)
if type(url) is not bytes:
@ -372,6 +373,7 @@ class RecordedUrl:
self.payload_digest = payload_digest
self.truncated = truncated
self.warc_records = warc_records
self.do_not_archive = do_not_archive
# inherit from object so that multiple inheritance from this class works
# properly in python 2

View File

@ -84,7 +84,8 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
and 'warc-prefix' in recorded_url.warcprox_meta
else self.options.prefix)
# special warc name prefix '-' means "don't archive"
return prefix != '-' and self._filter_accepts(recorded_url)
return (prefix != '-' and not recorded_url.do_not_archive
and self._filter_accepts(recorded_url))
def _log(self, recorded_url, records):
try: