mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Merge pull request #68 from internetarchive/do_not_archive
add support for do_not_archive attribute and for plugin CHAIN_POSITION...
This commit is contained in:
commit
1b4fbef26a
@ -1426,11 +1426,19 @@ def test_controller_with_defaults():
|
|||||||
assert not wwp.writer_pool.default_warc_writer.record_builder.base32
|
assert not wwp.writer_pool.default_warc_writer.record_builder.base32
|
||||||
assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1'
|
assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1'
|
||||||
|
|
||||||
|
|
||||||
|
class EarlyPlugin(warcprox.BaseStandardPostfetchProcessor):
|
||||||
|
CHAIN_POSITION = 'early'
|
||||||
|
def _process_url(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def test_load_plugin():
|
def test_load_plugin():
|
||||||
options = warcprox.Options(port=0, plugins=[
|
options = warcprox.Options(port=0, plugins=[
|
||||||
'warcprox.stats.RunningStats',
|
'warcprox.stats.RunningStats',
|
||||||
'warcprox.BaseStandardPostfetchProcessor',
|
'warcprox.BaseStandardPostfetchProcessor',
|
||||||
'warcprox.BaseBatchPostfetchProcessor',])
|
'warcprox.BaseBatchPostfetchProcessor',
|
||||||
|
'%s.%s' % (__name__, EarlyPlugin.__name__),])
|
||||||
controller = warcprox.controller.WarcproxController(options)
|
controller = warcprox.controller.WarcproxController(options)
|
||||||
assert isinstance(
|
assert isinstance(
|
||||||
controller._postfetch_chain[-1],
|
controller._postfetch_chain[-1],
|
||||||
@ -1451,6 +1459,9 @@ def test_load_plugin():
|
|||||||
assert isinstance(
|
assert isinstance(
|
||||||
controller._postfetch_chain[-4].listener,
|
controller._postfetch_chain[-4].listener,
|
||||||
warcprox.stats.RunningStats)
|
warcprox.stats.RunningStats)
|
||||||
|
assert isinstance(
|
||||||
|
controller._postfetch_chain[0],
|
||||||
|
EarlyPlugin)
|
||||||
|
|
||||||
def test_choose_a_port_for_me(warcprox_):
|
def test_choose_a_port_for_me(warcprox_):
|
||||||
options = warcprox.Options()
|
options = warcprox.Options()
|
||||||
|
@ -163,6 +163,47 @@ def test_special_dont_write_prefix():
|
|||||||
wwt.join()
|
wwt.join()
|
||||||
|
|
||||||
|
|
||||||
|
def test_do_not_archive():
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
logging.debug('cd %s', tmpdir)
|
||||||
|
os.chdir(tmpdir)
|
||||||
|
|
||||||
|
wwt = warcprox.writerthread.WarcWriterProcessor(
|
||||||
|
Options(writer_threads=1))
|
||||||
|
wwt.inq = warcprox.TimestampedQueue(maxsize=1)
|
||||||
|
wwt.outq = warcprox.TimestampedQueue(maxsize=1)
|
||||||
|
try:
|
||||||
|
wwt.start()
|
||||||
|
# to be written -- default do_not_archive False
|
||||||
|
recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
|
||||||
|
recorder.read()
|
||||||
|
wwt.inq.put(RecordedUrl(
|
||||||
|
url='http://example.com/yes', content_type='text/plain',
|
||||||
|
status=200, client_ip='127.0.0.2', request_data=b'abc',
|
||||||
|
response_recorder=recorder, remote_ip='127.0.0.3',
|
||||||
|
timestamp=datetime.utcnow(),
|
||||||
|
payload_digest=recorder.block_digest))
|
||||||
|
# not to be written -- do_not_archive set True
|
||||||
|
recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
|
||||||
|
recorder.read()
|
||||||
|
wwt.inq.put(RecordedUrl(
|
||||||
|
url='http://example.com/no', content_type='text/plain',
|
||||||
|
status=200, client_ip='127.0.0.2', request_data=b'abc',
|
||||||
|
response_recorder=recorder, remote_ip='127.0.0.3',
|
||||||
|
timestamp=datetime.utcnow(),
|
||||||
|
payload_digest=recorder.block_digest,
|
||||||
|
warcprox_meta={'warc-prefix': '-'},
|
||||||
|
do_not_archive=True))
|
||||||
|
recorded_url = wwt.outq.get(timeout=10)
|
||||||
|
assert recorded_url.warc_records
|
||||||
|
recorded_url = wwt.outq.get(timeout=10)
|
||||||
|
assert not recorded_url.warc_records
|
||||||
|
assert wwt.outq.empty()
|
||||||
|
finally:
|
||||||
|
wwt.stop.set()
|
||||||
|
wwt.join()
|
||||||
|
|
||||||
|
|
||||||
def test_warc_writer_filename(tmpdir):
|
def test_warc_writer_filename(tmpdir):
|
||||||
"""Test if WarcWriter is writing WARC files with custom filenames.
|
"""Test if WarcWriter is writing WARC files with custom filenames.
|
||||||
"""
|
"""
|
||||||
|
@ -212,6 +212,8 @@ class WarcproxController(object):
|
|||||||
self._postfetch_chain.append(
|
self._postfetch_chain.append(
|
||||||
warcprox.ListenerPostfetchProcessor(
|
warcprox.ListenerPostfetchProcessor(
|
||||||
plugin, self.options))
|
plugin, self.options))
|
||||||
|
elif hasattr(plugin, 'CHAIN_POSITION') and plugin.CHAIN_POSITION == 'early':
|
||||||
|
self._postfetch_chain.insert(0, plugin)
|
||||||
else:
|
else:
|
||||||
self._postfetch_chain.append(plugin)
|
self._postfetch_chain.append(plugin)
|
||||||
|
|
||||||
|
@ -331,7 +331,8 @@ class RecordedUrl:
|
|||||||
warcprox_meta=None, content_type=None, custom_type=None,
|
warcprox_meta=None, content_type=None, custom_type=None,
|
||||||
status=None, size=None, client_ip=None, method=None,
|
status=None, size=None, client_ip=None, method=None,
|
||||||
timestamp=None, host=None, duration=None, referer=None,
|
timestamp=None, host=None, duration=None, referer=None,
|
||||||
payload_digest=None, truncated=None, warc_records=None):
|
payload_digest=None, truncated=None, warc_records=None,
|
||||||
|
do_not_archive=False):
|
||||||
# XXX should test what happens with non-ascii url (when does
|
# XXX should test what happens with non-ascii url (when does
|
||||||
# url-encoding happen?)
|
# url-encoding happen?)
|
||||||
if type(url) is not bytes:
|
if type(url) is not bytes:
|
||||||
@ -372,6 +373,7 @@ class RecordedUrl:
|
|||||||
self.payload_digest = payload_digest
|
self.payload_digest = payload_digest
|
||||||
self.truncated = truncated
|
self.truncated = truncated
|
||||||
self.warc_records = warc_records
|
self.warc_records = warc_records
|
||||||
|
self.do_not_archive = do_not_archive
|
||||||
|
|
||||||
# inherit from object so that multiple inheritance from this class works
|
# inherit from object so that multiple inheritance from this class works
|
||||||
# properly in python 2
|
# properly in python 2
|
||||||
|
@ -84,7 +84,8 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
|
|||||||
and 'warc-prefix' in recorded_url.warcprox_meta
|
and 'warc-prefix' in recorded_url.warcprox_meta
|
||||||
else self.options.prefix)
|
else self.options.prefix)
|
||||||
# special warc name prefix '-' means "don't archive"
|
# special warc name prefix '-' means "don't archive"
|
||||||
return prefix != '-' and self._filter_accepts(recorded_url)
|
return (prefix != '-' and not recorded_url.do_not_archive
|
||||||
|
and self._filter_accepts(recorded_url))
|
||||||
|
|
||||||
def _log(self, recorded_url, records):
|
def _log(self, recorded_url, records):
|
||||||
try:
|
try:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user