test for special warc prefix "-" which means "do not archive"

This commit is contained in:
Noah Levitt 2017-12-21 14:31:54 -08:00
parent 399853dea0
commit 9784c91459
2 changed files with 119 additions and 11 deletions

View File

@ -52,7 +52,7 @@ except:
setuptools.setup(
name='warcprox',
version='2.3.1b4.dev128',
version='2.3.1b4.dev129',
description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox',
author='Noah Levitt',

View File

@ -1,3 +1,24 @@
'''
tests/test_writer.py - warcprox warc writing tests
Copyright (C) 2017 Internet Archive
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
USA.
'''
import os
import fcntl
from multiprocessing import Process, Queue
@ -7,16 +28,11 @@ from warcprox.mitmproxy import ProxyingRecorder
from warcprox.warcproxy import RecordedUrl
from warcprox.writer import WarcWriter
from warcprox import Options
recorder = ProxyingRecorder(None, None, 'sha1', url='http://example.com')
recorded_url = RecordedUrl(url='http://example.com', content_type='text/plain',
status=200, client_ip='127.0.0.2',
request_data=b'abc',
response_recorder=recorder,
remote_ip='127.0.0.3',
timestamp=datetime.utcnow())
import time
import warcprox
import io
import tempfile
import logging
def lock_file(queue, filename):
"""Try to lock file and return 1 if successful, else return 0.
@ -36,6 +52,13 @@ def test_warc_writer_locking(tmpdir):
When we don't have the .open suffix, WarcWriter locks the file and the
external process trying to ``lock_file`` fails (result=0).
"""
recorder = ProxyingRecorder(None, None, 'sha1', url='http://example.com')
recorded_url = RecordedUrl(
url='http://example.com', content_type='text/plain', status=200,
client_ip='127.0.0.2', request_data=b'abc',
response_recorder=recorder, remote_ip='127.0.0.3',
timestamp=datetime.utcnow())
dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer')))
wwriter = WarcWriter(Options(directory=dirname, no_warc_open_suffix=True))
wwriter.write_records(recorded_url)
@ -55,3 +78,88 @@ def test_warc_writer_locking(tmpdir):
p.start()
p.join()
assert queue.get() == 'OBTAINED LOCK'
def wait(callback, timeout):
start = time.time()
while time.time() - start < timeout:
if callback():
return
time.sleep(0.5)
raise Exception('timed out waiting for %s to return truthy' % callback)
def test_special_dont_write_prefix():
class NotifyMe:
def __init__(self):
self.the_list = []
def notify(self, recorded_url, records):
self.the_list.append((recorded_url, records))
with tempfile.TemporaryDirectory() as tmpdir:
logging.debug('cd %s', tmpdir)
os.chdir(tmpdir)
q = warcprox.TimestampedQueue(maxsize=1)
listener = NotifyMe()
wwt = warcprox.writerthread.WarcWriterThread(
recorded_url_q=q, options=Options(prefix='-'),
listeners=[listener])
try:
wwt.start()
# not to be written due to default prefix
recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
recorder.read()
q.put(RecordedUrl(
url='http://example.com/no', content_type='text/plain',
status=200, client_ip='127.0.0.2', request_data=b'abc',
response_recorder=recorder, remote_ip='127.0.0.3',
timestamp=datetime.utcnow(),
payload_digest=recorder.block_digest))
# to be written due to warcprox-meta prefix
recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
recorder.read()
q.put(RecordedUrl(
url='http://example.com/yes', content_type='text/plain',
status=200, client_ip='127.0.0.2', request_data=b'abc',
response_recorder=recorder, remote_ip='127.0.0.3',
timestamp=datetime.utcnow(),
payload_digest=recorder.block_digest,
warcprox_meta={'warc-prefix': 'normal-warc-prefix'}))
wait(lambda: len(listener.the_list) == 2, 10.0)
assert not listener.the_list[0][1]
assert listener.the_list[1][1]
finally:
wwt.stop.set()
wwt.join()
q = warcprox.TimestampedQueue(maxsize=1)
listener = NotifyMe()
wwt = warcprox.writerthread.WarcWriterThread(
recorded_url_q=q, listeners=[listener])
try:
wwt.start()
# to be written due to default prefix
recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
recorder.read()
q.put(RecordedUrl(
url='http://example.com/yes', content_type='text/plain',
status=200, client_ip='127.0.0.2', request_data=b'abc',
response_recorder=recorder, remote_ip='127.0.0.3',
timestamp=datetime.utcnow(),
payload_digest=recorder.block_digest))
# not to be written due to warcprox-meta prefix
recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
recorder.read()
q.put(RecordedUrl(
url='http://example.com/no', content_type='text/plain',
status=200, client_ip='127.0.0.2', request_data=b'abc',
response_recorder=recorder, remote_ip='127.0.0.3',
timestamp=datetime.utcnow(),
payload_digest=recorder.block_digest,
warcprox_meta={'warc-prefix': '-'}))
wait(lambda: len(listener.the_list) == 2, 10.0)
assert listener.the_list[0][1]
assert not listener.the_list[1][1]
finally:
wwt.stop.set()
wwt.join()