From 9784c91459f8b0a68bf0a9c769a53a10ecbb1afd Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 21 Dec 2017 14:31:54 -0800 Subject: [PATCH] test for special warc prefix "-" which means "do not archive" --- setup.py | 2 +- tests/test_writer.py | 128 +++++++++++++++++++++++++++++++++++++++---- 2 files changed, 119 insertions(+), 11 deletions(-) diff --git a/setup.py b/setup.py index d9ca672..4d1bc35 100755 --- a/setup.py +++ b/setup.py @@ -52,7 +52,7 @@ except: setuptools.setup( name='warcprox', - version='2.3.1b4.dev128', + version='2.3.1b4.dev129', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_writer.py b/tests/test_writer.py index 9ce0e13..4474f82 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -1,3 +1,24 @@ +''' +tests/test_writer.py - warcprox warc writing tests + +Copyright (C) 2017 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +''' + import os import fcntl from multiprocessing import Process, Queue @@ -7,16 +28,11 @@ from warcprox.mitmproxy import ProxyingRecorder from warcprox.warcproxy import RecordedUrl from warcprox.writer import WarcWriter from warcprox import Options - -recorder = ProxyingRecorder(None, None, 'sha1', url='http://example.com') - -recorded_url = RecordedUrl(url='http://example.com', content_type='text/plain', - status=200, client_ip='127.0.0.2', - request_data=b'abc', - response_recorder=recorder, - remote_ip='127.0.0.3', - timestamp=datetime.utcnow()) - +import time +import warcprox +import io +import tempfile +import logging def lock_file(queue, filename): """Try to lock file and return 1 if successful, else return 0. @@ -36,6 +52,13 @@ def test_warc_writer_locking(tmpdir): When we don't have the .open suffix, WarcWriter locks the file and the external process trying to ``lock_file`` fails (result=0). """ + recorder = ProxyingRecorder(None, None, 'sha1', url='http://example.com') + recorded_url = RecordedUrl( + url='http://example.com', content_type='text/plain', status=200, + client_ip='127.0.0.2', request_data=b'abc', + response_recorder=recorder, remote_ip='127.0.0.3', + timestamp=datetime.utcnow()) + dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer'))) wwriter = WarcWriter(Options(directory=dirname, no_warc_open_suffix=True)) wwriter.write_records(recorded_url) @@ -55,3 +78,88 @@ def test_warc_writer_locking(tmpdir): p.start() p.join() assert queue.get() == 'OBTAINED LOCK' + +def wait(callback, timeout): + start = time.time() + while time.time() - start < timeout: + if callback(): + return + time.sleep(0.5) + raise Exception('timed out waiting for %s to return truthy' % callback) + +def test_special_dont_write_prefix(): + class NotifyMe: + def __init__(self): + self.the_list = [] + def notify(self, recorded_url, records): + self.the_list.append((recorded_url, records)) + + with tempfile.TemporaryDirectory() as tmpdir: + logging.debug('cd %s', tmpdir) + os.chdir(tmpdir) + + q = warcprox.TimestampedQueue(maxsize=1) + listener = NotifyMe() + wwt = warcprox.writerthread.WarcWriterThread( + recorded_url_q=q, options=Options(prefix='-'), + listeners=[listener]) + try: + wwt.start() + # not to be written due to default prefix + recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) + recorder.read() + q.put(RecordedUrl( + url='http://example.com/no', content_type='text/plain', + status=200, client_ip='127.0.0.2', request_data=b'abc', + response_recorder=recorder, remote_ip='127.0.0.3', + timestamp=datetime.utcnow(), + payload_digest=recorder.block_digest)) + # to be written due to warcprox-meta prefix + recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) + recorder.read() + q.put(RecordedUrl( + url='http://example.com/yes', content_type='text/plain', + status=200, client_ip='127.0.0.2', request_data=b'abc', + response_recorder=recorder, remote_ip='127.0.0.3', + timestamp=datetime.utcnow(), + payload_digest=recorder.block_digest, + warcprox_meta={'warc-prefix': 'normal-warc-prefix'})) + wait(lambda: len(listener.the_list) == 2, 10.0) + assert not listener.the_list[0][1] + assert listener.the_list[1][1] + finally: + wwt.stop.set() + wwt.join() + + q = warcprox.TimestampedQueue(maxsize=1) + listener = NotifyMe() + wwt = warcprox.writerthread.WarcWriterThread( + recorded_url_q=q, listeners=[listener]) + try: + wwt.start() + # to be written due to default prefix + recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) + recorder.read() + q.put(RecordedUrl( + url='http://example.com/yes', content_type='text/plain', + status=200, client_ip='127.0.0.2', request_data=b'abc', + response_recorder=recorder, remote_ip='127.0.0.3', + timestamp=datetime.utcnow(), + payload_digest=recorder.block_digest)) + # not to be written due to warcprox-meta prefix + recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) + recorder.read() + q.put(RecordedUrl( + url='http://example.com/no', content_type='text/plain', + status=200, client_ip='127.0.0.2', request_data=b'abc', + response_recorder=recorder, remote_ip='127.0.0.3', + timestamp=datetime.utcnow(), + payload_digest=recorder.block_digest, + warcprox_meta={'warc-prefix': '-'})) + wait(lambda: len(listener.the_list) == 2, 10.0) + assert listener.the_list[0][1] + assert not listener.the_list[1][1] + finally: + wwt.stop.set() + wwt.join() +