test for special warc prefix "-" which means "do not archive"

This commit is contained in:
Noah Levitt 2017-12-21 14:31:54 -08:00
parent 399853dea0
commit 9784c91459
2 changed files with 119 additions and 11 deletions

View File

@ -52,7 +52,7 @@ except:
setuptools.setup( setuptools.setup(
name='warcprox', name='warcprox',
version='2.3.1b4.dev128', version='2.3.1b4.dev129',
description='WARC writing MITM HTTP/S proxy', description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox', url='https://github.com/internetarchive/warcprox',
author='Noah Levitt', author='Noah Levitt',

View File

@ -1,3 +1,24 @@
'''
tests/test_writer.py - warcprox warc writing tests
Copyright (C) 2017 Internet Archive
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
USA.
'''
import os import os
import fcntl import fcntl
from multiprocessing import Process, Queue from multiprocessing import Process, Queue
@ -7,16 +28,11 @@ from warcprox.mitmproxy import ProxyingRecorder
from warcprox.warcproxy import RecordedUrl from warcprox.warcproxy import RecordedUrl
from warcprox.writer import WarcWriter from warcprox.writer import WarcWriter
from warcprox import Options from warcprox import Options
import time
recorder = ProxyingRecorder(None, None, 'sha1', url='http://example.com') import warcprox
import io
recorded_url = RecordedUrl(url='http://example.com', content_type='text/plain', import tempfile
status=200, client_ip='127.0.0.2', import logging
request_data=b'abc',
response_recorder=recorder,
remote_ip='127.0.0.3',
timestamp=datetime.utcnow())
def lock_file(queue, filename): def lock_file(queue, filename):
"""Try to lock file and return 1 if successful, else return 0. """Try to lock file and return 1 if successful, else return 0.
@ -36,6 +52,13 @@ def test_warc_writer_locking(tmpdir):
When we don't have the .open suffix, WarcWriter locks the file and the When we don't have the .open suffix, WarcWriter locks the file and the
external process trying to ``lock_file`` fails (result=0). external process trying to ``lock_file`` fails (result=0).
""" """
recorder = ProxyingRecorder(None, None, 'sha1', url='http://example.com')
recorded_url = RecordedUrl(
url='http://example.com', content_type='text/plain', status=200,
client_ip='127.0.0.2', request_data=b'abc',
response_recorder=recorder, remote_ip='127.0.0.3',
timestamp=datetime.utcnow())
dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer'))) dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer')))
wwriter = WarcWriter(Options(directory=dirname, no_warc_open_suffix=True)) wwriter = WarcWriter(Options(directory=dirname, no_warc_open_suffix=True))
wwriter.write_records(recorded_url) wwriter.write_records(recorded_url)
@ -55,3 +78,88 @@ def test_warc_writer_locking(tmpdir):
p.start() p.start()
p.join() p.join()
assert queue.get() == 'OBTAINED LOCK' assert queue.get() == 'OBTAINED LOCK'
def wait(callback, timeout):
start = time.time()
while time.time() - start < timeout:
if callback():
return
time.sleep(0.5)
raise Exception('timed out waiting for %s to return truthy' % callback)
def test_special_dont_write_prefix():
class NotifyMe:
def __init__(self):
self.the_list = []
def notify(self, recorded_url, records):
self.the_list.append((recorded_url, records))
with tempfile.TemporaryDirectory() as tmpdir:
logging.debug('cd %s', tmpdir)
os.chdir(tmpdir)
q = warcprox.TimestampedQueue(maxsize=1)
listener = NotifyMe()
wwt = warcprox.writerthread.WarcWriterThread(
recorded_url_q=q, options=Options(prefix='-'),
listeners=[listener])
try:
wwt.start()
# not to be written due to default prefix
recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
recorder.read()
q.put(RecordedUrl(
url='http://example.com/no', content_type='text/plain',
status=200, client_ip='127.0.0.2', request_data=b'abc',
response_recorder=recorder, remote_ip='127.0.0.3',
timestamp=datetime.utcnow(),
payload_digest=recorder.block_digest))
# to be written due to warcprox-meta prefix
recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
recorder.read()
q.put(RecordedUrl(
url='http://example.com/yes', content_type='text/plain',
status=200, client_ip='127.0.0.2', request_data=b'abc',
response_recorder=recorder, remote_ip='127.0.0.3',
timestamp=datetime.utcnow(),
payload_digest=recorder.block_digest,
warcprox_meta={'warc-prefix': 'normal-warc-prefix'}))
wait(lambda: len(listener.the_list) == 2, 10.0)
assert not listener.the_list[0][1]
assert listener.the_list[1][1]
finally:
wwt.stop.set()
wwt.join()
q = warcprox.TimestampedQueue(maxsize=1)
listener = NotifyMe()
wwt = warcprox.writerthread.WarcWriterThread(
recorded_url_q=q, listeners=[listener])
try:
wwt.start()
# to be written due to default prefix
recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
recorder.read()
q.put(RecordedUrl(
url='http://example.com/yes', content_type='text/plain',
status=200, client_ip='127.0.0.2', request_data=b'abc',
response_recorder=recorder, remote_ip='127.0.0.3',
timestamp=datetime.utcnow(),
payload_digest=recorder.block_digest))
# not to be written due to warcprox-meta prefix
recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
recorder.read()
q.put(RecordedUrl(
url='http://example.com/no', content_type='text/plain',
status=200, client_ip='127.0.0.2', request_data=b'abc',
response_recorder=recorder, remote_ip='127.0.0.3',
timestamp=datetime.utcnow(),
payload_digest=recorder.block_digest,
warcprox_meta={'warc-prefix': '-'}))
wait(lambda: len(listener.the_list) == 2, 10.0)
assert listener.the_list[0][1]
assert not listener.the_list[1][1]
finally:
wwt.stop.set()
wwt.join()