mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Merge pull request #43 from vbanos/no-warc-open-suffix
Add hidden --no-warc-open-suffix CLI option
This commit is contained in:
commit
35100581ee
57
tests/test_writer.py
Normal file
57
tests/test_writer.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
import os
|
||||||
|
import fcntl
|
||||||
|
from multiprocessing import Process, Queue
|
||||||
|
from datetime import datetime
|
||||||
|
import pytest
|
||||||
|
from warcprox.mitmproxy import ProxyingRecorder
|
||||||
|
from warcprox.warcproxy import RecordedUrl
|
||||||
|
from warcprox.writer import WarcWriter
|
||||||
|
from warcprox import Options
|
||||||
|
|
||||||
|
recorder = ProxyingRecorder(None, None, 'sha1', url='http://example.com')
|
||||||
|
|
||||||
|
recorded_url = RecordedUrl(url='http://example.com', content_type='text/plain',
|
||||||
|
status=200, client_ip='127.0.0.2',
|
||||||
|
request_data=b'abc',
|
||||||
|
response_recorder=recorder,
|
||||||
|
remote_ip='127.0.0.3',
|
||||||
|
timestamp=datetime.utcnow())
|
||||||
|
|
||||||
|
|
||||||
|
def lock_file(queue, filename):
|
||||||
|
"""Try to lock file and return 1 if successful, else return 0.
|
||||||
|
It is necessary to run this method in a different process to test locking.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
fi = open(filename, 'ab')
|
||||||
|
fcntl.lockf(fi, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||||
|
fi.close()
|
||||||
|
queue.put('OBTAINED LOCK')
|
||||||
|
except IOError:
|
||||||
|
queue.put('FAILED TO OBTAIN LOCK')
|
||||||
|
|
||||||
|
|
||||||
|
def test_warc_writer_locking(tmpdir):
|
||||||
|
"""Test if WarcWriter is locking WARC files.
|
||||||
|
When we don't have the .open suffix, WarcWriter locks the file and the
|
||||||
|
external process trying to ``lock_file`` fails (result=0).
|
||||||
|
"""
|
||||||
|
dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer')))
|
||||||
|
wwriter = WarcWriter(Options(directory=dirname, no_warc_open_suffix=True))
|
||||||
|
wwriter.write_records(recorded_url)
|
||||||
|
warcs = [fn for fn in os.listdir(dirname) if fn.endswith('.warc')]
|
||||||
|
assert warcs
|
||||||
|
target_warc = os.path.join(dirname, warcs[0])
|
||||||
|
# launch another process and try to lock WARC file
|
||||||
|
queue = Queue()
|
||||||
|
p = Process(target=lock_file, args=(queue, target_warc))
|
||||||
|
p.start()
|
||||||
|
p.join()
|
||||||
|
assert queue.get() == 'FAILED TO OBTAIN LOCK'
|
||||||
|
wwriter.close_writer()
|
||||||
|
|
||||||
|
# locking must succeed after writer has closed the WARC file.
|
||||||
|
p = Process(target=lock_file, args=(queue, target_warc))
|
||||||
|
p.start()
|
||||||
|
p.join()
|
||||||
|
assert queue.get() == 'OBTAINED LOCK'
|
@ -78,6 +78,8 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
|||||||
default='./warcs', help='where to write warcs')
|
default='./warcs', help='where to write warcs')
|
||||||
arg_parser.add_argument('-z', '--gzip', dest='gzip', action='store_true',
|
arg_parser.add_argument('-z', '--gzip', dest='gzip', action='store_true',
|
||||||
help='write gzip-compressed warc records')
|
help='write gzip-compressed warc records')
|
||||||
|
arg_parser.add_argument('--no-warc-open-suffix', dest='no_warc_open_suffix',
|
||||||
|
default=False, action='store_true', help=argparse.SUPPRESS)
|
||||||
arg_parser.add_argument('-n', '--prefix', dest='prefix',
|
arg_parser.add_argument('-n', '--prefix', dest='prefix',
|
||||||
default='WARCPROX', help='WARC filename prefix')
|
default='WARCPROX', help='WARC filename prefix')
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
|
@ -24,6 +24,7 @@ from __future__ import absolute_import
|
|||||||
import logging
|
import logging
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from hanzo import warctools
|
from hanzo import warctools
|
||||||
|
import fcntl
|
||||||
import time
|
import time
|
||||||
import warcprox
|
import warcprox
|
||||||
import os
|
import os
|
||||||
@ -53,6 +54,7 @@ class WarcWriter:
|
|||||||
self._f = None
|
self._f = None
|
||||||
self._fpath = None
|
self._fpath = None
|
||||||
self._f_finalname = None
|
self._f_finalname = None
|
||||||
|
self._f_open_suffix = '' if options.no_warc_open_suffix else '.open'
|
||||||
self._serial = 0
|
self._serial = 0
|
||||||
self._lock = threading.RLock()
|
self._lock = threading.RLock()
|
||||||
|
|
||||||
@ -70,6 +72,12 @@ class WarcWriter:
|
|||||||
with self._lock:
|
with self._lock:
|
||||||
if self._fpath:
|
if self._fpath:
|
||||||
self.logger.info('closing %s', self._f_finalname)
|
self.logger.info('closing %s', self._f_finalname)
|
||||||
|
if self._f_open_suffix == '':
|
||||||
|
try:
|
||||||
|
fcntl.lockf(self._f, fcntl.LOCK_UN)
|
||||||
|
except IOError as exc:
|
||||||
|
self.logger.error('could not unlock file %s (%s)',
|
||||||
|
self._fpath, exc)
|
||||||
self._f.close()
|
self._f.close()
|
||||||
finalpath = os.path.sep.join(
|
finalpath = os.path.sep.join(
|
||||||
[self.directory, self._f_finalname])
|
[self.directory, self._f_finalname])
|
||||||
@ -91,9 +99,17 @@ class WarcWriter:
|
|||||||
self.prefix, self.timestamp17(), self._serial,
|
self.prefix, self.timestamp17(), self._serial,
|
||||||
self._randomtoken, '.gz' if self.gzip else '')
|
self._randomtoken, '.gz' if self.gzip else '')
|
||||||
self._fpath = os.path.sep.join([
|
self._fpath = os.path.sep.join([
|
||||||
self.directory, self._f_finalname + '.open'])
|
self.directory, self._f_finalname + self._f_open_suffix])
|
||||||
|
|
||||||
self._f = open(self._fpath, 'wb')
|
self._f = open(self._fpath, 'wb')
|
||||||
|
# if no '.open' suffix is used for WARC, acquire an exclusive
|
||||||
|
# file lock.
|
||||||
|
if self._f_open_suffix == '':
|
||||||
|
try:
|
||||||
|
fcntl.lockf(self._f, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||||
|
except IOError as exc:
|
||||||
|
self.logger.error('could not lock file %s (%s)',
|
||||||
|
self._fpath, exc)
|
||||||
|
|
||||||
warcinfo_record = self.record_builder.build_warcinfo_record(
|
warcinfo_record = self.record_builder.build_warcinfo_record(
|
||||||
self._f_finalname)
|
self._f_finalname)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user