Merge pull request #48 from vbanos/configurable-warc-filename

Configurable WARC filenames
This commit is contained in:
Noah Levitt 2018-01-15 16:43:35 -08:00 committed by GitHub
commit 9260367831
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 55 additions and 4 deletions

View File

@ -24,6 +24,7 @@ import fcntl
from multiprocessing import Process, Queue from multiprocessing import Process, Queue
from datetime import datetime from datetime import datetime
import pytest import pytest
import re
from warcprox.mitmproxy import ProxyingRecorder from warcprox.mitmproxy import ProxyingRecorder
from warcprox.warcproxy import RecordedUrl from warcprox.warcproxy import RecordedUrl
from warcprox.writer import WarcWriter from warcprox.writer import WarcWriter
@ -163,3 +164,21 @@ def test_special_dont_write_prefix():
wwt.stop.set() wwt.stop.set()
wwt.join() wwt.join()
def test_warc_writer_filename(tmpdir):
"""Test if WarcWriter is writing WARC files with custom filenames.
"""
recorder = ProxyingRecorder(None, None, 'sha1', url='http://example.com')
recorded_url = RecordedUrl(
url='http://example.com', content_type='text/plain', status=200,
client_ip='127.0.0.2', request_data=b'abc',
response_recorder=recorder, remote_ip='127.0.0.3',
timestamp=datetime.utcnow())
dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer')))
wwriter = WarcWriter(Options(directory=dirname, prefix='foo',
warc_filename='{timestamp17}_{prefix}_{timestamp14}_{serialno}'))
wwriter.write_records(recorded_url)
warcs = [fn for fn in os.listdir(dirname)]
assert warcs
assert re.search('\d{17}_foo_\d{14}_00000.warc.open', wwriter._fpath)

View File

@ -77,6 +77,9 @@ def _build_arg_parser(prog):
help='where to store and load generated certificates') help='where to store and load generated certificates')
arg_parser.add_argument('-d', '--dir', dest='directory', arg_parser.add_argument('-d', '--dir', dest='directory',
default='./warcs', help='where to write warcs') default='./warcs', help='where to write warcs')
arg_parser.add_argument('--warc-filename', dest='warc_filename',
default='{prefix}-{timestamp17}-{serialno}-{randomtoken}',
help='define custom WARC filename with variables {prefix}, {timestamp14}, {timestamp17}, {serialno}, {randomtoken}, {hostname}, {shorthostname}')
arg_parser.add_argument('-z', '--gzip', dest='gzip', action='store_true', arg_parser.add_argument('-z', '--gzip', dest='gzip', action='store_true',
help='write gzip-compressed warc records') help='write gzip-compressed warc records')
arg_parser.add_argument('--no-warc-open-suffix', dest='no_warc_open_suffix', arg_parser.add_argument('--no-warc-open-suffix', dest='no_warc_open_suffix',

View File

@ -28,6 +28,7 @@ import fcntl
import time import time
import warcprox import warcprox
import os import os
import socket
import string import string
import random import random
import threading import threading
@ -42,6 +43,8 @@ class WarcWriter:
self._last_activity = time.time() self._last_activity = time.time()
self.gzip = options.gzip or False self.gzip = options.gzip or False
self.warc_filename = options.warc_filename or \
'{prefix}-{timestamp17}-{randomtoken}-{serialno}'
digest_algorithm = options.digest_algorithm or 'sha1' digest_algorithm = options.digest_algorithm or 'sha1'
base32 = options.base32 base32 = options.base32
self.record_builder = warcprox.warc.WarcRecordBuilder( self.record_builder = warcprox.warc.WarcRecordBuilder(
@ -68,6 +71,10 @@ class WarcWriter:
now = datetime.utcnow() now = datetime.utcnow()
return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000) return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000)
def timestamp14(self):
now = datetime.utcnow()
return '{:%Y%m%d%H%M%S}'.format(now)
def close_writer(self): def close_writer(self):
with self._lock: with self._lock:
if self._fpath: if self._fpath:
@ -86,8 +93,32 @@ class WarcWriter:
self._fpath = None self._fpath = None
self._f = None self._f = None
def serial(self):
return '{:05d}'.format(self._serial)
# h3 default <!-- <property name="template" value="${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}" /> --> # h3 default <!-- <property name="template" value="${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}" /> -->
# ${prefix}-${timestamp17}-${randomtoken}-${serialno}.warc.gz" def _warc_filename(self):
"""WARC filename is configurable with CLI parameter --warc-filename.
Default: '{prefix}-{timestamp17}-{serialno}-{randomtoken}'
Available variables are: prefix, timestamp14, timestamp17, serialno,
randomtoken, hostname, shorthostname.
Extension ``.warc`` or ``.warc.gz`` is appended automatically.
"""
hostname = socket.getfqdn()
shorthostname = hostname.split('.')[0]
fname = self.warc_filename.format(prefix=self.prefix,
timestamp14=self.timestamp14(),
timestamp17=self.timestamp17(),
serialno=self.serial(),
randomtoken=self._randomtoken,
hostname=hostname,
shorthostname=shorthostname)
if self.gzip:
fname = fname + '.warc.gz'
else:
fname = fname + '.warc'
return fname
def _writer(self): def _writer(self):
with self._lock: with self._lock:
if self._fpath and os.path.getsize( if self._fpath and os.path.getsize(
@ -95,9 +126,7 @@ class WarcWriter:
self.close_writer() self.close_writer()
if self._f == None: if self._f == None:
self._f_finalname = '{}-{}-{:05d}-{}.warc{}'.format( self._f_finalname = self._warc_filename()
self.prefix, self.timestamp17(), self._serial,
self._randomtoken, '.gz' if self.gzip else '')
self._fpath = os.path.sep.join([ self._fpath = os.path.sep.join([
self.directory, self._f_finalname + self._f_open_suffix]) self.directory, self._f_finalname + self._f_open_suffix])