mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Merge pull request #48 from vbanos/configurable-warc-filename
Configurable WARC filenames
This commit is contained in:
commit
9260367831
@ -24,6 +24,7 @@ import fcntl
|
||||
from multiprocessing import Process, Queue
|
||||
from datetime import datetime
|
||||
import pytest
|
||||
import re
|
||||
from warcprox.mitmproxy import ProxyingRecorder
|
||||
from warcprox.warcproxy import RecordedUrl
|
||||
from warcprox.writer import WarcWriter
|
||||
@ -163,3 +164,21 @@ def test_special_dont_write_prefix():
|
||||
wwt.stop.set()
|
||||
wwt.join()
|
||||
|
||||
|
||||
def test_warc_writer_filename(tmpdir):
|
||||
"""Test if WarcWriter is writing WARC files with custom filenames.
|
||||
"""
|
||||
recorder = ProxyingRecorder(None, None, 'sha1', url='http://example.com')
|
||||
recorded_url = RecordedUrl(
|
||||
url='http://example.com', content_type='text/plain', status=200,
|
||||
client_ip='127.0.0.2', request_data=b'abc',
|
||||
response_recorder=recorder, remote_ip='127.0.0.3',
|
||||
timestamp=datetime.utcnow())
|
||||
|
||||
dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer')))
|
||||
wwriter = WarcWriter(Options(directory=dirname, prefix='foo',
|
||||
warc_filename='{timestamp17}_{prefix}_{timestamp14}_{serialno}'))
|
||||
wwriter.write_records(recorded_url)
|
||||
warcs = [fn for fn in os.listdir(dirname)]
|
||||
assert warcs
|
||||
assert re.search('\d{17}_foo_\d{14}_00000.warc.open', wwriter._fpath)
|
||||
|
@ -77,6 +77,9 @@ def _build_arg_parser(prog):
|
||||
help='where to store and load generated certificates')
|
||||
arg_parser.add_argument('-d', '--dir', dest='directory',
|
||||
default='./warcs', help='where to write warcs')
|
||||
arg_parser.add_argument('--warc-filename', dest='warc_filename',
|
||||
default='{prefix}-{timestamp17}-{serialno}-{randomtoken}',
|
||||
help='define custom WARC filename with variables {prefix}, {timestamp14}, {timestamp17}, {serialno}, {randomtoken}, {hostname}, {shorthostname}')
|
||||
arg_parser.add_argument('-z', '--gzip', dest='gzip', action='store_true',
|
||||
help='write gzip-compressed warc records')
|
||||
arg_parser.add_argument('--no-warc-open-suffix', dest='no_warc_open_suffix',
|
||||
|
@ -28,6 +28,7 @@ import fcntl
|
||||
import time
|
||||
import warcprox
|
||||
import os
|
||||
import socket
|
||||
import string
|
||||
import random
|
||||
import threading
|
||||
@ -42,6 +43,8 @@ class WarcWriter:
|
||||
self._last_activity = time.time()
|
||||
|
||||
self.gzip = options.gzip or False
|
||||
self.warc_filename = options.warc_filename or \
|
||||
'{prefix}-{timestamp17}-{randomtoken}-{serialno}'
|
||||
digest_algorithm = options.digest_algorithm or 'sha1'
|
||||
base32 = options.base32
|
||||
self.record_builder = warcprox.warc.WarcRecordBuilder(
|
||||
@ -68,6 +71,10 @@ class WarcWriter:
|
||||
now = datetime.utcnow()
|
||||
return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000)
|
||||
|
||||
def timestamp14(self):
|
||||
now = datetime.utcnow()
|
||||
return '{:%Y%m%d%H%M%S}'.format(now)
|
||||
|
||||
def close_writer(self):
|
||||
with self._lock:
|
||||
if self._fpath:
|
||||
@ -86,8 +93,32 @@ class WarcWriter:
|
||||
self._fpath = None
|
||||
self._f = None
|
||||
|
||||
def serial(self):
|
||||
return '{:05d}'.format(self._serial)
|
||||
|
||||
# h3 default <!-- <property name="template" value="${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}" /> -->
|
||||
# ${prefix}-${timestamp17}-${randomtoken}-${serialno}.warc.gz"
|
||||
def _warc_filename(self):
|
||||
"""WARC filename is configurable with CLI parameter --warc-filename.
|
||||
Default: '{prefix}-{timestamp17}-{serialno}-{randomtoken}'
|
||||
Available variables are: prefix, timestamp14, timestamp17, serialno,
|
||||
randomtoken, hostname, shorthostname.
|
||||
Extension ``.warc`` or ``.warc.gz`` is appended automatically.
|
||||
"""
|
||||
hostname = socket.getfqdn()
|
||||
shorthostname = hostname.split('.')[0]
|
||||
fname = self.warc_filename.format(prefix=self.prefix,
|
||||
timestamp14=self.timestamp14(),
|
||||
timestamp17=self.timestamp17(),
|
||||
serialno=self.serial(),
|
||||
randomtoken=self._randomtoken,
|
||||
hostname=hostname,
|
||||
shorthostname=shorthostname)
|
||||
if self.gzip:
|
||||
fname = fname + '.warc.gz'
|
||||
else:
|
||||
fname = fname + '.warc'
|
||||
return fname
|
||||
|
||||
def _writer(self):
|
||||
with self._lock:
|
||||
if self._fpath and os.path.getsize(
|
||||
@ -95,9 +126,7 @@ class WarcWriter:
|
||||
self.close_writer()
|
||||
|
||||
if self._f == None:
|
||||
self._f_finalname = '{}-{}-{:05d}-{}.warc{}'.format(
|
||||
self.prefix, self.timestamp17(), self._serial,
|
||||
self._randomtoken, '.gz' if self.gzip else '')
|
||||
self._f_finalname = self._warc_filename()
|
||||
self._fpath = os.path.sep.join([
|
||||
self.directory, self._f_finalname + self._f_open_suffix])
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user