mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Merge pull request #48 from vbanos/configurable-warc-filename
Configurable WARC filenames
This commit is contained in:
commit
9260367831
@ -24,6 +24,7 @@ import fcntl
|
|||||||
from multiprocessing import Process, Queue
|
from multiprocessing import Process, Queue
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import pytest
|
import pytest
|
||||||
|
import re
|
||||||
from warcprox.mitmproxy import ProxyingRecorder
|
from warcprox.mitmproxy import ProxyingRecorder
|
||||||
from warcprox.warcproxy import RecordedUrl
|
from warcprox.warcproxy import RecordedUrl
|
||||||
from warcprox.writer import WarcWriter
|
from warcprox.writer import WarcWriter
|
||||||
@ -163,3 +164,21 @@ def test_special_dont_write_prefix():
|
|||||||
wwt.stop.set()
|
wwt.stop.set()
|
||||||
wwt.join()
|
wwt.join()
|
||||||
|
|
||||||
|
|
||||||
|
def test_warc_writer_filename(tmpdir):
|
||||||
|
"""Test if WarcWriter is writing WARC files with custom filenames.
|
||||||
|
"""
|
||||||
|
recorder = ProxyingRecorder(None, None, 'sha1', url='http://example.com')
|
||||||
|
recorded_url = RecordedUrl(
|
||||||
|
url='http://example.com', content_type='text/plain', status=200,
|
||||||
|
client_ip='127.0.0.2', request_data=b'abc',
|
||||||
|
response_recorder=recorder, remote_ip='127.0.0.3',
|
||||||
|
timestamp=datetime.utcnow())
|
||||||
|
|
||||||
|
dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer')))
|
||||||
|
wwriter = WarcWriter(Options(directory=dirname, prefix='foo',
|
||||||
|
warc_filename='{timestamp17}_{prefix}_{timestamp14}_{serialno}'))
|
||||||
|
wwriter.write_records(recorded_url)
|
||||||
|
warcs = [fn for fn in os.listdir(dirname)]
|
||||||
|
assert warcs
|
||||||
|
assert re.search('\d{17}_foo_\d{14}_00000.warc.open', wwriter._fpath)
|
||||||
|
@ -77,6 +77,9 @@ def _build_arg_parser(prog):
|
|||||||
help='where to store and load generated certificates')
|
help='where to store and load generated certificates')
|
||||||
arg_parser.add_argument('-d', '--dir', dest='directory',
|
arg_parser.add_argument('-d', '--dir', dest='directory',
|
||||||
default='./warcs', help='where to write warcs')
|
default='./warcs', help='where to write warcs')
|
||||||
|
arg_parser.add_argument('--warc-filename', dest='warc_filename',
|
||||||
|
default='{prefix}-{timestamp17}-{serialno}-{randomtoken}',
|
||||||
|
help='define custom WARC filename with variables {prefix}, {timestamp14}, {timestamp17}, {serialno}, {randomtoken}, {hostname}, {shorthostname}')
|
||||||
arg_parser.add_argument('-z', '--gzip', dest='gzip', action='store_true',
|
arg_parser.add_argument('-z', '--gzip', dest='gzip', action='store_true',
|
||||||
help='write gzip-compressed warc records')
|
help='write gzip-compressed warc records')
|
||||||
arg_parser.add_argument('--no-warc-open-suffix', dest='no_warc_open_suffix',
|
arg_parser.add_argument('--no-warc-open-suffix', dest='no_warc_open_suffix',
|
||||||
|
@ -28,6 +28,7 @@ import fcntl
|
|||||||
import time
|
import time
|
||||||
import warcprox
|
import warcprox
|
||||||
import os
|
import os
|
||||||
|
import socket
|
||||||
import string
|
import string
|
||||||
import random
|
import random
|
||||||
import threading
|
import threading
|
||||||
@ -42,6 +43,8 @@ class WarcWriter:
|
|||||||
self._last_activity = time.time()
|
self._last_activity = time.time()
|
||||||
|
|
||||||
self.gzip = options.gzip or False
|
self.gzip = options.gzip or False
|
||||||
|
self.warc_filename = options.warc_filename or \
|
||||||
|
'{prefix}-{timestamp17}-{randomtoken}-{serialno}'
|
||||||
digest_algorithm = options.digest_algorithm or 'sha1'
|
digest_algorithm = options.digest_algorithm or 'sha1'
|
||||||
base32 = options.base32
|
base32 = options.base32
|
||||||
self.record_builder = warcprox.warc.WarcRecordBuilder(
|
self.record_builder = warcprox.warc.WarcRecordBuilder(
|
||||||
@ -68,6 +71,10 @@ class WarcWriter:
|
|||||||
now = datetime.utcnow()
|
now = datetime.utcnow()
|
||||||
return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000)
|
return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000)
|
||||||
|
|
||||||
|
def timestamp14(self):
|
||||||
|
now = datetime.utcnow()
|
||||||
|
return '{:%Y%m%d%H%M%S}'.format(now)
|
||||||
|
|
||||||
def close_writer(self):
|
def close_writer(self):
|
||||||
with self._lock:
|
with self._lock:
|
||||||
if self._fpath:
|
if self._fpath:
|
||||||
@ -86,8 +93,32 @@ class WarcWriter:
|
|||||||
self._fpath = None
|
self._fpath = None
|
||||||
self._f = None
|
self._f = None
|
||||||
|
|
||||||
|
def serial(self):
|
||||||
|
return '{:05d}'.format(self._serial)
|
||||||
|
|
||||||
# h3 default <!-- <property name="template" value="${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}" /> -->
|
# h3 default <!-- <property name="template" value="${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}" /> -->
|
||||||
# ${prefix}-${timestamp17}-${randomtoken}-${serialno}.warc.gz"
|
def _warc_filename(self):
|
||||||
|
"""WARC filename is configurable with CLI parameter --warc-filename.
|
||||||
|
Default: '{prefix}-{timestamp17}-{serialno}-{randomtoken}'
|
||||||
|
Available variables are: prefix, timestamp14, timestamp17, serialno,
|
||||||
|
randomtoken, hostname, shorthostname.
|
||||||
|
Extension ``.warc`` or ``.warc.gz`` is appended automatically.
|
||||||
|
"""
|
||||||
|
hostname = socket.getfqdn()
|
||||||
|
shorthostname = hostname.split('.')[0]
|
||||||
|
fname = self.warc_filename.format(prefix=self.prefix,
|
||||||
|
timestamp14=self.timestamp14(),
|
||||||
|
timestamp17=self.timestamp17(),
|
||||||
|
serialno=self.serial(),
|
||||||
|
randomtoken=self._randomtoken,
|
||||||
|
hostname=hostname,
|
||||||
|
shorthostname=shorthostname)
|
||||||
|
if self.gzip:
|
||||||
|
fname = fname + '.warc.gz'
|
||||||
|
else:
|
||||||
|
fname = fname + '.warc'
|
||||||
|
return fname
|
||||||
|
|
||||||
def _writer(self):
|
def _writer(self):
|
||||||
with self._lock:
|
with self._lock:
|
||||||
if self._fpath and os.path.getsize(
|
if self._fpath and os.path.getsize(
|
||||||
@ -95,9 +126,7 @@ class WarcWriter:
|
|||||||
self.close_writer()
|
self.close_writer()
|
||||||
|
|
||||||
if self._f == None:
|
if self._f == None:
|
||||||
self._f_finalname = '{}-{}-{:05d}-{}.warc{}'.format(
|
self._f_finalname = self._warc_filename()
|
||||||
self.prefix, self.timestamp17(), self._serial,
|
|
||||||
self._randomtoken, '.gz' if self.gzip else '')
|
|
||||||
self._fpath = os.path.sep.join([
|
self._fpath = os.path.sep.join([
|
||||||
self.directory, self._f_finalname + self._f_open_suffix])
|
self.directory, self._f_finalname + self._f_open_suffix])
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user