From ae23011d845021f28625be400b6c3ae895afa0e9 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Mon, 8 Jan 2018 12:13:05 +0000 Subject: [PATCH 1/9] Configurable WARC filenames New ``--warc-filename`` CLI parameter with default value: ``'{prefix}-{timestamp17}-{serialno}-{randomtoken}'`` (the previous hard-coded WARC filename format). Use variables: ``{prefix}, {timestamp14}, {timestamp17}, {serialno}, {randomtoken}, {hostname}, {shorthostname}`` to define custom WARC filenames. --- warcprox/main.py | 3 +++ warcprox/writer.py | 37 +++++++++++++++++++++++++++++++++---- 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/warcprox/main.py b/warcprox/main.py index 348dfbf..8bfc3c4 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -77,6 +77,9 @@ def _build_arg_parser(prog): help='where to store and load generated certificates') arg_parser.add_argument('-d', '--dir', dest='directory', default='./warcs', help='where to write warcs') + arg_parser.add_argument('--warc-filename', dest='warc_filename', + default='{prefix}-{timestamp17}-{serialno}-{randomtoken}', + help='define custom WARC filename with variables {prefix}, {timestamp14}, {timestamp17}, {serialno}, {randomtoken}, {hostname}, {shorthostname}') arg_parser.add_argument('-z', '--gzip', dest='gzip', action='store_true', help='write gzip-compressed warc records') arg_parser.add_argument('--no-warc-open-suffix', dest='no_warc_open_suffix', diff --git a/warcprox/writer.py b/warcprox/writer.py index 7a1032a..23dbafb 100644 --- a/warcprox/writer.py +++ b/warcprox/writer.py @@ -28,6 +28,7 @@ import fcntl import time import warcprox import os +import socket import string import random import threading @@ -42,6 +43,8 @@ class WarcWriter: self._last_activity = time.time() self.gzip = options.gzip or False + self.warc_filename = options.warc_filename or \ + '{prefix}-{timestamp17}-{randomtoken}-{serialno}.warc' digest_algorithm = options.digest_algorithm or 'sha1' base32 = options.base32 self.record_builder = warcprox.warc.WarcRecordBuilder( @@ -68,6 +71,10 @@ class WarcWriter: now = datetime.utcnow() return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000) + def timestamp14(self): + now = datetime.utcnow() + return '{:%Y%m%d%H%M%S}'.format(now) + def close_writer(self): with self._lock: if self._fpath: @@ -86,8 +93,32 @@ class WarcWriter: self._fpath = None self._f = None + def serial(self): + return '{:05d}'.format(self._serial) + # h3 default - # ${prefix}-${timestamp17}-${randomtoken}-${serialno}.warc.gz" + def _warc_filename(self): + """WARC filename is configurable with CLI parameter --warc-filename. + Default: '{prefix}-{timestamp17}-{serialno}-{randomtoken}' + Available variables are: prefix, timestamp14, timestamp17, serialno, + randomtoken, hostname, shorthostname. + Extension ``.warc`` or ``.warc.gz`` is appended automatically. + """ + hostname = socket.getfqdn() + shorthostname = hostname.split(',')[0] + fname = self.warc_filename.format(prefix=self.prefix, + timestamp14=self.timestamp14(), + timestamp17=self.timestamp17(), + serialno=self.serial(), + randomtoken=self._randomtoken, + hostname=hostname, + shorthostname=shorthostname) + if self.gzip: + fname = fname + '.warc.gz' + else: + fname = fname + '.warc' + return fname + def _writer(self): with self._lock: if self._fpath and os.path.getsize( @@ -95,9 +126,7 @@ class WarcWriter: self.close_writer() if self._f == None: - self._f_finalname = '{}-{}-{:05d}-{}.warc{}'.format( - self.prefix, self.timestamp17(), self._serial, - self._randomtoken, '.gz' if self.gzip else '') + self._f_finalname = self._warc_filename() self._fpath = os.path.sep.join([ self.directory, self._f_finalname + self._f_open_suffix]) From ec86f2b3dfdef11e47869819c5f863d76e867521 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Tue, 9 Jan 2018 07:02:39 +0000 Subject: [PATCH 2/9] Fix warc_filename default value Remove redundant `.warc` --- warcprox/writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/writer.py b/warcprox/writer.py index 23dbafb..44d21d3 100644 --- a/warcprox/writer.py +++ b/warcprox/writer.py @@ -44,7 +44,7 @@ class WarcWriter: self.gzip = options.gzip or False self.warc_filename = options.warc_filename or \ - '{prefix}-{timestamp17}-{randomtoken}-{serialno}.warc' + '{prefix}-{timestamp17}-{randomtoken}-{serialno}' digest_algorithm = options.digest_algorithm or 'sha1' base32 = options.base32 self.record_builder = warcprox.warc.WarcRecordBuilder( From d2ce61aec9afea6f72a2b4d0e9347bdb9a588afd Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Tue, 9 Jan 2018 12:54:42 +0000 Subject: [PATCH 3/9] Add WarcWriter warc_filename unit test Use custom ``warc_filename`` option and check that the created WARC filename follows the defined pattern. --- tests/test_writer.py | 24 ++++++++++++++++++++++++ warcprox/writer.py | 2 +- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/tests/test_writer.py b/tests/test_writer.py index 4474f82..9b6d53a 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -163,3 +163,27 @@ def test_special_dont_write_prefix(): wwt.stop.set() wwt.join() + +def test_warc_writer_filename(tmpdir): + """Test if WarcWriter is writing WARC files with custom filenames. + """ + recorder = ProxyingRecorder(None, None, 'sha1', url='http://example.com') + recorded_url = RecordedUrl( + url='http://example.com', content_type='text/plain', status=200, + client_ip='127.0.0.2', request_data=b'abc', + response_recorder=recorder, remote_ip='127.0.0.3', + timestamp=datetime.utcnow()) + + dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer'))) + wwriter = WarcWriter(Options(directory=dirname, prefix='foo', + warc_filename='{timestamp17}-{prefix}-{timestamp14}-{serialno}')) + wwriter.write_records(recorded_url) + warcs = [fn for fn in os.listdir(dirname)] + assert warcs + target_warc = os.path.join(dirname, warcs[0]) + assert target_warc + parts = warcs[0].split('-') + assert len(parts[0]) == 17 + assert parts[1] == 'foo' + assert len(parts[2]) == 14 + assert parts[3] == '00000.warc.open' diff --git a/warcprox/writer.py b/warcprox/writer.py index 44d21d3..56ff635 100644 --- a/warcprox/writer.py +++ b/warcprox/writer.py @@ -105,7 +105,7 @@ class WarcWriter: Extension ``.warc`` or ``.warc.gz`` is appended automatically. """ hostname = socket.getfqdn() - shorthostname = hostname.split(',')[0] + shorthostname = hostname.split('.')[0] fname = self.warc_filename.format(prefix=self.prefix, timestamp14=self.timestamp14(), timestamp17=self.timestamp17(), From 9d789cdae8b8b2e8c681813b99eead3314894fe9 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Wed, 10 Jan 2018 18:41:56 +0000 Subject: [PATCH 4/9] Fix writer unit test --- tests/test_writer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_writer.py b/tests/test_writer.py index 9b6d53a..7294ecc 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -176,13 +176,13 @@ def test_warc_writer_filename(tmpdir): dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer'))) wwriter = WarcWriter(Options(directory=dirname, prefix='foo', - warc_filename='{timestamp17}-{prefix}-{timestamp14}-{serialno}')) + warc_filename='{timestamp17}_{prefix}_{timestamp14}_{serialno}')) wwriter.write_records(recorded_url) warcs = [fn for fn in os.listdir(dirname)] assert warcs target_warc = os.path.join(dirname, warcs[0]) assert target_warc - parts = warcs[0].split('-') + parts = warcs[0].split('_') assert len(parts[0]) == 17 assert parts[1] == 'foo' assert len(parts[2]) == 14 From deddd4f850598df68320baa3e4fa95f3dc317d1d Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Wed, 10 Jan 2018 18:52:59 +0000 Subject: [PATCH 5/9] Another fix for the unit test --- tests/test_writer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_writer.py b/tests/test_writer.py index 7294ecc..2ed4e1b 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -176,13 +176,13 @@ def test_warc_writer_filename(tmpdir): dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer'))) wwriter = WarcWriter(Options(directory=dirname, prefix='foo', - warc_filename='{timestamp17}_{prefix}_{timestamp14}_{serialno}')) + warc_filename='{timestamp17}-{prefix}-{timestamp14}-{serialno}')) wwriter.write_records(recorded_url) warcs = [fn for fn in os.listdir(dirname)] assert warcs target_warc = os.path.join(dirname, warcs[0]) assert target_warc - parts = warcs[0].split('_') + parts = os.path.basename(warcs[0]).split('-') assert len(parts[0]) == 17 assert parts[1] == 'foo' assert len(parts[2]) == 14 From e737a30ec181730676c76da73f94645ffa9bd731 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Wed, 10 Jan 2018 19:29:22 +0000 Subject: [PATCH 6/9] fix github problem with unit test --- tests/test_writer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_writer.py b/tests/test_writer.py index 2ed4e1b..f9fb339 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -176,13 +176,13 @@ def test_warc_writer_filename(tmpdir): dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer'))) wwriter = WarcWriter(Options(directory=dirname, prefix='foo', - warc_filename='{timestamp17}-{prefix}-{timestamp14}-{serialno}')) + warc_filename='{timestamp17}_{prefix}_{timestamp14}_{serialno}')) wwriter.write_records(recorded_url) warcs = [fn for fn in os.listdir(dirname)] assert warcs target_warc = os.path.join(dirname, warcs[0]) assert target_warc - parts = os.path.basename(warcs[0]).split('-') + parts = os.path.basename(warcs[0]).split('_') assert len(parts[0]) == 17 assert parts[1] == 'foo' assert len(parts[2]) == 14 From b2c47142de435a83553bbb0c77c9ba2d9419978e Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Wed, 10 Jan 2018 20:38:06 +0000 Subject: [PATCH 7/9] Change the writer unit test To be able to run in github. --- tests/test_writer.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tests/test_writer.py b/tests/test_writer.py index f9fb339..5bce6b7 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -24,6 +24,7 @@ import fcntl from multiprocessing import Process, Queue from datetime import datetime import pytest +import re from warcprox.mitmproxy import ProxyingRecorder from warcprox.warcproxy import RecordedUrl from warcprox.writer import WarcWriter @@ -180,10 +181,4 @@ def test_warc_writer_filename(tmpdir): wwriter.write_records(recorded_url) warcs = [fn for fn in os.listdir(dirname)] assert warcs - target_warc = os.path.join(dirname, warcs[0]) - assert target_warc - parts = os.path.basename(warcs[0]).split('_') - assert len(parts[0]) == 17 - assert parts[1] == 'foo' - assert len(parts[2]) == 14 - assert parts[3] == '00000.warc.open' + assert re.match('\d{17}_foo_\d{14}_00000.warc.open', warcs[0]) From 47ea3110bee3d3c7ff0e39218db7c6ed68288bc5 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Wed, 10 Jan 2018 20:55:31 +0000 Subject: [PATCH 8/9] Yet another unit test fix --- tests/test_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_writer.py b/tests/test_writer.py index 5bce6b7..9728ec9 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -181,4 +181,4 @@ def test_warc_writer_filename(tmpdir): wwriter.write_records(recorded_url) warcs = [fn for fn in os.listdir(dirname)] assert warcs - assert re.match('\d{17}_foo_\d{14}_00000.warc.open', warcs[0]) + assert re.search('\d{17}_foo_\d{14}_00000.warc.open', warcs[0]) From f73e625d6b227c9e5cdadef739ec6bd7c0b596f4 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Mon, 15 Jan 2018 20:17:22 +0000 Subject: [PATCH 9/9] Chec writer._fname in unit test For some reason this test previously failed in github. Maybe it has to do with the temporary files I need to create there... in any case, I changed what we check and evaluate the ``write._fname`` for the correct filename format. --- tests/test_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_writer.py b/tests/test_writer.py index 9728ec9..61fe108 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -181,4 +181,4 @@ def test_warc_writer_filename(tmpdir): wwriter.write_records(recorded_url) warcs = [fn for fn in os.listdir(dirname)] assert warcs - assert re.search('\d{17}_foo_\d{14}_00000.warc.open', warcs[0]) + assert re.search('\d{17}_foo_\d{14}_00000.warc.open', wwriter._fpath)