diff --git a/setup.py b/setup.py index e90ac5f..242d5b6 100755 --- a/setup.py +++ b/setup.py @@ -52,7 +52,7 @@ except: setuptools.setup( name='warcprox', - version='2.3.1b4.dev137', + version='2.3.1b4.dev138', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_dedup.py b/tests/test_dedup.py index 124efb5..eea3ccd 100644 --- a/tests/test_dedup.py +++ b/tests/test_dedup.py @@ -4,43 +4,43 @@ from warcprox.dedup import CdxServerDedup def test_cdx_dedup(): # Mock CDX Server responses to simulate found, not found and errors. - with mock.patch('warcprox.dedup.CdxServerDedup.http_pool.request') as request: - url = "http://example.com" - # not found case - result = mock.Mock() - result.status = 200 - result.data = b'20170101020405 test' - request.return_value = result - cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") - res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", - url=url) - assert res is None + url = "http://example.com" + # not found case + result = mock.Mock() + result.status = 200 + result.data = b'20170101020405 test' + cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") + cdx_server.http_pool.request = mock.MagicMock(return_value=result) + res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", + url=url) + assert res is None - # found case - result = mock.Mock() - result.status = 200 - result.data = b'20170203040503 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A' - request.return_value = result - cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") - res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", - url=url) - assert res["date"] == b"2017-02-03T04:05:03Z" + # found case + result = mock.Mock() + result.status = 200 + result.data = b'20170203040503 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A' + cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") + cdx_server.http_pool.request = mock.MagicMock(return_value=result) + res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", + url=url) + assert res["date"] == b"2017-02-03T04:05:03Z" - # invalid CDX result status code - result = mock.Mock() - result.status = 400 - result.data = b'20170101020405 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A' - request.return_value = result - cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") - res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", - url=url) - assert res is None - # invalid CDX result content - result = mock.Mock() - result.status = 200 - result.data = b'InvalidExceptionResult' - request.return_value = result - cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") - res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", - url=url) - assert res is None + # invalid CDX result status code + result = mock.Mock() + result.status = 400 + result.data = b'20170101020405 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A' + cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") + cdx_server.http_pool.request = mock.MagicMock(return_value=result) + res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", + url=url) + assert res is None + + # invalid CDX result content + result = mock.Mock() + result.status = 200 + result.data = b'InvalidExceptionResult' + cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") + cdx_server.http_pool.request = mock.MagicMock(return_value=result) + res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", + url=url) + assert res is None diff --git a/tests/test_writer.py b/tests/test_writer.py index 2d9505f..9c37424 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -24,6 +24,7 @@ import fcntl from multiprocessing import Process, Queue from datetime import datetime import pytest +import re from warcprox.mitmproxy import ProxyingRecorder from warcprox.warcproxy import RecordedUrl from warcprox.writer import WarcWriter @@ -159,3 +160,21 @@ def test_special_dont_write_prefix(): wwt.stop.set() wwt.join() + +def test_warc_writer_filename(tmpdir): + """Test if WarcWriter is writing WARC files with custom filenames. + """ + recorder = ProxyingRecorder(None, None, 'sha1', url='http://example.com') + recorded_url = RecordedUrl( + url='http://example.com', content_type='text/plain', status=200, + client_ip='127.0.0.2', request_data=b'abc', + response_recorder=recorder, remote_ip='127.0.0.3', + timestamp=datetime.utcnow()) + + dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer'))) + wwriter = WarcWriter(Options(directory=dirname, prefix='foo', + warc_filename='{timestamp17}_{prefix}_{timestamp14}_{serialno}')) + wwriter.write_records(recorded_url) + warcs = [fn for fn in os.listdir(dirname)] + assert warcs + assert re.search('\d{17}_foo_\d{14}_00000.warc.open', wwriter._fpath) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index b9e136e..8931db2 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -202,12 +202,12 @@ class CdxServerDedup(DedupDb): """Query a CDX server to perform deduplication. """ logger = logging.getLogger("warcprox.dedup.CdxServerDedup") - http_pool = urllib3.PoolManager() def __init__(self, cdx_url="https://web.archive.org/cdx/search", - options=warcprox.Options()): + maxsize=200, options=warcprox.Options()): self.cdx_url = cdx_url self.options = options + self.http_pool = urllib3.PoolManager(maxsize=maxsize) def start(self): pass diff --git a/warcprox/main.py b/warcprox/main.py index a55e682..860e816 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -77,6 +77,9 @@ def _build_arg_parser(prog='warcprox'): help='where to store and load generated certificates') arg_parser.add_argument('-d', '--dir', dest='directory', default='./warcs', help='where to write warcs') + arg_parser.add_argument('--warc-filename', dest='warc_filename', + default='{prefix}-{timestamp17}-{serialno}-{randomtoken}', + help='define custom WARC filename with variables {prefix}, {timestamp14}, {timestamp17}, {serialno}, {randomtoken}, {hostname}, {shorthostname}') arg_parser.add_argument('-z', '--gzip', dest='gzip', action='store_true', help='write gzip-compressed warc records') arg_parser.add_argument('--no-warc-open-suffix', dest='no_warc_open_suffix', diff --git a/warcprox/writer.py b/warcprox/writer.py index 7a1032a..56ff635 100644 --- a/warcprox/writer.py +++ b/warcprox/writer.py @@ -28,6 +28,7 @@ import fcntl import time import warcprox import os +import socket import string import random import threading @@ -42,6 +43,8 @@ class WarcWriter: self._last_activity = time.time() self.gzip = options.gzip or False + self.warc_filename = options.warc_filename or \ + '{prefix}-{timestamp17}-{randomtoken}-{serialno}' digest_algorithm = options.digest_algorithm or 'sha1' base32 = options.base32 self.record_builder = warcprox.warc.WarcRecordBuilder( @@ -68,6 +71,10 @@ class WarcWriter: now = datetime.utcnow() return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000) + def timestamp14(self): + now = datetime.utcnow() + return '{:%Y%m%d%H%M%S}'.format(now) + def close_writer(self): with self._lock: if self._fpath: @@ -86,8 +93,32 @@ class WarcWriter: self._fpath = None self._f = None + def serial(self): + return '{:05d}'.format(self._serial) + # h3 default - # ${prefix}-${timestamp17}-${randomtoken}-${serialno}.warc.gz" + def _warc_filename(self): + """WARC filename is configurable with CLI parameter --warc-filename. + Default: '{prefix}-{timestamp17}-{serialno}-{randomtoken}' + Available variables are: prefix, timestamp14, timestamp17, serialno, + randomtoken, hostname, shorthostname. + Extension ``.warc`` or ``.warc.gz`` is appended automatically. + """ + hostname = socket.getfqdn() + shorthostname = hostname.split('.')[0] + fname = self.warc_filename.format(prefix=self.prefix, + timestamp14=self.timestamp14(), + timestamp17=self.timestamp17(), + serialno=self.serial(), + randomtoken=self._randomtoken, + hostname=hostname, + shorthostname=shorthostname) + if self.gzip: + fname = fname + '.warc.gz' + else: + fname = fname + '.warc' + return fname + def _writer(self): with self._lock: if self._fpath and os.path.getsize( @@ -95,9 +126,7 @@ class WarcWriter: self.close_writer() if self._f == None: - self._f_finalname = '{}-{}-{:05d}-{}.warc{}'.format( - self.prefix, self.timestamp17(), self._serial, - self._randomtoken, '.gz' if self.gzip else '') + self._f_finalname = self._warc_filename() self._fpath = os.path.sep.join([ self.directory, self._f_finalname + self._f_open_suffix])