Merge branch 'master' into wip-postfetch-chain

* master:
  fix running_stats thing
  Update CdxServerDedup unit test
  Chec writer._fname in unit test
  Configurable CdxServerDedup urllib3 connection pool size
  Yet another unit test fix
  Change the writer unit test
  fix github problem with unit test
  Another fix for the unit test
  Fix writer unit test
  Add WarcWriter warc_filename unit test
  Fix warc_filename default value
  Configurable WARC filenames
This commit is contained in:
Noah Levitt 2018-01-16 16:01:40 -08:00
commit 5354648512
6 changed files with 96 additions and 45 deletions

View File

@ -52,7 +52,7 @@ except:
setuptools.setup(
name='warcprox',
version='2.3.1b4.dev137',
version='2.3.1b4.dev138',
description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox',
author='Noah Levitt',

View File

@ -4,43 +4,43 @@ from warcprox.dedup import CdxServerDedup
def test_cdx_dedup():
# Mock CDX Server responses to simulate found, not found and errors.
with mock.patch('warcprox.dedup.CdxServerDedup.http_pool.request') as request:
url = "http://example.com"
# not found case
result = mock.Mock()
result.status = 200
result.data = b'20170101020405 test'
request.return_value = result
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
url=url)
assert res is None
url = "http://example.com"
# not found case
result = mock.Mock()
result.status = 200
result.data = b'20170101020405 test'
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
cdx_server.http_pool.request = mock.MagicMock(return_value=result)
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
url=url)
assert res is None
# found case
result = mock.Mock()
result.status = 200
result.data = b'20170203040503 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
request.return_value = result
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
url=url)
assert res["date"] == b"2017-02-03T04:05:03Z"
# found case
result = mock.Mock()
result.status = 200
result.data = b'20170203040503 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
cdx_server.http_pool.request = mock.MagicMock(return_value=result)
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
url=url)
assert res["date"] == b"2017-02-03T04:05:03Z"
# invalid CDX result status code
result = mock.Mock()
result.status = 400
result.data = b'20170101020405 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
request.return_value = result
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
url=url)
assert res is None
# invalid CDX result content
result = mock.Mock()
result.status = 200
result.data = b'InvalidExceptionResult'
request.return_value = result
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
url=url)
assert res is None
# invalid CDX result status code
result = mock.Mock()
result.status = 400
result.data = b'20170101020405 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
cdx_server.http_pool.request = mock.MagicMock(return_value=result)
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
url=url)
assert res is None
# invalid CDX result content
result = mock.Mock()
result.status = 200
result.data = b'InvalidExceptionResult'
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
cdx_server.http_pool.request = mock.MagicMock(return_value=result)
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
url=url)
assert res is None

View File

@ -24,6 +24,7 @@ import fcntl
from multiprocessing import Process, Queue
from datetime import datetime
import pytest
import re
from warcprox.mitmproxy import ProxyingRecorder
from warcprox.warcproxy import RecordedUrl
from warcprox.writer import WarcWriter
@ -159,3 +160,21 @@ def test_special_dont_write_prefix():
wwt.stop.set()
wwt.join()
def test_warc_writer_filename(tmpdir):
"""Test if WarcWriter is writing WARC files with custom filenames.
"""
recorder = ProxyingRecorder(None, None, 'sha1', url='http://example.com')
recorded_url = RecordedUrl(
url='http://example.com', content_type='text/plain', status=200,
client_ip='127.0.0.2', request_data=b'abc',
response_recorder=recorder, remote_ip='127.0.0.3',
timestamp=datetime.utcnow())
dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer')))
wwriter = WarcWriter(Options(directory=dirname, prefix='foo',
warc_filename='{timestamp17}_{prefix}_{timestamp14}_{serialno}'))
wwriter.write_records(recorded_url)
warcs = [fn for fn in os.listdir(dirname)]
assert warcs
assert re.search('\d{17}_foo_\d{14}_00000.warc.open', wwriter._fpath)

View File

@ -202,12 +202,12 @@ class CdxServerDedup(DedupDb):
"""Query a CDX server to perform deduplication.
"""
logger = logging.getLogger("warcprox.dedup.CdxServerDedup")
http_pool = urllib3.PoolManager()
def __init__(self, cdx_url="https://web.archive.org/cdx/search",
options=warcprox.Options()):
maxsize=200, options=warcprox.Options()):
self.cdx_url = cdx_url
self.options = options
self.http_pool = urllib3.PoolManager(maxsize=maxsize)
def start(self):
pass

View File

@ -77,6 +77,9 @@ def _build_arg_parser(prog='warcprox'):
help='where to store and load generated certificates')
arg_parser.add_argument('-d', '--dir', dest='directory',
default='./warcs', help='where to write warcs')
arg_parser.add_argument('--warc-filename', dest='warc_filename',
default='{prefix}-{timestamp17}-{serialno}-{randomtoken}',
help='define custom WARC filename with variables {prefix}, {timestamp14}, {timestamp17}, {serialno}, {randomtoken}, {hostname}, {shorthostname}')
arg_parser.add_argument('-z', '--gzip', dest='gzip', action='store_true',
help='write gzip-compressed warc records')
arg_parser.add_argument('--no-warc-open-suffix', dest='no_warc_open_suffix',

View File

@ -28,6 +28,7 @@ import fcntl
import time
import warcprox
import os
import socket
import string
import random
import threading
@ -42,6 +43,8 @@ class WarcWriter:
self._last_activity = time.time()
self.gzip = options.gzip or False
self.warc_filename = options.warc_filename or \
'{prefix}-{timestamp17}-{randomtoken}-{serialno}'
digest_algorithm = options.digest_algorithm or 'sha1'
base32 = options.base32
self.record_builder = warcprox.warc.WarcRecordBuilder(
@ -68,6 +71,10 @@ class WarcWriter:
now = datetime.utcnow()
return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000)
def timestamp14(self):
now = datetime.utcnow()
return '{:%Y%m%d%H%M%S}'.format(now)
def close_writer(self):
with self._lock:
if self._fpath:
@ -86,8 +93,32 @@ class WarcWriter:
self._fpath = None
self._f = None
def serial(self):
return '{:05d}'.format(self._serial)
# h3 default <!-- <property name="template" value="${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}" /> -->
# ${prefix}-${timestamp17}-${randomtoken}-${serialno}.warc.gz"
def _warc_filename(self):
"""WARC filename is configurable with CLI parameter --warc-filename.
Default: '{prefix}-{timestamp17}-{serialno}-{randomtoken}'
Available variables are: prefix, timestamp14, timestamp17, serialno,
randomtoken, hostname, shorthostname.
Extension ``.warc`` or ``.warc.gz`` is appended automatically.
"""
hostname = socket.getfqdn()
shorthostname = hostname.split('.')[0]
fname = self.warc_filename.format(prefix=self.prefix,
timestamp14=self.timestamp14(),
timestamp17=self.timestamp17(),
serialno=self.serial(),
randomtoken=self._randomtoken,
hostname=hostname,
shorthostname=shorthostname)
if self.gzip:
fname = fname + '.warc.gz'
else:
fname = fname + '.warc'
return fname
def _writer(self):
with self._lock:
if self._fpath and os.path.getsize(
@ -95,9 +126,7 @@ class WarcWriter:
self.close_writer()
if self._f == None:
self._f_finalname = '{}-{}-{:05d}-{}.warc{}'.format(
self.prefix, self.timestamp17(), self._serial,
self._randomtoken, '.gz' if self.gzip else '')
self._f_finalname = self._warc_filename()
self._fpath = os.path.sep.join([
self.directory, self._f_finalname + self._f_open_suffix])