mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Merge branch 'master' into wip-postfetch-chain
* master: fix running_stats thing Update CdxServerDedup unit test Chec writer._fname in unit test Configurable CdxServerDedup urllib3 connection pool size Yet another unit test fix Change the writer unit test fix github problem with unit test Another fix for the unit test Fix writer unit test Add WarcWriter warc_filename unit test Fix warc_filename default value Configurable WARC filenames
This commit is contained in:
commit
5354648512
2
setup.py
2
setup.py
@ -52,7 +52,7 @@ except:
|
||||
|
||||
setuptools.setup(
|
||||
name='warcprox',
|
||||
version='2.3.1b4.dev137',
|
||||
version='2.3.1b4.dev138',
|
||||
description='WARC writing MITM HTTP/S proxy',
|
||||
url='https://github.com/internetarchive/warcprox',
|
||||
author='Noah Levitt',
|
||||
|
@ -4,43 +4,43 @@ from warcprox.dedup import CdxServerDedup
|
||||
|
||||
def test_cdx_dedup():
|
||||
# Mock CDX Server responses to simulate found, not found and errors.
|
||||
with mock.patch('warcprox.dedup.CdxServerDedup.http_pool.request') as request:
|
||||
url = "http://example.com"
|
||||
# not found case
|
||||
result = mock.Mock()
|
||||
result.status = 200
|
||||
result.data = b'20170101020405 test'
|
||||
request.return_value = result
|
||||
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
url=url)
|
||||
assert res is None
|
||||
url = "http://example.com"
|
||||
# not found case
|
||||
result = mock.Mock()
|
||||
result.status = 200
|
||||
result.data = b'20170101020405 test'
|
||||
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||
cdx_server.http_pool.request = mock.MagicMock(return_value=result)
|
||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
url=url)
|
||||
assert res is None
|
||||
|
||||
# found case
|
||||
result = mock.Mock()
|
||||
result.status = 200
|
||||
result.data = b'20170203040503 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
|
||||
request.return_value = result
|
||||
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
url=url)
|
||||
assert res["date"] == b"2017-02-03T04:05:03Z"
|
||||
# found case
|
||||
result = mock.Mock()
|
||||
result.status = 200
|
||||
result.data = b'20170203040503 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
|
||||
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||
cdx_server.http_pool.request = mock.MagicMock(return_value=result)
|
||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
url=url)
|
||||
assert res["date"] == b"2017-02-03T04:05:03Z"
|
||||
|
||||
# invalid CDX result status code
|
||||
result = mock.Mock()
|
||||
result.status = 400
|
||||
result.data = b'20170101020405 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
|
||||
request.return_value = result
|
||||
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
url=url)
|
||||
assert res is None
|
||||
# invalid CDX result content
|
||||
result = mock.Mock()
|
||||
result.status = 200
|
||||
result.data = b'InvalidExceptionResult'
|
||||
request.return_value = result
|
||||
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
url=url)
|
||||
assert res is None
|
||||
# invalid CDX result status code
|
||||
result = mock.Mock()
|
||||
result.status = 400
|
||||
result.data = b'20170101020405 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
|
||||
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||
cdx_server.http_pool.request = mock.MagicMock(return_value=result)
|
||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
url=url)
|
||||
assert res is None
|
||||
|
||||
# invalid CDX result content
|
||||
result = mock.Mock()
|
||||
result.status = 200
|
||||
result.data = b'InvalidExceptionResult'
|
||||
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||
cdx_server.http_pool.request = mock.MagicMock(return_value=result)
|
||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
url=url)
|
||||
assert res is None
|
||||
|
@ -24,6 +24,7 @@ import fcntl
|
||||
from multiprocessing import Process, Queue
|
||||
from datetime import datetime
|
||||
import pytest
|
||||
import re
|
||||
from warcprox.mitmproxy import ProxyingRecorder
|
||||
from warcprox.warcproxy import RecordedUrl
|
||||
from warcprox.writer import WarcWriter
|
||||
@ -159,3 +160,21 @@ def test_special_dont_write_prefix():
|
||||
wwt.stop.set()
|
||||
wwt.join()
|
||||
|
||||
|
||||
def test_warc_writer_filename(tmpdir):
|
||||
"""Test if WarcWriter is writing WARC files with custom filenames.
|
||||
"""
|
||||
recorder = ProxyingRecorder(None, None, 'sha1', url='http://example.com')
|
||||
recorded_url = RecordedUrl(
|
||||
url='http://example.com', content_type='text/plain', status=200,
|
||||
client_ip='127.0.0.2', request_data=b'abc',
|
||||
response_recorder=recorder, remote_ip='127.0.0.3',
|
||||
timestamp=datetime.utcnow())
|
||||
|
||||
dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer')))
|
||||
wwriter = WarcWriter(Options(directory=dirname, prefix='foo',
|
||||
warc_filename='{timestamp17}_{prefix}_{timestamp14}_{serialno}'))
|
||||
wwriter.write_records(recorded_url)
|
||||
warcs = [fn for fn in os.listdir(dirname)]
|
||||
assert warcs
|
||||
assert re.search('\d{17}_foo_\d{14}_00000.warc.open', wwriter._fpath)
|
||||
|
@ -202,12 +202,12 @@ class CdxServerDedup(DedupDb):
|
||||
"""Query a CDX server to perform deduplication.
|
||||
"""
|
||||
logger = logging.getLogger("warcprox.dedup.CdxServerDedup")
|
||||
http_pool = urllib3.PoolManager()
|
||||
|
||||
def __init__(self, cdx_url="https://web.archive.org/cdx/search",
|
||||
options=warcprox.Options()):
|
||||
maxsize=200, options=warcprox.Options()):
|
||||
self.cdx_url = cdx_url
|
||||
self.options = options
|
||||
self.http_pool = urllib3.PoolManager(maxsize=maxsize)
|
||||
|
||||
def start(self):
|
||||
pass
|
||||
|
@ -77,6 +77,9 @@ def _build_arg_parser(prog='warcprox'):
|
||||
help='where to store and load generated certificates')
|
||||
arg_parser.add_argument('-d', '--dir', dest='directory',
|
||||
default='./warcs', help='where to write warcs')
|
||||
arg_parser.add_argument('--warc-filename', dest='warc_filename',
|
||||
default='{prefix}-{timestamp17}-{serialno}-{randomtoken}',
|
||||
help='define custom WARC filename with variables {prefix}, {timestamp14}, {timestamp17}, {serialno}, {randomtoken}, {hostname}, {shorthostname}')
|
||||
arg_parser.add_argument('-z', '--gzip', dest='gzip', action='store_true',
|
||||
help='write gzip-compressed warc records')
|
||||
arg_parser.add_argument('--no-warc-open-suffix', dest='no_warc_open_suffix',
|
||||
|
@ -28,6 +28,7 @@ import fcntl
|
||||
import time
|
||||
import warcprox
|
||||
import os
|
||||
import socket
|
||||
import string
|
||||
import random
|
||||
import threading
|
||||
@ -42,6 +43,8 @@ class WarcWriter:
|
||||
self._last_activity = time.time()
|
||||
|
||||
self.gzip = options.gzip or False
|
||||
self.warc_filename = options.warc_filename or \
|
||||
'{prefix}-{timestamp17}-{randomtoken}-{serialno}'
|
||||
digest_algorithm = options.digest_algorithm or 'sha1'
|
||||
base32 = options.base32
|
||||
self.record_builder = warcprox.warc.WarcRecordBuilder(
|
||||
@ -68,6 +71,10 @@ class WarcWriter:
|
||||
now = datetime.utcnow()
|
||||
return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000)
|
||||
|
||||
def timestamp14(self):
|
||||
now = datetime.utcnow()
|
||||
return '{:%Y%m%d%H%M%S}'.format(now)
|
||||
|
||||
def close_writer(self):
|
||||
with self._lock:
|
||||
if self._fpath:
|
||||
@ -86,8 +93,32 @@ class WarcWriter:
|
||||
self._fpath = None
|
||||
self._f = None
|
||||
|
||||
def serial(self):
|
||||
return '{:05d}'.format(self._serial)
|
||||
|
||||
# h3 default <!-- <property name="template" value="${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}" /> -->
|
||||
# ${prefix}-${timestamp17}-${randomtoken}-${serialno}.warc.gz"
|
||||
def _warc_filename(self):
|
||||
"""WARC filename is configurable with CLI parameter --warc-filename.
|
||||
Default: '{prefix}-{timestamp17}-{serialno}-{randomtoken}'
|
||||
Available variables are: prefix, timestamp14, timestamp17, serialno,
|
||||
randomtoken, hostname, shorthostname.
|
||||
Extension ``.warc`` or ``.warc.gz`` is appended automatically.
|
||||
"""
|
||||
hostname = socket.getfqdn()
|
||||
shorthostname = hostname.split('.')[0]
|
||||
fname = self.warc_filename.format(prefix=self.prefix,
|
||||
timestamp14=self.timestamp14(),
|
||||
timestamp17=self.timestamp17(),
|
||||
serialno=self.serial(),
|
||||
randomtoken=self._randomtoken,
|
||||
hostname=hostname,
|
||||
shorthostname=shorthostname)
|
||||
if self.gzip:
|
||||
fname = fname + '.warc.gz'
|
||||
else:
|
||||
fname = fname + '.warc'
|
||||
return fname
|
||||
|
||||
def _writer(self):
|
||||
with self._lock:
|
||||
if self._fpath and os.path.getsize(
|
||||
@ -95,9 +126,7 @@ class WarcWriter:
|
||||
self.close_writer()
|
||||
|
||||
if self._f == None:
|
||||
self._f_finalname = '{}-{}-{:05d}-{}.warc{}'.format(
|
||||
self.prefix, self.timestamp17(), self._serial,
|
||||
self._randomtoken, '.gz' if self.gzip else '')
|
||||
self._f_finalname = self._warc_filename()
|
||||
self._fpath = os.path.sep.join([
|
||||
self.directory, self._f_finalname + self._f_open_suffix])
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user