Merge branch 'do_not_archive' into qa

This commit is contained in:
Barbara Miller 2018-02-15 14:07:03 -08:00
commit 082b338b71
14 changed files with 297 additions and 199 deletions

1
MANIFEST.in Normal file
View File

@ -0,0 +1 @@
recursive-include tests *.py *.sh Dockerfile

6
setup.cfg Normal file
View File

@ -0,0 +1,6 @@
[aliases]
test=pytest
[tool:pytest]
addopts=-v
testpaths=tests

View File

@ -2,7 +2,7 @@
''' '''
setup.py - setuptools installation configuration for warcprox setup.py - setuptools installation configuration for warcprox
Copyright (C) 2013-2016 Internet Archive Copyright (C) 2013-2018 Internet Archive
This program is free software; you can redistribute it and/or This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License modify it under the terms of the GNU General Public License
@ -22,18 +22,6 @@ USA.
import sys import sys
import setuptools import setuptools
import setuptools.command.test
class PyTest(setuptools.command.test.test):
def finalize_options(self):
setuptools.command.test.test.finalize_options(self)
self.test_args = []
self.test_suite = True
def run_tests(self):
# import here, because outside the eggs aren't loaded
import pytest
errno = pytest.main(self.test_args)
sys.exit(errno)
deps = [ deps = [
'certauth==1.1.6', 'certauth==1.1.6',
@ -52,7 +40,7 @@ except:
setuptools.setup( setuptools.setup(
name='warcprox', name='warcprox',
version='2.4b1.dev144', version='2.4b2.dev149',
description='WARC writing MITM HTTP/S proxy', description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox', url='https://github.com/internetarchive/warcprox',
author='Noah Levitt', author='Noah Levitt',
@ -61,9 +49,8 @@ setuptools.setup(
license='GPL', license='GPL',
packages=['warcprox'], packages=['warcprox'],
install_requires=deps, install_requires=deps,
setup_requires=['pytest-runner'],
tests_require=['mock', 'pytest', 'warcio'], tests_require=['mock', 'pytest', 'warcio'],
cmdclass = {'test': PyTest},
test_suite='warcprox.tests',
entry_points={ entry_points={
'console_scripts': [ 'console_scripts': [
'warcprox=warcprox.main:main', 'warcprox=warcprox.main:main',

View File

@ -249,6 +249,14 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
elif self.path == '/empty-response': elif self.path == '/empty-response':
headers = b'' headers = b''
payload = b'' payload = b''
elif self.path == '/slow-response':
time.sleep(6)
headers = (b'HTTP/1.1 200 OK\r\n'
+ b'Content-Type: text/plain\r\n'
+ b'\r\n')
payload = b'Test.'
actual_headers = (b'Content-Type: text/plain\r\n'
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n')
else: else:
payload = b'404 Not Found\n' payload = b'404 Not Found\n'
headers = (b'HTTP/1.1 404 Not Found\r\n' headers = (b'HTTP/1.1 404 Not Found\r\n'
@ -356,7 +364,8 @@ def warcprox_(request):
'--port=0', '--port=0',
'--playback-port=0', '--playback-port=0',
'--onion-tor-socks-proxy=localhost:9050', '--onion-tor-socks-proxy=localhost:9050',
'--crawl-log-dir=crawl-logs'] '--crawl-log-dir=crawl-logs',
'--socket-timeout=4']
if request.config.getoption('--rethinkdb-dedup-url'): if request.config.getoption('--rethinkdb-dedup-url'):
argv.append('--rethinkdb-dedup-url=%s' % request.config.getoption('--rethinkdb-dedup-url')) argv.append('--rethinkdb-dedup-url=%s' % request.config.getoption('--rethinkdb-dedup-url'))
# test these here only # test these here only
@ -758,10 +767,12 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies,
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 4) wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 4)
# close the warc # close the warc
assert warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"] assert warcprox_.warc_writer_processor.writer_pool.warc_writers["test_dedup_buckets"]
writer = warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"] writer = warcprox_.warc_writer_processor.writer_pool.warc_writers["test_dedup_buckets"]
warc_path = os.path.join(writer.directory, writer._f_finalname) warc = writer._available_warcs.queue[0]
warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"].close_writer() warc_path = os.path.join(warc.directory, warc.finalname)
assert not os.path.exists(warc_path)
warcprox_.warc_writer_processor.writer_pool.warc_writers["test_dedup_buckets"].close_writer()
assert os.path.exists(warc_path) assert os.path.exists(warc_path)
# read the warc # read the warc
@ -1380,20 +1391,16 @@ def test_controller_with_defaults():
assert controller.proxy.server_port == 8000 assert controller.proxy.server_port == 8000
assert controller.proxy.running_stats assert controller.proxy.running_stats
assert not controller.proxy.stats_db assert not controller.proxy.stats_db
wwt = controller.warc_writer_thread wwp = controller.warc_writer_processor
assert wwt assert wwp
assert wwt.inq assert wwp.inq
assert wwt.outq assert wwp.outq
assert wwt.writer_pool assert wwp.writer_pool
assert wwt.writer_pool.default_warc_writer assert wwp.writer_pool.default_warc_writer
assert wwt.writer_pool.default_warc_writer.directory == './warcs' assert wwp.writer_pool.default_warc_writer.gzip is False
assert wwt.writer_pool.default_warc_writer.rollover_idle_time is None assert wwp.writer_pool.default_warc_writer.record_builder
assert wwt.writer_pool.default_warc_writer.rollover_size == 1000000000 assert not wwp.writer_pool.default_warc_writer.record_builder.base32
assert wwt.writer_pool.default_warc_writer.prefix == 'warcprox' assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1'
assert wwt.writer_pool.default_warc_writer.gzip is False
assert wwt.writer_pool.default_warc_writer.record_builder
assert not wwt.writer_pool.default_warc_writer.record_builder.base32
assert wwt.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1'
def test_load_plugin(): def test_load_plugin():
options = warcprox.Options(port=0, plugins=[ options = warcprox.Options(port=0, plugins=[
@ -1473,7 +1480,7 @@ def test_via_response_header(warcprox_, http_daemon, archiving_proxies, playback
assert response.status_code == 200 assert response.status_code == 200
assert not 'via' in playback_response assert not 'via' in playback_response
warc = warcprox_.warc_writer_thread.writer_pool.default_warc_writer._fpath warc = warcprox_.warc_writer_processor.writer_pool.default_warc_writer._available_warcs.queue[0].path
with open(warc, 'rb') as f: with open(warc, 'rb') as f:
for record in warcio.archiveiterator.ArchiveIterator(f): for record in warcio.archiveiterator.ArchiveIterator(f):
if record.rec_headers.get_header('warc-target-uri') == url: if record.rec_headers.get_header('warc-target-uri') == url:
@ -1691,10 +1698,11 @@ def test_long_warcprox_meta(
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1) wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
# check that warcprox-meta was parsed and honored ("warc-prefix" param) # check that warcprox-meta was parsed and honored ("warc-prefix" param)
assert warcprox_.warc_writer_thread.writer_pool.warc_writers["test_long_warcprox_meta"] assert warcprox_.warc_writer_processor.writer_pool.warc_writers["test_long_warcprox_meta"]
writer = warcprox_.warc_writer_thread.writer_pool.warc_writers["test_long_warcprox_meta"] writer = warcprox_.warc_writer_processor.writer_pool.warc_writers["test_long_warcprox_meta"]
warc_path = os.path.join(writer.directory, writer._f_finalname) warc = writer._available_warcs.queue[0]
warcprox_.warc_writer_thread.writer_pool.warc_writers["test_long_warcprox_meta"].close_writer() warc_path = os.path.join(warc.directory, warc.finalname)
warcprox_.warc_writer_processor.writer_pool.warc_writers["test_long_warcprox_meta"].close_writer()
assert os.path.exists(warc_path) assert os.path.exists(warc_path)
# read the warc # read the warc
@ -1711,6 +1719,16 @@ def test_long_warcprox_meta(
with pytest.raises(StopIteration): with pytest.raises(StopIteration):
next(rec_iter) next(rec_iter)
def test_socket_timeout_response(
warcprox_, http_daemon, https_daemon, archiving_proxies,
playback_proxies):
"""Response will timeout because we use --socket-timeout=4 whereas the
target URL will return after 6 sec.
"""
url = 'http://localhost:%s/slow-response' % http_daemon.server_port
response = requests.get(url, proxies=archiving_proxies, verify=False)
assert response.status_code == 502
def test_empty_response( def test_empty_response(
warcprox_, http_daemon, https_daemon, archiving_proxies, warcprox_, http_daemon, https_daemon, archiving_proxies,
playback_proxies): playback_proxies):

View File

@ -61,7 +61,8 @@ def test_warc_writer_locking(tmpdir):
timestamp=datetime.utcnow()) timestamp=datetime.utcnow())
dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer'))) dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer')))
wwriter = WarcWriter(Options(directory=dirname, no_warc_open_suffix=True)) wwriter = WarcWriter(Options(
directory=dirname, no_warc_open_suffix=True, writer_threads=1))
wwriter.write_records(recorded_url) wwriter.write_records(recorded_url)
warcs = [fn for fn in os.listdir(dirname) if fn.endswith('.warc')] warcs = [fn for fn in os.listdir(dirname) if fn.endswith('.warc')]
assert warcs assert warcs
@ -93,7 +94,8 @@ def test_special_dont_write_prefix():
logging.debug('cd %s', tmpdir) logging.debug('cd %s', tmpdir)
os.chdir(tmpdir) os.chdir(tmpdir)
wwt = warcprox.writerthread.WarcWriterThread(Options(prefix='-')) wwt = warcprox.writerthread.WarcWriterProcessor(
Options(prefix='-', writer_threads=1))
wwt.inq = warcprox.TimestampedQueue(maxsize=1) wwt.inq = warcprox.TimestampedQueue(maxsize=1)
wwt.outq = warcprox.TimestampedQueue(maxsize=1) wwt.outq = warcprox.TimestampedQueue(maxsize=1)
try: try:
@ -126,7 +128,8 @@ def test_special_dont_write_prefix():
wwt.stop.set() wwt.stop.set()
wwt.join() wwt.join()
wwt = warcprox.writerthread.WarcWriterThread() wwt = warcprox.writerthread.WarcWriterProcessor(
Options(writer_threads=1))
wwt.inq = warcprox.TimestampedQueue(maxsize=1) wwt.inq = warcprox.TimestampedQueue(maxsize=1)
wwt.outq = warcprox.TimestampedQueue(maxsize=1) wwt.outq = warcprox.TimestampedQueue(maxsize=1)
try: try:
@ -172,8 +175,11 @@ def test_warc_writer_filename(tmpdir):
dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer'))) dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer')))
wwriter = WarcWriter(Options(directory=dirname, prefix='foo', wwriter = WarcWriter(Options(directory=dirname, prefix='foo',
warc_filename='{timestamp17}_{prefix}_{timestamp14}_{serialno}')) warc_filename='{timestamp17}_{prefix}_{timestamp14}_{serialno}',
writer_threads=1))
wwriter.write_records(recorded_url) wwriter.write_records(recorded_url)
warcs = [fn for fn in os.listdir(dirname)] warcs = [fn for fn in os.listdir(dirname)]
assert warcs assert warcs
assert re.search('\d{17}_foo_\d{14}_00000.warc.open', wwriter._fpath) assert re.search(
r'\d{17}_foo_\d{14}_00000.warc.open',
wwriter._available_warcs.queue[0].path)

View File

@ -237,6 +237,14 @@ class ListenerPostfetchProcessor(BaseStandardPostfetchProcessor):
self.logger.error( self.logger.error(
'%s raised exception', listener.stop, exc_info=True) '%s raised exception', listener.stop, exc_info=True)
def timestamp17():
now = datetime.datetime.utcnow()
return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000)
def timestamp14():
now = datetime.datetime.utcnow()
return '{:%Y%m%d%H%M%S}'.format(now)
# monkey-patch log levels TRACE and NOTICE # monkey-patch log levels TRACE and NOTICE
TRACE = 5 TRACE = 5
def _logger_trace(self, msg, *args, **kwargs): def _logger_trace(self, msg, *args, **kwargs):

View File

@ -57,7 +57,6 @@ class Factory:
@staticmethod @staticmethod
def stats_processor(options): def stats_processor(options):
# return warcprox.stats.StatsProcessor(options)
if options.rethinkdb_stats_url: if options.rethinkdb_stats_url:
stats_processor = warcprox.stats.RethinkStatsProcessor(options) stats_processor = warcprox.stats.RethinkStatsProcessor(options)
elif options.stats_db_file in (None, '', '/dev/null'): elif options.stats_db_file in (None, '', '/dev/null'):
@ -68,8 +67,8 @@ class Factory:
return stats_processor return stats_processor
@staticmethod @staticmethod
def warc_writer(options): def warc_writer_processor(options):
return warcprox.writerthread.WarcWriterThread(options) return warcprox.writerthread.WarcWriterProcessor(options)
@staticmethod @staticmethod
def playback_proxy(ca, options): def playback_proxy(ca, options):
@ -142,6 +141,12 @@ class WarcproxController(object):
self.playback_proxy = Factory.playback_proxy( self.playback_proxy = Factory.playback_proxy(
self.proxy.ca, self.options) self.proxy.ca, self.options)
# default number of warc writer threads = sqrt(proxy.max_threads)
# pulled out of thin air because it strikes me as reasonable
# 1=>1 2=>1 5=>2 10=>3 50=>7 100=>10 200=>14 500=>22 1000=>32 2000=>45
if not self.options.writer_threads:
self.options.writer_threads = int(self.proxy.max_threads ** 0.5)
self.build_postfetch_chain(self.proxy.recorded_url_q) self.build_postfetch_chain(self.proxy.recorded_url_q)
self.service_registry = Factory.service_registry(options) self.service_registry = Factory.service_registry(options)
@ -181,8 +186,8 @@ class WarcproxController(object):
if self.dedup_db: if self.dedup_db:
self._postfetch_chain.append(self.dedup_db.loader()) self._postfetch_chain.append(self.dedup_db.loader())
self.warc_writer_thread = Factory.warc_writer(self.options) self.warc_writer_processor = Factory.warc_writer_processor(self.options)
self._postfetch_chain.append(self.warc_writer_thread) self._postfetch_chain.append(self.warc_writer_processor)
if self.dedup_db: if self.dedup_db:
self._postfetch_chain.append(self.dedup_db.storer()) self._postfetch_chain.append(self.dedup_db.storer())
@ -207,6 +212,8 @@ class WarcproxController(object):
self._postfetch_chain.append( self._postfetch_chain.append(
warcprox.ListenerPostfetchProcessor( warcprox.ListenerPostfetchProcessor(
plugin, self.options)) plugin, self.options))
elif hasattr(plugin, 'CHAIN_POSITION') and plugin.CHAIN_POSITION == 'early':
self._postfetch_chain.insert(0, plugin) # or insert early but later than 0?
else: else:
self._postfetch_chain.append(plugin) self._postfetch_chain.append(plugin)

View File

@ -206,9 +206,14 @@ class CdxServerDedup(DedupDb):
def __init__(self, cdx_url="https://web.archive.org/cdx/search", def __init__(self, cdx_url="https://web.archive.org/cdx/search",
maxsize=200, options=warcprox.Options()): maxsize=200, options=warcprox.Options()):
"""Initialize cdx server connection pool and related parameters.
Use low timeout value and no retries to avoid blocking warcprox
operation by a slow CDX server.
"""
self.cdx_url = cdx_url self.cdx_url = cdx_url
self.options = options self.options = options
self.http_pool = urllib3.PoolManager(maxsize=maxsize) self.http_pool = urllib3.PoolManager(maxsize=maxsize, retries=0,
timeout=2.0)
if options.cdxserver_dedup_cookies: if options.cdxserver_dedup_cookies:
self.cookies = options.cdxserver_dedup_cookies self.cookies = options.cdxserver_dedup_cookies
@ -271,7 +276,7 @@ class CdxServerDedup(DedupDb):
class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor): class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor):
def __init__(self, cdx_dedup, options=warcprox.Options()): def __init__(self, cdx_dedup, options=warcprox.Options()):
warcprox.BaseBatchPostfetchProcessor.__init__(self, options) warcprox.BaseBatchPostfetchProcessor.__init__(self, options)
self.pool = futures.ThreadPoolExecutor(max_workers=50) self.pool = futures.ThreadPoolExecutor(max_workers=200)
self.batch = set() self.batch = set()
self.cdx_dedup = cdx_dedup self.cdx_dedup = cdx_dedup

View File

@ -162,6 +162,10 @@ def _build_arg_parser(prog='warcprox'):
default=None, help=( default=None, help=(
'host:port of tor socks proxy, used only to connect to ' 'host:port of tor socks proxy, used only to connect to '
'.onion sites')) '.onion sites'))
# Configurable connection socket timeout, default is 60 sec.
arg_parser.add_argument(
'--socket-timeout', dest='socket_timeout', type=float,
default=None, help=argparse.SUPPRESS)
arg_parser.add_argument( arg_parser.add_argument(
'--crawl-log-dir', dest='crawl_log_dir', default=None, help=( '--crawl-log-dir', dest='crawl_log_dir', default=None, help=(
'if specified, write crawl log files in the specified ' 'if specified, write crawl log files in the specified '

View File

@ -205,12 +205,13 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
and records the bytes in transit as it proxies them. and records the bytes in transit as it proxies them.
''' '''
logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler") logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler")
_socket_timeout = 60
def __init__(self, request, client_address, server): def __init__(self, request, client_address, server):
threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1]) threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1])
self.is_connect = False self.is_connect = False
self._headers_buffer = [] self._headers_buffer = []
request.settimeout(60) # XXX what value should this have? request.settimeout(self._socket_timeout)
http_server.BaseHTTPRequestHandler.__init__(self, request, client_address, server) http_server.BaseHTTPRequestHandler.__init__(self, request, client_address, server)
def _determine_host_port(self): def _determine_host_port(self):
@ -247,8 +248,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
self._remote_server_sock = socket.socket() self._remote_server_sock = socket.socket()
self._remote_server_sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) self._remote_server_sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
# XXX what value should this timeout have? self._remote_server_sock.settimeout(self._socket_timeout)
self._remote_server_sock.settimeout(60)
self._remote_server_sock.connect((self.hostname, int(self.port))) self._remote_server_sock.connect((self.hostname, int(self.port)))
# Wrap socket if SSL is required # Wrap socket if SSL is required

View File

@ -103,17 +103,13 @@ class TroughClient(object):
elif isinstance(x, bool): elif isinstance(x, bool):
return int(x) return int(x)
elif isinstance(x, str) or isinstance(x, bytes): elif isinstance(x, str) or isinstance(x, bytes):
# py3: repr(u'abc') => 'abc' # the only character that needs escaped in sqlite string literals
# repr(b'abc') => b'abc' # is single-quote, which is escaped as two single-quotes
# py2: repr(u'abc') => u'abc' if isinstance(x, bytes):
# repr(b'abc') => 'abc' s = x.decode('utf-8')
# Repr gives us a prefix we don't want in different situations
# depending on whether this is py2 or py3. Chop it off either way.
r = repr(x)
if r[:1] == "'":
return r
else: else:
return r[1:] s = x
return "'" + s.replace("'", "''") + "'"
elif isinstance(x, (int, float)): elif isinstance(x, (int, float)):
return x return x
else: else:
@ -196,7 +192,7 @@ class TroughClient(object):
response.status_code, response.reason, response.text, response.status_code, response.reason, response.text,
write_url, sql) write_url, sql)
return return
self.logger.debug('posted %r to %s', sql, write_url) self.logger.debug('posted to %s: %r', write_url, sql)
def read(self, segment_id, sql_tmpl, values=()): def read(self, segment_id, sql_tmpl, values=()):
read_url = self.read_url(segment_id) read_url = self.read_url(segment_id)

View File

@ -330,7 +330,7 @@ class RecordedUrl:
warcprox_meta=None, content_type=None, custom_type=None, warcprox_meta=None, content_type=None, custom_type=None,
status=None, size=None, client_ip=None, method=None, status=None, size=None, client_ip=None, method=None,
timestamp=None, host=None, duration=None, referer=None, timestamp=None, host=None, duration=None, referer=None,
payload_digest=None, warc_records=None): payload_digest=None, warc_records=None, do_not_archive=False):
# XXX should test what happens with non-ascii url (when does # XXX should test what happens with non-ascii url (when does
# url-encoding happen?) # url-encoding happen?)
if type(url) is not bytes: if type(url) is not bytes:
@ -370,6 +370,7 @@ class RecordedUrl:
self.referer = referer self.referer = referer
self.payload_digest = payload_digest self.payload_digest = payload_digest
self.warc_records = warc_records self.warc_records = warc_records
self.do_not_archive = do_not_archive
# inherit from object so that multiple inheritance from this class works # inherit from object so that multiple inheritance from this class works
# properly in python 2 # properly in python 2
@ -397,6 +398,9 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object):
WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy
WarcProxyHandler.onion_tor_socks_proxy_port = None WarcProxyHandler.onion_tor_socks_proxy_port = None
if options.socket_timeout:
WarcProxyHandler._socket_timeout = options.socket_timeout
http_server.HTTPServer.__init__( http_server.HTTPServer.__init__(
self, server_address, WarcProxyHandler, bind_and_activate=True) self, server_address, WarcProxyHandler, bind_and_activate=True)

View File

@ -22,133 +22,172 @@ USA.
from __future__ import absolute_import from __future__ import absolute_import
import logging import logging
from datetime import datetime
from hanzo import warctools from hanzo import warctools
import fcntl import fcntl
import time import time
import warcprox import warcprox
import os import os
import socket import socket
import string
import random import random
import threading import threading
try:
import queue
except ImportError:
import Queue as queue
import contextlib
class WarcWriter: class _OneWritableWarc:
logger = logging.getLogger('warcprox.writer.WarcWriter') logger = logging.getLogger('warcprox.writer._OneWritableWarc')
def __init__(self, options=warcprox.Options()):
'''
Utility class used by WarcWriter
'''
def __init__(self, options=warcprox.Options(), randomtoken='0'):
self.f = None
self.path = None
self.finalname = None
self.gzip = options.gzip or False
self.prefix = options.prefix or 'warcprox'
self.open_suffix = '' if options.no_warc_open_suffix else '.open'
self.randomtoken = randomtoken
self.rollover_size = options.rollover_size or 1000000000 self.rollover_size = options.rollover_size or 1000000000
self.rollover_idle_time = options.rollover_idle_time or None self.rollover_idle_time = options.rollover_idle_time or None
self._last_activity = time.time()
self.gzip = options.gzip or False
self.warc_filename = options.warc_filename or \
'{prefix}-{timestamp17}-{randomtoken}-{serialno}'
digest_algorithm = options.digest_algorithm or 'sha1'
base32 = options.base32
self.record_builder = warcprox.warc.WarcRecordBuilder(
digest_algorithm=digest_algorithm, base32=base32)
# warc path and filename stuff
self.directory = options.directory or './warcs' self.directory = options.directory or './warcs'
self.prefix = options.prefix or 'warcprox' self.filename_template = options.warc_filename or \
'{prefix}-{timestamp17}-{randomtoken}-{serialno}'
self._f = None self.last_activity = time.time()
self._fpath = None
self._f_finalname = None
self._f_open_suffix = '' if options.no_warc_open_suffix else '.open'
self._serial = 0
self._lock = threading.RLock()
self._randomtoken = "".join(random.Random().sample(string.digits + string.ascii_lowercase, 8))
if not os.path.exists(self.directory):
self.logger.info("warc destination directory {} doesn't exist, creating it".format(self.directory))
os.mkdir(self.directory)
def timestamp17(self):
now = datetime.utcnow()
return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000)
def timestamp14(self):
now = datetime.utcnow()
return '{:%Y%m%d%H%M%S}'.format(now)
def close_writer(self):
with self._lock:
if self._fpath:
self.logger.info('closing %s', self._f_finalname)
if self._f_open_suffix == '':
try:
fcntl.lockf(self._f, fcntl.LOCK_UN)
except IOError as exc:
self.logger.error('could not unlock file %s (%s)',
self._fpath, exc)
self._f.close()
finalpath = os.path.sep.join(
[self.directory, self._f_finalname])
os.rename(self._fpath, finalpath)
self._fpath = None
self._f = None
def serial(self):
return '{:05d}'.format(self._serial)
# h3 default <!-- <property name="template" value="${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}" /> --> # h3 default <!-- <property name="template" value="${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}" /> -->
def _warc_filename(self): def next_filename(self, serial):
"""WARC filename is configurable with CLI parameter --warc-filename. """WARC filename is configurable with CLI parameter --warc-filename.
Default: '{prefix}-{timestamp17}-{serialno}-{randomtoken}' Default: '{prefix}-{timestamp17}-{randomtoken}-{serialno}'
Available variables are: prefix, timestamp14, timestamp17, serialno, Available variables are: prefix, timestamp14, timestamp17, serialno,
randomtoken, hostname, shorthostname. randomtoken, hostname, shorthostname.
Extension ``.warc`` or ``.warc.gz`` is appended automatically. Extension ``.warc`` or ``.warc.gz`` is appended automatically.
""" """
hostname = socket.getfqdn() hostname = socket.getfqdn()
shorthostname = hostname.split('.')[0] shorthostname = hostname.split('.')[0]
fname = self.warc_filename.format(prefix=self.prefix, fname = self.filename_template.format(
timestamp14=self.timestamp14(), prefix=self.prefix, timestamp14=warcprox.timestamp14(),
timestamp17=self.timestamp17(), timestamp17=warcprox.timestamp17(),
serialno=self.serial(), serialno='{:05d}'.format(serial),
randomtoken=self._randomtoken, randomtoken=self.randomtoken, hostname=hostname,
hostname=hostname, shorthostname=shorthostname)
shorthostname=shorthostname)
if self.gzip: if self.gzip:
fname = fname + '.warc.gz' fname = fname + '.warc.gz'
else: else:
fname = fname + '.warc' fname = fname + '.warc'
return fname return fname
def _writer(self): def open(self, serial):
with self._lock: if not os.path.exists(self.directory):
if self._fpath and os.path.getsize( self.logger.info(
self._fpath) > self.rollover_size: "warc destination directory %s doesn't exist, creating it",
self.close_writer() self.directory)
os.mkdir(self.directory)
if self._f == None: self.finalname = self.next_filename(serial)
self._f_finalname = self._warc_filename() self.path = os.path.sep.join(
self._fpath = os.path.sep.join([ [self.directory, self.finalname + self.open_suffix])
self.directory, self._f_finalname + self._f_open_suffix])
self._f = open(self._fpath, 'wb') self.f = open(self.path, 'wb')
# if no '.open' suffix is used for WARC, acquire an exclusive # if no '.open' suffix is used for WARC, acquire an exclusive
# file lock. # file lock.
if self._f_open_suffix == '': if self.open_suffix == '':
try: try:
fcntl.lockf(self._f, fcntl.LOCK_EX | fcntl.LOCK_NB) fcntl.lockf(self.f, fcntl.LOCK_EX | fcntl.LOCK_NB)
except IOError as exc: except IOError as exc:
self.logger.error('could not lock file %s (%s)', self.logger.error(
self._fpath, exc) 'could not lock file %s (%s)', self.path, exc)
return self.f
warcinfo_record = self.record_builder.build_warcinfo_record( def close(self):
self._f_finalname) if self.path:
self.logger.debug( self.logger.trace('closing %s', self.finalname)
'warcinfo_record.headers=%s', warcinfo_record.headers) if self.open_suffix == '':
warcinfo_record.write_to(self._f, gzip=self.gzip) try:
fcntl.lockf(self.f, fcntl.LOCK_UN)
except IOError as exc:
self.logger.error(
'could not unlock file %s (%s)', self.path, exc)
self.f.close()
finalpath = os.path.sep.join(
[self.directory, self.finalname])
os.rename(self.path, finalpath)
self.path = None
self.f = None
def maybe_idle_rollover(self):
if (self.path and self.rollover_idle_time
and self.rollover_idle_time > 0
and time.time() - self.last_activity > self.rollover_idle_time):
self.logger.info(
'rolling over %s after %0.1f seconds idle',
self.finalname, time.time() - self.last_activity)
self.close()
def maybe_size_rollover(self):
if self.path and os.path.getsize(self.path) > self.rollover_size:
self.logger.info(
'rolling over %s because it has reached %s bytes in size',
self.finalname, os.path.getsize(self.path))
self.close()
class WarcWriter:
logger = logging.getLogger('warcprox.writer.WarcWriter')
def __init__(self, options=warcprox.Options()):
self.options = options
self.gzip = options.gzip or False
self.record_builder = warcprox.warc.WarcRecordBuilder(
digest_algorithm=options.digest_algorithm or 'sha1',
base32=options.base32)
self._available_warcs = queue.Queue()
self._warc_count = 0
self._warc_count_lock = threading.Lock()
self._serial = 0
self._serial_lock = threading.Lock()
self._randomtoken = ''.join(
random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8))
def _bespeak_warc(self):
try:
return self._available_warcs.get(block=False)
except queue.Empty:
with self._warc_count_lock:
if self._warc_count < self.options.writer_threads:
self._warc_count += 1
return _OneWritableWarc(self.options, self._randomtoken)
# else we're maxed out, wait for one to free up
return self._available_warcs.get(block=True)
@contextlib.contextmanager
def _warc(self):
warc = self._bespeak_warc()
warc.maybe_size_rollover()
# lazy file open
if warc.f == None:
with self._serial_lock:
serial = self._serial
self._serial += 1 self._serial += 1
warc.open(serial)
warcinfo = self.record_builder.build_warcinfo_record(warc.finalname)
self.logger.debug('warcinfo.headers=%s', warcinfo.headers)
warcinfo.write_to(warc.f, gzip=self.gzip)
return self._f yield warc
# __exit__()
warc.f.flush()
warc.last_activity = time.time()
self._available_warcs.put(warc)
def write_records(self, recorded_url): def write_records(self, recorded_url):
"""Returns tuple of records written, which are instances of """Returns tuple of records written, which are instances of
@ -156,44 +195,47 @@ class WarcWriter:
"offset" attributes.""" "offset" attributes."""
records = self.record_builder.build_warc_records(recorded_url) records = self.record_builder.build_warc_records(recorded_url)
with self._lock: with self._warc() as warc:
writer = self._writer()
for record in records: for record in records:
offset = writer.tell() offset = warc.f.tell()
record.write_to(writer, gzip=self.gzip) record.write_to(warc.f, gzip=self.gzip)
record.offset = offset record.offset = offset
record.length = writer.tell() - offset record.length = warc.f.tell() - offset
record.warc_filename = self._f_finalname record.warc_filename = warc.finalname
self.logger.debug( self.logger.debug(
'wrote warc record: warc_type=%s content_length=%s ' 'wrote warc record: warc_type=%s content_length=%s '
'url=%s warc=%s offset=%d', 'url=%s warc=%s offset=%d',
record.get_header(warctools.WarcRecord.TYPE), record.get_header(warctools.WarcRecord.TYPE),
record.get_header(warctools.WarcRecord.CONTENT_LENGTH), record.get_header(warctools.WarcRecord.CONTENT_LENGTH),
record.get_header(warctools.WarcRecord.URL), record.get_header(warctools.WarcRecord.URL),
self._fpath, record.offset) warc.path, record.offset)
self._f.flush()
self._last_activity = time.time()
return records return records
def maybe_idle_rollover(self): def maybe_idle_rollover(self):
with self._lock: warcs = []
if (self._fpath is not None while True:
and self.rollover_idle_time is not None try:
and self.rollover_idle_time > 0 warc = self._available_warcs.get(block=False)
and time.time() - self._last_activity > self.rollover_idle_time): warcs.append(warc)
self.logger.info( except queue.Empty:
'rolling over %s after %s seconds idle', break
self._f_finalname, time.time() - self._last_activity) for warc in warcs:
self.close_writer() warc.maybe_idle_rollover()
self._available_warcs.put(warc)
def close_writer(self):
while self._warc_count > 0:
with self._warc_count_lock:
warc = self._available_warcs.get()
warc.close()
self._warc_count -= 1
class WarcWriterPool: class WarcWriterPool:
logger = logging.getLogger("warcprox.writer.WarcWriterPool") logger = logging.getLogger("warcprox.writer.WarcWriterPool")
def __init__(self, options=warcprox.Options()): def __init__(self, options=warcprox.Options()):
self.default_warc_writer = WarcWriter(options=options) self.default_warc_writer = WarcWriter(options)
self.warc_writers = {} # {prefix:WarcWriter} self.warc_writers = {} # {prefix:WarcWriter}
self.options = options self.options = options
self._lock = threading.RLock() self._lock = threading.RLock()
@ -208,8 +250,7 @@ class WarcWriterPool:
options.prefix = recorded_url.warcprox_meta["warc-prefix"] options.prefix = recorded_url.warcprox_meta["warc-prefix"]
with self._lock: with self._lock:
if not options.prefix in self.warc_writers: if not options.prefix in self.warc_writers:
self.warc_writers[options.prefix] = WarcWriter( self.warc_writers[options.prefix] = WarcWriter(options)
options=options)
w = self.warc_writers[options.prefix] w = self.warc_writers[options.prefix]
return w return w

View File

@ -30,33 +30,44 @@ except ImportError:
import logging import logging
import time import time
import warcprox import warcprox
from concurrent import futures
class WarcWriterThread(warcprox.BaseStandardPostfetchProcessor): class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
logger = logging.getLogger("warcprox.writerthread.WarcWriterThread") logger = logging.getLogger("warcprox.writerthread.WarcWriterProcessor")
_ALWAYS_ACCEPT = {'WARCPROX_WRITE_RECORD'} _ALWAYS_ACCEPT = {'WARCPROX_WRITE_RECORD'}
def __init__(self, options=warcprox.Options()): def __init__(self, options=warcprox.Options()):
warcprox.BaseStandardPostfetchProcessor.__init__(self, options=options) warcprox.BaseStandardPostfetchProcessor.__init__(self, options=options)
self.options = options
self.writer_pool = warcprox.writer.WarcWriterPool(options) self.writer_pool = warcprox.writer.WarcWriterPool(options)
self.method_filter = set(method.upper() for method in self.options.method_filter or []) self.method_filter = set(method.upper() for method in self.options.method_filter or [])
self.pool = futures.ThreadPoolExecutor(max_workers=options.writer_threads or 1)
self.batch = set()
def _get_process_put(self): def _get_process_put(self):
try: recorded_url = self.inq.get(block=True, timeout=0.5)
warcprox.BaseStandardPostfetchProcessor._get_process_put(self) self.batch.add(recorded_url)
finally: self.pool.submit(self._process_url, recorded_url)
self.writer_pool.maybe_idle_rollover()
def _process_url(self, recorded_url): def _process_url(self, recorded_url):
records = [] try:
if self._should_archive(recorded_url): records = []
records = self.writer_pool.write_records(recorded_url) if self._should_archive(recorded_url):
recorded_url.warc_records = records records = self.writer_pool.write_records(recorded_url)
self._log(recorded_url, records) recorded_url.warc_records = records
# try to release resources in a timely fashion self._log(recorded_url, records)
if recorded_url.response_recorder and recorded_url.response_recorder.tempfile: # try to release resources in a timely fashion
recorded_url.response_recorder.tempfile.close() if recorded_url.response_recorder and recorded_url.response_recorder.tempfile:
recorded_url.response_recorder.tempfile.close()
except:
logging.error(
'caught exception processing %s', recorded_url.url,
exc_info=True)
finally:
self.batch.remove(recorded_url)
if self.outq:
self.outq.put(recorded_url)
self.writer_pool.maybe_idle_rollover()
def _filter_accepts(self, recorded_url): def _filter_accepts(self, recorded_url):
if not self.method_filter: if not self.method_filter:
@ -70,8 +81,12 @@ class WarcWriterThread(warcprox.BaseStandardPostfetchProcessor):
if recorded_url.warcprox_meta if recorded_url.warcprox_meta
and 'warc-prefix' in recorded_url.warcprox_meta and 'warc-prefix' in recorded_url.warcprox_meta
else self.options.prefix) else self.options.prefix)
do_not_archive = (recorded_url.do_not_archive
if recorded_url.do_not_archive
else False)
# special warc name prefix '-' means "don't archive" # special warc name prefix '-' means "don't archive"
return prefix != '-' and self._filter_accepts(recorded_url) return (prefix != '-' and (not do_not_archive)
and self._filter_accepts(recorded_url))
def _log(self, recorded_url, records): def _log(self, recorded_url, records):
try: try: