Merge pull request #17 from internetarchive/2.x

2.x
2025-01-18 13:22:09 +01:00 · 2016-10-19 15:34:49 -07:00 · 2016-10-19 15:34:49 -07:00 · de3c81fdc8
commit de3c81fdc8
parent f79e744823 719380e612
32 changed files with 4359 additions and 1306 deletions
--- a/.gitignore
+++ b/.gitignore
@ -11,3 +11,4 @@ warcs
 build
 dist
 .tox
+out.*
--- a/.travis.yml
+++ b/.travis.yml
@ -1,21 +1,36 @@
-# vim: set sw=4 et:
-#
-# tox approach stolen from
-# https://github.com/pypa/pip/blob/abdb597dbfb51b21cc76c1cff068b72c80f3a77d/.travis.yml
-#
-
 language: python
+python:
+- 3.5
+- 3.4
+- 2.7
+- nightly
+- pypy
+- pypy3

-env:
-    - TOXENV=py27
-    - TOXENV=py34
+matrix:
+  allow_failures:
+  - python: pypy
+  - python: pypy3
+
+addons:
+  apt:
+    packages:
+    - python-gdbm
+    - python3-gdbm
+    - tor
+
+services:
+- docker

 before_install:
-    - sudo apt-get update
-    - sudo apt-get -y install python-gdbm python3-gdbm
+- sudo service docker restart ; sleep 10  # https://github.com/travis-ci/travis-ci/issues/4778
+- docker run -d --publish=28015:28015 rethinkdb

 before_script:
-    - pip install tox
+- pip install . pytest requests

-script: tox
+script:
+- py.test -v -s tests
+- py.test -v -s --rethinkdb-servers=localhost tests tests
+- py.test -v -s --rethinkdb-servers=localhost --rethinkdb-big-table tests

--- a/README.rst
+++ b/README.rst
@ -1,15 +1,11 @@
 warcprox - WARC writing MITM HTTP/S proxy
 -----------------------------------------
-.. image:: https://travis-ci.org/internetarchive/warcprox.png?branch=master   
+.. image:: https://travis-ci.org/internetarchive/warcprox.png?branch=master
        :target: https://travis-ci.org/internetarchive/warcprox

 Based on the excellent and simple pymiproxy by Nadeem Douba.
 https://github.com/allfro/pymiproxy

-License: because pymiproxy is GPL and warcprox is a derivative work of
-pymiproxy, warcprox is also GPL.
-
-
 Install
 ~~~~~~~

@ -19,6 +15,7 @@ To install latest release run:

 ::

+    # apt-get install libffi-dev libssl-dev python3-gdbm
    pip install warcprox

 You can also install the latest bleeding edge code:
@ -45,10 +42,15 @@ Usage
    usage: warcprox [-h] [-p PORT] [-b ADDRESS] [-c CACERT]
                    [--certs-dir CERTS_DIR] [-d DIRECTORY] [-z] [-n PREFIX]
                    [-s SIZE] [--rollover-idle-time ROLLOVER_IDLE_TIME]
-                    [-g DIGEST_ALGORITHM] [--base32] [-j DEDUP_DB_FILE]
-                    [-P PLAYBACK_PORT]
-                    [--playback-index-db-file PLAYBACK_INDEX_DB_FILE] [--version]
-                    [-v] [-q]
+                    [-g DIGEST_ALGORITHM] [--base32]
+                    [--stats-db-file STATS_DB_FILE] [-P PLAYBACK_PORT]
+                    [--playback-index-db-file PLAYBACK_INDEX_DB_FILE]
+                    [-j DEDUP_DB_FILE | --rethinkdb-servers RETHINKDB_SERVERS]
+                    [--rethinkdb-db RETHINKDB_DB] [--rethinkdb-big-table]
+                    [--kafka-broker-list KAFKA_BROKER_LIST]
+                    [--kafka-capture-feed-topic KAFKA_CAPTURE_FEED_TOPIC]
+                    [--onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY]
+                    [--version] [-v] [-q]

    warcprox - WARC writing MITM HTTP/S proxy

@ -58,84 +60,91 @@ Usage
      -b ADDRESS, --address ADDRESS
                            address to listen on (default: localhost)
      -c CACERT, --cacert CACERT
-                            CA certificate file; if file does not exist, it will
-                            be created (default: ./desktop-nlevitt-warcprox-
-                            ca.pem)
+                            CA certificate file; if file does not exist, it
+                            will be created (default: ./MacBook-Pro.local-
+                            warcprox-ca.pem)
      --certs-dir CERTS_DIR
                            where to store and load generated certificates
-                            (default: ./desktop-nlevitt-warcprox-ca)
+                            (default: ./MacBook-Pro.local-warcprox-ca)
      -d DIRECTORY, --dir DIRECTORY
                            where to write warcs (default: ./warcs)
-      -z, --gzip            write gzip-compressed warc records (default: False)
+      -z, --gzip            write gzip-compressed warc records (default:
+                            False)
      -n PREFIX, --prefix PREFIX
                            WARC filename prefix (default: WARCPROX)
-      -s SIZE, --size SIZE  WARC file rollover size threshold in bytes (default:
-                            1000000000)
+      -s SIZE, --size SIZE  WARC file rollover size threshold in bytes
+                            (default: 1000000000)
      --rollover-idle-time ROLLOVER_IDLE_TIME
-                            WARC file rollover idle time threshold in seconds (so
-                            that Friday's last open WARC doesn't sit there all
-                            weekend waiting for more data) (default: None)
+                            WARC file rollover idle time threshold in seconds
+                            (so that Friday's last open WARC doesn't sit there
+                            all weekend waiting for more data) (default: None)
      -g DIGEST_ALGORITHM, --digest-algorithm DIGEST_ALGORITHM
-                            digest algorithm, one of sha384, sha512, md5, sha224,
-                            sha256, sha1 (default: sha1)
+                            digest algorithm, one of sha1, sha256, md5,
+                            sha224, sha512, sha384 (default: sha1)
      --base32              write digests in Base32 instead of hex (default:
                            False)
-      -j DEDUP_DB_FILE, --dedup-db-file DEDUP_DB_FILE
-                            persistent deduplication database file; empty string
-                            or /dev/null disables deduplication (default:
-                            ./warcprox-dedup.db)
+      --stats-db-file STATS_DB_FILE
+                            persistent statistics database file; empty string
+                            or /dev/null disables statistics tracking
+                            (default: ./warcprox-stats.db)
      -P PLAYBACK_PORT, --playback-port PLAYBACK_PORT
-                            port to listen on for instant playback (default: None)
+                            port to listen on for instant playback (default:
+                            None)
      --playback-index-db-file PLAYBACK_INDEX_DB_FILE
-                            playback index database file (only used if --playback-
-                            port is specified) (default: ./warcprox-playback-
-                            index.db)
+                            playback index database file (only used if
+                            --playback-port is specified) (default:
+                            ./warcprox-playback-index.db)
+      -j DEDUP_DB_FILE, --dedup-db-file DEDUP_DB_FILE
+                            persistent deduplication database file; empty
+                            string or /dev/null disables deduplication
+                            (default: ./warcprox-dedup.db)
+      --rethinkdb-servers RETHINKDB_SERVERS
+                            rethinkdb servers, used for dedup and stats if
+                            specified; e.g.
+                            db0.foo.org,db0.foo.org:38015,db1.foo.org
+                            (default: None)
+      --rethinkdb-db RETHINKDB_DB
+                            rethinkdb database name (ignored unless
+                            --rethinkdb-servers is specified) (default:
+                            warcprox)
+      --rethinkdb-big-table
+                            use a big rethinkdb table called "captures",
+                            instead of a small table called "dedup"; table is
+                            suitable for use as index for playback (ignored
+                            unless --rethinkdb-servers is specified) (default:
+                            False)
+      --kafka-broker-list KAFKA_BROKER_LIST
+                            kafka broker list for capture feed (default: None)
+      --kafka-capture-feed-topic KAFKA_CAPTURE_FEED_TOPIC
+                            kafka capture feed topic (default: None)
+      --onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY
+                            host:port of tor socks proxy, used only to connect
+                            to .onion sites (default: None)
      --version             show program's version number and exit
      -v, --verbose
      -q, --quiet

-To do
-~~~~~

-* (partly done) integration tests, unit tests
-* (done) url-agnostic deduplication
-* unchunk and/or ungzip before storing payload, or alter request to
-  discourage server from chunking/gzipping
-* check certs from proxied website, like browser does, and present
-  browser-like warning if appropriate
-* keep statistics, produce reports
-* write cdx while crawling?
-* performance testing
-* (done) base32 sha1 like heritrix?
-* configurable timeouts and stuff
-* evaluate ipv6 support
-* (done) more explicit handling of connection closed exception
-  during transfer
-* dns cache?? the system already does a fine job I'm thinking
-* keepalive with remote servers?
-* (done) python3
-* special handling for 304 not-modified (write nothing or write revisit
-  record... and/or modify request so server never responds with 304)
-* (done) instant playback on a second proxy port
-* special url for downloading ca cert e.g. http(s)://warcprox./ca.pem
-* special url for other stuff, some status info or something?
-* browser plugin for warcprox mode
+License
+~~~~~~~

-  -  accept warcprox CA cert only when in warcprox mode
-  -  separate temporary cookie store, like incognito
-  -  "careful! your activity is being archived" banner
-  -  easy switch between archiving and instant playback proxy port
+Warcprox is a derivative work of pymiproxy, which is GPL. Thus warcprox is also
+GPL.

-To not do
-^^^^^^^^^
+Copyright (C) 2012 Cygnos Corporation
+Copyright (C) 2013-2016 Internet Archive

-The features below could also be part of warcprox. But maybe they don't
-belong here, since this is a proxy, not a crawler/robot. It can be used
-by a human with a browser, or by something automated, i.e. a robot. My
-feeling is that it's more appropriate to implement these in the robot.
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.

-*  politeness, i.e. throttle requests per server
-*  fetch and obey robots.txt
-*  alter user-agent, maybe insert something like "warcprox mitm
-   archiving proxy; +http://archive.org/details/archive.org\_bot"
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

--- a/benchmarks/requirements.txt
+++ b/benchmarks/requirements.txt
@ -0,0 +1 @@
+aiohttp
--- a/benchmarks/run-benchmarks.py
+++ b/benchmarks/run-benchmarks.py
@ -0,0 +1,172 @@
+#!/usr/bin/env python
+#
+# run-benchmarks.py - some benchmarking code for warcprox
+#
+# Copyright (C) 2015-2016 Internet Archive
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
+# USA.
+#
+
+import sys
+import aiohttp
+import aiohttp.server
+import asyncio
+import ssl
+import tempfile
+import OpenSSL.crypto
+import OpenSSL.SSL
+import random
+import os
+import threading
+import time
+import logging
+import warcprox.main
+
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+        format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
+
+def self_signed_cert():
+    key = OpenSSL.crypto.PKey()
+    key.generate_key(OpenSSL.crypto.TYPE_RSA, 2048)
+
+    cert = OpenSSL.crypto.X509()
+    cert.set_serial_number(random.randint(0, 2 ** 64 - 1))
+    cert.get_subject().CN = 'localhost'
+
+    cert.set_version(2)
+    cert.gmtime_adj_notBefore(0)
+    cert.gmtime_adj_notAfter(10 * 365 * 24 * 60 * 60)
+
+    cert.set_issuer(cert.get_subject())
+    cert.set_pubkey(key)
+    cert.sign(key, "sha1")
+
+    return key, cert
+
+class HttpRequestHandler(aiohttp.server.ServerHttpProtocol):
+    @asyncio.coroutine
+    def handle_request(self, message, payload):
+        response = aiohttp.Response(
+            self.writer, 200, http_version=message.version
+        )
+        n = int(message.path.partition('/')[2])
+        response.add_header('Content-Type', 'text/plain')
+        # response.add_header('Content-Length', '18')
+        response.send_headers()
+        for i in range(n):
+            response.write(b'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n')
+        yield from response.write_eof()
+
+def run_servers():
+    loop.run_forever()
+
+def start_servers():
+    loop = asyncio.get_event_loop()
+    http = loop.create_server(lambda: HttpRequestHandler(debug=True, keep_alive=75), '127.0.0.1', '8080')
+    sslcontext = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
+    key, cert = self_signed_cert()
+    with tempfile.NamedTemporaryFile(delete=False) as certfile:
+        certfile.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM, key))
+        certfile.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM, cert))
+    sslcontext.load_cert_chain(certfile.name)
+    os.remove(certfile.name)
+    https = loop.create_server(lambda: HttpRequestHandler(debug=True, keep_alive=75), '127.0.0.1', '8443', ssl=sslcontext)
+    srv = loop.run_until_complete(http)
+    srv = loop.run_until_complete(https)
+    logging.info('serving on http://127.0.0.1:8080 and https://127.0.0.1:8443')
+
+class AsyncClient(object):
+    def __init__(self, proxy=None):
+        self.n_urls = 0
+        self.n_bytes = 0
+        self.proxy = proxy
+        if proxy:
+            self.connector = aiohttp.connector.ProxyConnector(proxy, verify_ssl=False)
+        else:
+            self.connector = aiohttp.connector.TCPConnector(verify_ssl=False)
+
+    @asyncio.coroutine
+    def read_response(self, r, url):
+        # time.sleep(random.random() * 10)
+        while True:
+            chunk = yield from r.content.read(2**16)
+            self.n_bytes += len(chunk)
+            if not chunk:
+                self.n_urls += 1
+                logging.debug("finished reading from %s", url)
+                r.close()
+                break
+
+    @asyncio.coroutine
+    def one_request(self, url):
+        logging.debug("issuing request to %s", url)
+        r = yield from aiohttp.get(url, connector=self.connector)
+        logging.debug("issued request to %s", url)
+        yield from self.read_response(r, url)
+
+def benchmark(client):
+    try:
+       start = time.time()
+       tasks_https = [client.one_request('https://localhost:8443/%s' % int(1.1**i)) for i in range(80)]
+       asyncio.get_event_loop().run_until_complete(asyncio.wait(tasks_https))
+       tasks_http = [client.one_request('http://localhost:8080/%s' % int(1.1**i)) for i in range(80)]
+       asyncio.get_event_loop().run_until_complete(asyncio.wait(tasks_http))
+    finally:
+        finish = time.time()
+        logging.info("proxy=%s: %s urls totaling %s bytes in %s seconds", client.proxy, client.n_urls, client.n_bytes, (finish - start))
+
+if __name__ == '__main__':
+    args = warcprox.main.parse_args()
+
+    start_servers()
+
+    baseline_client = AsyncClient()
+    logging.info("===== baseline benchmark starting (no proxy) =====")
+    benchmark(baseline_client)
+    logging.info("===== baseline benchmark finished =====")
+
+
+    # Queue size of 1 makes warcprox behave as though it were synchronous (each
+    # request blocks until the warc writer starts working on the last request).
+    # This gives us a better sense of sustained max throughput. The
+    # asynchronous nature of warcprox helps with bursty traffic, as long as the
+    # average throughput stays below the sustained max.
+    with tempfile.TemporaryDirectory() as tmpdir:
+        args.queue_size = 1
+        args.cacert = os.path.join(tmpdir, "benchmark-warcprox-ca.pem")
+        args.certs_dir = os.path.join(tmpdir, "benchmark-warcprox-ca")
+        args.directory = os.path.join(tmpdir, "warcs")
+        args.gzip = True
+        args.base32 = True
+        args.stats_db_file = os.path.join(tmpdir, "stats.db")
+        args.dedup_db_file = os.path.join(tmpdir, "dedup.db")
+
+        warcprox_controller = warcprox.main.init_controller(args)
+        warcprox_controller_thread = threading.Thread(target=warcprox_controller.run_until_shutdown)
+        warcprox_controller_thread.start()
+        proxy = "http://%s:%s" % (args.address, args.port)
+        proxied_client = AsyncClient(proxy=proxy)
+
+        logging.info("===== warcprox benchmark starting =====")
+        benchmark(proxied_client)
+        logging.info("===== warcprox benchmark finished =====")
+
+        warcprox_controller.stop.set()
+        warcprox_controller_thread.join()
+
+    asyncio.get_event_loop().stop()
+    logging.info("finished")
+
--- a/bin/warcprox
+++ b/bin/warcprox
@ -1,8 +0,0 @@
-#!/usr/bin/env python
-# vim: set sw=4 et:
-
-from __future__ import absolute_import
-
-import warcprox.main
-
-warcprox.main.main()
--- a/setup.py
+++ b/setup.py
@ -1,44 +1,57 @@
 #!/usr/bin/env python
-# vim: set sw=4 et:
+'''
+setup.py - setuptools installation configuration for warcprox
+
+Copyright (C) 2013-2016 Internet Archive
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
+USA.
+'''

-from setuptools.command.test import test as TestCommand
 import sys
-import setuptools 
-
-VERSION_BYTES = b'1.4'
-
-def full_version_bytes():
-    import subprocess, time
-    try:
-        commit_bytes = subprocess.check_output(['git', 'log', '-1', '--pretty=format:%h'])
-
-        t_bytes = subprocess.check_output(['git', 'log', '-1', '--pretty=format:%ct'])
-        t = int(t_bytes.strip().decode('utf-8'))
-        tm = time.gmtime(t)
-        timestamp_utc = time.strftime("%Y%m%d%H%M%S", time.gmtime(t))
-        return VERSION_BYTES + b'-' + timestamp_utc.encode('utf-8') + b'-' + commit_bytes.strip()
-    except subprocess.CalledProcessError:
-        return VERSION_BYTES
-
-version_bytes = full_version_bytes()
-with open('warcprox/version.txt', 'wb') as out:
-    out.write(version_bytes)
-    out.write(b'\n');
+import setuptools
+import setuptools.command.test

 # special class needs to be added to support the pytest written dump-anydbm tests
-class PyTest(TestCommand):
+class PyTest(setuptools.command.test.test):
    def finalize_options(self):
-        TestCommand.finalize_options(self)
+        setuptools.command.test.test.finalize_options(self)
        self.test_args = []
        self.test_suite = True
    def run_tests(self):
-        #import here, cause outside the eggs aren't loaded
+        # import here, because outside the eggs aren't loaded
        import pytest
        errno = pytest.main(self.test_args)
        sys.exit(errno)

-setuptools.setup(name='warcprox',
-        version=version_bytes.decode('utf-8'),
+deps = [
+    'certauth>=1.1.0',
+    'warctools',
+    'kafka-python>=1.0.1',
+    'surt>=0.3b4',
+    'rethinkstuff',
+    'PySocks',
+]
+try:
+    import concurrent.futures
+except:
+    deps.append('futures')
+
+setuptools.setup(
+        name='warcprox',
+        version='2.0b2.dev32',
        description='WARC writing MITM HTTP/S proxy',
        url='https://github.com/internetarchive/warcprox',
        author='Noah Levitt',
@ -46,13 +59,18 @@ setuptools.setup(name='warcprox',
        long_description=open('README.rst').read(),
        license='GPL',
        packages=['warcprox'],
-        package_data={'warcprox':['version.txt']},
-        install_requires=['certauth>=1.1.0', 'warctools>=4.8.3'],  # gdbm not in pip :(
-        dependency_links=['git+https://github.com/internetarchive/warctools.git#egg=warctools-4.8.3'],
+        install_requires=deps,
        tests_require=['requests>=2.0.1', 'pytest'],  # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636
        cmdclass = {'test': PyTest},
        test_suite='warcprox.tests',
-        scripts=['bin/dump-anydbm', 'bin/warcprox'],
+        entry_points={
+            'console_scripts': [
+                'warcprox=warcprox.main:main',
+                ('warcprox-ensure-rethinkdb-tables='
+                    'warcprox.main:ensure_rethinkdb_tables'),
+                'dump-anydbm=warcprox.dump_anydbm:main',
+            ],
+        },
        zip_safe=False,
        classifiers=[
            'Development Status :: 5 - Production/Stable',
@ -60,6 +78,7 @@ setuptools.setup(name='warcprox',
            'License :: OSI Approved :: GNU General Public License (GPL)',
            'Programming Language :: Python :: 2.7',
            'Programming Language :: Python :: 3.4',
+            'Programming Language :: Python :: 3.5',
            'Topic :: Internet :: Proxy Servers',
            'Topic :: Internet :: WWW/HTTP',
            'Topic :: Software Development :: Libraries :: Python Modules',
--- a/tests/Dockerfile
+++ b/tests/Dockerfile
@ -0,0 +1,49 @@
+#
+# Dockerfile for warcprox tests
+#
+# Copyright (C) 2015-2016 Internet Archive
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
+# USA.
+#
+
+FROM phusion/baseimage
+MAINTAINER Noah Levitt <nlevitt@archive.org>
+
+# see https://github.com/stuartpb/rethinkdb-dockerfiles/blob/master/trusty/2.1.3/Dockerfile
+
+ENV LANG=C.UTF-8
+
+RUN apt-get update && apt-get --auto-remove -y dist-upgrade
+
+# Add the RethinkDB repository and public key
+# "RethinkDB Packaging <packaging@rethinkdb.com>" http://download.rethinkdb.com/apt/pubkey.gpg
+RUN apt-key adv --keyserver pgp.mit.edu --recv-keys 1614552E5765227AEC39EFCFA7E00EF33A8F2399 \
+    && echo "deb http://download.rethinkdb.com/apt trusty main" > /etc/apt/sources.list.d/rethinkdb.list \
+    && apt-get update && apt-get -y install rethinkdb
+
+RUN mkdir -vp /etc/service/rethinkdb \
+    && echo "#!/bin/sh\nrethinkdb --bind 0.0.0.0 --directory /tmp/rethink-data --runuser rethinkdb --rungroup rethinkdb\n" > /etc/service/rethinkdb/run \
+    && chmod a+x /etc/service/rethinkdb/run
+
+RUN apt-get -y install python-virtualenv git
+RUN apt-get -y install python-gdbm python3-gdbm libpython2.7-dev libpython3.4-dev libffi-dev libssl-dev
+RUN pip install devpi-client
+
+RUN apt-get -y install tor
+RUN mkdir -vp /etc/service/tor \
+    && echo "#!/bin/sh\ntor\n" > /etc/service/tor/run \
+    && chmod a+x /etc/service/tor/run
+
--- a/warcprox/tests/init.py
+++ b/warcprox/tests/init.py
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -0,0 +1,39 @@
+#
+# tests/conftest.py - command line options for warcprox tests
+#
+# Copyright (C) 2015-2016 Internet Archive
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
+# USA.
+#
+
+import pytest
+
+def pytest_addoption(parser):
+    parser.addoption('--rethinkdb-servers', dest='rethinkdb_servers',
+            help='rethink db servers for dedup, e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org')
+    parser.addoption('--rethinkdb-big-table',
+            dest='rethinkdb_big_table', action='store_true', default=False,
+            help='use a big rethinkdb table called "captures", instead of a small table called "dedup"; table is suitable for use as index for playback (ignored unless --rethinkdb-servers is specified)')
+
+@pytest.fixture(scope="module")
+def rethinkdb_servers(request):
+    return request.config.getoption("--rethinkdb-servers")
+
+@pytest.fixture(scope="module")
+def rethinkdb_big_table(request):
+    return request.config.getoption("--rethinkdb-big-table")
+
+
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@ -0,0 +1,48 @@
+#!/bin/bash
+#
+# tests/run-tests.sh - Runs tests in a docker container. Also runs a temporary
+# instance of rethinkdb inside the container. The tests run with rethinkdb
+# features enabled, against that instance of rethinkdb, and also run without
+# rethinkdb features enabled.  With python 2.7 and 3.4.
+#
+# tests/conftest.py - command line options for warcprox tests
+#
+# Copyright (C) 2015-2016 Internet Archive
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
+# USA.
+#
+# 😬
+#
+
+set -e
+
+script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+docker build -t internetarchive/warcprox-tests $script_dir
+
+for python in python2.7 python3.4
+do
+    docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests /sbin/my_init -- \
+        bash -x -c "cd /tmp && git clone /warcprox && cd /tmp/warcprox \
+            && (cd /warcprox && git diff) | patch -p1 \
+            && virtualenv -p $python /tmp/venv \
+            && source /tmp/venv/bin/activate \
+            && pip --log-file /tmp/pip.log install . pytest requests \
+            && py.test -s tests \
+            && py.test -s --rethinkdb-servers=localhost tests \
+            && py.test -s --rethinkdb-servers=localhost --rethinkdb-big-table tests"
+done
+
--- a/tests/single-threaded-proxy.py
+++ b/tests/single-threaded-proxy.py
@ -0,0 +1,102 @@
+#!/usr/bin/env python
+"""
+tests/single-threaded-proxy.py - single-threaded MITM proxy, useful for
+debugging, does not write warcs
+
+Copyright (C) 2015-2016 Internet Archive
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
+USA.
+"""
+
+from __future__ import absolute_import
+
+import warcprox
+import logging
+import sys
+import argparse
+import certauth
+import queue
+import socket
+import os
+
+class FakeQueue(object):
+    logger = logging.getLogger("FakeQueue")
+    def __init__(self, maxsize=0): pass
+    def join(self): pass
+    def qsize(self): return 0
+    def empty(self): return True
+    def full(self): return False
+    def get(self, block=True, timeout=None): raise queue.Empty
+    def put_nowait(self, item): return self.put(item, block=False)
+    def get_nowait(self): return self.get(block=False)
+    def put(self, recorded_url, block=True, timeout=None):
+        logging.info("{} {} {} {} {} size={} {}".format(
+            recorded_url.client_ip, recorded_url.status, recorded_url.method,
+            recorded_url.url.decode("utf-8"), recorded_url.mimetype,
+            recorded_url.size, warcprox.digest_str(recorded_url.response_recorder.payload_digest, False).decode('utf-8')))
+
+def parse_args():
+    prog = os.path.basename(sys.argv[0])
+    arg_parser = argparse.ArgumentParser(prog=prog,
+        description='%s - single threaded mitm http/s proxy, for debugging' % prog,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    arg_parser.add_argument('-p', '--port', dest='port', default='8000',
+        type=int, help='port to listen on')
+    arg_parser.add_argument('-b', '--address', dest='address',
+        default='localhost', help='address to listen on')
+    arg_parser.add_argument('-c', '--cacert', dest='cacert',
+        default='./{0}-warcprox-ca.pem'.format(socket.gethostname()),
+        help='CA certificate file; if file does not exist, it will be created')
+    arg_parser.add_argument('--certs-dir', dest='certs_dir',
+        default='./{0}-warcprox-ca'.format(socket.gethostname()),
+        help='where to store and load generated certificates')
+    arg_parser.add_argument('--onion-tor-socks-proxy', dest='onion_tor_socks_proxy',
+        default=None, help='host:port of tor socks proxy, used only to connect to .onion sites')
+    arg_parser.add_argument('--version', action='version',
+        version="warcprox {}".format(warcprox.__version__))
+    arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')
+    arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true')
+
+    return arg_parser.parse_args(args=sys.argv[1:])
+
+def init_logging(verbose):
+    if args.verbose:
+        loglevel = logging.DEBUG
+    elif args.quiet:
+        loglevel = logging.WARNING
+    else:
+        loglevel = logging.INFO
+
+    logging.basicConfig(stream=sys.stdout, level=loglevel,
+            format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
+            # format='%(asctime)s %(funcName) 21s() %(filename)15s:%(lineno)05d %(message)s')
+
+def init_proxy(args):
+    ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64]
+    ca = certauth.certauth.CertificateAuthority(args.cacert, args.certs_dir,
+        ca_name=ca_name)
+    options = warcprox.Options(**vars(args))
+    proxy = warcprox.warcproxy.SingleThreadedWarcProxy(ca,
+        recorded_url_q=FakeQueue(), options=options)
+    return proxy
+
+if __name__ == "__main__":
+    args = parse_args()
+    init_logging(args.verbose)
+    proxy = init_proxy(args)
+
+    proxy.serve_forever()
+
--- a/warcprox/tests/test_dump-anydbm.py
+++ b/warcprox/tests/test_dump-anydbm.py
@ -1,4 +1,24 @@
 #!/usr/bin/env python
+#
+# tests/test_dump-anydbm.py - tests for dump-anydbm
+#
+# Copyright (C) 2013-2016 Internet Archive
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
+# USA.
+#

 import pytest
 import os
@ -6,6 +26,7 @@ import tempfile
 import subprocess # to access the script from shell
 import sys
 import glob
+import distutils

 # will try as python 3 then default to python 2 modules
 try:
@ -38,7 +59,7 @@ val1 = 'very first value'
 val2 = 'second value'

 py = sys.executable
-dump_anydbm_loc = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "bin/dump-anydbm")
+dump_anydbm_loc = distutils.spawn.find_executable("dump-anydbm")

@pytest.fixture(scope="function")
 def gdbm_test_db(request):
--- a/tests/test_warcprox.py
+++ b/tests/test_warcprox.py
--- a/tox.ini
+++ b/tox.ini
@ -1,13 +0,0 @@
-# Tox (http://tox.testrun.org/) is a tool for running tests
-# in multiple virtualenvs. This configuration file will run the
-# test suite on all supported python versions. To use it, "pip install tox"
-# and then run "tox" from this directory.
-
-[tox]
-envlist = py27, py34
-
-[testenv]
-commands = py.test warcprox
-deps =
-    pytest
-    requests
--- a/warcprox/init.py
+++ b/warcprox/init.py
@ -1,8 +1,141 @@
-def _read_version_bytes():
-    import os
-    version_txt = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ['version.txt'])
-    with open(version_txt, 'rb') as fin:
-        return fin.read().strip()
+"""
+warcprox/__init__.py - warcprox package main file, contains some utility code

-version_bytes = _read_version_bytes().strip()
-version_str = version_bytes.decode('utf-8')
+Copyright (C) 2013-2016 Internet Archive
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301,
+USA.
+"""
+
+from argparse import Namespace as _Namespace
+from pkg_resources import get_distribution as _get_distribution
+__version__ = _get_distribution('warcprox').version
+
+def digest_str(hash_obj, base32):
+    import base64
+    return hash_obj.name.encode('utf-8') + b':' + (
+            base64.b32encode(hash_obj.digest()) if base32
+            else hash_obj.hexdigest().encode('ascii'))
+
+class Options(_Namespace):
+    def __getattr__(self, name):
+        try:
+            return super(Options, self).__getattr__(self, name)
+        except AttributeError:
+            return None
+
+# XXX linux-specific
+def gettid():
+    try:
+        import ctypes
+        libc = ctypes.cdll.LoadLibrary('libc.so.6')
+        SYS_gettid = 186
+        tid = libc.syscall(SYS_gettid)
+        return tid
+    except:
+        return "n/a"
+
+class RequestBlockedByRule(Exception):
+    """
+    An exception raised when a request should be blocked to respect a
+    Warcprox-Meta rule.
+    """
+    def __init__(self, msg):
+        self.msg = msg
+    def __str__(self):
+        return "%s: %s" % (self.__class__.__name__, self.msg)
+
+class Url:
+    '''
+    Utility class
+    '''
+    def __init__(self, url):
+        self.url = url
+        self._surt = None
+        self._host = None
+
+    @property
+    def surt(self):
+        if not self._surt:
+            import surt
+            hurl = surt.handyurl.parse(self.url)
+            surt.GoogleURLCanonicalizer.canonicalize(hurl)
+            hurl.query = None
+            hurl.hash = None
+            self._surt = hurl.getURLString(surt=True, trailing_comma=True)
+        return self._surt
+
+    @property
+    def host(self):
+        if not self._host:
+            import surt
+            self._host = surt.handyurl.parse(self.url).host
+        return self._host
+
+    def matches_ip_or_domain(self, ip_or_domain):
+        return host_matches_ip_or_domain(self.host, ip_or_domain)
+
+def normalize_host(host):
+    # normalize host (punycode and lowercase)
+    return host.encode('idna').decode('ascii').lower()
+
+def host_matches_ip_or_domain(host, ip_or_domain):
+    '''
+    Returns true if
+     - ip_or_domain is an ip address and host is the same ip address
+     - ip_or_domain is a domain and host is the same domain
+     - ip_or_domain is a domain and host is a subdomain of it
+    '''
+    _host = normalize_host(host)
+    _ip_or_domain = normalize_host(ip_or_domain)
+
+    if _ip_or_domain == _host:
+        return True
+
+    # if either _ip_or_domain or host are ip addresses, and they're not
+    # identical (previous check), not a match
+    try:
+        ipaddress.ip_address(_ip_or_domain)
+        return False
+    except:
+        pass
+    try:
+        ipaddress.ip_address(_host)
+        return False
+    except:
+        pass
+
+    # if we get here, we're looking at two hostnames
+    domain_parts = _ip_or_domain.split(".")
+    host_parts = _host.split(".")
+
+    result = host_parts[-len(domain_parts):] == domain_parts
+    return result
+
+
+# logging level more fine-grained than logging.DEBUG==10
+TRACE = 5
+
+import warcprox.controller as controller
+import warcprox.playback as playback
+import warcprox.dedup as dedup
+import warcprox.warcproxy as warcproxy
+import warcprox.mitmproxy as mitmproxy
+import warcprox.writer as writer
+import warcprox.warc as warc
+import warcprox.writerthread as writerthread
+import warcprox.stats as stats
+import warcprox.bigtable as bigtable
+import warcprox.kafkafeed as kafkafeed
--- a/warcprox/bigtable.py
+++ b/warcprox/bigtable.py
@ -0,0 +1,218 @@
+"""
+warcprox/bigtable.py - module for "big" RethinkDB table for deduplication;
+the table is "big" in the sense that it is designed to be usable as an index
+for playback software outside of warcprox, and contains information not
+needed merely for deduplication
+
+Copyright (C) 2015-2016 Internet Archive
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301,
+USA.
+"""
+
+from __future__ import absolute_import
+
+import logging
+from hanzo import warctools
+import random
+import warcprox
+import base64
+import surt
+import os
+import hashlib
+import threading
+import datetime
+import rethinkstuff
+
+class RethinkCaptures:
+    """Inserts in batches every 0.5 seconds"""
+    logger = logging.getLogger("warcprox.bigtable.RethinkCaptures")
+
+    def __init__(self, r, table="captures", shards=None, replicas=None, options=warcprox.Options()):
+        self.r = r
+        self.table = table
+        self.shards = shards or len(r.servers)
+        self.replicas = replicas or min(3, len(r.servers))
+        self.options = options
+        self._ensure_db_table()
+
+        self._stop = threading.Event()
+        self._batch_lock = threading.RLock()
+        with self._batch_lock:
+            self._batch = []
+        self._timer = None
+
+    def start(self):
+        """Starts batch insert repeating timer"""
+        self._insert_batch()
+
+    def _insert_batch(self):
+        try:
+            with self._batch_lock:
+                if len(self._batch) > 0:
+                    result = self.r.table(self.table).insert(self._batch).run()
+                    if result["inserted"] != len(self._batch) or sorted(
+                            result.values()) != [0,0,0,0,0,len(self._batch)]:
+                        raise Exception(
+                                "unexpected result %s saving batch of %s "
+                                "entries", result, len(self._batch))
+                    self.logger.debug(
+                            "saved %s entries to big capture table db",
+                            len(self._batch))
+                    self._batch = []
+        except BaseException as e:
+            self.logger.error(
+                    "caught exception trying to save %s entries, they will "
+                    "be included in the next batch", len(self._batch),
+                    exc_info=True)
+        finally:
+            if not self._stop.is_set():
+                t = threading.Timer(0.5, self._insert_batch)
+                t.name = "RethinkCaptures-batch-insert-timer-%s" % datetime.datetime.utcnow().isoformat()
+                t.start()
+                # ensure self._timer joinable (already started) whenever close() happens to be called
+                self._timer = t
+            else:
+                self.logger.info("finished")
+
+    def _ensure_db_table(self):
+        dbs = self.r.db_list().run()
+        if not self.r.dbname in dbs:
+            self.logger.info("creating rethinkdb database %s", repr(self.r.dbname))
+            self.r.db_create(self.r.dbname).run()
+        tables = self.r.table_list().run()
+        if not self.table in tables:
+            self.logger.info("creating rethinkdb table %s in database %s", repr(self.table), repr(self.r.dbname))
+            self.r.table_create(self.table, shards=self.shards, replicas=self.replicas).run()
+            self.r.table(self.table).index_create("abbr_canon_surt_timestamp", [self.r.row["abbr_canon_surt"], self.r.row["timestamp"]]).run()
+            self.r.table(self.table).index_create("sha1_warc_type", [self.r.row["sha1base32"], self.r.row["warc_type"], self.r.row["bucket"]]).run()
+
+    def find_response_by_digest(self, algo, raw_digest, bucket="__unspecified__"):
+        if algo != "sha1":
+            raise Exception("digest type is {} but big capture table is indexed by sha1".format(algo))
+        sha1base32 = base64.b32encode(raw_digest).decode("utf-8")
+        results_iter = self.r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type").run()
+        results = list(results_iter)
+        if len(results) > 0:
+            if len(results) > 1:
+                self.logger.debug("expected 0 or 1 but found %s results for sha1base32=%s bucket=%s (will use first result)", len(results), sha1base32, bucket)
+            result = results[0]
+        else:
+            result = None
+        self.logger.debug("returning %s for sha1base32=%s bucket=%s",
+                          result, sha1base32, bucket)
+        return result
+
+    def _assemble_entry(self, recorded_url, records):
+        if recorded_url.response_recorder:
+            if recorded_url.response_recorder.payload_digest.name == "sha1":
+                sha1base32 = base64.b32encode(
+                        recorded_url.response_recorder.payload_digest.digest()
+                        ).decode("utf-8")
+            else:
+                self.logger.warn(
+                        "digest type is %s but big capture table is indexed "
+                        "by sha1",
+                        recorded_url.response_recorder.payload_digest.name)
+        else:
+            digest = hashlib.new("sha1", records[0].content[1])
+            sha1base32 = base64.b32encode(digest.digest()).decode("utf-8")
+
+        if (recorded_url.warcprox_meta
+                and "captures-bucket" in recorded_url.warcprox_meta):
+            bucket = recorded_url.warcprox_meta["captures-bucket"]
+        else:
+            bucket = "__unspecified__"
+
+        canon_surt = surt.surt(recorded_url.url.decode("utf-8"),
+            trailing_comma=True, host_massage=False, with_scheme=True)
+
+        entry = {
+            # id only specified for rethinkdb partitioning
+            "id": "{} {}".format(
+                canon_surt[:20], records[0].id.decode("utf-8")[10:-1]),
+            "abbr_canon_surt": canon_surt[:150],
+            "canon_surt": canon_surt,
+            "timestamp": recorded_url.timestamp.replace(
+                tzinfo=rethinkstuff.UTC),
+            "url": recorded_url.url.decode("utf-8"),
+            "offset": records[0].offset,
+            "filename": os.path.basename(records[0].warc_filename),
+            "warc_type": records[0].type.decode("utf-8"),
+            "warc_id": records[0].id.decode("utf-8"),
+            "sha1base32": sha1base32,
+            "content_type": recorded_url.mimetype,
+            "response_code": recorded_url.status,
+            "http_method": recorded_url.method,
+            "bucket": bucket,
+            "length": records[0].length,
+        }
+
+        if (recorded_url.warcprox_meta and
+                "captures-table-extra-fields" in recorded_url.warcprox_meta):
+            extras = recorded_url.warcprox_meta["captures-table-extra-fields"]
+            for extra_field in extras:
+                entry[extra_field] = extras[extra_field]
+
+        return entry
+
+    def notify(self, recorded_url, records):
+        entry = self._assemble_entry(recorded_url, records)
+        with self._batch_lock:
+            self._batch.append(entry)
+
+    def close(self):
+        self.stop()
+
+    def stop(self):
+        self.logger.info("closing rethinkdb captures table")
+        self._stop.set()
+        if self._timer:
+            self._timer.join()
+
+class RethinkCapturesDedup:
+    logger = logging.getLogger("warcprox.dedup.RethinkCapturesDedup")
+
+    def __init__(self, captures_db, options=warcprox.Options()):
+        self.captures_db = captures_db
+        self.options = options
+
+    def lookup(self, digest_key, bucket="__unspecified__"):
+        k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key
+        algo, value_str = k.split(":")
+        if self.options.base32:
+            raw_digest = base64.b32decode(value_str, casefold=True)
+        else:
+            raw_digest = base64.b16decode(value_str, casefold=True)
+        entry = self.captures_db.find_response_by_digest(algo, raw_digest, bucket)
+        if entry:
+            dedup_info = {
+                "url": entry["url"].encode("utf-8"),
+                "date": entry["timestamp"].strftime("%Y-%m-%dT%H:%M:%SZ").encode("utf-8"),
+            }
+            if "warc_id" in entry:
+                dedup_info["id"] = entry["warc_id"].encode("utf-8")
+            return dedup_info
+        else:
+            return None
+
+    def start(self):
+        self.captures_db.start()
+
+    def stop(self):
+        self.captures_db.stop()
+
+    def close(self):
+        self.captures_db.close()
--- a/warcprox/controller.py
+++ b/warcprox/controller.py
@ -1,19 +1,45 @@
-# vim: set sw=4 et:
+'''
+warcprox/controller.py - contains WarcproxController class, responsible for
+starting up and shutting down the various components of warcprox, and for
+sending heartbeats to the service registry if configured to do so; also has
+some memory profiling capabilities
+
+Copyright (C) 2013-2016 Internet Archive
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301,
+USA.
+'''

 from __future__ import absolute_import

 import logging
 import threading
-import signal
 import time
-
-import warcprox.warcprox
-import warcprox.warcwriter
+import warcprox
+import sys
+import gc
+import datetime

 class WarcproxController(object):
    logger = logging.getLogger("warcprox.controller.WarcproxController")

-    def __init__(self, proxy=None, warc_writer_thread=None, playback_proxy=None):
+    HEARTBEAT_INTERVAL = 20.0
+
+    def __init__(self, proxy=None, warc_writer_thread=None,
+        playback_proxy=None, service_registry=None,
+        options=warcprox.Options()):
        """
        Create warcprox controller.

@ -34,44 +60,129 @@ class WarcproxController(object):
        else:
            self.warc_writer_thread = warcprox.warcwriter.WarcWriterThread(recorded_url_q=self.proxy.recorded_url_q)

+        self.proxy_thread = None
+        self.playback_proxy_thread = None
        self.playback_proxy = playback_proxy
+        self.service_registry = service_registry
+        self.options = options

-
-    def run_until_shutdown(self):
-        """Start warcprox and run until shut down.
-
-        If running in the main thread, SIGTERM initiates a graceful shutdown.
-        Otherwise, call warcprox_controller.stop.set().
-        """
-        proxy_thread = threading.Thread(target=self.proxy.serve_forever, name='ProxyThread')
-        proxy_thread.start()
-        self.warc_writer_thread.start()
-
-        if self.playback_proxy is not None:
-            playback_proxy_thread = threading.Thread(target=self.playback_proxy.serve_forever, name='PlaybackProxyThread')
-            playback_proxy_thread.start()
+        self._last_rss = None

        self.stop = threading.Event()
+        self._start_stop_lock = threading.Lock()

-        try:
-            signal.signal(signal.SIGTERM, self.stop.set)
-            self.logger.info('SIGTERM will initiate graceful shutdown')
-        except ValueError:
-            pass
+    def debug_mem(self):
+        self.logger.info("self.proxy.recorded_url_q.qsize()=%s", self.proxy.recorded_url_q.qsize())
+        with open("/proc/self/status") as f:
+            for line in f:
+                fields = line.split()
+                if len(fields) >= 2:
+                    k, v = fields[0:2]
+                if k == "VmHWM:":
+                    hwm = int(v)
+                elif k == "VmRSS:":
+                    rss = int(v)
+                elif k == "VmData:":
+                    data = int(v)
+                elif k == "VmStk:":
+                    stk = int(v)
+        self.logger.info("rss=%s data=%s stack=%s hwm=%s", rss, data, stk, hwm)
+        self._last_rss = self._last_rss or rss  # to set initial value
+
+        if rss - self._last_rss > 1024:
+            num_unreachable = gc.collect()
+            all_objects = gc.get_objects()
+            total_size = 0
+            summary = {}
+            biggest_objects = [None] * 10
+            for obj in all_objects:
+                size = sys.getsizeof(obj)
+                total_size += size
+                if not type(obj) in summary:
+                    summary[type(obj)] = {"count":0,"size":0}
+                summary[type(obj)]["count"] += 1
+                summary[type(obj)]["size"] += size
+                if size > sys.getsizeof(biggest_objects[-1]):
+                    for i in range(len(biggest_objects)):
+                        if size > sys.getsizeof(biggest_objects[i]):
+                            index = i
+                            break
+                    biggest_objects[index+1:] = biggest_objects[index:-1]
+                    biggest_objects[index] = obj
+
+            self.logger.info("%s objects totaling %s bytes", len(all_objects), total_size)
+
+            self.logger.info("=== biggest types ===")
+            for item in sorted(summary.items(), key=lambda item: item[1]["size"], reverse=True)[:10]:
+                self.logger.info("%s bytes in %s instances of %s", item[1]["size"], item[1]["count"], item[0])
+
+            self.logger.info("=== warcprox types ===")
+            for t in (t for t in summary if str(t).find("warcprox") >= 0):
+                self.logger.info("%s bytes in %s instances of %s", summary[t]["size"], summary[t]["count"], t)
+
+            for i in range(len(biggest_objects)):
+                obj = biggest_objects[i]
+                try:
+                    value = repr(bytes(obj.getbuffer()[:100]))
+                except:
+                    try:
+                        value = repr(obj)[:100]
+                    except BaseException as e:
+                        value = "<{} getting value>".format(e)
+                self.logger.info("#%s (%s) (%s bytes) (%s refs) (id=%s): %s", i+1, type(obj), sys.getsizeof(obj), sys.getrefcount(obj), id(obj), value)
+            self.logger.info("%s unreachable objects totaling %s bytes", len(gc.garbage), sum(sys.getsizeof(x) for x in gc.garbage))
+
+        self._last_rss = rss
+
+    def _service_heartbeat(self):
+        if hasattr(self, 'status_info'):
+            status_info = self.status_info
+        else:
+            status_info = {
+                'role': 'warcprox',
+                'heartbeat_interval': self.HEARTBEAT_INTERVAL,
+                'port': self.options.port,
+            }
+        status_info['load'] = 1.0 * self.proxy.recorded_url_q.qsize() / (self.proxy.recorded_url_q.maxsize or 100)
+        status_info['queue_size'] = self.proxy.recorded_url_q.qsize()
+
+        self.status_info = self.service_registry.heartbeat(status_info)
+        self.logger.log(
+                warcprox.TRACE, "status in service registry: %s",
+                self.status_info)
+
+    def start(self):
+        with self._start_stop_lock:
+            if self.proxy_thread and self.proxy_thread.is_alive():
+                self.logger.info('warcprox is already running')
+                return
+
+            if self.proxy.stats_db:
+                self.proxy.stats_db.start()
+            self.proxy_thread = threading.Thread(
+                    target=self.proxy.serve_forever, name='ProxyThread')
+            self.proxy_thread.start()
+
+            if self.warc_writer_thread.dedup_db:
+                self.warc_writer_thread.dedup_db.start()
+            self.warc_writer_thread.start()
+
+            if self.playback_proxy is not None:
+                self.playback_proxy_thread = threading.Thread(
+                        target=self.playback_proxy.serve_forever,
+                        name='PlaybackProxyThread')
+                self.playback_proxy_thread.start()
+
+    def shutdown(self):
+        with self._start_stop_lock:
+            if not self.proxy_thread or not self.proxy_thread.is_alive():
+                self.logger.info('warcprox is not running')
+                return

-        try:
-            while not self.stop.is_set():
-                time.sleep(0.5)
-        except:
-            pass
-        finally:
            self.warc_writer_thread.stop.set()
            self.proxy.shutdown()
            self.proxy.server_close()

-            if self.warc_writer_thread.warc_writer.dedup_db is not None:
-                self.warc_writer_thread.warc_writer.dedup_db.close()
-
            if self.playback_proxy is not None:
                self.playback_proxy.shutdown()
                self.playback_proxy.server_close()
@ -80,7 +191,59 @@ class WarcproxController(object):

            # wait for threads to finish
            self.warc_writer_thread.join()
-            proxy_thread.join()
-            if self.playback_proxy is not None:
-                playback_proxy_thread.join()
+
+            if self.proxy.stats_db:
+                self.proxy.stats_db.stop()
+            if self.warc_writer_thread.dedup_db:
+                self.warc_writer_thread.dedup_db.close()
+
+            self.proxy_thread.join()
+            if self.playback_proxy is not None:
+                self.playback_proxy_thread.join()
+
+            if self.service_registry and hasattr(self, "status_info"):
+                self.service_registry.unregister(self.status_info["id"])
+
+    def run_until_shutdown(self):
+        """
+        Start warcprox and run until shut down. Call
+        warcprox_controller.stop.set() to initiate graceful shutdown.
+        """
+        self.start()
+
+        last_mem_dbg = datetime.datetime.utcfromtimestamp(0)
+
+        try:
+            utc = datetime.timezone.utc
+        except AttributeError:
+            # python2 :-\
+            class UTC(datetime.tzinfo):
+                def tzname(self, dt): return "UTC+00:00"
+                def dst(self, dt): return datetime.timedelta(0)
+                def utcoffset(self, dt): return datetime.timedelta(0)
+            utc = UTC()
+
+        try:
+            while not self.stop.is_set():
+                if self.service_registry and (
+                        not hasattr(self, "status_info") or (
+                            datetime.datetime.now(utc)
+                            - self.status_info["last_heartbeat"]
+                        ).total_seconds() > self.HEARTBEAT_INTERVAL):
+                    self._service_heartbeat()
+
+                if self.options.profile and (
+                            datetime.datetime.utcnow() - last_mem_dbg
+                        ).total_seconds() > 60:
+                    self.debug_mem()
+                    last_mem_dbg = datetime.datetime.utcnow()
+
+                time.sleep(0.5)
+        except:
+            self.logger.critical(
+                    "shutting down in response to fatal exception",
+                    exc_info=True)
+            pass
+        finally:
+            self.shutdown()

--- a/warcprox/dedup.py
+++ b/warcprox/dedup.py
@ -1,30 +1,58 @@
-# vim:set sw=4 et:
+#
+# warcprox/dedup.py - identical payload digest deduplication
+#
+# Copyright (C) 2013-2016 Internet Archive
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
+# USA.
+#

 from __future__ import absolute_import

-try:
-    import dbm.gnu as dbm_gnu
-except ImportError:
-    try:
-        import gdbm as dbm_gnu
-    except ImportError:
-        import anydbm as dbm_gnu
-
 import logging
 import os
 import json
 from hanzo import warctools
+import warcprox
+import random

 class DedupDb(object):
    logger = logging.getLogger("warcprox.dedup.DedupDb")

-    def __init__(self, dbm_file='./warcprox-dedup.db'):
+    def __init__(self, dbm_file='./warcprox-dedup.db', options=warcprox.Options()):
+        try:
+            import dbm.gnu as dbm_gnu
+        except ImportError:
+            try:
+                import gdbm as dbm_gnu
+            except ImportError:
+                import anydbm as dbm_gnu
+
        if os.path.exists(dbm_file):
            self.logger.info('opening existing deduplication database {}'.format(dbm_file))
        else:
            self.logger.info('creating new deduplication database {}'.format(dbm_file))

        self.db = dbm_gnu.open(dbm_file, 'c')
+        self.options = options
+
+    def start(self):
+        pass
+
+    def stop(self):
+        self.close()

    def close(self):
        self.db.close()
@ -35,26 +63,115 @@ class DedupDb(object):
        except:
            pass

-    def save(self, key, response_record, offset):
+    def save(self, digest_key, response_record, bucket=""):
        record_id = response_record.get_header(warctools.WarcRecord.ID).decode('latin1')
        url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1')
        date = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1')

-        py_value = {'i':record_id, 'u':url, 'd':date}
+        key = digest_key + b"|" + bucket.encode("utf-8")
+
+        py_value = {'id':record_id, 'url':url, 'date':date}
        json_value = json.dumps(py_value, separators=(',',':'))

        self.db[key] = json_value.encode('utf-8')
-        self.logger.debug('dedup db saved {}:{}'.format(key, json_value))
+        self.logger.debug('dedup db saved %s:%s', key, json_value)

-    def lookup(self, key):
+    def lookup(self, digest_key, bucket=""):
+        result = None
+        key = digest_key + b"|" + bucket.encode("utf-8")
        if key in self.db:
            json_result = self.db[key]
            result = json.loads(json_result.decode('utf-8'))
-            result['i'] = result['i'].encode('latin1')
-            result['u'] = result['u'].encode('latin1')
-            result['d'] = result['d'].encode('latin1')
-            return result
+            result['id'] = result['id'].encode('latin1')
+            result['url'] = result['url'].encode('latin1')
+            result['date'] = result['date'].encode('latin1')
+        self.logger.debug('dedup db lookup of key=%s returning %s', key, result)
+        return result
+
+    def notify(self, recorded_url, records):
+        if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
+                and recorded_url.response_recorder.payload_size() > 0):
+            digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest,
+                self.options.base32)
+            if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
+                self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"])
+            else:
+                self.save(digest_key, records[0])
+
+
+def decorate_with_dedup_info(dedup_db, recorded_url, base32=False):
+    if (recorded_url.response_recorder
+            and recorded_url.response_recorder.payload_digest
+            and recorded_url.response_recorder.payload_size() > 0):
+        digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32)
+        if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
+            recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"])
        else:
-            return None
+            recorded_url.dedup_info = dedup_db.lookup(digest_key)
+
+class RethinkDedupDb:
+    logger = logging.getLogger("warcprox.dedup.RethinkDedupDb")
+
+    def __init__(self, r, table="dedup", shards=None, replicas=None, options=warcprox.Options()):
+        self.r = r
+        self.table = table
+        self.shards = shards or len(r.servers)
+        self.replicas = replicas or min(3, len(r.servers))
+        self._ensure_db_table()
+        self.options = options
+
+    def _ensure_db_table(self):
+        dbs = self.r.db_list().run()
+        if not self.r.dbname in dbs:
+            self.logger.info("creating rethinkdb database %s", repr(self.r.dbname))
+            self.r.db_create(self.r.dbname).run()
+        tables = self.r.table_list().run()
+        if not self.table in tables:
+            self.logger.info("creating rethinkdb table %s in database %s shards=%s replicas=%s",
+                             repr(self.table), repr(self.r.dbname), self.shards, self.replicas)
+            self.r.table_create(self.table, primary_key="key", shards=self.shards, replicas=self.replicas).run()


+    def start(self):
+        pass
+
+    def stop(self):
+        pass
+
+    def close(self):
+        pass
+
+    def sync(self):
+        pass
+
+    def save(self, digest_key, response_record, bucket=""):
+        k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key
+        k = "{}|{}".format(k, bucket)
+        record_id = response_record.get_header(warctools.WarcRecord.ID).decode('latin1')
+        url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1')
+        date = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1')
+        record = {'key':k,'url':url,'date':date,'id':record_id}
+        result = self.r.table(self.table).insert(record,conflict="replace").run()
+        if sorted(result.values()) != [0,0,0,0,0,1] and [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]:
+            raise Exception("unexpected result %s saving %s", result, record)
+        self.logger.debug('dedup db saved %s:%s', k, record)
+
+    def lookup(self, digest_key, bucket=""):
+        k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key
+        k = "{}|{}".format(k, bucket)
+        result = self.r.table(self.table).get(k).run()
+        if result:
+            for x in result:
+                result[x] = result[x].encode("utf-8")
+        self.logger.debug('dedup db lookup of key=%s returning %s', k, result)
+        return result
+
+    def notify(self, recorded_url, records):
+        if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
+                and recorded_url.response_recorder.payload_size() > 0):
+            digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest,
+                    self.options.base32)
+            if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
+                self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"])
+            else:
+                self.save(digest_key, records[0])
--- a/warcprox/dump_anydbm.py
+++ b/warcprox/dump_anydbm.py
@ -1,12 +1,28 @@
 #!/usr/bin/env python
-# vim:set sw=4 et:
-#
+'''
+dump-anydbm - dumps contents of dbm file to stdout

-"""
 Dump contents of database to stdout. Database can be any file that the anydbm
 module can read. Included with warcprox because it's useful for inspecting a
 deduplication database or a playback index database, but it is a generic tool.
-"""
+
+Copyright (C) 2013-2016 Internet Archive
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
+USA.
+'''

 try:
    import dbm
@ -14,7 +30,7 @@ try:
    whichdb = dbm.whichdb

 except:
-    import anydbm 
+    import anydbm
    dbm = anydbm
    from whichdb import whichdb

@ -22,6 +38,9 @@ import sys
 import os.path

 if __name__ == "__main__":
+    main()
+
+def main():
    if len(sys.argv) != 2:
        sys.stderr.write("usage: {} DBM_FILE\n".format(sys.argv[0]))
        exit(1)
--- a/warcprox/kafkafeed.py
+++ b/warcprox/kafkafeed.py
@ -0,0 +1,101 @@
+'''
+warcprox/kafkafeed.py - support for publishing information about archived
+urls to apache kafka
+
+Copyright (C) 2015-2016 Internet Archive
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
+USA.
+'''
+
+import kafka
+import datetime
+import json
+import logging
+from hanzo import warctools
+
+class CaptureFeed:
+    logger = logging.getLogger('warcprox.kafkafeed.CaptureFeed')
+
+    def __init__(self, broker_list, topic=None):
+        self.broker_list = broker_list
+        self.topic = topic
+        self.__producer = None
+        self._connection_exception = None
+
+    def _producer(self):
+        if not self.__producer:
+            try:
+                # acks=0 to avoid ever blocking
+                self.__producer = kafka.KafkaProducer(
+                        bootstrap_servers=self.broker_list, acks=0)
+                if self._connection_exception:
+                    logging.info('connected to kafka successfully!')
+                    self._connection_exception = None
+            except Exception as e:
+                if not self._connection_exception:
+                    self._connection_exception = e
+                    logging.error('problem connecting to kafka', exc_info=True)
+
+        return self.__producer
+
+    def notify(self, recorded_url, records):
+        if records[0].type not in (b'revisit', b'response'):
+            return
+
+        topic = recorded_url.warcprox_meta.get('capture-feed-topic', self.topic)
+        if not topic:
+            return
+
+        try:
+            payload_digest = records[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode('utf-8')
+        except:
+            payload_digest = '-'
+
+        # {"status_code":200,"content_digest":"sha1:3VU56HI3BTMDZBL2TP7SQYXITT7VEAJQ","host":"www.kaosgl.com","via":"http://www.kaosgl.com/sayfa.php?id=4427","account_id":"877","seed":"http://www.kaosgl.com/","warc_filename":"ARCHIVEIT-6003-WEEKLY-JOB171310-20150903100014694-00002.warc.gz","url":"http://www.kaosgl.com/resim/HomofobiKarsitiBulusma/trabzon05.jpg","size":29700,"start_time_plus_duration":"20150903175709637+1049","timestamp":"2015-09-03T17:57:10.707Z","mimetype":"image/jpeg","collection_id":"6003","is_test_crawl":"false","job_name":"6003-20150902172136074","warc_offset":856320200,"thread":6,"hop_path":"RLLLLLE","extra_info":{},"annotations":"duplicate:digest","content_length":29432}
+
+        now = datetime.datetime.utcnow()
+        d = {
+            'timestamp': '{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000),
+            'size': recorded_url.size,
+            'status_code': recorded_url.status,
+            'url': recorded_url.url.decode('utf-8'),
+            'mimetype': recorded_url.mimetype,
+            'content_digest': payload_digest,
+            'warc_filename': records[0].warc_filename,
+            'warc_offset': records[0].offset,
+            'host': recorded_url.host,
+            'annotations': 'duplicate:digest' if records[0].type == 'revisit' else '',
+            'content_length': recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset,
+            'start_time_plus_duration': '{:%Y%m%d%H%M%S}{:03d}+{}'.format(
+                recorded_url.timestamp, recorded_url.timestamp.microsecond//1000,
+                int(recorded_url.duration.total_seconds() * 1000)),
+            # 'hop_path': ?  # only used for seed redirects, which are n/a to brozzler (?)
+            # 'via': ?
+            # 'thread': ? # not needed
+        }
+
+        # fields expected to be populated here are (for archive-it):
+        # account_id, collection_id, is_test_crawl, seed, job_name
+        if recorded_url.warcprox_meta and 'capture-feed-extra-fields' in recorded_url.warcprox_meta:
+            for (k,v) in recorded_url.warcprox_meta['capture-feed-extra-fields'].items():
+                d[k] = v
+
+        msg = json.dumps(d, separators=(',', ':')).encode('utf-8')
+        self.logger.debug('feeding kafka topic=%s msg=%s', repr(topic), msg)
+        p = self._producer()
+        if p:
+            p.send(topic, msg)
+
--- a/warcprox/main.py
+++ b/warcprox/main.py
@ -1,5 +1,25 @@
 #!/usr/bin/env python
-# vim:set sw=4 et:
+'''
+warcprox/main.py - entrypoint for warcprox executable, parses command line
+arguments, initializes components, starts controller, handles signals
+
+Copyright (C) 2013-2016 Internet Archive
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
+USA.
+'''

 from __future__ import absolute_import

@ -14,21 +34,21 @@ import hashlib
 import argparse
 import os
 import socket
-
+import traceback
+import signal
+import threading
 import certauth.certauth
-
-import warcprox.playback
-import warcprox.dedup
-import warcprox.warcwriter
-import warcprox.warcprox
-import warcprox.controller
+import warcprox
+import re
+import rethinkstuff
+import cryptography.hazmat.backends.openssl

 def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
    arg_parser = argparse.ArgumentParser(prog=prog,
            description='warcprox - WARC writing MITM HTTP/S proxy',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    arg_parser.add_argument('-p', '--port', dest='port', default='8000',
-            help='port to listen on')
+            type=int, help='port to listen on')
    arg_parser.add_argument('-b', '--address', dest='address',
            default='localhost', help='address to listen on')
    arg_parser.add_argument('-c', '--cacert', dest='cacert',
@ -44,10 +64,10 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
    arg_parser.add_argument('-n', '--prefix', dest='prefix',
            default='WARCPROX', help='WARC filename prefix')
    arg_parser.add_argument('-s', '--size', dest='size',
-            default=1000*1000*1000,
+            default=1000*1000*1000, type=int,
            help='WARC file rollover size threshold in bytes')
    arg_parser.add_argument('--rollover-idle-time',
-            dest='rollover_idle_time', default=None,
+            dest='rollover_idle_time', default=None, type=int,
            help="WARC file rollover idle time threshold in seconds (so that Friday's last open WARC doesn't sit there all weekend waiting for more data)")
    try:
        hash_algos = hashlib.algorithms_guaranteed
@ -57,30 +77,171 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
            default='sha1', help='digest algorithm, one of {}'.format(', '.join(hash_algos)))
    arg_parser.add_argument('--base32', dest='base32', action='store_true',
            default=False, help='write digests in Base32 instead of hex')
-    arg_parser.add_argument('-j', '--dedup-db-file', dest='dedup_db_file',
-            default='./warcprox-dedup.db', help='persistent deduplication database file; empty string or /dev/null disables deduplication')
+    arg_parser.add_argument('--stats-db-file', dest='stats_db_file',
+            default='./warcprox-stats.db', help='persistent statistics database file; empty string or /dev/null disables statistics tracking')
    arg_parser.add_argument('-P', '--playback-port', dest='playback_port',
-            default=None, help='port to listen on for instant playback')
+            type=int, default=None, help='port to listen on for instant playback')
    arg_parser.add_argument('--playback-index-db-file', dest='playback_index_db_file',
            default='./warcprox-playback-index.db',
            help='playback index database file (only used if --playback-port is specified)')
+    group = arg_parser.add_mutually_exclusive_group()
+    group.add_argument('-j', '--dedup-db-file', dest='dedup_db_file',
+            default='./warcprox-dedup.db', help='persistent deduplication database file; empty string or /dev/null disables deduplication')
+    group.add_argument('--rethinkdb-servers', dest='rethinkdb_servers',
+            help='rethinkdb servers, used for dedup and stats if specified; e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org')
+    arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default='warcprox',
+            help='rethinkdb database name (ignored unless --rethinkdb-servers is specified)')
+    arg_parser.add_argument('--rethinkdb-big-table',
+            dest='rethinkdb_big_table', action='store_true', default=False,
+            help='use a big rethinkdb table called "captures", instead of a small table called "dedup"; table is suitable for use as index for playback (ignored unless --rethinkdb-servers is specified)')
+    arg_parser.add_argument('--kafka-broker-list', dest='kafka_broker_list',
+            default=None, help='kafka broker list for capture feed')
+    arg_parser.add_argument('--kafka-capture-feed-topic', dest='kafka_capture_feed_topic',
+            default=None, help='kafka capture feed topic')
+    arg_parser.add_argument('--queue-size', dest='queue_size', default=500,
+            help=argparse.SUPPRESS)
+    arg_parser.add_argument('--max-threads', dest='max_threads',
+            help=argparse.SUPPRESS)
+    arg_parser.add_argument('--profile', action='store_true', default=False,
+            help=argparse.SUPPRESS)
+    arg_parser.add_argument('--onion-tor-socks-proxy', dest='onion_tor_socks_proxy',
+            default=None, help='host:port of tor socks proxy, used only to connect to .onion sites')
    arg_parser.add_argument('--version', action='version',
-            version="warcprox {}".format(warcprox.version_str))
+            version="warcprox {}".format(warcprox.__version__))
    arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')
+    arg_parser.add_argument('--trace', dest='trace', action='store_true')
    arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true')
-    # [--ispartof=warcinfo ispartof]
-    # [--description=warcinfo description]
-    # [--operator=warcinfo operator]
-    # [--httpheader=warcinfo httpheader]

    return arg_parser

+def dump_state(signum=None, frame=None):
+    '''
+    Signal handler, logs stack traces of active threads.
+    '''
+    state_strs = []

-def main(argv=sys.argv):
+    for th in threading.enumerate():
+        try:
+            state_strs.append(str(th))
+        except AssertionError:
+            state_strs.append('<n/a:AssertionError>')
+        stack = traceback.format_stack(sys._current_frames()[th.ident])
+        state_strs.append(''.join(stack))
+
+    logging.warn(
+            'dumping state (caught signal %s)\n%s',
+            signum, '\n'.join(state_strs))
+
+def init_controller(args):
+    '''
+    Creates a warcprox.controller.WarcproxController configured according to
+    the supplied arguments (normally the result of parse_args(sys.argv)).
+    '''
+    options = warcprox.Options(**vars(args))
+
+    try:
+        hashlib.new(args.digest_algorithm)
+    except Exception as e:
+        logging.fatal(e)
+        exit(1)
+
+    listeners = []
+    if args.rethinkdb_servers:
+        r = rethinkstuff.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db)
+        if args.rethinkdb_big_table:
+            captures_db = warcprox.bigtable.RethinkCaptures(r, options=options)
+            dedup_db = warcprox.bigtable.RethinkCapturesDedup(captures_db, options=options)
+            listeners.append(captures_db)
+        else:
+            dedup_db = warcprox.dedup.RethinkDedupDb(r, options=options)
+            listeners.append(dedup_db)
+    elif args.dedup_db_file in (None, '', '/dev/null'):
+        logging.info('deduplication disabled')
+        dedup_db = None
+    else:
+        dedup_db = warcprox.dedup.DedupDb(args.dedup_db_file, options=options)
+        listeners.append(dedup_db)
+
+    if args.rethinkdb_servers:
+        stats_db = warcprox.stats.RethinkStatsDb(r, options=options)
+        listeners.append(stats_db)
+    elif args.stats_db_file in (None, '', '/dev/null'):
+        logging.info('statistics tracking disabled')
+        stats_db = None
+    else:
+        stats_db = warcprox.stats.StatsDb(args.stats_db_file, options=options)
+        listeners.append(stats_db)
+
+    if args.kafka_broker_list:
+        kafka_capture_feed = warcprox.kafkafeed.CaptureFeed(
+                args.kafka_broker_list, args.kafka_capture_feed_topic)
+        listeners.append(kafka_capture_feed)
+
+    recorded_url_q = queue.Queue(maxsize=args.queue_size)
+
+    ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64]
+    ca = certauth.certauth.CertificateAuthority(args.cacert, args.certs_dir,
+                                                ca_name=ca_name)
+
+    proxy = warcprox.warcproxy.WarcProxy(ca=ca, recorded_url_q=recorded_url_q,
+            stats_db=stats_db, options=options)
+
+    if args.playback_port is not None:
+        playback_index_db = warcprox.playback.PlaybackIndexDb(args.playback_index_db_file, options=options)
+        playback_proxy = warcprox.playback.PlaybackProxy(
+                server_address=(args.address, args.playback_port), ca=ca,
+                playback_index_db=playback_index_db, warcs_dir=args.directory,
+                options=options)
+        listeners.append(playback_index_db)
+    else:
+        playback_index_db = None
+        playback_proxy = None
+
+    writer_pool = warcprox.writer.WarcWriterPool(options=options)
+    warc_writer_thread = warcprox.writerthread.WarcWriterThread(
+            recorded_url_q=recorded_url_q, writer_pool=writer_pool,
+            dedup_db=dedup_db, listeners=listeners, options=options)
+
+    if args.rethinkdb_servers:
+        svcreg = rethinkstuff.ServiceRegistry(r)
+    else:
+        svcreg = None
+
+    controller = warcprox.controller.WarcproxController(proxy,
+        warc_writer_thread, playback_proxy, service_registry=svcreg,
+        options=options)
+
+    return controller
+
+def real_main(args):
+    # see https://github.com/pyca/cryptography/issues/2911
+    cryptography.hazmat.backends.openssl.backend.activate_builtin_random()
+
+    controller = init_controller(args)
+
+    signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set())
+    signal.signal(signal.SIGINT, lambda a,b: controller.stop.set())
+    signal.signal(signal.SIGQUIT, dump_state)
+
+    controller.run_until_shutdown()
+
+def parse_args(argv=sys.argv):
+    '''
+    Parses command line arguments with argparse.
+    '''
    arg_parser = _build_arg_parser(prog=os.path.basename(argv[0]))
    args = arg_parser.parse_args(args=argv[1:])
+    return args

-    if args.verbose:
+def main(argv=sys.argv):
+    '''
+    Main method, entry point of warcprox command.
+    '''
+    args = parse_args(argv)
+
+    if args.trace:
+        loglevel = warcprox.TRACE
+    elif args.verbose:
        loglevel = logging.DEBUG
    elif args.quiet:
        loglevel = logging.WARNING
@ -90,51 +251,50 @@ def main(argv=sys.argv):
    logging.basicConfig(stream=sys.stdout, level=loglevel,
            format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')

-    try:
-        hashlib.new(args.digest_algorithm)
-    except Exception as e:
-        logging.fatal(e)
-        exit(1)
+    real_main(args)

-    if args.dedup_db_file in (None, '', '/dev/null'):
-        logging.info('deduplication disabled')
-        dedup_db = None
-    else:
-        dedup_db = warcprox.dedup.DedupDb(args.dedup_db_file)
+def ensure_rethinkdb_tables():
+    '''
+    Creates rethinkdb tables if they don't already exist. Warcprox normally
+    creates the tables it needs on demand at startup, but if multiple instances
+    are starting up at the same time, you can end up with duplicate broken
+    tables. So it's a good idea to use this utility at an early step when
+    spinning up a cluster.
+    '''
+    arg_parser = argparse.ArgumentParser(
+            prog=os.path.basename(sys.argv[0]),
+            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    arg_parser.add_argument(
+            '--rethinkdb-servers', dest='rethinkdb_servers', default='localhost',
+            help='rethinkdb servers e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org')
+    arg_parser.add_argument(
+            '--rethinkdb-db', dest='rethinkdb_db', default='warcprox',
+            help='rethinkdb database name')
+    arg_parser.add_argument(
+            '-q', '--quiet', dest='log_level',
+            action='store_const', default=logging.INFO, const=logging.WARN)
+    arg_parser.add_argument(
+            '-v', '--verbose', dest='log_level',
+            action='store_const', default=logging.INFO, const=logging.DEBUG)
+    args = arg_parser.parse_args(args=sys.argv[1:])

-    recorded_url_q = queue.Queue()
+    logging.basicConfig(
+            stream=sys.stdout, level=args.log_level,
+            format=(
+                '%(asctime)s %(levelname)s %(name)s.%(funcName)s'
+                '(%(filename)s:%(lineno)d) %(message)s'))

-    ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64]
-    ca = certauth.certauth.CertificateAuthority(args.cacert, args.certs_dir,
-                                                ca_name=ca_name)
+    r = rethinkstuff.Rethinker(
+            args.rethinkdb_servers.split(','), args.rethinkdb_db)

-    proxy = warcprox.warcprox.WarcProxy(
-            server_address=(args.address, int(args.port)), ca=ca,
-            recorded_url_q=recorded_url_q,
-            digest_algorithm=args.digest_algorithm)
+    # services table
+    rethinkstuff.ServiceRegistry(r)

-    if args.playback_port is not None:
-        playback_index_db = warcprox.playback.PlaybackIndexDb(args.playback_index_db_file)
-        playback_server_address=(args.address, int(args.playback_port))
-        playback_proxy = warcprox.playback.PlaybackProxy(server_address=playback_server_address,
-                ca=ca, playback_index_db=playback_index_db,
-                warcs_dir=args.directory)
-    else:
-        playback_index_db = None
-        playback_proxy = None
-
-    warc_writer = warcprox.warcwriter.WarcWriter(directory=args.directory,
-            gzip=args.gzip, prefix=args.prefix, port=int(args.port),
-            rollover_size=int(args.size), base32=args.base32,
-            dedup_db=dedup_db, digest_algorithm=args.digest_algorithm,
-            playback_index_db=playback_index_db)
-    warc_writer_thread = warcprox.warcwriter.WarcWriterThread(
-            recorded_url_q=recorded_url_q, warc_writer=warc_writer,
-            rollover_idle_time=int(args.rollover_idle_time) if args.rollover_idle_time is not None else None)
-
-    controller = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy)
-    controller.run_until_shutdown()
+    # stats table
+    warcprox.stats.RethinkStatsDb(r)

+    # captures table
+    warcprox.bigtable.RethinkCaptures(r)

 if __name__ == '__main__':
    main()
--- a/warcprox/mitmproxy.py
+++ b/warcprox/mitmproxy.py
@ -1,4 +1,28 @@
-# vim:set sw=4 et:
+'''
+warcprox/mitmproxy.py - man-in-the-middle http/s proxy code, handles http
+CONNECT method by creating a snakeoil certificate for the requested site,
+calling ssl.wrap_socket() on the client connection; connects to remote
+(proxied) host, possibly using tor if host tld is .onion and tor proxy is
+configured
+
+Copyright (C) 2012 Cygnos Corporation
+Copyright (C) 2013-2016 Internet Archive
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
+USA.
+'''

 from __future__ import absolute_import

@ -11,46 +35,194 @@ try:
    import urllib.parse as urllib_parse
 except ImportError:
    import urlparse as urllib_parse
-
+try:
+    import http.client as http_client
+except ImportError:
+    import httplib as http_client
 import socket
 import logging
 import ssl
+import warcprox
+import threading
+import datetime
+import socks
+import tempfile
+import hashlib
+try:
+    import socketserver
+except ImportError:
+    import SocketServer as socketserver
+import resource
+import concurrent.futures
+
+class ProxyingRecorder(object):
+    """
+    Wraps a socket._fileobject, recording the bytes as they are read,
+    calculating digests, and sending them on to the proxy client.
+    """
+
+    logger = logging.getLogger("warcprox.mitmproxy.ProxyingRecorder")
+
+    def __init__(self, fp, proxy_client, digest_algorithm='sha1', url=None):
+        self.fp = fp
+        # "The file has no name, and will cease to exist when it is closed."
+        self.tempfile = tempfile.SpooledTemporaryFile(max_size=512*1024)
+        self.digest_algorithm = digest_algorithm
+        self.block_digest = hashlib.new(digest_algorithm)
+        self.payload_offset = None
+        self.payload_digest = None
+        self.proxy_client = proxy_client
+        self._proxy_client_conn_open = True
+        self.len = 0
+        self.url = url
+
+    def payload_starts_now(self):
+        self.payload_digest = hashlib.new(self.digest_algorithm)
+        self.payload_offset = self.len
+
+    def _update_payload_digest(self, hunk):
+        if self.payload_digest:
+            self.payload_digest.update(hunk)
+
+    def _update(self, hunk):
+        self._update_payload_digest(hunk)
+        self.block_digest.update(hunk)
+
+        self.tempfile.write(hunk)
+
+        if self.payload_digest and self._proxy_client_conn_open:
+            try:
+                self.proxy_client.sendall(hunk)
+            except BaseException as e:
+                self._proxy_client_conn_open = False
+                self.logger.warn(
+                        '%s sending data to proxy client for url %s',
+                        e, self.url)
+                self.logger.info(
+                        'will continue downloading from remote server without '
+                        'sending to client %s', self.url)
+
+        self.len += len(hunk)
+
+    def read(self, size=-1):
+        hunk = self.fp.read(size)
+        self._update(hunk)
+        return hunk
+
+    def readinto(self, b):
+        n = self.fp.readinto(b)
+        self._update(b[:n])
+        return n
+
+    def readline(self, size=-1):
+        # XXX depends on implementation details of self.fp.readline(), in
+        # particular that it doesn't call self.fp.read()
+        hunk = self.fp.readline(size)
+        self._update(hunk)
+        return hunk
+
+    def flush(self):
+        return self.fp.flush()
+
+    def close(self):
+        return self.fp.close()
+
+    def __len__(self):
+        return self.len
+
+    def payload_size(self):
+        if self.payload_offset is not None:
+            return self.len - self.payload_offset
+        else:
+            return 0
+
+class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
+    '''
+    Implementation of HTTPResponse that uses a ProxyingRecorder to read the
+    response from the remote web server and send it on to the proxy client,
+    while recording the bytes in transit.
+    '''
+    def __init__(
+            self, sock, debuglevel=0, method=None, proxy_client=None,
+            digest_algorithm='sha1', url=None):
+        http_client.HTTPResponse.__init__(
+                self, sock, debuglevel=debuglevel, method=method)
+        self.proxy_client = proxy_client
+        self.url = url
+
+        # Keep around extra reference to self.fp because HTTPResponse sets
+        # self.fp=None after it finishes reading, but we still need it
+        self.recorder = ProxyingRecorder(
+                self.fp, proxy_client, digest_algorithm, url=url)
+        self.fp = self.recorder
+
+    def begin(self):
+        http_client.HTTPResponse.begin(self)  # reads status line, headers
+
+        status_and_headers = 'HTTP/1.1 {} {}\r\n'.format(
+                self.status, self.reason)
+        for k,v in self.msg.items():
+            if k.lower() not in (
+                    'connection', 'proxy-connection', 'keep-alive',
+                    'proxy-authenticate', 'proxy-authorization', 'upgrade',
+                    'strict-transport-security'):
+                status_and_headers += '{}: {}\r\n'.format(k, v)
+        status_and_headers += 'Connection: close\r\n\r\n'
+        self.proxy_client.sendall(status_and_headers.encode('latin1'))
+
+        self.recorder.payload_starts_now()

 class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
+    '''
+    An http proxy implementation of BaseHTTPRequestHandler, that acts as a
+    man-in-the-middle in order to peek at the content of https transactions,
+    and records the bytes in transit as it proxies them.
+    '''
    logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler")

    def __init__(self, request, client_address, server):
+        threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1])
        self.is_connect = False
        self._headers_buffer = []
+        request.settimeout(60)  # XXX what value should this have?
        http_server.BaseHTTPRequestHandler.__init__(self, request, client_address, server)

    def _determine_host_port(self):
        # Get hostname and port to connect to
        if self.is_connect:
-            self.hostname, self.port = self.path.split(':')
+            host, self.port = self.path.split(':')
        else:
            self.url = self.path
            u = urllib_parse.urlparse(self.url)
            if u.scheme != 'http':
-                raise Exception('Unknown scheme %s' % repr(u.scheme))
-            self.hostname = u.hostname
+                raise Exception(
+                        'unable to parse request %s as a proxy request' % (
+                            repr(self.requestline)))
+            host = u.hostname
            self.port = u.port or 80
            self.path = urllib_parse.urlunparse(
                urllib_parse.ParseResult(
-                    scheme='',
-                    netloc='',
-                    params=u.params,
-                    path=u.path or '/',
-                    query=u.query,
-                    fragment=u.fragment
-                )
-            )
+                    scheme='', netloc='', params=u.params, path=u.path or '/',
+                    query=u.query, fragment=u.fragment))
+        self.hostname = warcprox.normalize_host(host)

-    def _connect_to_host(self):
+    def _connect_to_remote_server(self):
        # Connect to destination
-        self._proxy_sock = socket.socket()
-        self._proxy_sock.settimeout(60)
-        self._proxy_sock.connect((self.hostname, int(self.port)))
+        if self.onion_tor_socks_proxy_host and self.hostname.lower().endswith('.onion'):
+            self.logger.info("using tor socks proxy at %s:%s to connect to %s",
+                    self.onion_tor_socks_proxy_host,
+                    self.onion_tor_socks_proxy_port or 1080,
+                    self.hostname)
+            self._remote_server_sock = socks.socksocket()
+            self._remote_server_sock.set_proxy(
+                    socks.SOCKS5, addr=self.onion_tor_socks_proxy_host,
+                    port=self.onion_tor_socks_proxy_port, rdns=True)
+        else:
+            self._remote_server_sock = socket.socket()
+
+        # XXX what value should this timeout have?
+        self._remote_server_sock.settimeout(60)
+        self._remote_server_sock.connect((self.hostname, int(self.port)))

        # Wrap socket if SSL is required
        if self.is_connect:
@ -58,24 +230,44 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
                context = ssl.create_default_context()
                context.check_hostname = False
                context.verify_mode = ssl.CERT_NONE
-                self._proxy_sock = context.wrap_socket(self._proxy_sock, server_hostname=self.hostname)
+                self._remote_server_sock = context.wrap_socket(
+                        self._remote_server_sock, server_hostname=self.hostname)
            except AttributeError:
                try:
-                    self._proxy_sock = ssl.wrap_socket(self._proxy_sock)
+                    self._remote_server_sock = ssl.wrap_socket(
+                            self._remote_server_sock)
                except ssl.SSLError:
-                    self.logger.warn("failed to establish ssl connection to {}; python ssl library does not support SNI, considering upgrading to python >= 2.7.9 or python 3.4".format(self.hostname))
+                    self.logger.warn(
+                            "failed to establish ssl connection to %s; python "
+                            "ssl library does not support SNI, considering "
+                            "upgrading to python >= 2.7.9 or python 3.4",
+                            self.hostname)
                    raise

+        return self._remote_server_sock
+
    def _transition_to_ssl(self):
        self.request = self.connection = ssl.wrap_socket(self.connection,
                server_side=True, certfile=self.server.ca.cert_for_host(self.hostname))

    def do_CONNECT(self):
+        '''
+        Handles a http CONNECT request.
+
+        The CONNECT method is meant to "convert the request connection to a
+        transparent TCP/IP tunnel, usually to facilitate SSL-encrypted
+        communication (HTTPS) through an unencrypted HTTP proxy" (Wikipedia).
+
+        do_CONNECT is where the man-in-the-middle logic happens. In do_CONNECT
+        the proxy transitions the proxy client connection to ssl while
+        masquerading as the remote web server using a generated certificate.
+        Meanwhile makes its own separate ssl connection to the remote web
+        server. Then it calls self.handle_one_request() again to handle the
+        request intended for the remote server.
+        '''
        self.is_connect = True
        try:
-            # Connect to destination first
            self._determine_host_port()
-            self._connect_to_host()

            # If successful, let's do this!
            self.send_response(200, 'Connection established')
@ -83,6 +275,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
            self._transition_to_ssl()
        except Exception as e:
            try:
+                self.logger.error("problem handling {}: {}".format(repr(self.requestline), e))
                if type(e) is socket.timeout:
                    self.send_error(504, str(e))
                else:
@ -115,35 +308,162 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
        return result

    def do_COMMAND(self):
-        if not self.is_connect:
-            try:
-                # Connect to destination
-                self._determine_host_port()
-                self._connect_to_host()
-                assert self.url
-            except Exception as e:
-                self.send_error(500, str(e))
-                return
-        else:
-            # if self.is_connect we already connected in do_CONNECT
+        if self.is_connect:
            self.url = self._construct_tunneled_url()
+        else:
+            self._determine_host_port()
+            assert self.url

-        self._proxy_request()
+        try:
+            # Connect to destination
+            self._connect_to_remote_server()
+        except warcprox.RequestBlockedByRule as e:
+            # limit enforcers have already sent the appropriate response
+            self.logger.info("%s: %s", repr(self.requestline), e)
+            return
+        except Exception as e:
+            self.logger.error("problem processing request {}: {}".format(repr(self.requestline), e), exc_info=True)
+            self.send_error(500, str(e))
+            return

+        try:
+            self._proxy_request()
+        except:
+            self.logger.error("exception proxying request", exc_info=True)
+            raise

    def _proxy_request(self):
-        raise Exception('_proxy_request() not implemented in MitmProxyHandler, must be implemented in subclass!')
+        '''
+        Sends the request to the remote server, then uses a ProxyingRecorder to
+        read the response and send it to the proxy client, while recording the
+        bytes in transit. Returns a tuple (request, response) where request is
+        the raw request bytes, and response is a ProxyingRecorder.
+        '''
+        # Build request
+        req_str = '{} {} {}\r\n'.format(
+                self.command, self.path, self.request_version)
+
+        # Swallow headers that don't make sense to forward on, i.e. most
+        # hop-by-hop headers, see
+        # http://tools.ietf.org/html/rfc2616#section-13.5.
+        # self.headers is an email.message.Message, which is case-insensitive
+        # and doesn't throw KeyError in __delitem__
+        for key in (
+                'Connection', 'Proxy-Connection', 'Keep-Alive',
+                'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'):
+            del self.headers[key]
+
+        # Add headers to the request
+        # XXX in at least python3.3 str(self.headers) uses \n not \r\n :(
+        req_str += '\r\n'.join(
+                '{}: {}'.format(k,v) for (k,v) in self.headers.items())
+
+        req = req_str.encode('latin1') + b'\r\n\r\n'
+
+        # Append message body if present to the request
+        if 'Content-Length' in self.headers:
+            req += self.rfile.read(int(self.headers['Content-Length']))
+
+        try:
+            self.logger.debug('sending to remote server req=%s', repr(req))
+
+            # Send it down the pipe!
+            self._remote_server_sock.sendall(req)
+
+            prox_rec_res = ProxyingRecordingHTTPResponse(
+                    self._remote_server_sock, proxy_client=self.connection,
+                    digest_algorithm=self.server.digest_algorithm,
+                    url=self.url)
+            prox_rec_res.begin()
+
+            buf = prox_rec_res.read(8192)
+            while buf != b'':
+                buf = prox_rec_res.read(8192)
+
+            self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
+        except socket.timeout as e:
+            self.logger.warn(
+                    "%s proxying %s %s", repr(e), self.command, self.url)
+        except BaseException as e:
+            self.logger.error(
+                    "%s proxying %s %s", repr(e), self.command, self.url,
+                    exc_info=True)
+        finally:
+            # Let's close off the remote end
+            if prox_rec_res:
+                prox_rec_res.close()
+            self._remote_server_sock.close()
+
+        return req, prox_rec_res

    def __getattr__(self, item):
        if item.startswith('do_'):
            return self.do_COMMAND

    def log_error(self, fmt, *args):
-        self.logger.error("{0} - - [{1}] {2}".format(self.address_string(),
-            self.log_date_time_string(), fmt % args))
+        self.logger.warn(fmt, *args)

-    def log_message(self, fmt, *args):
-        self.logger.info("{} {} - - [{}] {}".format(self.__class__.__name__,
-            self.address_string(), self.log_date_time_string(), fmt % args))
+class PooledMixIn(socketserver.ThreadingMixIn):
+    logger = logging.getLogger("warcprox.mitmproxy.PooledMixIn")
+    def __init__(self, max_threads=None):
+        '''
+        If max_threads is not supplied, calculates a reasonable value based
+        on system resource limits.
+        '''
+        if not max_threads:
+            # man getrlimit: "RLIMIT_NPROC The maximum number of processes (or,
+            # more precisely on Linux, threads) that can be created for the
+            # real user ID of the calling process."
+            rlimit_nproc = resource.getrlimit(resource.RLIMIT_NPROC)[0]
+            rlimit_nofile = resource.getrlimit(resource.RLIMIT_NOFILE)[0]
+            max_threads = min(rlimit_nofile // 10, rlimit_nproc // 2)
+            self.logger.info(
+                    "max_threads=%s (rlimit_nproc=%s, rlimit_nofile=%s)",
+                    max_threads, rlimit_nproc, rlimit_nofile)
+        self.pool = concurrent.futures.ThreadPoolExecutor(max_threads)

+    def process_request(self, request, client_address):
+        self.pool.submit(self.process_request_thread, request, client_address)
+
+class MitmProxy(http_server.HTTPServer):
+    def finish_request(self, request, client_address):
+        '''
+        We override socketserver.BaseServer.finish_request to get at
+        MitmProxyHandler's self.request. A normal socket server's self.request
+        is set to `request` and never changes, but in our case, it may be
+        replaced with an SSL socket. The caller of this method (e.g.
+        self.process_request or PooledMitmProxy.process_request_thread) needs
+        to get a hold of that socket so it can close it.
+        '''
+        req_handler = self.RequestHandlerClass(request, client_address, self)
+        return req_handler.request
+
+    def process_request(self, request, client_address):
+        '''
+        This an almost verbatim copy/paste of
+        socketserver.BaseServer.process_request.
+        The only difference is that it expects self.finish_request to return
+        the request (i.e. the socket). This new value of request is passed on
+        to self.shutdown_request. See the comment on self.finish_request for
+        the rationale.
+        '''
+        request = self.finish_request(request, client_address)
+        self.shutdown_request(request)
+
+class PooledMitmProxy(PooledMixIn, MitmProxy):
+    def process_request_thread(self, request, client_address):
+        '''
+        This an almost verbatim copy/paste of
+        socketserver.ThreadingMixIn.process_request_thread.
+        The only difference is that it expects self.finish_request to return
+        the request (i.e. the socket). This new value of request is passed on
+        to self.shutdown_request. See the comment on MitmProxy.finish_request
+        for the rationale.
+        '''
+        try:
+            request = self.finish_request(request, client_address)
+            self.shutdown_request(request)
+        except:
+            self.handle_error(request, client_address)
+            self.shutdown_request(request)

--- a/warcprox/playback.py
+++ b/warcprox/playback.py
@ -1,4 +1,24 @@
-# vim:set sw=4 et:
+'''
+warcprox/playback.py - rudimentary support for playback of urls archived by
+warcprox (not much used or maintained)
+
+Copyright (C) 2013-2016 Internet Archive
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
+USA.
+'''

 from __future__ import absolute_import

@ -12,14 +32,6 @@ try:
 except ImportError:
    import SocketServer as socketserver

-try:
-    import dbm.gnu as dbm_gnu
-except ImportError:
-    try:
-        import gdbm as dbm_gnu
-    except ImportError:
-        import anydbm as dbm_gnu
-
 import logging
 import os
 from hanzo import warctools
@ -27,13 +39,14 @@ import json
 import traceback
 import re
 from warcprox.mitmproxy import MitmProxyHandler
+import warcprox

 class PlaybackProxyHandler(MitmProxyHandler):
    logger = logging.getLogger("warcprox.playback.PlaybackProxyHandler")

    # @Override
-    def _connect_to_host(self):
-        # don't connect to host!
+    def _connect_to_remote_server(self):
+        # don't connect to any remote server!
        pass


@ -180,13 +193,14 @@ class PlaybackProxyHandler(MitmProxyHandler):
 class PlaybackProxy(socketserver.ThreadingMixIn, http_server.HTTPServer):
    logger = logging.getLogger("warcprox.playback.PlaybackProxy")

-    def __init__(self, server_address, req_handler_class=PlaybackProxyHandler,
-            bind_and_activate=True, ca=None, playback_index_db=None,
-            warcs_dir=None):
-        http_server.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate)
+
+    def __init__(self, ca=None, playback_index_db=None, options=warcprox.Options()):
+        server_address = (options.address or 'localhost', options.playback_port if options.playback_port is not None else 8001)
+        http_server.HTTPServer.__init__(self, server_address, PlaybackProxyHandler, bind_and_activate=True)
        self.ca = ca
        self.playback_index_db = playback_index_db
-        self.warcs_dir = warcs_dir
+        self.warcs_dir = options.directory
+        self.options = options

    def server_activate(self):
        http_server.HTTPServer.server_activate(self)
@ -201,6 +215,14 @@ class PlaybackIndexDb(object):
    logger = logging.getLogger("warcprox.playback.PlaybackIndexDb")

    def __init__(self, dbm_file='./warcprox-playback-index.db'):
+        try:
+            import dbm.gnu as dbm_gnu
+        except ImportError:
+            try:
+                import gdbm as dbm_gnu
+            except ImportError:
+                import anydbm as dbm_gnu
+
        if os.path.exists(dbm_file):
            self.logger.info('opening existing playback index database {}'.format(dbm_file))
        else:
@ -217,6 +239,9 @@ class PlaybackIndexDb(object):
        except:
            pass

+    def notify(self, recorded_url, records):
+        self.save(records[0].warc_filename, records, records[0].offset)
+
    def save(self, warcfile, recordset, offset):
        response_record = recordset[0]
        # XXX canonicalize url?
--- a/warcprox/stats.py
+++ b/warcprox/stats.py
@ -0,0 +1,303 @@
+'''
+warcprox/stats.py - keeps statistics on what has been archived
+
+Copyright (C) 2013-2016 Internet Archive
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
+USA.
+'''
+
+from __future__ import absolute_import
+
+import logging
+import os
+import json
+from hanzo import warctools
+import random
+import warcprox
+import threading
+import rethinkdb as r
+import datetime
+import surt
+
+def _empty_bucket(bucket):
+    return {
+        "bucket": bucket,
+        "total": {
+            "urls": 0,
+            "wire_bytes": 0,
+        },
+        "new": {
+            "urls": 0,
+            "wire_bytes": 0,
+        },
+        "revisit": {
+            "urls": 0,
+            "wire_bytes": 0,
+        },
+    }
+
+class StatsDb:
+    logger = logging.getLogger("warcprox.stats.StatsDb")
+
+    def __init__(self, dbm_file='./warcprox-stats.db', options=warcprox.Options()):
+        try:
+            import dbm.gnu as dbm_gnu
+        except ImportError:
+            try:
+                import gdbm as dbm_gnu
+            except ImportError:
+                import anydbm as dbm_gnu
+
+        if os.path.exists(dbm_file):
+            self.logger.info('opening existing stats database {}'.format(dbm_file))
+        else:
+            self.logger.info('creating new stats database {}'.format(dbm_file))
+
+        self.db = dbm_gnu.open(dbm_file, 'c')
+        self.options = options
+
+    def start(self):
+        # method only exists to match RethinkStatsDb
+        pass
+
+    def stop(self):
+        self.close()
+
+    def close(self):
+        self.db.close()
+
+    def sync(self):
+        try:
+            self.db.sync()
+        except:
+            pass
+
+    def value(self, bucket0="__all__", bucket1=None, bucket2=None):
+        # Gdbm wants str/bytes keys in python2, str/unicode keys in python3.
+        # This ugliness deals with keys that arrive as unicode in py2.
+        b0 = bucket0.encode("utf-8") if bucket0 and not isinstance(bucket0, str) else bucket0
+        b1 = bucket1.encode("utf-8") if bucket1 and not isinstance(bucket1, str) else bucket1
+        b2 = bucket2.encode("utf-8") if bucket2 and not isinstance(bucket2, str) else bucket2
+
+        if b0 in self.db:
+            bucket0_stats = json.loads(self.db[b0].decode("utf-8"))
+            if b1:
+                if b2:
+                    return bucket0_stats[b1][b2]
+                else:
+                    return bucket0_stats[b1]
+            else:
+                return bucket0_stats
+        else:
+            return None
+
+    def notify(self, recorded_url, records):
+        self.tally(recorded_url, records)
+
+    def buckets(self, recorded_url):
+        '''
+        Unravels bucket definitions in Warcprox-Meta header. Each bucket
+        definition can either be a string, which signifies the name of the
+        bucket, or a dict. If a dict it is expected to have at least an item
+        with key 'bucket' whose value is the name of the bucket. The other
+        currently recognized item is 'tally-domains', which if supplied should
+        be a list of domains. This instructs warcprox to additionally tally
+        substats of the given bucket by domain.  Host stats are stored in the
+        stats table under the key '{parent-bucket}:{domain(normalized)}'.
+
+        Example Warcprox-Meta header (a real one will likely have other
+        sections besides 'stats'):
+
+        Warcprox-Meta: {'stats':{'buckets':['bucket1',{'bucket':'bucket2','tally-domains':['foo.bar.com','192.168.10.20'}]}}
+        '''
+        buckets = ["__all__"]
+        if (recorded_url.warcprox_meta
+                and "stats" in recorded_url.warcprox_meta
+                and "buckets" in recorded_url.warcprox_meta["stats"]):
+            for bucket in recorded_url.warcprox_meta["stats"]["buckets"]:
+                if isinstance(bucket, dict):
+                    if not 'bucket' in bucket:
+                        self.logger.warn(
+                                'ignoring invalid stats bucket in '
+                                'warcprox-meta header %s', bucket)
+                        continue
+                    buckets.append(bucket['bucket'])
+                    if bucket.get('tally-domains'):
+                        url = warcprox.Url(recorded_url.url.decode('utf-8'))
+                        for domain in bucket['tally-domains']:
+                            if url.matches_ip_or_domain(domain):
+                                buckets.append('%s:%s' % (
+                                    bucket['bucket'],
+                                    warcprox.normalize_host(domain)))
+                else:
+                    buckets.append(bucket)
+        else:
+            buckets.append("__unspecified__")
+
+        return buckets
+
+    def tally(self, recorded_url, records):
+        for bucket in self.buckets(recorded_url):
+            # Gdbm wants str/bytes keys in python2, str/unicode keys in python3.
+            # This ugliness deals with keys that arrive as unicode in py2.
+            b = bucket.encode("utf-8") if bucket and not isinstance(bucket, str) else bucket
+            if b in self.db:
+                bucket_stats = json.loads(self.db[b].decode("utf-8"))
+            else:
+                bucket_stats = _empty_bucket(b)
+
+            bucket_stats["total"]["urls"] += 1
+            bucket_stats["total"]["wire_bytes"] += recorded_url.size
+
+            if records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT:
+                bucket_stats["revisit"]["urls"] += 1
+                bucket_stats["revisit"]["wire_bytes"] += recorded_url.size
+            else:
+                bucket_stats["new"]["urls"] += 1
+                bucket_stats["new"]["wire_bytes"] += recorded_url.size
+
+            self.db[b] = json.dumps(bucket_stats, separators=(',',':')).encode("utf-8")
+
+class RethinkStatsDb(StatsDb):
+    """Updates database in batch every 2.0 seconds"""
+    logger = logging.getLogger("warcprox.stats.RethinkStatsDb")
+
+    def __init__(self, rethinker, table="stats", shards=None, replicas=None, options=warcprox.Options()):
+        self.r = rethinker
+        self.table = table
+        self.shards = shards or 1  # 1 shard by default because it's probably a small table
+        self.replicas = replicas or min(3, len(self.r.servers))
+        self._ensure_db_table()
+        self.options = options
+
+        self._stop = threading.Event()
+        self._batch_lock = threading.RLock()
+        with self._batch_lock:
+            self._batch = {}
+        self._timer = None
+
+    def start(self):
+        """Starts batch update repeating timer."""
+        self._update_batch() # starts repeating timer
+
+    def _bucket_batch_update_reql(self, bucket):
+        return self.r.table(self.table).get(bucket).replace(
+            lambda old: r.branch(
+                old.eq(None), self._batch[bucket], old.merge({
+                    "total": {
+                        "urls": old["total"]["urls"].add(
+                            self._batch[bucket]["total"]["urls"]),
+                        "wire_bytes": old["total"]["wire_bytes"].add(
+                            self._batch[bucket]["total"]["wire_bytes"]),
+                        },
+                    "new": {
+                        "urls": old["new"]["urls"].add(
+                            self._batch[bucket]["new"]["urls"]),
+                        "wire_bytes": old["new"]["wire_bytes"].add(
+                            self._batch[bucket]["new"]["wire_bytes"]),
+                        },
+                    "revisit": {
+                        "urls": old["revisit"]["urls"].add(
+                            self._batch[bucket]["revisit"]["urls"]),
+                        "wire_bytes": old["revisit"]["wire_bytes"].add(
+                            self._batch[bucket]["revisit"]["wire_bytes"]),
+                        },
+                })))
+
+    def _update_batch(self):
+        with self._batch_lock:
+            if len(self._batch) > 0:
+                # XXX can all the buckets be done in one query?
+                for bucket in self._batch:
+                    result = self._bucket_batch_update_reql(bucket).run()
+                    if (not result["inserted"] and not result["replaced"]
+                            or sorted(result.values()) != [0,0,0,0,0,1]):
+                        raise Exception(
+                                "unexpected result %s updating stats %s" % (
+                                    result, self._batch[bucket]))
+                self._batch = {}
+
+        if not self._stop.is_set():
+            self._timer = threading.Timer(2.0, self._update_batch)
+            self._timer.name = "RethinkStats-batch-update-timer-%s" % (
+                    datetime.datetime.utcnow().isoformat())
+            self._timer.start()
+        else:
+            self.logger.info("finished")
+
+    def _ensure_db_table(self):
+        dbs = self.r.db_list().run()
+        if not self.r.dbname in dbs:
+            self.logger.info(
+                    "creating rethinkdb database %s", repr(self.r.dbname))
+            self.r.db_create(self.r.dbname).run()
+        tables = self.r.table_list().run()
+        if not self.table in tables:
+            self.logger.info(
+                    "creating rethinkdb table %s in database %s shards=%s "
+                    "replicas=%s", repr(self.table), repr(self.r.dbname),
+                    self.shards, self.replicas)
+            self.r.table_create(
+                    self.table, primary_key="bucket", shards=self.shards,
+                    replicas=self.replicas).run()
+
+    def close(self):
+        self.stop()
+
+    def stop(self):
+        self.logger.info("stopping rethinkdb stats table batch updates")
+        self._stop.set()
+        if self._timer:
+            self._timer.join()
+
+    def sync(self):
+        pass
+
+    def value(self, bucket0="__all__", bucket1=None, bucket2=None):
+        bucket0_stats = self.r.table(self.table).get(bucket0).run()
+        self.logger.debug(
+                'stats db lookup of bucket=%s returned %s',
+                bucket0, bucket0_stats)
+        if bucket0_stats:
+            if bucket1:
+                if bucket2:
+                    return bucket0_stats[bucket1][bucket2]
+                else:
+                    return bucket0_stats[bucket1]
+        return bucket0_stats
+
+    def tally(self, recorded_url, records):
+        buckets = self.buckets(recorded_url)
+        is_revisit = records[0].get_header(
+                warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT
+        with self._batch_lock:
+            for bucket in buckets:
+                bucket_stats = self._batch.setdefault(
+                        bucket, _empty_bucket(bucket))
+
+                bucket_stats["total"]["urls"] += 1
+                bucket_stats["total"]["wire_bytes"] += recorded_url.size
+
+                if is_revisit:
+                    bucket_stats["revisit"]["urls"] += 1
+                    bucket_stats["revisit"]["wire_bytes"] += recorded_url.size
+                else:
+                    bucket_stats["new"]["urls"] += 1
+                    bucket_stats["new"]["wire_bytes"] += recorded_url.size
+
+    def notify(self, recorded_url, records):
+        self.tally(recorded_url, records)
+
--- a/warcprox/tests/test_warcprox.py
+++ b/warcprox/tests/test_warcprox.py
@ -1,414 +0,0 @@
-#!/usr/bin/env python
-# vim: set sw=4 et:
-
-import unittest
-import threading
-import time
-import logging
-import sys
-import ssl
-import re
-import tempfile
-import OpenSSL
-import os
-import shutil
-import requests
-
-try:
-    import http.server as http_server
-except ImportError:
-    import BaseHTTPServer as http_server
-
-try:
-    import queue
-except ImportError:
-    import Queue as queue
-
-import certauth.certauth
-
-import warcprox.controller
-import warcprox.warcprox
-import warcprox.playback
-import warcprox.warcwriter
-import warcprox.dedup
-
-class TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
-    logger = logging.getLogger('TestHttpRequestHandler')
-
-    def do_GET(self):
-        self.logger.info('GET {}'.format(self.path))
-
-        m = re.match(r'^/([^/]+)/([^/]+)$', self.path)
-        if m is not None:
-            special_header = 'warcprox-test-header: {}!'.format(m.group(1)).encode('utf-8')
-            payload = 'I am the warcprox test payload! {}!\n'.format(10*m.group(2)).encode('utf-8')
-            headers = (b'HTTP/1.1 200 OK\r\n'
-                    +  b'Content-Type: text/plain\r\n'
-                    +  special_header + b'\r\n'
-                    +  b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n'
-                    +  b'\r\n')
-        else:
-            payload = b'404 Not Found\n'
-            headers = (b'HTTP/1.1 404 Not Found\r\n'
-                    +  b'Content-Type: text/plain\r\n'
-                    +  b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n'
-                    +  b'\r\n')
-
-        self.connection.sendall(headers)
-        self.connection.sendall(payload)
-
-
-class WarcproxTest(unittest.TestCase):
-    logger = logging.getLogger('WarcproxTest')
-
-    def __init__(self, methodName='runTest'):
-        self.__cert = None
-        unittest.TestCase.__init__(self, methodName)
-
-    @property
-    def _cert(self):
-        if self.__cert is None:
-            f = tempfile.NamedTemporaryFile(prefix='warcprox-test-https-', suffix='.pem', delete=False)
-            try:
-                key = OpenSSL.crypto.PKey()
-                key.generate_key(OpenSSL.crypto.TYPE_RSA, 2048)
-                req = OpenSSL.crypto.X509Req()
-                req.get_subject().CN = 'localhost'
-                req.set_pubkey(key)
-                req.sign(key, 'sha1')
-                cert = OpenSSL.crypto.X509()
-                cert.set_subject(req.get_subject())
-                cert.set_serial_number(0)
-                cert.gmtime_adj_notBefore(0)
-                cert.gmtime_adj_notAfter(2*60*60) # valid for 2hrs
-                cert.set_issuer(cert.get_subject())
-                cert.set_pubkey(req.get_pubkey())
-                cert.sign(key, 'sha1')
-
-                f.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM, key))
-                f.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM, cert))
-
-                self.logger.info('generated self-signed certificate {}'.format(f.name))
-                self.__cert = f.name
-            finally:
-                f.close()
-
-        return self.__cert
-
-
-    def _start_http_servers(self):
-        self.http_daemon = http_server.HTTPServer(('localhost', 0),
-                RequestHandlerClass=TestHttpRequestHandler)
-        self.logger.info('starting http://{}:{}'.format(self.http_daemon.server_address[0], self.http_daemon.server_address[1]))
-        self.http_daemon_thread = threading.Thread(name='HttpdThread',
-                target=self.http_daemon.serve_forever)
-        self.http_daemon_thread.start()
-
-        # http://www.piware.de/2011/01/creating-an-https-server-in-python/
-        self.https_daemon = http_server.HTTPServer(('localhost', 0),
-                RequestHandlerClass=TestHttpRequestHandler)
-        # self.https_daemon.socket = ssl.wrap_socket(httpd.socket, certfile='path/to/localhost.pem', server_side=True)
-        self.https_daemon.socket = ssl.wrap_socket(self.https_daemon.socket, certfile=self._cert, server_side=True)
-        self.logger.info('starting https://{}:{}'.format(self.https_daemon.server_address[0], self.https_daemon.server_address[1]))
-        self.https_daemon_thread = threading.Thread(name='HttpdThread',
-                target=self.https_daemon.serve_forever)
-        self.https_daemon_thread.start()
-
-
-    def _start_warcprox(self):
-        f = tempfile.NamedTemporaryFile(prefix='warcprox-test-ca-', suffix='.pem', delete=True)
-        f.close() # delete it, or CertificateAuthority will try to read it
-        self._ca_file = f.name
-        self._ca_dir = tempfile.mkdtemp(prefix='warcprox-test-', suffix='-ca')
-        ca = certauth.certauth.CertificateAuthority(self._ca_file, self._ca_dir, 'warcprox-test')
-
-        recorded_url_q = queue.Queue()
-
-        proxy = warcprox.warcprox.WarcProxy(server_address=('localhost', 0), ca=ca,
-                recorded_url_q=recorded_url_q)
-
-        self._warcs_dir = tempfile.mkdtemp(prefix='warcprox-test-warcs-')
-
-        f = tempfile.NamedTemporaryFile(prefix='warcprox-test-playback-index-', suffix='.db', delete=False)
-        f.close()
-        self._playback_index_db_file = f.name
-        playback_index_db = warcprox.playback.PlaybackIndexDb(self._playback_index_db_file)
-        playback_proxy = warcprox.playback.PlaybackProxy(server_address=('localhost', 0), ca=ca,
-                playback_index_db=playback_index_db, warcs_dir=self._warcs_dir)
-
-        f = tempfile.NamedTemporaryFile(prefix='warcprox-test-dedup-', suffix='.db', delete=False)
-        f.close()
-        self._dedup_db_file = f.name
-        dedup_db = warcprox.dedup.DedupDb(self._dedup_db_file)
-
-        warc_writer = warcprox.warcwriter.WarcWriter(directory=self._warcs_dir,
-                port=proxy.server_port, dedup_db=dedup_db,
-                playback_index_db=playback_index_db)
-        warc_writer_thread = warcprox.warcwriter.WarcWriterThread(recorded_url_q=recorded_url_q,
-                warc_writer=warc_writer)
-
-        self.warcprox = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy)
-        self.logger.info('starting warcprox')
-        self.warcprox_thread = threading.Thread(name='WarcproxThread',
-                target=self.warcprox.run_until_shutdown)
-        self.warcprox_thread.start()
-
-
-    def setUp(self):
-        logging.basicConfig(stream=sys.stdout, level=logging.DEBUG,
-                format='%(asctime)s %(levelname)s %(process)d %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
-
-        self._start_http_servers()
-        self._start_warcprox()
-
-        archiving_proxy = 'http://localhost:{}'.format(self.warcprox.proxy.server_port)
-        self.archiving_proxies = {'http':archiving_proxy, 'https':archiving_proxy}
-
-        playback_proxy = 'http://localhost:{}'.format(self.warcprox.playback_proxy.server_port)
-        self.playback_proxies = {'http':playback_proxy, 'https':playback_proxy}
-
-
-    def tearDown(self):
-        self.logger.info('stopping warcprox')
-        self.warcprox.stop.set()
-
-        self.logger.info('stopping http and https daemons')
-        self.http_daemon.shutdown()
-        self.https_daemon.shutdown()
-        self.http_daemon.server_close()
-        self.https_daemon.server_close()
-
-        # Have to wait for threads to finish or the threads will try to use
-        # variables that no longer exist, resulting in errors like this:
-        #   File "/usr/lib/python2.7/SocketServer.py", line 235, in serve_forever
-        #       r, w, e = _eintr_retry(select.select, [self], [], [],
-        #   AttributeError: 'NoneType' object has no attribute 'select'
-        self.http_daemon_thread.join()
-        self.https_daemon_thread.join()
-        self.warcprox_thread.join()
-
-        for f in (self.__cert, self._ca_file, self._ca_dir, self._warcs_dir, self._playback_index_db_file, self._dedup_db_file):
-            if os.path.isdir(f):
-                self.logger.info('deleting directory {}'.format(f))
-                shutil.rmtree(f)
-            else:
-                self.logger.info('deleting file {}'.format(f))
-                os.unlink(f)
-
-
-    def _test_httpds_no_proxy(self):
-        url = 'http://localhost:{}/'.format(self.http_daemon.server_port)
-        response = requests.get(url)
-        self.assertEqual(response.status_code, 404)
-        self.assertEqual(response.content, b'404 Not Found\n')
-
-        url = 'https://localhost:{}/'.format(self.https_daemon.server_port)
-        response = requests.get(url, verify=False)
-        self.assertEqual(response.status_code, 404)
-        self.assertEqual(response.content, b'404 Not Found\n')
-
-        url = 'http://localhost:{}/a/b'.format(self.http_daemon.server_port)
-        response = requests.get(url)
-        self.assertEqual(response.status_code, 200)
-        self.assertEqual(response.headers['warcprox-test-header'], 'a!')
-        self.assertEqual(response.content, b'I am the warcprox test payload! bbbbbbbbbb!\n')
-
-        url = 'https://localhost:{}/c/d'.format(self.https_daemon.server_port)
-        response = requests.get(url, verify=False)
-        self.assertEqual(response.status_code, 200)
-        self.assertEqual(response.headers['warcprox-test-header'], 'c!')
-        self.assertEqual(response.content, b'I am the warcprox test payload! dddddddddd!\n')
-
-
-    def poll_playback_until(self, url, status, timeout_sec):
-        start = time.time()
-        # check playback (warc writing is asynchronous, give it up to 10 sec)
-        while time.time() - start < timeout_sec:
-            response = requests.get(url, proxies=self.playback_proxies, verify=False)
-            if response.status_code == status:
-                break
-            time.sleep(0.5)
-
-        return response
-
-
-    def _test_archive_and_playback_http_url(self):
-        url = 'http://localhost:{}/a/b'.format(self.http_daemon.server_port)
-
-        # ensure playback fails before archiving
-        response = requests.get(url, proxies=self.playback_proxies)
-        self.assertEqual(response.status_code, 404)
-        self.assertEqual(response.content, b'404 Not in Archive\n')
-
-        # archive
-        response = requests.get(url, proxies=self.archiving_proxies)
-        self.assertEqual(response.status_code, 200)
-        self.assertEqual(response.headers['warcprox-test-header'], 'a!')
-        self.assertEqual(response.content, b'I am the warcprox test payload! bbbbbbbbbb!\n')
-
-        response = self.poll_playback_until(url, status=200, timeout_sec=10)
-        self.assertEqual(response.status_code, 200)
-        self.assertEqual(response.headers['warcprox-test-header'], 'a!')
-        self.assertEqual(response.content, b'I am the warcprox test payload! bbbbbbbbbb!\n')
-
-
-    def _test_archive_and_playback_https_url(self):
-        url = 'https://localhost:{}/c/d'.format(self.https_daemon.server_port)
-
-        # ensure playback fails before archiving
-        response = requests.get(url, proxies=self.playback_proxies, verify=False)
-        self.assertEqual(response.status_code, 404)
-        self.assertEqual(response.content, b'404 Not in Archive\n')
-
-        # fetch & archive response
-        response = requests.get(url, proxies=self.archiving_proxies, verify=False)
-        self.assertEqual(response.status_code, 200)
-        self.assertEqual(response.headers['warcprox-test-header'], 'c!')
-        self.assertEqual(response.content, b'I am the warcprox test payload! dddddddddd!\n')
-
-        # test playback
-        response = self.poll_playback_until(url, status=200, timeout_sec=10)
-        self.assertEqual(response.status_code, 200)
-        self.assertEqual(response.headers['warcprox-test-header'], 'c!')
-        self.assertEqual(response.content, b'I am the warcprox test payload! dddddddddd!\n')
-
-
-    # test dedup of same http url with same payload
-    def _test_dedup_http(self):
-        url = 'http://localhost:{}/e/f'.format(self.http_daemon.server_port)
-
-        # ensure playback fails before archiving
-        response = requests.get(url, proxies=self.playback_proxies, verify=False)
-        self.assertEqual(response.status_code, 404)
-        self.assertEqual(response.content, b'404 Not in Archive\n')
-
-        # check not in dedup db
-        dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
-        self.assertIsNone(dedup_lookup)
-
-        # archive
-        response = requests.get(url, proxies=self.archiving_proxies, verify=False)
-        self.assertEqual(response.status_code, 200)
-        self.assertEqual(response.headers['warcprox-test-header'], 'e!')
-        self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n')
-
-        # test playback
-        response = self.poll_playback_until(url, status=200, timeout_sec=10)
-        self.assertEqual(response.status_code, 200)
-        self.assertEqual(response.headers['warcprox-test-header'], 'e!')
-        self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n')
-
-        # check in dedup db
-        # {u'i': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'}
-        dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
-        self.assertEqual(dedup_lookup['u'], url.encode('ascii'))
-        self.assertRegexpMatches(dedup_lookup['i'], br'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$')
-        self.assertRegexpMatches(dedup_lookup['d'], br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$')
-        record_id = dedup_lookup['i']
-        dedup_date = dedup_lookup['d']
-
-        # need revisit to have a later timestamp than original, else playing
-        # back the latest record might not hit the revisit
-        time.sleep(1.5)
-
-        # fetch & archive revisit
-        response = requests.get(url, proxies=self.archiving_proxies, verify=False)
-        self.assertEqual(response.status_code, 200)
-        self.assertEqual(response.headers['warcprox-test-header'], 'e!')
-        self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n')
-
-        # XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\
-        time.sleep(2.0)
-
-        # check in dedup db (no change from prev)
-        dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
-        self.assertEqual(dedup_lookup['u'], url.encode('ascii'))
-        self.assertEqual(dedup_lookup['i'], record_id)
-        self.assertEqual(dedup_lookup['d'], dedup_date)
-
-        # test playback
-        self.logger.debug('testing playback of revisit of {}'.format(url))
-        response = self.poll_playback_until(url, status=200, timeout_sec=10)
-        self.assertEqual(response.status_code, 200)
-        self.assertEqual(response.headers['warcprox-test-header'], 'e!')
-        self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n')
-        # XXX how to check dedup was used?
-
-
-    # test dedup of same https url with same payload
-    def _test_dedup_https(self):
-        url = 'https://localhost:{}/g/h'.format(self.https_daemon.server_port)
-
-        # ensure playback fails before archiving
-        response = requests.get(url, proxies=self.playback_proxies, verify=False)
-        self.assertEqual(response.status_code, 404)
-        self.assertEqual(response.content, b'404 Not in Archive\n')
-
-        # check not in dedup db
-        dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
-        self.assertIsNone(dedup_lookup)
-
-        # archive
-        response = requests.get(url, proxies=self.archiving_proxies, verify=False)
-        self.assertEqual(response.status_code, 200)
-        self.assertEqual(response.headers['warcprox-test-header'], 'g!')
-        self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n')
-
-        # test playback
-        response = self.poll_playback_until(url, status=200, timeout_sec=10)
-        self.assertEqual(response.status_code, 200)
-        self.assertEqual(response.headers['warcprox-test-header'], 'g!')
-        self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n')
-
-        # check in dedup db
-        # {u'i': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'}
-        dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
-        self.assertEqual(dedup_lookup['u'], url.encode('ascii'))
-        self.assertRegexpMatches(dedup_lookup['i'], br'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$')
-        self.assertRegexpMatches(dedup_lookup['d'], br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$')
-        record_id = dedup_lookup['i']
-        dedup_date = dedup_lookup['d']
-
-        # need revisit to have a later timestamp than original, else playing
-        # back the latest record might not hit the revisit
-        time.sleep(1.5)
-
-        # fetch & archive revisit
-        response = requests.get(url, proxies=self.archiving_proxies, verify=False)
-        self.assertEqual(response.status_code, 200)
-        self.assertEqual(response.headers['warcprox-test-header'], 'g!')
-        self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n')
-
-        # XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\
-        time.sleep(2.0)
-
-        # check in dedup db (no change from prev)
-        dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
-        self.assertEqual(dedup_lookup['u'], url.encode('ascii'))
-        self.assertEqual(dedup_lookup['i'], record_id)
-        self.assertEqual(dedup_lookup['d'], dedup_date)
-
-        # test playback
-        self.logger.debug('testing playback of revisit of {}'.format(url))
-        response = self.poll_playback_until(url, status=200, timeout_sec=10)
-        self.assertEqual(response.status_code, 200)
-        self.assertEqual(response.headers['warcprox-test-header'], 'g!')
-        self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n')
-        # XXX how to check dedup was used?
-
-
-    # run everything from here, otherwise it wants to setUp() and tearDown
-    # around each test
-    def runTest(self):
-        self._test_httpds_no_proxy()
-        self._test_archive_and_playback_http_url()
-        self._test_archive_and_playback_https_url()
-        self._test_dedup_http()
-        self._test_dedup_https()
-        # self._test_dedup_mixed_http()
-        # self._test_dedup_mixed_https()
-
-
-if __name__ == '__main__':
-    unittest.main()
-
--- a/warcprox/warc.py
+++ b/warcprox/warc.py
@ -0,0 +1,171 @@
+#
+# warcprox/warc.py - assembles warc records
+#
+# Copyright (C) 2013-2016 Internet Archive
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
+# USA.
+#
+
+from __future__ import absolute_import
+
+import logging
+import warcprox
+import hashlib
+import socket
+import hanzo.httptools
+from hanzo import warctools
+import warcprox
+import datetime
+
+class WarcRecordBuilder:
+    logger = logging.getLogger("warcprox.warc.WarcRecordBuilder")
+
+    def __init__(self, digest_algorithm="sha1", base32=False):
+        self.digest_algorithm = digest_algorithm
+        self.base32 = base32
+
+    def _build_response_principal_record(self, recorded_url, warc_date):
+        """Builds response or revisit record, whichever is appropriate."""
+        if hasattr(recorded_url, "dedup_info") and recorded_url.dedup_info:
+            # revisit record
+            recorded_url.response_recorder.tempfile.seek(0)
+            if recorded_url.response_recorder.payload_offset is not None:
+                response_header_block = recorded_url.response_recorder.tempfile.read(recorded_url.response_recorder.payload_offset)
+            else:
+                response_header_block = recorded_url.response_recorder.tempfile.read()
+
+            return self.build_warc_record(
+                    url=recorded_url.url, warc_date=warc_date,
+                    data=response_header_block,
+                    warc_type=warctools.WarcRecord.REVISIT,
+                    refers_to=recorded_url.dedup_info['id'],
+                    refers_to_target_uri=recorded_url.dedup_info['url'],
+                    refers_to_date=recorded_url.dedup_info['date'],
+                    payload_digest=warcprox.digest_str(recorded_url.response_recorder.payload_digest, self.base32),
+                    profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST,
+                    content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
+                    remote_ip=recorded_url.remote_ip)
+        else:
+            # response record
+            return self.build_warc_record(
+                    url=recorded_url.url, warc_date=warc_date,
+                    recorder=recorded_url.response_recorder,
+                    warc_type=warctools.WarcRecord.RESPONSE,
+                    content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
+                    remote_ip=recorded_url.remote_ip)
+
+    def build_warc_records(self, recorded_url):
+        """Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)"""
+        warc_date = warctools.warc.warc_datetime_str(recorded_url.timestamp)
+
+        if recorded_url.response_recorder:
+            principal_record = self._build_response_principal_record(recorded_url, warc_date)
+            request_record = self.build_warc_record(url=recorded_url.url,
+                    warc_date=warc_date, data=recorded_url.request_data,
+                    warc_type=warctools.WarcRecord.REQUEST,
+                    content_type=hanzo.httptools.RequestMessage.CONTENT_TYPE,
+                    concurrent_to=principal_record.id)
+            return principal_record, request_record
+        else:
+            principal_record = self.build_warc_record(url=recorded_url.url,
+                    warc_date=warc_date, data=recorded_url.request_data,
+                    warc_type=recorded_url.custom_type,
+                    content_type=recorded_url.content_type.encode("latin1"))
+            return (principal_record,)
+
+    def build_warc_record(self, url, warc_date=None, recorder=None, data=None,
+        concurrent_to=None, warc_type=None, content_type=None, remote_ip=None,
+        profile=None, refers_to=None, refers_to_target_uri=None,
+        refers_to_date=None, payload_digest=None):
+
+        if warc_date is None:
+            warc_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow())
+
+        record_id = warctools.WarcRecord.random_warc_uuid()
+
+        headers = []
+        if warc_type is not None:
+            headers.append((warctools.WarcRecord.TYPE, warc_type))
+        headers.append((warctools.WarcRecord.ID, record_id))
+        headers.append((warctools.WarcRecord.DATE, warc_date))
+        headers.append((warctools.WarcRecord.URL, url))
+        if remote_ip is not None:
+            headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip))
+        if profile is not None:
+            headers.append((warctools.WarcRecord.PROFILE, profile))
+        if refers_to is not None:
+            headers.append((warctools.WarcRecord.REFERS_TO, refers_to))
+        if refers_to_target_uri is not None:
+            headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri))
+        if refers_to_date is not None:
+            headers.append((warctools.WarcRecord.REFERS_TO_DATE, refers_to_date))
+        if concurrent_to is not None:
+            headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to))
+        if content_type is not None:
+            headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
+        if payload_digest is not None:
+            headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
+
+        if recorder is not None:
+            headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1')))
+            headers.append((warctools.WarcRecord.BLOCK_DIGEST,
+                warcprox.digest_str(recorder.block_digest, self.base32)))
+            if recorder.payload_digest is not None:
+                headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
+                    warcprox.digest_str(recorder.payload_digest, self.base32)))
+
+            recorder.tempfile.seek(0)
+            record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)
+
+        else:
+            headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1')))
+            digest = hashlib.new(self.digest_algorithm, data)
+            headers.append((warctools.WarcRecord.BLOCK_DIGEST,
+                warcprox.digest_str(digest, self.base32)))
+            if not payload_digest:
+                headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
+                                warcprox.digest_str(digest, self.base32)))
+
+            content_tuple = content_type, data
+            record = warctools.WarcRecord(headers=headers, content=content_tuple)
+
+        return record
+
+    def build_warcinfo_record(self, filename):
+        warc_record_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow())
+        record_id = warctools.WarcRecord.random_warc_uuid()
+
+        headers = []
+        headers.append((warctools.WarcRecord.ID, record_id))
+        headers.append((warctools.WarcRecord.TYPE, warctools.WarcRecord.WARCINFO))
+        headers.append((warctools.WarcRecord.FILENAME, filename.encode('latin1')))
+        headers.append((warctools.WarcRecord.DATE, warc_record_date))
+
+        warcinfo_fields = []
+        warcinfo_fields.append(b'software: warcprox ' + warcprox.__version__.encode('latin1'))
+        hostname = socket.gethostname()
+        warcinfo_fields.append('hostname: {}'.format(hostname).encode('latin1'))
+        warcinfo_fields.append('ip: {}'.format(socket.gethostbyname(hostname)).encode('latin1'))
+        warcinfo_fields.append(b'format: WARC File Format 1.0')
+        # warcinfo_fields.append('robots: ignore')
+        # warcinfo_fields.append('description: {0}'.format(self.description))
+        # warcinfo_fields.append('isPartOf: {0}'.format(self.is_part_of))
+        data = b'\r\n'.join(warcinfo_fields) + b'\r\n'
+
+        record = warctools.WarcRecord(headers=headers, content=(b'application/warc-fields', data))
+
+        return record
+
--- a/warcprox/warcprox.py
+++ b/warcprox/warcprox.py
@ -1,272 +0,0 @@
-#!/usr/bin/env python
-# vim:set sw=4 et:
-#
-"""
-WARC writing MITM HTTP/S proxy
-
-See README.rst or https://github.com/internetarchive/warcprox
-"""
-
-from __future__ import absolute_import
-
-try:
-    import http.server as http_server
-except ImportError:
-    import BaseHTTPServer as http_server
-
-try:
-    import socketserver
-except ImportError:
-    import SocketServer as socketserver
-
-try:
-    import queue
-except ImportError:
-    import Queue as queue
-
-try:
-    import http.client as http_client
-except ImportError:
-    import httplib as http_client
-
-import logging
-import re
-import tempfile
-import traceback
-import hashlib
-import json
-import socket
-
-from certauth.certauth import CertificateAuthority
-import warcprox.mitmproxy
-
-class ProxyingRecorder(object):
-    """
-    Wraps a socket._fileobject, recording the bytes as they are read,
-    calculating digests, and sending them on to the proxy client.
-    """
-
-    logger = logging.getLogger("warcprox.warcprox.ProxyingRecorder")
-
-    def __init__(self, fp, proxy_dest, digest_algorithm='sha1'):
-        self.fp = fp
-        # "The file has no name, and will cease to exist when it is closed."
-        self.tempfile = tempfile.SpooledTemporaryFile(max_size=512*1024)
-        self.digest_algorithm = digest_algorithm
-        self.block_digest = hashlib.new(digest_algorithm)
-        self.payload_offset = None
-        self.payload_digest = None
-        self.proxy_dest = proxy_dest
-        self._proxy_dest_conn_open = True
-        self._prev_hunk_last_two_bytes = b''
-        self.len = 0
-
-    def _update_payload_digest(self, hunk):
-        if self.payload_digest is None:
-            # convoluted handling of two newlines crossing hunks
-            # XXX write tests for this
-            if self._prev_hunk_last_two_bytes.endswith(b'\n'):
-                if hunk.startswith(b'\n'):
-                    self.payload_digest = hashlib.new(self.digest_algorithm)
-                    self.payload_digest.update(hunk[1:])
-                    self.payload_offset = self.len + 1
-                elif hunk.startswith(b'\r\n'):
-                    self.payload_digest = hashlib.new(self.digest_algorithm)
-                    self.payload_digest.update(hunk[2:])
-                    self.payload_offset = self.len + 2
-            elif self._prev_hunk_last_two_bytes == b'\n\r':
-                if hunk.startswith(b'\n'):
-                    self.payload_digest = hashlib.new(self.digest_algorithm)
-                    self.payload_digest.update(hunk[1:])
-                    self.payload_offset = self.len + 1
-            else:
-                m = re.search(br'\n\r?\n', hunk)
-                if m is not None:
-                    self.payload_digest = hashlib.new(self.digest_algorithm)
-                    self.payload_digest.update(hunk[m.end():])
-                    self.payload_offset = self.len + m.end()
-
-            # if we still haven't found start of payload hold on to these bytes
-            if self.payload_digest is None:
-                self._prev_hunk_last_two_bytes = hunk[-2:]
-        else:
-            self.payload_digest.update(hunk)
-
-    def _update(self, hunk):
-        self._update_payload_digest(hunk)
-        self.block_digest.update(hunk)
-
-        self.tempfile.write(hunk)
-
-        if self._proxy_dest_conn_open:
-            try:
-                self.proxy_dest.sendall(hunk)
-            except BaseException as e:
-                self._proxy_dest_conn_open = False
-                self.logger.warn('{} sending data to proxy client'.format(e))
-                self.logger.info('will continue downloading from remote server without sending to client')
-
-        self.len += len(hunk)
-
-    def read(self, size=-1):
-        hunk = self.fp.read(size)
-        self._update(hunk)
-        return hunk
-
-    def readinto(self, b):
-        n = self.fp.readinto(b)
-        self._update(b[:n])
-        return n
-
-    def readline(self, size=-1):
-        # XXX depends on implementation details of self.fp.readline(), in
-        # particular that it doesn't call self.fp.read()
-        hunk = self.fp.readline(size)
-        self._update(hunk)
-        return hunk
-
-    def close(self):
-        return self.fp.close()
-
-    def __len__(self):
-        return self.len
-
-    def payload_size(self):
-        if self.payload_offset is not None:
-            return self.len - self.payload_offset
-        else:
-            return 0
-
-
-class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
-
-    def __init__(self, sock, debuglevel=0, method=None, proxy_dest=None, digest_algorithm='sha1'):
-        http_client.HTTPResponse.__init__(self, sock, debuglevel=debuglevel, method=method)
-
-        # Keep around extra reference to self.fp because HTTPResponse sets
-        # self.fp=None after it finishes reading, but we still need it
-        self.recorder = ProxyingRecorder(self.fp, proxy_dest, digest_algorithm)
-        self.fp = self.recorder
-
-
-class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
-    logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler")
-
-    def _proxy_request(self):
-        # Build request
-        req_str = '{} {} {}\r\n'.format(self.command, self.path, self.request_version)
-
-        warcprox_meta = self.headers.get('Warcprox-Meta')
-
-        # Swallow headers that don't make sense to forward on, i.e. most
-        # hop-by-hop headers, see http://tools.ietf.org/html/rfc2616#section-13.5
-        # self.headers is an email.message.Message, which is case-insensitive
-        # and doesn't throw KeyError in __delitem__
-        for h in ('Connection', 'Proxy-Connection', 'Keep-Alive',
-                'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade',
-                'Warcprox-Meta'):
-            del self.headers[h]
-
-        # Add headers to the request
-        # XXX in at least python3.3 str(self.headers) uses \n not \r\n :(
-        req_str += '\r\n'.join('{}: {}'.format(k,v) for (k,v) in self.headers.items())
-
-        req = req_str.encode('utf-8') + b'\r\n\r\n'
-
-        # Append message body if present to the request
-        if 'Content-Length' in self.headers:
-            req += self.rfile.read(int(self.headers['Content-Length']))
-
-        self.logger.debug('req={}'.format(repr(req)))
-
-        # Send it down the pipe!
-        self._proxy_sock.sendall(req)
-
-        # We want HTTPResponse's smarts about http and handling of
-        # non-compliant servers. But HTTPResponse.read() doesn't return the raw
-        # bytes read from the server, it unchunks them if they're chunked, and
-        # might do other stuff. We want to send the raw bytes back to the
-        # client. So we ignore the values returned by h.read() below. Instead
-        # the ProxyingRecordingHTTPResponse takes care of sending the raw bytes
-        # to the proxy client.
-
-        # Proxy and record the response
-        h = ProxyingRecordingHTTPResponse(self._proxy_sock,
-                proxy_dest=self.connection,
-                digest_algorithm=self.server.digest_algorithm)
-        h.begin()
-
-        buf = h.read(8192)
-        while buf != b'':
-            buf = h.read(8192)
-
-        self.log_request(h.status, h.recorder.len)
-
-        remote_ip = self._proxy_sock.getpeername()[0]
-
-        # Let's close off the remote end
-        h.close()
-        self._proxy_sock.close()
-
-        recorded_url = RecordedUrl(url=self.url, request_data=req,
-                response_recorder=h.recorder, remote_ip=remote_ip,
-                warcprox_meta=warcprox_meta)
-        self.server.recorded_url_q.put(recorded_url)
-
-        return recorded_url
-
-
-class RecordedUrl(object):
-    def __init__(self, url, request_data, response_recorder, remote_ip, warcprox_meta=None):
-        # XXX should test what happens with non-ascii url (when does
-        # url-encoding happen?)
-        if type(url) is not bytes:
-            self.url = url.encode('ascii')
-        else:
-            self.url = url
-
-        if type(remote_ip) is not bytes:
-            self.remote_ip = remote_ip.encode('ascii')
-        else:
-            self.remote_ip = remote_ip
-
-        self.request_data = request_data
-        self.response_recorder = response_recorder
-
-        if warcprox_meta:
-            self.warcprox_meta = json.loads(warcprox_meta)
-        else:
-            self.warcprox_meta = {}
-
-
-class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer):
-    logger = logging.getLogger("warcprox.warcprox.WarcProxy")
-
-    def __init__(self, server_address=('localhost', 8000),
-            req_handler_class=WarcProxyHandler, bind_and_activate=True,
-            ca=None, recorded_url_q=None, digest_algorithm='sha1'):
-        http_server.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate)
-
-        self.digest_algorithm = digest_algorithm
-
-        if ca is not None:
-            self.ca = ca
-        else:
-            ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64]
-            self.ca = CertificateAuthority(ca_file='warcprox-ca.pem',
-                                           certs_dir='./warcprox-ca',
-                                           ca_name=ca_name)
-
-        if recorded_url_q is not None:
-            self.recorded_url_q = recorded_url_q
-        else:
-            self.recorded_url_q = queue.Queue()
-
-    def server_activate(self):
-        http_server.HTTPServer.server_activate(self)
-        self.logger.info('WarcProxy listening on {0}:{1}'.format(self.server_address[0], self.server_address[1]))
-
-    def server_close(self):
-        self.logger.info('WarcProxy shutting down')
-        http_server.HTTPServer.server_close(self)
-
--- a/warcprox/warcproxy.py
+++ b/warcprox/warcproxy.py
@ -0,0 +1,415 @@
+'''
+warcprox/warcproxy.py - recording proxy, extends mitmproxy to record traffic,
+enqueue info on the recorded url queue
+
+Copyright (C) 2013-2016 Internet Archive
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
+USA.
+'''
+
+from __future__ import absolute_import
+
+try:
+    import http.server as http_server
+except ImportError:
+    import BaseHTTPServer as http_server
+try:
+    import socketserver
+except ImportError:
+    import SocketServer as socketserver
+try:
+    import queue
+except ImportError:
+    import Queue as queue
+import logging
+import re
+import traceback
+import json
+import socket
+from hanzo import warctools
+from certauth.certauth import CertificateAuthority
+import warcprox
+import datetime
+import ipaddress
+import surt
+
+class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
+    '''
+    XXX add more information.
+
+    Among other things, this class enforces limits specified in the
+    Warcprox-Meta request header. If a limit is deemed to have been reached, no
+    request will be made to the remote destination server. This implementation
+    detail has implications worth noting. For example, if a limit applies to
+    "new" (not deduplicated) bytes, and the limit has already been reached, no
+    request will be made, even if it would have resulted in duplicate content,
+    which would not count toward the limit. To reiterate, this is because the
+    limit enforcer does not know that the content would be deduplicated.
+    '''
+    # self.server is WarcProxy
+    logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler")
+
+    # XXX nearly identical to brozzler.site.Site._scope_rule_applies() but
+    # there's no obvious common dependency where this code should go... TBD
+    def _scope_rule_applies(self, rule):
+        u = warcprox.Url(self.url)
+
+        if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]):
+            return False
+        if "url_match" in rule:
+            if rule["url_match"] == "STRING_MATCH":
+                return u.url.find(rule["value"]) >= 0
+            elif rule["url_match"] == "REGEX_MATCH":
+                try:
+                    return re.fullmatch(rule["value"], u.url)
+                except Exception as e:
+                    self.logger.warn(
+                            "caught exception matching against regex %s: %s",
+                            rule["value"], e)
+                    return False
+            elif rule["url_match"] == "SURT_MATCH":
+                return u.surt.startswith(rule["value"])
+            else:
+                self.logger.warn("invalid rule.url_match=%s", rule.url_match)
+                return False
+        else:
+            if "domain" in rule:
+                # we already know that it matches from earlier check
+                return True
+            else:
+                self.logger.warn("unable to make sense of scope rule %s", rule)
+                return False
+
+    def _enforce_blocks(self, warcprox_meta):
+        """
+        Sends a 403 response and raises warcprox.RequestBlockedByRule if the
+        url is blocked by a rule in warcprox_meta.
+        """
+        if warcprox_meta and "blocks" in warcprox_meta:
+            for rule in warcprox_meta["blocks"]:
+                if self._scope_rule_applies(rule):
+                    body = ("request rejected by warcprox: blocked by "
+                            "rule found in Warcprox-Meta header: %s"
+                            % rule).encode("utf-8")
+                    self.send_response(403, "Forbidden")
+                    self.send_header("Content-Type", "text/plain;charset=utf-8")
+                    self.send_header("Connection", "close")
+                    self.send_header("Content-Length", len(body))
+                    response_meta = {"blocked-by-rule":rule}
+                    self.send_header(
+                            "Warcprox-Meta",
+                            json.dumps(response_meta, separators=(",",":")))
+                    self.end_headers()
+                    if self.command != "HEAD":
+                        self.wfile.write(body)
+                    self.connection.close()
+                    raise warcprox.RequestBlockedByRule(
+                            "%s 403 %s %s -- blocked by rule in Warcprox-Meta "
+                            "request header %s" % (
+                                self.client_address[0], self.command,
+                                self.url, rule))
+
+    def _enforce_limit(self, limit_key, limit_value, soft=False):
+        bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
+        _limit_key = limit_key
+
+        # if limit_key looks like 'job1:foo.com/total/urls' then we only want
+        # to apply this rule if the requested url is within domain
+        bucket0_fields = bucket0.split(':')
+        if len(bucket0_fields) == 2:
+            if not warcprox.host_matches_ip_or_domain(
+                    self.hostname, bucket0_fields[1]):
+                return # else host matches, go ahead and enforce the limit
+            bucket0 = '%s:%s' % (
+                    bucket0_fields[0],
+                    warcprox.normalize_host(bucket0_fields[1]))
+            _limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2)
+
+        value = self.server.stats_db.value(bucket0, bucket1, bucket2)
+        if value and value >= limit_value:
+            body = ("request rejected by warcprox: reached %s %s=%s\n" % (
+                        "soft limit" if soft else "limit", _limit_key,
+                        limit_value)).encode("utf-8")
+            if soft:
+                self.send_response(430, "Reached soft limit")
+            else:
+                self.send_response(420, "Reached limit")
+            self.send_header("Content-Type", "text/plain;charset=utf-8")
+            self.send_header("Connection", "close")
+            self.send_header("Content-Length", len(body))
+            response_meta = {
+                "stats": {bucket0:self.server.stats_db.value(bucket0)}
+            }
+            if soft:
+                response_meta["reached-soft-limit"] = {_limit_key:limit_value}
+            else:
+                response_meta["reached-limit"] = {_limit_key:limit_value}
+            self.send_header(
+                    "Warcprox-Meta",
+                    json.dumps(response_meta, separators=(",",":")))
+            self.end_headers()
+            if self.command != "HEAD":
+                self.wfile.write(body)
+            self.connection.close()
+            raise warcprox.RequestBlockedByRule(
+                    "%s %s %s %s -- reached %s %s=%s" % (
+                        self.client_address[0], 430 if soft else 420,
+                        self.command, self.url,
+                        "soft limit" if soft else "limit",
+                        _limit_key, limit_value))
+
+    def _enforce_limits(self, warcprox_meta):
+        """
+        Sends a 420 (hard limit) or 430 (soft limit) response and raises
+        warcprox.RequestBlockedByRule if a limit specified in warcprox_meta is
+        reached.
+        """
+        if warcprox_meta and "limits" in warcprox_meta:
+            for item in warcprox_meta["limits"].items():
+                limit_key, limit_value = item
+                self._enforce_limit(limit_key, limit_value, soft=False)
+        if warcprox_meta and "soft-limits" in warcprox_meta:
+            for item in warcprox_meta["soft-limits"].items():
+                limit_key, limit_value = item
+                self._enforce_limit(limit_key, limit_value, soft=True)
+
+    def _connect_to_remote_server(self):
+        '''
+        Wraps MitmProxyHandler._connect_to_remote_server, first enforcing
+        limits and block rules in the Warcprox-Meta request header, if any.
+        Raises warcprox.RequestBlockedByRule if a rule has been enforced.
+        Otherwise calls MitmProxyHandler._connect_to_remote_server, which
+        initializes self._remote_server_sock.
+        '''
+        if 'Warcprox-Meta' in self.headers:
+            warcprox_meta = json.loads(self.headers['Warcprox-Meta'])
+            self._enforce_limits(warcprox_meta)
+            self._enforce_blocks(warcprox_meta)
+        return warcprox.mitmproxy.MitmProxyHandler._connect_to_remote_server(self)
+
+    def _proxy_request(self):
+        warcprox_meta = None
+        raw_warcprox_meta = self.headers.get('Warcprox-Meta')
+        self.logger.log(
+                warcprox.TRACE, 'request for %s Warcprox-Meta header: %s',
+                self.url, repr(raw_warcprox_meta))
+        if raw_warcprox_meta:
+            warcprox_meta = json.loads(raw_warcprox_meta)
+            del self.headers['Warcprox-Meta']
+
+        remote_ip = self._remote_server_sock.getpeername()[0]
+        timestamp = datetime.datetime.utcnow()
+
+        req, prox_rec_res = warcprox.mitmproxy.MitmProxyHandler._proxy_request(
+                self)
+
+        recorded_url = RecordedUrl(
+                url=self.url, request_data=req,
+                response_recorder=prox_rec_res.recorder, remote_ip=remote_ip,
+                warcprox_meta=warcprox_meta, status=prox_rec_res.status,
+                size=prox_rec_res.recorder.len,
+                client_ip=self.client_address[0],
+                content_type=prox_rec_res.getheader("Content-Type"),
+                method=self.command, timestamp=timestamp, host=self.hostname,
+                duration=datetime.datetime.utcnow()-timestamp)
+        self.server.recorded_url_q.put(recorded_url)
+
+        return recorded_url
+
+    # deprecated
+    def do_PUTMETA(self):
+        '''
+        Handles a special warcprox PUTMETA request (deprecated). A PUTMETA
+        request is equivalent to a WARCPROX_WRITE_RECORD request with
+        WARC-Type: metadata.
+        '''
+        self.do_WARCPROX_WRITE_RECORD(warc_type=warctools.WarcRecord.METADATA)
+
+    def do_WARCPROX_WRITE_RECORD(self, warc_type=None):
+        '''
+        Handles a request with http method WARCPROX_WRITE_RECORD, a special
+        type of request which tells warcprox to construct a warc record from
+        the request more or less verbatim, and write it to a warc.
+
+        To honor the request, this method creates a RecordedUrl queues it for
+        the WarcWriterThread to process. The warc record headers Content-Type
+        and WARC-Type are taken from the request headers, as is the payload.
+
+        Example request:
+
+        WARCPROX_WRITE_RECORD screenshot:https://example.com/ HTTP/1.1
+        WARC-Type: metadata
+        Content-Type: image/png
+        Content-Length: 12345
+        Connection: close
+
+        <png image data>
+        '''
+        try:
+            self.url = self.path
+
+            if ('Content-Length' in self.headers and 'Content-Type' in self.headers
+                    and (warc_type or 'WARC-Type' in self.headers)):
+                timestamp = datetime.datetime.utcnow()
+
+                # stream this?
+                request_data = self.rfile.read(int(self.headers['Content-Length']))
+
+                warcprox_meta = None
+                raw_warcprox_meta = self.headers.get('Warcprox-Meta')
+                if raw_warcprox_meta:
+                    warcprox_meta = json.loads(raw_warcprox_meta)
+
+                rec_custom = RecordedUrl(url=self.url,
+                                         request_data=request_data,
+                                         response_recorder=None,
+                                         remote_ip=b'',
+                                         warcprox_meta=warcprox_meta,
+                                         content_type=self.headers['Content-Type'],
+                                         custom_type=warc_type or self.headers['WARC-Type'].encode('utf-8'),
+                                         status=204, size=len(request_data),
+                                         client_ip=self.client_address[0],
+                                         method=self.command, timestamp=timestamp)
+
+                self.server.recorded_url_q.put(rec_custom)
+                self.send_response(204, 'OK')
+            else:
+                self.send_error(400, 'Bad request')
+
+            self.end_headers()
+        except:
+            self.logger.error("uncaught exception in do_WARCPROX_WRITE_RECORD", exc_info=True)
+            raise
+
+    def log_message(self, fmt, *args):
+        # logging better handled elsewhere?
+        pass
+
+
+class RecordedUrl:
+    logger = logging.getLogger("warcprox.warcproxy.RecordedUrl")
+
+    def __init__(self, url, request_data, response_recorder, remote_ip,
+            warcprox_meta=None, content_type=None, custom_type=None,
+            status=None, size=None, client_ip=None, method=None,
+            timestamp=None, host=None, duration=None):
+        # XXX should test what happens with non-ascii url (when does
+        # url-encoding happen?)
+        if type(url) is not bytes:
+            self.url = url.encode('ascii')
+        else:
+            self.url = url
+
+        if type(remote_ip) is not bytes:
+            self.remote_ip = remote_ip.encode('ascii')
+        else:
+            self.remote_ip = remote_ip
+
+        self.request_data = request_data
+        self.response_recorder = response_recorder
+
+        if warcprox_meta:
+            self.warcprox_meta = warcprox_meta
+        else:
+            self.warcprox_meta = {}
+
+        self.content_type = content_type
+
+        self.mimetype = content_type
+        if self.mimetype:
+            n = self.mimetype.find(";")
+            if n >= 0:
+                self.mimetype = self.mimetype[:n]
+
+        self.custom_type = custom_type
+        self.status = status
+        self.size = size
+        self.client_ip = client_ip
+        self.method = method
+        self.timestamp = timestamp
+        self.host = host
+        self.duration = duration
+
+
+class SingleThreadedWarcProxy(http_server.HTTPServer):
+    logger = logging.getLogger("warcprox.warcproxy.WarcProxy")
+
+    def __init__(
+            self, ca=None, recorded_url_q=None, stats_db=None,
+            options=warcprox.Options()):
+        server_address = (
+                options.address or 'localhost',
+                options.port if options.port is not None else 8000)
+
+        if options.onion_tor_socks_proxy:
+            try:
+                host, port = options.onion_tor_socks_proxy.split(':')
+                WarcProxyHandler.onion_tor_socks_proxy_host = host
+                WarcProxyHandler.onion_tor_socks_proxy_port = int(port)
+            except ValueError:
+                WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy
+                WarcProxyHandler.onion_tor_socks_proxy_port = None
+
+        http_server.HTTPServer.__init__(
+                self, server_address, WarcProxyHandler, bind_and_activate=True)
+
+        self.digest_algorithm = options.digest_algorithm or 'sha1'
+
+        if ca is not None:
+            self.ca = ca
+        else:
+            ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64]
+            self.ca = CertificateAuthority(ca_file='warcprox-ca.pem',
+                                           certs_dir='./warcprox-ca',
+                                           ca_name=ca_name)
+
+        if recorded_url_q is not None:
+            self.recorded_url_q = recorded_url_q
+        else:
+            self.recorded_url_q = queue.Queue(maxsize=options.queue_size or 1000)
+
+        self.stats_db = stats_db
+
+        self.options = options
+
+class WarcProxy(SingleThreadedWarcProxy, warcprox.mitmproxy.PooledMitmProxy):
+    logger = logging.getLogger("warcprox.warcproxy.WarcProxy")
+
+    def __init__(
+            self, ca=None, recorded_url_q=None, stats_db=None,
+            options=warcprox.Options()):
+        if options.max_threads:
+            self.logger.info(
+                    "max_threads=%s set by command line option",
+                    options.max_threads)
+        warcprox.mitmproxy.PooledMitmProxy.__init__(self, options.max_threads)
+        SingleThreadedWarcProxy.__init__(
+                self, ca, recorded_url_q, stats_db, options)
+
+    def server_activate(self):
+        http_server.HTTPServer.server_activate(self)
+        self.logger.info(
+                'listening on %s:%s', self.server_address[0],
+                self.server_address[1])
+
+    def server_close(self):
+        self.logger.info('shutting down')
+        http_server.HTTPServer.server_close(self)
+
+    def handle_error(self, request, client_address):
+        self.logger.warn(
+                "exception processing request %s from %s", request,
+                client_address, exc_info=True)
--- a/warcprox/warcwriter.py
+++ b/warcprox/warcwriter.py
@ -1,301 +0,0 @@
-# vim:set sw=4 et:
-
-from __future__ import absolute_import
-
-try:
-    import queue
-except ImportError:
-    import Queue as queue
-
-import logging
-import threading
-import os
-import hashlib
-import time
-import socket
-import base64
-from datetime import datetime
-import hanzo.httptools
-from hanzo import warctools
-import warcprox
-
-class WarcWriter:
-    logger = logging.getLogger("warcprox.warcwriter.WarcWriter")
-
-    # port is only used for warc filename
-    def __init__(self, directory='./warcs', rollover_size=1000000000,
-            gzip=False, prefix='WARCPROX', port=0,
-            digest_algorithm='sha1', base32=False, dedup_db=None,
-            playback_index_db=None):
-
-        self.rollover_size = rollover_size
-
-        self.gzip = gzip
-        self.digest_algorithm = digest_algorithm
-        self.base32 = base32
-        self.dedup_db = dedup_db
-
-        self.playback_index_db = playback_index_db
-
-        # warc path and filename stuff
-        self.directory = directory
-        self.prefix = prefix
-        self.port = port
-
-        self._f = None
-        self._fpath = None
-        self._serial = 0
-
-        if not os.path.exists(directory):
-            self.logger.info("warc destination directory {} doesn't exist, creating it".format(directory))
-            os.mkdir(directory)
-
-
-    # returns a tuple (principal_record, request_record) where principal_record is either a response or revisit record
-    def build_warc_records(self, recorded_url):
-        warc_date = warctools.warc.warc_datetime_str(datetime.utcnow())
-
-        dedup_info = None
-        if self.dedup_db is not None and recorded_url.response_recorder.payload_digest is not None:
-            key = self.digest_str(recorded_url.response_recorder.payload_digest)
-            dedup_info = self.dedup_db.lookup(key)
-
-        if dedup_info is not None:
-            # revisit record
-            recorded_url.response_recorder.tempfile.seek(0)
-            if recorded_url.response_recorder.payload_offset is not None:
-                response_header_block = recorded_url.response_recorder.tempfile.read(recorded_url.response_recorder.payload_offset)
-            else:
-                response_header_block = recorded_url.response_recorder.tempfile.read()
-
-            principal_record = self.build_warc_record(
-                    url=recorded_url.url, warc_date=warc_date,
-                    data=response_header_block,
-                    warc_type=warctools.WarcRecord.REVISIT,
-                    refers_to=dedup_info['i'],
-                    refers_to_target_uri=dedup_info['u'],
-                    refers_to_date=dedup_info['d'],
-                    payload_digest=self.digest_str(recorded_url.response_recorder.payload_digest),
-                    profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST,
-                    content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
-                    remote_ip=recorded_url.remote_ip)
-        else:
-            # response record
-            principal_record = self.build_warc_record(
-                    url=recorded_url.url, warc_date=warc_date,
-                    recorder=recorded_url.response_recorder,
-                    warc_type=warctools.WarcRecord.RESPONSE,
-                    content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
-                    remote_ip=recorded_url.remote_ip)
-
-        request_record = self.build_warc_record(
-                url=recorded_url.url, warc_date=warc_date,
-                data=recorded_url.request_data,
-                warc_type=warctools.WarcRecord.REQUEST,
-                content_type=hanzo.httptools.RequestMessage.CONTENT_TYPE,
-                concurrent_to=principal_record.id)
-
-        return principal_record, request_record
-
-
-    def digest_str(self, hash_obj):
-        return hash_obj.name.encode('utf-8') + b':' + (base64.b32encode(hash_obj.digest()) if self.base32 else hash_obj.hexdigest().encode('ascii'))
-
-
-    def build_warc_record(self, url, warc_date=None, recorder=None, data=None,
-        concurrent_to=None, warc_type=None, content_type=None, remote_ip=None,
-        profile=None, refers_to=None, refers_to_target_uri=None,
-        refers_to_date=None, payload_digest=None):
-
-        if warc_date is None:
-            warc_date = warctools.warc.warc_datetime_str(datetime.utcnow())
-
-        record_id = warctools.WarcRecord.random_warc_uuid()
-
-        headers = []
-        if warc_type is not None:
-            headers.append((warctools.WarcRecord.TYPE, warc_type))
-        headers.append((warctools.WarcRecord.ID, record_id))
-        headers.append((warctools.WarcRecord.DATE, warc_date))
-        headers.append((warctools.WarcRecord.URL, url))
-        if remote_ip is not None:
-            headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip))
-        if profile is not None:
-            headers.append((warctools.WarcRecord.PROFILE, profile))
-        if refers_to is not None:
-            headers.append((warctools.WarcRecord.REFERS_TO, refers_to))
-        if refers_to_target_uri is not None:
-            headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri))
-        if refers_to_date is not None:
-            headers.append((warctools.WarcRecord.REFERS_TO_DATE, refers_to_date))
-        if concurrent_to is not None:
-            headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to))
-        if content_type is not None:
-            headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
-        if payload_digest is not None:
-            headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
-
-        if recorder is not None:
-            headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1')))
-            headers.append((warctools.WarcRecord.BLOCK_DIGEST,
-                self.digest_str(recorder.block_digest)))
-            if recorder.payload_digest is not None:
-                headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
-                    self.digest_str(recorder.payload_digest)))
-
-            recorder.tempfile.seek(0)
-            record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)
-
-        else:
-            headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1')))
-            block_digest = hashlib.new(self.digest_algorithm, data)
-            headers.append((warctools.WarcRecord.BLOCK_DIGEST,
-                self.digest_str(block_digest)))
-
-            content_tuple = content_type, data
-            record = warctools.WarcRecord(headers=headers, content=content_tuple)
-
-        return record
-
-
-    def timestamp17(self):
-        now = datetime.utcnow()
-        return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000)
-
-    def close_writer(self):
-        if self._fpath:
-            self.logger.info('closing {0}'.format(self._f_finalname))
-            self._f.close()
-            finalpath = os.path.sep.join([self.directory, self._f_finalname])
-            os.rename(self._fpath, finalpath)
-
-            self._fpath = None
-            self._f = None
-
-    def _build_warcinfo_record(self, filename):
-        warc_record_date = warctools.warc.warc_datetime_str(datetime.utcnow())
-        record_id = warctools.WarcRecord.random_warc_uuid()
-
-        headers = []
-        headers.append((warctools.WarcRecord.ID, record_id))
-        headers.append((warctools.WarcRecord.TYPE, warctools.WarcRecord.WARCINFO))
-        headers.append((warctools.WarcRecord.FILENAME, filename.encode('latin1')))
-        headers.append((warctools.WarcRecord.DATE, warc_record_date))
-
-        warcinfo_fields = []
-        warcinfo_fields.append(b'software: warcprox ' + warcprox.version_bytes)
-        hostname = socket.gethostname()
-        warcinfo_fields.append('hostname: {}'.format(hostname).encode('latin1'))
-        warcinfo_fields.append('ip: {0}'.format(socket.gethostbyname(hostname)).encode('latin1'))
-        warcinfo_fields.append(b'format: WARC File Format 1.0')
-        # warcinfo_fields.append('robots: ignore')
-        # warcinfo_fields.append('description: {0}'.format(self.description))
-        # warcinfo_fields.append('isPartOf: {0}'.format(self.is_part_of))
-        data = b'\r\n'.join(warcinfo_fields) + b'\r\n'
-
-        record = warctools.WarcRecord(headers=headers, content=(b'application/warc-fields', data))
-
-        return record
-
-
-    # <!-- <property name="template" value="${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}" /> -->
-    def _writer(self):
-        if self._fpath and os.path.getsize(self._fpath) > self.rollover_size:
-            self.close_writer()
-
-        if self._f == None:
-            self._f_finalname = '{}-{}-{:05d}-{}-{}-{}.warc{}'.format(
-                    self.prefix, self.timestamp17(), self._serial, os.getpid(),
-                    socket.gethostname(), self.port, '.gz' if self.gzip else '')
-            self._fpath = os.path.sep.join([self.directory, self._f_finalname + '.open'])
-
-            self._f = open(self._fpath, 'wb')
-
-            warcinfo_record = self._build_warcinfo_record(self._f_finalname)
-            self.logger.debug('warcinfo_record.headers={}'.format(warcinfo_record.headers))
-            warcinfo_record.write_to(self._f, gzip=self.gzip)
-
-            self._serial += 1
-
-        return self._f
-
-
-    def _final_tasks(self, recorded_url, recordset, recordset_offset):
-        if (self.dedup_db is not None
-                and recordset[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
-                and recorded_url.response_recorder.payload_size() > 0):
-            key = self.digest_str(recorded_url.response_recorder.payload_digest)
-            self.dedup_db.save(key, recordset[0], recordset_offset)
-
-        if self.playback_index_db is not None:
-            self.playback_index_db.save(self._f_finalname, recordset, recordset_offset)
-
-        recorded_url.response_recorder.tempfile.close()
-
-    def write_records(self, recorded_url):
-        recordset = self.build_warc_records(recorded_url)
-
-        writer = self._writer()
-        recordset_offset = writer.tell()
-
-        for record in recordset:
-            offset = writer.tell()
-            record.write_to(writer, gzip=self.gzip)
-            self.logger.debug('wrote warc record: warc_type={} content_length={} url={} warc={} offset={}'.format(
-                    record.get_header(warctools.WarcRecord.TYPE),
-                    record.get_header(warctools.WarcRecord.CONTENT_LENGTH),
-                    record.get_header(warctools.WarcRecord.URL),
-                    self._fpath, offset))
-
-        self._f.flush()
-
-        self._final_tasks(recorded_url, recordset, recordset_offset)
-
-
-
-class WarcWriterThread(threading.Thread):
-    logger = logging.getLogger("warcprox.warcwriter.WarcWriterThread")
-
-    def __init__(self, recorded_url_q=None, warc_writer=None, rollover_idle_time=None):
-        """recorded_url_q is a queue.Queue of warcprox.warcprox.RecordedUrl."""
-        threading.Thread.__init__(self, name='WarcWriterThread')
-        self.recorded_url_q = recorded_url_q
-        self.rollover_idle_time = rollover_idle_time
-        self.stop = threading.Event()
-        if warc_writer:
-            self.warc_writer = warc_writer
-        else:
-            self.warc_writer = WarcWriter()
-
-    def run(self):
-        self.logger.info('WarcWriterThread starting, directory={} gzip={} rollover_size={} rollover_idle_time={} prefix={} port={}'.format(
-                os.path.abspath(self.warc_writer.directory), self.warc_writer.gzip, self.warc_writer.rollover_size,
-                self.rollover_idle_time, self.warc_writer.prefix, self.warc_writer.port))
-
-        self._last_sync = self._last_activity = time.time()
-
-        while not self.stop.is_set():
-            try:
-                recorded_url = self.recorded_url_q.get(block=True, timeout=0.5)
-                self.logger.info("recorded_url.warcprox_meta={} for {}".format(recorded_url.warcprox_meta, recorded_url.url))
-                self.warc_writer.write_records(recorded_url)
-                self._last_activity = time.time()
-            except queue.Empty:
-                if (self.warc_writer._fpath is not None
-                        and self.rollover_idle_time is not None
-                        and self.rollover_idle_time > 0
-                        and time.time() - self._last_activity > self.rollover_idle_time):
-                    self.logger.debug('rolling over warc file after {} seconds idle'.format(time.time() - self._last_activity))
-                    self.warc_writer.close_writer()
-
-                if time.time() - self._last_sync > 60:
-                    if self.warc_writer.dedup_db:
-                        self.warc_writer.dedup_db.sync()
-                    if self.warc_writer.playback_index_db:
-                        self.warc_writer.playback_index_db.sync()
-                    self._last_sync = time.time()
-
-        self.logger.info('WarcWriterThread shutting down')
-        self.warc_writer.close_writer();
-
-
--- a/warcprox/writer.py
+++ b/warcprox/writer.py
@ -0,0 +1,168 @@
+#
+# warcprox/writer.py - warc writer, manages and writes records to warc files
+#
+# Copyright (C) 2013-2016 Internet Archive
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
+# USA.
+#
+
+from __future__ import absolute_import
+
+import logging
+from datetime import datetime
+from hanzo import warctools
+import time
+import warcprox
+import os
+import socket
+import string
+import random
+
+class WarcWriter:
+    logger = logging.getLogger('warcprox.writer.WarcWriter')
+
+    def __init__(self, options=warcprox.Options()):
+
+        self.rollover_size = options.rollover_size or 1000000000
+        self.rollover_idle_time = options.rollover_idle_time or None
+        self._last_activity = time.time()
+
+        self.gzip = options.gzip or False
+        digest_algorithm = options.digest_algorithm or 'sha1'
+        base32 = options.base32
+        self.record_builder = warcprox.warc.WarcRecordBuilder(digest_algorithm=digest_algorithm, base32=base32)
+
+        # warc path and filename stuff
+        self.directory = options.directory or './warcs'
+        self.prefix = options.prefix or 'warcprox'
+
+        self._f = None
+        self._fpath = None
+        self._f_finalname = None
+        self._serial = 0
+
+        self._randomtoken = "".join(random.Random().sample(string.digits + string.ascii_lowercase, 8))
+
+        if not os.path.exists(self.directory):
+            self.logger.info("warc destination directory {} doesn't exist, creating it".format(self.directory))
+            os.mkdir(self.directory)
+
+    def timestamp17(self):
+        now = datetime.utcnow()
+        return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000)
+
+    def close_writer(self):
+        if self._fpath:
+            self.logger.info('closing {0}'.format(self._f_finalname))
+            self._f.close()
+            finalpath = os.path.sep.join([self.directory, self._f_finalname])
+            os.rename(self._fpath, finalpath)
+
+            self._fpath = None
+            self._f = None
+
+    # h3 default <!-- <property name="template" value="${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}" /> -->
+    # ${prefix}-${timestamp17}-${randomtoken}-${serialno}.warc.gz"
+    def _writer(self):
+        if self._fpath and os.path.getsize(self._fpath) > self.rollover_size:
+            self.close_writer()
+
+        if self._f == None:
+            self._f_finalname = '{}-{}-{:05d}-{}.warc{}'.format(
+                    self.prefix, self.timestamp17(), self._serial, self._randomtoken, '.gz' if self.gzip else '')
+            self._fpath = os.path.sep.join([self.directory, self._f_finalname + '.open'])
+
+            self._f = open(self._fpath, 'wb')
+
+            warcinfo_record = self.record_builder.build_warcinfo_record(self._f_finalname)
+            self.logger.debug('warcinfo_record.headers={}'.format(warcinfo_record.headers))
+            warcinfo_record.write_to(self._f, gzip=self.gzip)
+
+            self._serial += 1
+
+        return self._f
+
+    def write_records(self, recorded_url):
+        """Returns tuple of records written, which are instances of
+        hanzo.warctools.warc.WarcRecord, decorated with "warc_filename" and
+        "offset" attributes."""
+        records = self.record_builder.build_warc_records(recorded_url)
+
+        writer = self._writer()
+        recordset_offset = writer.tell()
+
+        for record in records:
+            offset = writer.tell()
+            record.write_to(writer, gzip=self.gzip)
+            record.offset = offset
+            record.length = writer.tell() - offset
+            record.warc_filename = self._f_finalname
+            self.logger.debug('wrote warc record: warc_type=%s content_length=%s url=%s warc=%s offset=%d',
+                    record.get_header(warctools.WarcRecord.TYPE),
+                    record.get_header(warctools.WarcRecord.CONTENT_LENGTH),
+                    record.get_header(warctools.WarcRecord.URL),
+                    self._fpath, record.offset)
+
+        self._f.flush()
+        self._last_activity = time.time()
+
+        return records
+
+    def maybe_idle_rollover(self):
+        if (self._fpath is not None
+                and self.rollover_idle_time is not None
+                and self.rollover_idle_time > 0
+                and time.time() - self._last_activity > self.rollover_idle_time):
+            self.logger.debug('rolling over {} after {} seconds idle'.format(self._f_finalname, time.time() - self._last_activity))
+            self.close_writer()
+
+class WarcWriterPool:
+    logger = logging.getLogger("warcprox.writer.WarcWriterPool")
+
+    def __init__(self, options=warcprox.Options()):
+        self.default_warc_writer = WarcWriter(options=options)
+        self.warc_writers = {}  # {prefix:WarcWriter}
+        self._last_sync = time.time()
+        self.options = options
+
+    # chooses writer for filename specified by warcprox_meta["warc-prefix"] if set
+    def _writer(self, recorded_url):
+        w = self.default_warc_writer
+        if recorded_url.warcprox_meta and "warc-prefix" in recorded_url.warcprox_meta:
+            # self.logger.info("recorded_url.warcprox_meta={} for {}".format(recorded_url.warcprox_meta, recorded_url.url))
+            options = warcprox.Options(**vars(self.options))
+            options.prefix = recorded_url.warcprox_meta["warc-prefix"]
+            if not options.prefix in self.warc_writers:
+                self.warc_writers[options.prefix] = WarcWriter(options=options)
+            w = self.warc_writers[options.prefix]
+        return w
+
+    def write_records(self, recorded_url):
+        """Returns tuple of records written, which are instances of
+        hanzo.warctools.warc.WarcRecord, decorated with "warc_filename" and
+        "offset" attributes."""
+        return self._writer(recorded_url).write_records(recorded_url)
+
+    def maybe_idle_rollover(self):
+        self.default_warc_writer.maybe_idle_rollover()
+        for w in self.warc_writers.values():
+            w.maybe_idle_rollover()
+
+    def close_writers(self):
+        self.default_warc_writer.close_writer()
+        for w in self.warc_writers.values():
+            w.close_writer()
+
--- a/warcprox/writerthread.py
+++ b/warcprox/writerthread.py
@ -0,0 +1,122 @@
+#
+# warcprox/writerthread.py - warc writer thread, reads from the recorded url
+# queue, writes warc records, runs final tasks after warc records are written
+#
+# Copyright (C) 2013-2016 Internet Archive
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
+# USA.
+#
+
+from __future__ import absolute_import
+
+try:
+    import queue
+except ImportError:
+    import Queue as queue
+
+import logging
+import threading
+import os
+import hashlib
+import time
+import socket
+import base64
+from datetime import datetime
+import hanzo.httptools
+from hanzo import warctools
+import warcprox
+import cProfile
+
+class WarcWriterThread(threading.Thread):
+    logger = logging.getLogger("warcprox.warcproxwriter.WarcWriterThread")
+
+    def __init__(self, recorded_url_q=None, writer_pool=None, dedup_db=None, listeners=None, options=warcprox.Options()):
+        """recorded_url_q is a queue.Queue of warcprox.warcprox.RecordedUrl."""
+        threading.Thread.__init__(self, name='WarcWriterThread')
+        self.recorded_url_q = recorded_url_q
+        self.stop = threading.Event()
+        if writer_pool:
+            self.writer_pool = writer_pool
+        else:
+            self.writer_pool = WarcWriterPool()
+        self.dedup_db = dedup_db
+        self.listeners = listeners
+        self.options = options
+        self.idle = None
+
+    def run(self):
+        if self.options.profile:
+            cProfile.runctx('self._run()', globals(), locals(), sort='cumulative')
+        else:
+            self._run()
+
+    def _run(self):
+        while not self.stop.is_set():
+            try:
+                self.name = 'WarcWriterThread(tid={})'.format(warcprox.gettid())
+                while True:
+                    try:
+                        if self.stop.is_set():
+                            qsize = self.recorded_url_q.qsize()
+                            if qsize % 50 == 0:
+                                self.logger.info("%s urls left to write", qsize)
+
+                        recorded_url = self.recorded_url_q.get(block=True, timeout=0.5)
+                        self.idle = None
+                        if self.dedup_db:
+                            warcprox.dedup.decorate_with_dedup_info(self.dedup_db,
+                                    recorded_url, base32=self.options.base32)
+                        records = self.writer_pool.write_records(recorded_url)
+                        self._final_tasks(recorded_url, records)
+
+                        # try to release resources in a timely fashion
+                        if recorded_url.response_recorder and recorded_url.response_recorder.tempfile:
+                            recorded_url.response_recorder.tempfile.close()
+                    except queue.Empty:
+                        if self.stop.is_set():
+                            break
+                        self.idle = time.time()
+                        self.writer_pool.maybe_idle_rollover()
+
+                self.logger.info('WarcWriterThread shutting down')
+                self.writer_pool.close_writers()
+            except:
+                self.logger.critical("WarcWriterThread will try to continue after unexpected error", exc_info=True)
+                time.sleep(0.5)
+
+    # closest thing we have to heritrix crawl log at the moment
+    def _log(self, recorded_url, records):
+        try:
+            payload_digest = records[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode("utf-8")
+        except:
+            payload_digest = "-"
+
+        # 2015-07-17T22:32:23.672Z     1         58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"}
+        self.logger.info("{} {} {} {} {} size={} {} {} {} offset={}".format(
+            recorded_url.client_ip, recorded_url.status, recorded_url.method,
+            recorded_url.url.decode("utf-8"), recorded_url.mimetype,
+            recorded_url.size, payload_digest, records[0].type.decode("utf-8"),
+            records[0].warc_filename, records[0].offset))
+
+    def _final_tasks(self, recorded_url, records):
+        if self.listeners:
+            for listener in self.listeners:
+                try:
+                    listener.notify(recorded_url, records)
+                except:
+                    self.logger.error('%s raised exception',
+                                      listener.notify, exc_info=True)
+        self._log(recorded_url, records)