From af863c6dba906d123627a6cc03f106d2baca8124 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Tue, 15 May 2018 11:22:10 -0700
Subject: [PATCH 01/33] default values for dedup_min_text_size et al

because they may be missing in case warcprox is used as a library
---
 setup.py          | 2 +-
 warcprox/dedup.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 4579b12..a7fdb47 100755
--- a/setup.py
+++ b/setup.py
@@ -40,7 +40,7 @@ except:
 
 setuptools.setup(
         name='warcprox',
-        version='2.4b2.dev170',
+        version='2.4b2.dev171',
         description='WARC writing MITM HTTP/S proxy',
         url='https://github.com/internetarchive/warcprox',
         author='Noah Levitt',
diff --git a/warcprox/dedup.py b/warcprox/dedup.py
index f979d97..5d5039f 100644
--- a/warcprox/dedup.py
+++ b/warcprox/dedup.py
@@ -39,9 +39,9 @@ urllib3.disable_warnings()
 
 class DedupableMixin(object):
     def __init__(self, options=warcprox.Options()):
-        self.min_text_size = options.dedup_min_text_size
-        self.min_binary_size = options.dedup_min_binary_size
-        self.dedup_only_with_bucket = options.dedup_only_with_bucket
+        self.min_text_size = options.dedup_min_text_size or 0
+        self.min_binary_size = options.dedup_min_binary_size or 0
+        self.dedup_only_with_bucket = options.dedup_only_with_bucket or False
 
     def should_dedup(self, recorded_url):
         """Check if we should try to run dedup on resource based on payload

From e23af32e94d5b33f6527cfc58c798385b2cab79f Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Tue, 15 May 2018 15:33:52 -0700
Subject: [PATCH 02/33] we want to save all captures to the big "captures"

table, even if we don't want to dedup against them
---
 setup.py             | 2 +-
 warcprox/bigtable.py | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index a7fdb47..eefcdb4 100755
--- a/setup.py
+++ b/setup.py
@@ -40,7 +40,7 @@ except:
 
 setuptools.setup(
         name='warcprox',
-        version='2.4b2.dev171',
+        version='2.4b2.dev172',
         description='WARC writing MITM HTTP/S proxy',
         url='https://github.com/internetarchive/warcprox',
         author='Noah Levitt',
diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py
index d8cd218..0d98270 100644
--- a/warcprox/bigtable.py
+++ b/warcprox/bigtable.py
@@ -253,6 +253,4 @@ class RethinkCapturesDedup(warcprox.dedup.DedupDb, DedupableMixin):
         self.captures_db.close()
 
     def notify(self, recorded_url, records):
-        if (records and records[0].type == b'response'
-                and self.should_dedup(recorded_url)):
-            self.captures_db.notify(recorded_url, records)
+        self.captures_db.notify(recorded_url, records)

From 5f0c46d579a45a06768babbe8b72f83e98d73ce0 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Wed, 16 May 2018 10:52:04 -0700
Subject: [PATCH 03/33] rewrite test_dedup_min_size() to account for

the fact that we always save a record to the big captures table,
partly by adding a new check that --dedup-min-*-size is respected even
if there is an entry in the dedup db for the sha1
---
 tests/test_warcprox.py | 145 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 132 insertions(+), 13 deletions(-)

diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py
index 269deee..2a12b22 100755
--- a/tests/test_warcprox.py
+++ b/tests/test_warcprox.py
@@ -1940,7 +1940,7 @@ def test_trough_segment_promotion(warcprox_):
     time.sleep(3)
     assert promoted == []
 
-def test_dedup_min_size(http_daemon, warcprox_, archiving_proxies, playback_proxies):
+def test_dedup_min_text_size(http_daemon, warcprox_, archiving_proxies):
     """We use options --dedup-min-text-size=3 --dedup-min-binary-size=5 and we
     try to download content smaller than these limits to make sure that it is
     not deduplicated. We create the digest_str with the following code:
@@ -1950,36 +1950,155 @@ def test_dedup_min_size(http_daemon, warcprox_, archiving_proxies, playback_prox
     warcprox.digest_str(payload_digest)
     ```
     """
+    urls_before = warcprox_.proxy.running_stats.urls
+
+    # start a fresh warc
+    warcprox_.warc_writer_processor.writer_pool.close_writers()
+
+    # fetch small text
     url = 'http://localhost:%s/text-2bytes' % http_daemon.server_port
     response = requests.get(
         url, proxies=archiving_proxies, verify=False, timeout=10)
     assert len(response.content) == 2
+    # wait for postfetch chain
+    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
+    # check no dedup was saved (except RethinkCapturesDedup which always saves)
     dedup_lookup = warcprox_.dedup_db.lookup(
             b'sha1:e0c9035898dd52fc65c41454cec9c4d2611bfb37')
-    assert dedup_lookup is None
-    time.sleep(3)
+    if not isinstance(warcprox_.dedup_db, warcprox.bigtable.RethinkCapturesDedup):
+        assert dedup_lookup is None
+
+        # fetch again saving dedup info so that we can test dedup info ignored
+        orig_should_dedup = warcprox_.dedup_db.should_dedup
+        warcprox_.dedup_db.should_dedup = lambda *args, **kwargs: True
+        try:
+            response = requests.get(
+                url, proxies=archiving_proxies, verify=False, timeout=10)
+            assert len(response.content) == 2
+            wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2)
+            # check dedup was saved
+            dedup_lookup = warcprox_.dedup_db.lookup(
+                    b'sha1:e0c9035898dd52fc65c41454cec9c4d2611bfb37')
+            assert dedup_lookup
+        finally:
+            warcprox_.dedup_db.should_dedup = orig_should_dedup
+    else:
+        assert dedup_lookup
+
+    # fetch again and check that it was not deduped
+    urls_before = warcprox_.proxy.running_stats.urls
     response = requests.get(
         url, proxies=archiving_proxies, verify=False, timeout=10)
-    dedup_lookup = warcprox_.dedup_db.lookup(
-            b'sha1:e0c9035898dd52fc65c41454cec9c4d2611bfb37')
-    # This would return dedup data if payload_size > dedup-min-text-size
-    assert dedup_lookup is None
+    assert len(response.content) == 2
+    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
 
+    # check that response records were written
+    warc = warcprox_.warc_writer_processor.writer_pool.default_warc_writer._available_warcs.queue[0].path
+    with open(warc, 'rb') as f:
+        rec_iter = iter(warcio.archiveiterator.ArchiveIterator(f))
+        record = next(rec_iter)
+        assert record.rec_type == 'warcinfo'
+        record = next(rec_iter)
+        assert record.rec_type == 'response'
+        assert record.rec_headers.get_header('warc-target-uri') == url
+        record = next(rec_iter)
+        assert record.rec_type == 'request'
+        assert record.rec_headers.get_header('warc-target-uri') == url
+        record = next(rec_iter)
+        assert record.rec_type == 'response'
+        assert record.rec_headers.get_header('warc-target-uri') == url
+        record = next(rec_iter)
+        assert record.rec_type == 'request'
+        assert record.rec_headers.get_header('warc-target-uri') == url
+        if not isinstance(warcprox_.dedup_db, warcprox.bigtable.RethinkCapturesDedup):
+            record = next(rec_iter)
+            assert record.rec_type == 'response'
+            assert record.rec_headers.get_header('warc-target-uri') == url
+            record = next(rec_iter)
+            assert record.rec_type == 'request'
+            assert record.rec_headers.get_header('warc-target-uri') == url
+        with pytest.raises(StopIteration):
+            next(rec_iter)
+
+def test_dedup_min_binary_size(http_daemon, warcprox_, archiving_proxies):
+    """We use options --dedup-min-text-size=3 --dedup-min-binary-size=5 and we
+    try to download content smaller than these limits to make sure that it is
+    not deduplicated. We create the digest_str with the following code:
+    ```
+    payload_digest = hashlib.new('sha1')
+    payload_digest.update(b'aa')
+    warcprox.digest_str(payload_digest)
+    ```
+    """
+    urls_before = warcprox_.proxy.running_stats.urls
+
+    # start a fresh warc
+    warcprox_.warc_writer_processor.writer_pool.close_writers()
+
+    # fetch small binary
     url = 'http://localhost:%s/binary-4bytes' % http_daemon.server_port
     response = requests.get(
         url, proxies=archiving_proxies, verify=False, timeout=10)
     assert len(response.content) == 4
+    # wait for postfetch chain
+    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
+    # check no dedup was saved (except RethinkCapturesDedup which always saves)
     dedup_lookup = warcprox_.dedup_db.lookup(
             b'sha1:70c881d4a26984ddce795f6f71817c9cf4480e79')
-    assert dedup_lookup is None
-    time.sleep(3)
+    if not isinstance(warcprox_.dedup_db, warcprox.bigtable.RethinkCapturesDedup):
+        assert dedup_lookup is None
+
+        # fetch again saving dedup info so that we can test dedup info ignored
+        orig_should_dedup = warcprox_.dedup_db.should_dedup
+        warcprox_.dedup_db.should_dedup = lambda *args, **kwargs: True
+        try:
+            response = requests.get(
+                url, proxies=archiving_proxies, verify=False, timeout=10)
+            assert len(response.content) == 4
+            wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2)
+            # check dedup was saved
+            dedup_lookup = warcprox_.dedup_db.lookup(
+                    b'sha1:70c881d4a26984ddce795f6f71817c9cf4480e79')
+            assert dedup_lookup
+        finally:
+            warcprox_.dedup_db.should_dedup = orig_should_dedup
+    else:
+        assert dedup_lookup
+
+    # fetch again and check that it was not deduped
+    urls_before = warcprox_.proxy.running_stats.urls
     response = requests.get(
         url, proxies=archiving_proxies, verify=False, timeout=10)
-    dedup_lookup = warcprox_.dedup_db.lookup(
-            b'sha1:70c881d4a26984ddce795f6f71817c9cf4480e79')
-    # This would return dedup data if payload_size > dedup-min-binary-size
-    assert dedup_lookup is None
+    assert len(response.content) == 4
+    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
 
+    # check that response records were written
+    warc = warcprox_.warc_writer_processor.writer_pool.default_warc_writer._available_warcs.queue[0].path
+    with open(warc, 'rb') as f:
+        rec_iter = iter(warcio.archiveiterator.ArchiveIterator(f))
+        record = next(rec_iter)
+        assert record.rec_type == 'warcinfo'
+        record = next(rec_iter)
+        assert record.rec_type == 'response'
+        assert record.rec_headers.get_header('warc-target-uri') == url
+        record = next(rec_iter)
+        assert record.rec_type == 'request'
+        assert record.rec_headers.get_header('warc-target-uri') == url
+        record = next(rec_iter)
+        assert record.rec_type == 'response'
+        assert record.rec_headers.get_header('warc-target-uri') == url
+        record = next(rec_iter)
+        assert record.rec_type == 'request'
+        assert record.rec_headers.get_header('warc-target-uri') == url
+        if not isinstance(warcprox_.dedup_db, warcprox.bigtable.RethinkCapturesDedup):
+            record = next(rec_iter)
+            assert record.rec_type == 'response'
+            assert record.rec_headers.get_header('warc-target-uri') == url
+            record = next(rec_iter)
+            assert record.rec_type == 'request'
+            assert record.rec_headers.get_header('warc-target-uri') == url
+        with pytest.raises(StopIteration):
+            next(rec_iter)
 
 if __name__ == '__main__':
     pytest.main()

From 76ebaea944c7b117bc910c158e741f202ea2101d Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Wed, 16 May 2018 12:17:06 -0700
Subject: [PATCH 04/33] fix test_dedup_min_text_size failure?

by waiting for postfetch chain in test_socket_timeout_response
---
 tests/test_warcprox.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py
index 2a12b22..079fdd1 100755
--- a/tests/test_warcprox.py
+++ b/tests/test_warcprox.py
@@ -1837,6 +1837,8 @@ def test_socket_timeout_response(
     """Response will timeout because we use --socket-timeout=4 whereas the
     target URL will return after 6 sec.
     """
+    urls_before = warcprox_.proxy.running_stats.urls
+
     url = 'http://localhost:%s/slow-response' % http_daemon.server_port
     response = requests.get(url, proxies=archiving_proxies, verify=False)
     assert response.status_code == 502
@@ -1849,6 +1851,8 @@ def test_socket_timeout_response(
     assert response.status_code == 404
     assert response.content == b'404 Not Found\n'
 
+    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
+
 def test_empty_response(
         warcprox_, http_daemon, https_daemon, archiving_proxies,
         playback_proxies):

From 49f637af0584fb1ef3d0e61911b43f570ea43962 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Wed, 16 May 2018 13:48:04 -0700
Subject: [PATCH 05/33] fix trough deployment in Dockerfile

---
 tests/Dockerfile    | 16 +++++++++-------
 tests/run-trough.sh |  2 +-
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/tests/Dockerfile b/tests/Dockerfile
index 5e380d8..6a97ac0 100644
--- a/tests/Dockerfile
+++ b/tests/Dockerfile
@@ -75,11 +75,13 @@ RUN mv -v /etc/hadoop/conf/hdfs-site.xml /etc/hadoop/conf/hdfs-site.xml.orig \
 RUN echo '#!/bin/bash\nservice hadoop-hdfs-namenode start\nservice hadoop-hdfs-datanode start' > /etc/my_init.d/50_start_hdfs.sh \
     && chmod a+x /etc/my_init.d/50_start_hdfs.sh
 
+RUN apt-get install -y libsqlite3-dev
+
 # trough itself
 RUN virtualenv -p python3 /opt/trough-ve3 \
     && . /opt/trough-ve3/bin/activate \
     && pip install git+https://github.com/jkafader/snakebite@feature/python3-version-string \
-    && pip install git+https://github.com/nlevitt/trough.git@toward-warcprox-dedup
+    && pip install git+https://github.com/internetarchive/trough.git
 
 RUN mkdir -vp /etc/service/trough-sync-local \
     && echo "#!/bin/bash\nsource /opt/trough-ve3/bin/activate\nexec sync.py >>/tmp/trough-sync-local.out 2>&1" > /etc/service/trough-sync-local/run \
@@ -97,11 +99,11 @@ RUN mkdir -vp /etc/service/trough-write \
     && echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6222 --master --processes=2 --harakiri=240 --max-requests=50000 --vacuum --die-on-term --wsgi-file $venv/bin/writer.py >>/tmp/trough-write.out 2>&1' > /etc/service/trough-write/run \
     && chmod a+x /etc/service/trough-write/run
 
-RUN mkdir -vp /etc/service/trough-write-provisioner-local \
-    && echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6112 --master --processes=2 --harakiri=20 --max-requests=50000 --vacuum --die-on-term --wsgi-file $venv/bin/write_provisioner_local.py >>/tmp/trough-write-provisioner-local.out 2>&1' > /etc/service/trough-write-provisioner-local/run \
-    && chmod a+x /etc/service/trough-write-provisioner-local/run
+RUN mkdir -vp /etc/service/trough-segment-manager-local \
+    && echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6112 --master --processes=2 --harakiri=7200 --http-timeout=7200 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:local >>/tmp/trough-segment-manager-local.out 2>&1' > /etc/service/trough-segment-manager-local/run \
+    && chmod a+x /etc/service/trough-segment-manager-local/run
 
-RUN mkdir -vp /etc/service/trough-write-provisioner-server \
-    && echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6111 --master --processes=2 --harakiri=20 --max-requests=50000 --vacuum --die-on-term --wsgi-file $venv/bin/write_provisioner_server.py >>/tmp/trough-write-provisioner-server.out 2>&1' > /etc/service/trough-write-provisioner-server/run \
-    && chmod a+x /etc/service/trough-write-provisioner-server/run
+RUN mkdir -vp /etc/service/trough-segment-manager-server \
+    && echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6111 --master --processes=2 --harakiri=7200 --http-timeout=7200 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:server >>/tmp/trough-segment-manager-server.out 2>&1' > /etc/service/trough-segment-manager-server/run \
+    && chmod a+x /etc/service/trough-segment-manager-server/run
 
diff --git a/tests/run-trough.sh b/tests/run-trough.sh
index c2319a0..81e0e68 100644
--- a/tests/run-trough.sh
+++ b/tests/run-trough.sh
@@ -4,7 +4,7 @@
 #
 
 pip install git+https://github.com/jkafader/snakebite@feature/python3-version-string
-pip install git+https://github.com/internetarchive/trough.git@toward-warcprox-dedup
+pip install git+https://github.com/internetarchive/trough.git
 
 mkdir /etc/trough
 

From d834ac3e5910e8354b9e74630b94aa4bab500c81 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Wed, 16 May 2018 14:21:18 -0700
Subject: [PATCH 06/33] only run tests in py3

---
 tests/run-tests.sh | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/tests/run-tests.sh b/tests/run-tests.sh
index a7a819c..12dd371 100755
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@@ -31,18 +31,15 @@ script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
 docker build -t internetarchive/warcprox-tests $script_dir
 
-for python in python3 python2.7
-do
-    docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests /sbin/my_init -- \
-        bash -x -c "cd /tmp && git clone /warcprox && cd /tmp/warcprox \
-            && (cd /warcprox && git diff HEAD) | patch -p1 \
-            && virtualenv -p $python /tmp/venv \
-            && source /tmp/venv/bin/activate \
-            && pip --log-file /tmp/pip.log install . pytest mock requests warcio \
-            && py.test -v tests \
-            && py.test -v --rethinkdb-dedup-url=rethinkdb://localhost/test1/dedup tests \
-            && py.test -v --rethinkdb-big-table-url=rethinkdb://localhost/test2/captures tests \
-            && py.test -v --rethinkdb-trough-db-url=rethinkdb://localhost/trough_configuration tests \
-            "
-done
+docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests /sbin/my_init -- \
+    bash -x -c "cd /tmp && git clone /warcprox && cd /tmp/warcprox \
+        && (cd /warcprox && git diff HEAD) | patch -p1 \
+        && virtualenv -p python3 /tmp/venv \
+        && source /tmp/venv/bin/activate \
+        && pip --log-file /tmp/pip.log install . pytest mock requests warcio \
+        && py.test -v tests \
+        && py.test -v --rethinkdb-dedup-url=rethinkdb://localhost/test1/dedup tests \
+        && py.test -v --rethinkdb-big-table-url=rethinkdb://localhost/test2/captures tests \
+        && py.test -v --rethinkdb-trough-db-url=rethinkdb://localhost/trough_configuration tests \
+        "
 

From b762d6468b40fc740c4ec38a575b8896cac86689 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Wed, 16 May 2018 14:25:01 -0700
Subject: [PATCH 07/33] just one should_dedup() for trough dedup

fixes failing test and clarifies things
---
 setup.py          |  2 +-
 warcprox/dedup.py | 10 ++++------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/setup.py b/setup.py
index eefcdb4..8c38b62 100755
--- a/setup.py
+++ b/setup.py
@@ -40,7 +40,7 @@ except:
 
 setuptools.setup(
         name='warcprox',
-        version='2.4b2.dev172',
+        version='2.4b2.dev173',
         description='WARC writing MITM HTTP/S proxy',
         url='https://github.com/internetarchive/warcprox',
         author='Noah Levitt',
diff --git a/warcprox/dedup.py b/warcprox/dedup.py
index 5d5039f..be91874 100644
--- a/warcprox/dedup.py
+++ b/warcprox/dedup.py
@@ -326,10 +326,9 @@ class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin)
             if self.outq:
                 self.outq.put(recorded_url)
 
-class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor, DedupableMixin):
+class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor):
     def __init__(self, trough_dedup_db, options=warcprox.Options()):
         warcprox.BaseBatchPostfetchProcessor.__init__(self, options)
-        DedupableMixin.__init__(self, options)
         self.trough_dedup_db = trough_dedup_db
 
     def _filter_and_bucketize(self, batch):
@@ -341,7 +340,7 @@ class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor, DedupableMixin):
         for recorded_url in batch:
             if (recorded_url.warc_records
                     and recorded_url.warc_records[0].type == b'response'
-                    and self.should_dedup(recorded_url)):
+                    and self.trough_dedup_db.should_dedup(recorded_url)):
                 if (recorded_url.warcprox_meta
                         and 'dedup-bucket' in recorded_url.warcprox_meta):
                     bucket = recorded_url.warcprox_meta['dedup-bucket']
@@ -373,10 +372,9 @@ class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor, DedupableMixin):
                 logging.warn(
                     'timed out saving dedup info to trough', exc_info=True)
 
-class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin):
+class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
     def __init__(self, trough_dedup_db, options=warcprox.Options()):
         warcprox.BaseBatchPostfetchProcessor.__init__(self, options)
-        DedupableMixin.__init__(self, options)
         self.trough_dedup_db = trough_dedup_db
 
     def _startup(self):
@@ -391,7 +389,7 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin):
         for recorded_url in batch:
             if (recorded_url.response_recorder
                     and recorded_url.payload_digest
-                    and self.should_dedup(recorded_url)):
+                    and self.trough_dedup_db.should_dedup(recorded_url)):
                 if (recorded_url.warcprox_meta
                         and 'dedup-bucket' in recorded_url.warcprox_meta):
                     bucket = recorded_url.warcprox_meta['dedup-bucket']

From 997d4341fe3d99c4d2e65845edff97eb817c130c Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Fri, 18 May 2018 17:29:38 -0700
Subject: [PATCH 08/33] add some debug logging in BatchTroughLoader

---
 setup.py          |  2 +-
 warcprox/dedup.py | 21 ++++++++++++++++++++-
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 8c38b62..9e7db14 100755
--- a/setup.py
+++ b/setup.py
@@ -40,7 +40,7 @@ except:
 
 setuptools.setup(
         name='warcprox',
-        version='2.4b2.dev173',
+        version='2.4b2.dev174',
         description='WARC writing MITM HTTP/S proxy',
         url='https://github.com/internetarchive/warcprox',
         author='Noah Levitt',
diff --git a/warcprox/dedup.py b/warcprox/dedup.py
index be91874..83f3e0a 100644
--- a/warcprox/dedup.py
+++ b/warcprox/dedup.py
@@ -373,6 +373,8 @@ class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor):
                     'timed out saving dedup info to trough', exc_info=True)
 
 class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
+    logger = logging.getLogger("warcprox.dedup.BatchTroughLoader")
+
     def __init__(self, trough_dedup_db, options=warcprox.Options()):
         warcprox.BaseBatchPostfetchProcessor.__init__(self, options)
         self.trough_dedup_db = trough_dedup_db
@@ -386,6 +388,7 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
         be looked up.
         '''
         buckets = collections.defaultdict(list)
+        discards = []
         for recorded_url in batch:
             if (recorded_url.response_recorder
                     and recorded_url.payload_digest
@@ -396,6 +399,13 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
                 else:
                     bucket = '__unspecified__'
                 buckets[bucket].append(recorded_url)
+            else:
+                discards.append(
+                        warcprox.digest_str(
+                            recorded_url.payload_digest, self.options.base32)
+                        if recorded_url.payload_digest else 'n/a')
+        self.logger.debug(
+                'filtered out digests (not loading dedup): %r', discards)
         return buckets
 
     def _build_key_index(self, batch):
@@ -443,10 +453,19 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
                                 'problem looking up dedup info for %s urls '
                                 'in bucket %s', len(buckets[bucket]), bucket,
                                 exc_info=True)
+
+                    if self.logger.isEnabledFor(logging.DEBUG):
+                        dups = sorted([e['digest_key'] for e in future.result()])
+                        novel = sorted([
+                            k for k in key_index.keys() if k not in dups])
+                        self.logger.debug(
+                                'bucket %s: dups=%r novel=%r',
+                                bucket, dups, novel)
+
             except futures.TimeoutError as e:
                 # the remaining threads actually keep running in this case,
                 # there's no way to stop them, but that should be harmless
-                logging.warn(
+                self.logger.warn(
                     'timed out loading dedup info from trough', exc_info=True)
 
 class TroughDedupDb(DedupDb, DedupableMixin):

From b7ebc384915f87127dda0db554f7c6fb90194539 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Mon, 21 May 2018 22:18:28 +0000
Subject: [PATCH 09/33] rename README.rst -> readme.rst

---
 README.rst => readme.rst | 0
 setup.py                 | 2 +-
 warcprox/main.py         | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename README.rst => readme.rst (100%)

diff --git a/README.rst b/readme.rst
similarity index 100%
rename from README.rst
rename to readme.rst
diff --git a/setup.py b/setup.py
index 9e7db14..4fc1cbf 100755
--- a/setup.py
+++ b/setup.py
@@ -45,7 +45,7 @@ setuptools.setup(
         url='https://github.com/internetarchive/warcprox',
         author='Noah Levitt',
         author_email='nlevitt@archive.org',
-        long_description=open('README.rst').read(),
+        long_description=open('readme.rst').read(),
         license='GPL',
         packages=['warcprox'],
         install_requires=deps,
diff --git a/warcprox/main.py b/warcprox/main.py
index 6fb46ef..5f45a13 100644
--- a/warcprox/main.py
+++ b/warcprox/main.py
@@ -193,7 +193,7 @@ def _build_arg_parser(prog='warcprox'):
             action='append', help=(
                 'Qualified name of plugin class, e.g. "mypkg.mymod.MyClass". '
                 'May be used multiple times to register multiple plugins. '
-                'See README.rst for more information.'))
+                'See readme.rst for more information.'))
     arg_parser.add_argument('--version', action='version',
             version="warcprox {}".format(warcprox.__version__))
     arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')

From efc51a43617eb791e9829dd5ecff00e8bbf6a946 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Tue, 22 May 2018 11:59:06 -0700
Subject: [PATCH 10/33] stubby api docs

---
 api.rst    | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 readme.rst |  23 ++++++++---
 2 files changed, 127 insertions(+), 6 deletions(-)
 create mode 100644 api.rst

diff --git a/api.rst b/api.rst
new file mode 100644
index 0000000..87b444f
--- /dev/null
+++ b/api.rst
@@ -0,0 +1,110 @@
+warcprox API
+************
+
+Means of Interacting with warcprox over http, aside from simply proxying urls.
+
+`/status` url
+=============
+
+If warcprox is running at localhost:8000, http://localhost:8000/status returns
+a json blob with a bunch of status info. For example:
+
+::
+
+    $ curl -sS http://localhost:8000/status
+    {
+      "rates_5min": {
+        "warc_bytes_per_sec": 0.0,
+        "urls_per_sec": 0.0,
+        "actual_elapsed": 277.2983281612396
+      },
+      "version": "2.4b2.dev174",
+      "load": 0.0,
+      "seconds_behind": 0.0,
+      "threads": 100,
+      "warc_bytes_written": 0,
+      "port": 8000,
+      "postfetch_chain": [
+        {
+          "queued_urls": 0,
+          "processor": "SkipFacebookCaptchas"
+        },
+        {
+          "queued_urls": 0,
+          "processor": "BatchTroughLoader"
+        },
+        {
+          "queued_urls": 0,
+          "processor": "WarcWriterProcessor"
+        },
+        {
+          "queued_urls": 0,
+          "processor": "BatchTroughStorer"
+        },
+        {
+          "queued_urls": 0,
+          "processor": "RethinkStatsProcessor"
+        },
+        {
+          "queued_urls": 0,
+          "processor": "CrawlLogger"
+        },
+        {
+          "queued_urls": 0,
+          "processor": "TroughFeed"
+        },
+        {
+          "queued_urls": 0,
+          "processor": "RunningStats"
+        }
+      ],
+      "queue_max_size": 500,
+      "role": "warcprox",
+      "queued_urls": 0,
+      "active_requests": 1,
+      "host": "wbgrp-svc405.us.archive.org",
+      "rates_15min": {
+        "warc_bytes_per_sec": 0.0,
+        "urls_per_sec": 0.0,
+        "actual_elapsed": 876.9885368347168
+      },
+      "unaccepted_requests": 0,
+      "urls_processed": 0,
+      "pid": 18841,
+      "address": "127.0.0.1",
+      "rates_1min": {
+        "warc_bytes_per_sec": 0.0,
+        "urls_per_sec": 0.0,
+        "actual_elapsed": 54.92501664161682
+      },
+      "start_time": 1526690353.4060142
+    }
+
+`WARCPROX_WRITE_RECORD` http method
+===================================
+
+::
+
+    $ echo -ne 'WARCPROX_WRITE_RECORD special://url/some?thing HTTP/1.1\r\nWARC-Type: resource\r\ncontent-type: text/plain;charset=utf-8\r\ncontent-length: 29\r\n\r\ni am a warc record payload!\r\n' | ncat 127.0.0.1 8000
+    HTTP/1.0 204 OK
+    Server: BaseHTTP/0.6 Python/3.6.3
+    Date: Mon, 21 May 2018 23:33:31 GMT
+
+::
+
+    WARC/1.0
+    WARC-Type: resource
+    WARC-Record-ID: <urn:uuid:d0e10852-b18c-4037-a99e-f41915fec5b5>
+    WARC-Date: 2018-05-21T23:33:31Z
+    WARC-Target-URI: special://url/some?thing
+    WARC-Block-Digest: sha1:a282cfe127ab8d51b315ff3d31de18614979d0df
+    WARC-Payload-Digest: sha1:a282cfe127ab8d51b315ff3d31de18614979d0df
+    Content-Type: text/plain;charset=utf-8
+    Content-Length: 29
+
+    i am a warc record payload!
+
+
+`Warcprox-Meta` http request header
+===================================
+
diff --git a/readme.rst b/readme.rst
index 113099b..090130e 100644
--- a/readme.rst
+++ b/readme.rst
@@ -1,5 +1,5 @@
 warcprox - WARC writing MITM HTTP/S proxy
------------------------------------------
+*****************************************
 .. image:: https://travis-ci.org/internetarchive/warcprox.svg?branch=master
     :target: https://travis-ci.org/internetarchive/warcprox
 
@@ -7,7 +7,7 @@ Based on the excellent and simple pymiproxy by Nadeem Douba.
 https://github.com/allfro/pymiproxy
 
 Install
-~~~~~~~
+=======
 
 Warcprox runs on python 3.4+.
 
@@ -26,7 +26,7 @@ You can also install the latest bleeding edge code:
 
 
 Trusting the CA cert
-~~~~~~~~~~~~~~~~~~~~
+====================
 
 For best results while browsing through warcprox, you need to add the CA
 cert as a trusted cert in your browser. If you don't do that, you will
@@ -34,8 +34,19 @@ get the warning when you visit each new site. But worse, any embedded
 https content on a different server will simply fail to load, because
 the browser will reject the certificate without telling you.
 
+API
+===
+
+For interacting with a running instance of warcprox.
+
+* `/status` url
+* `WARCPROX_WRITE_RECORD` http method
+* `Warcprox-Meta` http request header
+
+See `<api.rst>`_.
+
 Plugins
-~~~~~~~
+=======
 
 Warcprox supports a limited notion of plugins by way of the `--plugin` command
 line argument. Plugin classes are loaded from the regular python module search
@@ -49,7 +60,7 @@ specifying `--plugin` multiples times.
 `A minimal example <https://github.com/internetarchive/warcprox/blob/318405e795ac0ab8760988a1a482cf0a17697148/warcprox/__init__.py#L165>`__
 
 Usage
-~~~~~
+=====
 
 ::
 
@@ -162,7 +173,7 @@ Usage
       -q, --quiet
 
 License
-~~~~~~~
+=======
 
 Warcprox is a derivative work of pymiproxy, which is GPL. Thus warcprox is also
 GPL.

From 44ca939cb6a8ab28518748de5ac8f6bf0fbe12e5 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Tue, 22 May 2018 12:02:49 -0700
Subject: [PATCH 11/33] double the backticks

---
 api.rst    |  6 +++---
 readme.rst | 22 +++++++++++-----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/api.rst b/api.rst
index 87b444f..77d7ca4 100644
--- a/api.rst
+++ b/api.rst
@@ -3,7 +3,7 @@ warcprox API
 
 Means of Interacting with warcprox over http, aside from simply proxying urls.
 
-`/status` url
+``/status`` url
 =============
 
 If warcprox is running at localhost:8000, http://localhost:8000/status returns
@@ -80,7 +80,7 @@ a json blob with a bunch of status info. For example:
       "start_time": 1526690353.4060142
     }
 
-`WARCPROX_WRITE_RECORD` http method
+``WARCPROX_WRITE_RECORD`` http method
 ===================================
 
 ::
@@ -105,6 +105,6 @@ a json blob with a bunch of status info. For example:
     i am a warc record payload!
 
 
-`Warcprox-Meta` http request header
+``Warcprox-Meta`` http request header
 ===================================
 
diff --git a/readme.rst b/readme.rst
index 090130e..ffeabca 100644
--- a/readme.rst
+++ b/readme.rst
@@ -39,23 +39,23 @@ API
 
 For interacting with a running instance of warcprox.
 
-* `/status` url
-* `WARCPROX_WRITE_RECORD` http method
-* `Warcprox-Meta` http request header
+* ``/status`` url
+* ``WARCPROX_WRITE_RECORD`` http method
+* ``Warcprox-Meta`` http request header
 
 See `<api.rst>`_.
 
 Plugins
 =======
 
-Warcprox supports a limited notion of plugins by way of the `--plugin` command
-line argument. Plugin classes are loaded from the regular python module search
-path. They will be instantiated with one argument, a `warcprox.Options`, which
-holds the values of all the command line arguments. Legacy plugins with
-constructors that take no arguments are also supported. Plugins should either
-have a method `notify(self, recorded_url, records)` or should subclass
-`warcprox.BasePostfetchProcessor`. More than one plugin can be configured by
-specifying `--plugin` multiples times.
+Warcprox supports a limited notion of plugins by way of the ``--plugin``
+command line argument. Plugin classes are loaded from the regular python module
+search path. They will be instantiated with one argument, a
+``warcprox.Options``, which holds the values of all the command line arguments.
+Legacy plugins with constructors that take no arguments are also supported.
+Plugins should either have a method ``notify(self, recorded_url, records)`` or
+should subclass ``warcprox.BasePostfetchProcessor``. More than one plugin can
+be configured by specifying ``--plugin`` multiples times.
 
 `A minimal example <https://github.com/internetarchive/warcprox/blob/318405e795ac0ab8760988a1a482cf0a17697148/warcprox/__init__.py#L165>`__
 

From 36f6696552be5830e6ab823e563ea23b813bdb28 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Tue, 22 May 2018 15:00:10 -0700
Subject: [PATCH 12/33] fix failure message in test_return_capture_timestamp

---
 tests/test_warcprox.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py
index 079fdd1..7c5253b 100755
--- a/tests/test_warcprox.py
+++ b/tests/test_warcprox.py
@@ -726,14 +726,16 @@ def test_return_capture_timestamp(http_daemon, warcprox_, archiving_proxies):
     response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True)
     assert response.status_code == 200
     assert response.headers['Warcprox-Meta']
-    data = json.loads(response.headers['Warcprox-Meta'])
-    assert data['capture-metadata']
+    response_meta = json.loads(response.headers['Warcprox-Meta'])
+    assert response_meta['capture-metadata']
     try:
-        dt = datetime.datetime.strptime(data['capture-metadata']['timestamp'],
+        dt = datetime.datetime.strptime(response_meta['capture-metadata']['timestamp'],
                                         '%Y-%m-%dT%H:%M:%SZ')
         assert dt
     except ValueError:
-        pytest.fail('Invalid capture-timestamp format %s', data['capture-timestamp'])
+        pytest.fail(
+                'Invalid http response warcprox-meta["capture-metadata"]["timestamp"]: %r',
+                meta['capture-metadata']['timestamp'])
 
     # wait for postfetch chain (or subsequent test could fail)
     wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)

From b26a5d2d73355b5cbd271984f45ba5c93139508c Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Tue, 22 May 2018 15:00:36 -0700
Subject: [PATCH 13/33] starting to talk about warcprox-meta

---
 api.rst | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 61 insertions(+), 8 deletions(-)

diff --git a/api.rst b/api.rst
index 77d7ca4..43732aa 100644
--- a/api.rst
+++ b/api.rst
@@ -4,7 +4,7 @@ warcprox API
 Means of Interacting with warcprox over http, aside from simply proxying urls.
 
 ``/status`` url
-=============
+===============
 
 If warcprox is running at localhost:8000, http://localhost:8000/status returns
 a json blob with a bunch of status info. For example:
@@ -81,16 +81,27 @@ a json blob with a bunch of status info. For example:
     }
 
 ``WARCPROX_WRITE_RECORD`` http method
-===================================
+=====================================
 
-::
+To make warcprox write an arbitrary warc record you can send it a special
+request with http method ``WARCPROX_WRITE_RECORD``. The http request must
+include the headers ``WARC-Type``, ``Content-Type``, and ``Content-Length``.
+Warcprox will use these to populate the warc record. For example::
 
-    $ echo -ne 'WARCPROX_WRITE_RECORD special://url/some?thing HTTP/1.1\r\nWARC-Type: resource\r\ncontent-type: text/plain;charset=utf-8\r\ncontent-length: 29\r\n\r\ni am a warc record payload!\r\n' | ncat 127.0.0.1 8000
+    $ ncat --crlf 127.0.0.1 8000 <<EOF
+    > WARCPROX_WRITE_RECORD special://url/some?thing HTTP/1.1
+    > WARC-Type: resource
+    > Content-type: text/plain;charset=utf-8
+    > Content-length: 29
+    > 
+    > i am a warc record payload!
+    > EOF
     HTTP/1.0 204 OK
     Server: BaseHTTP/0.6 Python/3.6.3
-    Date: Mon, 21 May 2018 23:33:31 GMT
+    Date: Tue, 22 May 2018 19:21:02 GMT
 
-::
+On success warcprox responds with http status 204. For the request above
+warcprox will write a warc record that looks like this::
 
     WARC/1.0
     WARC-Type: resource
@@ -104,7 +115,49 @@ a json blob with a bunch of status info. For example:
 
     i am a warc record payload!
 
-
 ``Warcprox-Meta`` http request header
-===================================
+=====================================
 
+``Warcprox-Meta`` is a special http request header that can be used to pass
+configuration information and metadata with each proxy request to warcprox. The
+value is a json blob. There are several fields understood by warcprox, and
+arbitrary additional fields can be included. If warcprox doesn't recognize a
+field it simply ignores it. Warcprox plugins could make use of custom fields,
+for example.
+
+Warcprox strips the ``warcprox-meta`` header out before sending the request to
+remote server, and also does not write it in the warc request record.
+
+::
+
+    Warcprox-Meta: {}
+
+- warc-prefix
+- stats
+  - buckets
+- dedup-bucket
+- blocks
+- limits
+- soft-limits
+- metadata
+- accept
+- dedup-ok # deprecate?
+
+Brozzler knows about ``warcprox-meta``. For information on configuring
+``warcprox-meta`` in brozzler, see https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#warcprox-meta
+
+``Warcprox-Meta`` http response header
+======================================
+
+In some cases warcprox will add a ``Warcprox-Meta`` header in the http response
+that it sends to the client. Like the request header, the value is a json blob.
+It is only included if something in the ``warcprox-meta`` request header calls
+for it. Those cases are described above in the "``Warcprox-Meta`` http request header" section.
+
+### - blocked-by-rule
+### - reached-limit
+### - reached-soft-limit
+### - stats
+### - capture-metadata
+###
+### Response codes 420, 430

From b562170403f21e01ce341c5ff774678511b908fb Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Fri, 25 May 2018 10:32:42 -0700
Subject: [PATCH 14/33] explain deduplication

---
 readme.rst | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/readme.rst b/readme.rst
index ffeabca..fd3cb8c 100644
--- a/readme.rst
+++ b/readme.rst
@@ -1,4 +1,4 @@
-warcprox - WARC writing MITM HTTP/S proxy
+Warcprox - WARC writing MITM HTTP/S proxy
 *****************************************
 .. image:: https://travis-ci.org/internetarchive/warcprox.svg?branch=master
     :target: https://travis-ci.org/internetarchive/warcprox
@@ -6,9 +6,10 @@ warcprox - WARC writing MITM HTTP/S proxy
 Based on the excellent and simple pymiproxy by Nadeem Douba.
 https://github.com/allfro/pymiproxy
 
+.. contents::
+
 Install
 =======
-
 Warcprox runs on python 3.4+.
 
 To install latest release run:
@@ -27,27 +28,46 @@ You can also install the latest bleeding edge code:
 
 Trusting the CA cert
 ====================
-
 For best results while browsing through warcprox, you need to add the CA
 cert as a trusted cert in your browser. If you don't do that, you will
 get the warning when you visit each new site. But worse, any embedded
 https content on a different server will simply fail to load, because
 the browser will reject the certificate without telling you.
 
+Deduplication
+=============
+Warcprox avoids archiving redundant content by "deduplicating" it. The process
+for deduplication works similarly to heritrix and other web archiving tools.
+
+1. while fetching url, calculate payload content digest (typically sha1)
+2. look up digest in deduplication database (warcprox supports a few different
+   ones)
+3. if found write warc ``revisit`` record referencing the url and capture time
+   of the previous capture
+4. else (if not found)
+   a. write warc ``response`` record with full payload
+   b. store entry in deduplication database
+
+The dedup database is partitioned into different "buckets". Urls are
+deduplicated only against other captures in the same bucket. If specified, the
+``dedup-bucket`` field of the ``Warcprox-Meta`` http request header determines
+the bucket, otherwise the default bucket is used.
+
+Deduplication can be disabled entirely by starting warcprox with the argument
+``--dedup-db-file=/dev/null``.
+
 API
 ===
-
 For interacting with a running instance of warcprox.
 
 * ``/status`` url
 * ``WARCPROX_WRITE_RECORD`` http method
-* ``Warcprox-Meta`` http request header
+* ``Warcprox-Meta`` http request header and response header
 
 See `<api.rst>`_.
 
 Plugins
 =======
-
 Warcprox supports a limited notion of plugins by way of the ``--plugin``
 command line argument. Plugin classes are loaded from the regular python module
 search path. They will be instantiated with one argument, a

From 02e96188c3a743930992a5ba78b0eb509b62f647 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Fri, 25 May 2018 10:33:45 -0700
Subject: [PATCH 15/33] barely starting to flesh out warcprox-meta section

---
 api.rst | 57 ++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 46 insertions(+), 11 deletions(-)

diff --git a/api.rst b/api.rst
index 43732aa..8937ade 100644
--- a/api.rst
+++ b/api.rst
@@ -1,7 +1,9 @@
 warcprox API
 ************
 
-Means of Interacting with warcprox over http, aside from simply proxying urls.
+Means of interacting with warcprox over http, aside from simply proxying urls.
+
+.. contents::
 
 ``/status`` url
 ===============
@@ -132,16 +134,49 @@ remote server, and also does not write it in the warc request record.
 
     Warcprox-Meta: {}
 
-- warc-prefix
-- stats
-  - buckets
-- dedup-bucket
-- blocks
-- limits
-- soft-limits
-- metadata
-- accept
-- dedup-ok # deprecate?
+Warcprox-Meta fields
+-------------------
+
+``warc-prefix`` (string)
+~~~~~~~~~~~~~~~~~~~~~~~~
+Specifies a warc filename prefix. Warcprox will write the warc record for this
+capture, if any, to a warc named accordingly.
+
+Example::
+
+    Warcprox-Meta: {"warc-prefix": "special-warc"}
+
+``stats`` (dictionary)
+~~~~~~~~~~~~~~~~~~~~~~
+* buckets
+
+Example::
+
+    Warcprox-Meta: {"stats":{"buckets":["my-stats-bucket","all-the-stats"]}}
+
+``dedup-bucket`` (string)
+~~~~~~~~~~~~~~~~~~~~~~~~~
+Specifies the deduplication bucket. For more information about deduplication
+see `<readme.rst#deduplication>`_.
+
+Example::
+
+    Warcprox-Meta: {"dedup-bucket":"my-dedup-bucket"}
+
+``blocks``
+~~~~~~~~~~
+
+``limits``
+~~~~~~~~~~
+
+``soft-limits``
+~~~~~~~~~~~~~~~
+
+``metadata`` (dictionary)
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+``accept``
+~~~~~~~~~~
 
 Brozzler knows about ``warcprox-meta``. For information on configuring
 ``warcprox-meta`` in brozzler, see https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#warcprox-meta

From 401de2260098981d86bf4bcc908e9f672cd55235 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Fri, 25 May 2018 14:46:19 -0700
Subject: [PATCH 16/33] short sectioni on stats

---
 readme.rst | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/readme.rst b/readme.rst
index fd3cb8c..5cdd7cc 100644
--- a/readme.rst
+++ b/readme.rst
@@ -34,6 +34,16 @@ get the warning when you visit each new site. But worse, any embedded
 https content on a different server will simply fail to load, because
 the browser will reject the certificate without telling you.
 
+API
+===
+For interacting with a running instance of warcprox.
+
+* ``/status`` url
+* ``WARCPROX_WRITE_RECORD`` http method
+* ``Warcprox-Meta`` http request header and response header
+
+See `<api.rst>`_.
+
 Deduplication
 =============
 Warcprox avoids archiving redundant content by "deduplicating" it. The process
@@ -56,15 +66,20 @@ the bucket, otherwise the default bucket is used.
 Deduplication can be disabled entirely by starting warcprox with the argument
 ``--dedup-db-file=/dev/null``.
 
-API
-===
-For interacting with a running instance of warcprox.
+Statistics
+==========
+Warcprox keeps some crawl statistics and stores them in sqlite or rethinkdb.
+These are consulting when enforcing ``limits`` and ``soft-limits`` (see
+`<api.rst#warcprox-meta-fields>`_), and can also be consulted by other
+processes outside of warcprox, for reporting etc.
 
-* ``/status`` url
-* ``WARCPROX_WRITE_RECORD`` http method
-* ``Warcprox-Meta`` http request header and response header
+This is what they look like currently in sqlite, the default store::
 
-See `<api.rst>`_.
+    sqlite> select * from buckets_of_stats order by bucket desc;
+    bucket           stats
+    ---------------  ---------------------------------------------------------------------------------------------
+    __unspecified__  {"bucket":"__unspecified__","total":{"urls":37,"wire_bytes":1502781},"new":{"urls":15,"wire_bytes":1179906},"revisit":{"urls":22,"wire_bytes":322875}}
+    __all__          {"bucket":"__all__","total":{"urls":37,"wire_bytes":1502781},"new":{"urls":15,"wire_bytes":1179906},"revisit":{"urls":22,"wire_bytes":322875}}
 
 Plugins
 =======

From 4bd49b61a9929f9e89a1186ffccfb979257e3ac1 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Fri, 25 May 2018 15:26:26 -0700
Subject: [PATCH 17/33] starting to explain some warcprox-meta fields

---
 api.rst           | 46 ++++++++++++++++++++++++++++++++++++++++++----
 warcprox/stats.py |  2 +-
 2 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/api.rst b/api.rst
index 8937ade..3b2f0b3 100644
--- a/api.rst
+++ b/api.rst
@@ -134,6 +134,13 @@ remote server, and also does not write it in the warc request record.
 
     Warcprox-Meta: {}
 
+Brozzler knows about ``warcprox-meta``. For information on configuring
+it in brozzler, see
+`https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#warcprox-meta`_.
+``Warcprox-Meta`` is often a very important part of brozzler job configuration.
+It is the way url and data quotas (limits) on jobs, seeds, and hosts are
+implemented, among other things.
+
 Warcprox-Meta fields
 -------------------
 
@@ -148,11 +155,24 @@ Example::
 
 ``stats`` (dictionary)
 ~~~~~~~~~~~~~~~~~~~~~~
-* buckets
+``stats`` is a dictionary with only one field understood by warcprox,
+``"buckets"``. The value of ``"buckets"`` is a list of strings and/or
+dictionaries. A string signifies the name of the bucket; a dictionary is
+expected to have at least an item with key ``"bucket"`` whose value is the name
+of the bucket. The other currently recognized key is ``"tally-domains"``, which
+if supplied should be a list of domains. This instructs warcprox to
+additionally tally substats of the given bucket by domain. Host stats are
+stored in the stats table under the key
+``{parent-bucket}:{domain(normalized)}``, e.g. `"bucket2:foo.bar.com"` for the
+example below.
 
-Example::
+Examples::
 
     Warcprox-Meta: {"stats":{"buckets":["my-stats-bucket","all-the-stats"]}}
+    Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}
+
+See `<readme.rst#statistics>`_ for more information on statistics kept by
+warcprox.
 
 ``dedup-bucket`` (string)
 ~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -166,20 +186,38 @@ Example::
 ``blocks``
 ~~~~~~~~~~
 
+Example::
+
+    Warcprox-Meta: {"blocks": [{"ssurt": "com,example,//https:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]}
+
 ``limits``
 ~~~~~~~~~~
 
+Example::
+
+    {"stats": {"buckets": ["test_limits_bucket"]}, "limits": {"test_limits_bucket/total/urls": 10}}
+
 ``soft-limits``
 ~~~~~~~~~~~~~~~
 
+Example::
+
+    Warcprox-Meta: {"stats": {"buckets": [{"bucket": "test_domain_doc_limit_bucket", "tally-domains": ["foo.localhost"]}]}, "soft-limits": {"test_domain_doc_limit_bucket:foo.localhost/total/urls": 10}}
+
+
 ``metadata`` (dictionary)
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
+Example::
+
+    Warcprox-Meta: {"metadata": {"seed": "http://example.com/seed", "description": "here's some information about this crawl job. blah blah"}
+
 ``accept``
 ~~~~~~~~~~
 
-Brozzler knows about ``warcprox-meta``. For information on configuring
-``warcprox-meta`` in brozzler, see https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#warcprox-meta
+Example::
+
+    request_meta = {"accept": ["capture-metadata"]}
 
 ``Warcprox-Meta`` http response header
 ======================================
diff --git a/warcprox/stats.py b/warcprox/stats.py
index db2493c..4de5fef 100644
--- a/warcprox/stats.py
+++ b/warcprox/stats.py
@@ -166,7 +166,7 @@ class StatsProcessor(warcprox.BaseBatchPostfetchProcessor):
         Example Warcprox-Meta header (a real one will likely have other
         sections besides 'stats'):
 
-        Warcprox-Meta: {'stats':{'buckets':['bucket1',{'bucket':'bucket2','tally-domains':['foo.bar.com','192.168.10.20'}]}}
+        Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}
         '''
         buckets = ["__all__"]
         if (recorded_url.warcprox_meta

From 2c850876e8207aaf9f5162898233d493aa392348 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Fri, 25 May 2018 16:06:12 -0700
Subject: [PATCH 18/33] explain warcprox-meta "blocks"

---
 api.rst               | 42 +++++++++++++++++++++++++++++++++++++-----
 warcprox/warcproxy.py |  6 +++---
 2 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/api.rst b/api.rst
index 3b2f0b3..d9419b3 100644
--- a/api.rst
+++ b/api.rst
@@ -138,8 +138,8 @@ Brozzler knows about ``warcprox-meta``. For information on configuring
 it in brozzler, see
 `https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#warcprox-meta`_.
 ``Warcprox-Meta`` is often a very important part of brozzler job configuration.
-It is the way url and data quotas (limits) on jobs, seeds, and hosts are
-implemented, among other things.
+It is the way url and data limits on jobs, seeds, and hosts are implemented,
+among other things.
 
 Warcprox-Meta fields
 -------------------
@@ -183,12 +183,44 @@ Example::
 
     Warcprox-Meta: {"dedup-bucket":"my-dedup-bucket"}
 
-``blocks``
-~~~~~~~~~~
+``blocks`` (list)
+~~~~~~~~~~~~~~~~~
+List of url match rules. Url match rules are somewhat described at
+https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#scoping
+and https://github.com/iipc/urlcanon/blob/e2ab3524e/python/urlcanon/rules.py#L70.
+(TODO: write a better doc and link to it)
 
 Example::
 
-    Warcprox-Meta: {"blocks": [{"ssurt": "com,example,//https:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]}
+    Warcprox-Meta: {"blocks": [{"ssurt": "com,example,//http:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]}
+
+If any of the rules match the url being requested, warcprox aborts normal
+processing and responds with a http 403. The http response includes
+a ``Warcprox-Meta`` **response** header with one field, `"blocked-by-rule"`,
+which reproduces the value of the match rule that resulted in the block. The
+presence of the ``warcprox-meta`` response header can be used by the client to
+distinguish this type of a response from a 403 from the remote url being
+requested.
+
+For example::
+
+    $ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"blocks": [{"ssurt": "com,example,//http:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]}' http://example.com/foo
+    HTTP/1.0 403 Forbidden
+    Server: BaseHTTP/0.6 Python/3.6.3
+    Date: Fri, 25 May 2018 22:46:42 GMT
+    Content-Type: text/plain;charset=utf-8
+    Connection: close
+    Content-Length: 111
+    Warcprox-Meta: {"blocked-by-rule":{"ssurt":"com,example,//http:/"}}
+
+    request rejected by warcprox: blocked by rule found in Warcprox-Meta header: {"ssurt": "com,example,//http:/"}
+
+You might be wondering why ``blocks`` is necessary. Why would the warcprox
+client make a request that it should already know will be blocked by the proxy?
+The answer is that the request may be initiated somewhere where it's not
+possible, or at least not convenient, to evaluate the block rules. In
+particular, this circumstance prevails when the browser controlled by brozzler
+is requesting images, javascript, css, and so on, embedded in a page.
 
 ``limits``
 ~~~~~~~~~~
diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py
index 0d93e5c..2050807 100644
--- a/warcprox/warcproxy.py
+++ b/warcprox/warcproxy.py
@@ -72,13 +72,13 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
                 block_rule = urlcanon.MatchRule(**rule)
                 if block_rule.applies(url):
                     body = ("request rejected by warcprox: blocked by "
-                            "rule found in Warcprox-Meta header: %s"
-                            % rule).encode("utf-8")
+                            "rule found in Warcprox-Meta header: %s\n"
+                            % json.dumps(rule)).encode("utf-8")
                     self.send_response(403, "Forbidden")
                     self.send_header("Content-Type", "text/plain;charset=utf-8")
                     self.send_header("Connection", "close")
                     self.send_header("Content-Length", len(body))
-                    response_meta = {"blocked-by-rule":rule}
+                    response_meta = {"blocked-by-rule": rule}
                     self.send_header(
                             "Warcprox-Meta",
                             json.dumps(response_meta, separators=(",",":")))

From 1e76ed33027833e95f4a78eb664e7d2b0e545887 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Fri, 25 May 2018 16:38:19 -0700
Subject: [PATCH 19/33] working on "limits" and "soft-limits"

---
 api.rst | 52 +++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 45 insertions(+), 7 deletions(-)

diff --git a/api.rst b/api.rst
index d9419b3..71cc59a 100644
--- a/api.rst
+++ b/api.rst
@@ -222,20 +222,57 @@ possible, or at least not convenient, to evaluate the block rules. In
 particular, this circumstance prevails when the browser controlled by brozzler
 is requesting images, javascript, css, and so on, embedded in a page.
 
-``limits``
-~~~~~~~~~~
+``limits`` (dictionary)
+~~~~~~~~~~~~~~~~~~~~~~~
 
 Example::
 
     {"stats": {"buckets": ["test_limits_bucket"]}, "limits": {"test_limits_bucket/total/urls": 10}}
 
-``soft-limits``
-~~~~~~~~~~~~~~~
+::
+
+    $ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"stats": {"buckets": ["test_limits_bucket"]}, "limits": {"test_limits_bucket/total/urls": 10}}' http://example.com/foo
+    HTTP/1.0 420 Reached limit
+    Server: BaseHTTP/0.6 Python/3.6.3
+    Date: Fri, 25 May 2018 23:08:32 GMT
+    Content-Type: text/plain;charset=utf-8
+    Connection: close
+    Content-Length: 77
+    Warcprox-Meta: {"stats":{"test_limits_bucket":{"bucket":"test_limits_bucket","total":{"urls":10,"wire_bytes":15840},"new":{"urls":0,"wire_bytes":0},"revisit":{"urls":10,"wire_bytes":15840}}},"reached-limit":{"test_limits_bucket/total/urls":10}}
+
+    request rejected by warcprox: reached limit test_limits_bucket/total/urls=10
+
+``soft-limits`` (dictionary)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+From warcprox's perspective ``soft-limits`` work almost exactly the same way
+as ``limits``. The only difference is that when a soft limit is hit, warcprox
+response with an http 430 "Reached soft limit" instead of http 420.
+
+Warcprox clients might treat a 430 very differently from a 420. From brozzler's
+perspective, for instance, ``soft-limits`` are very different from ``limits``.
+When brozzler receives a 420 from warcprox because a ``limit`` has been
+reached, this means that crawling for that seed is finished, and brozzler sets
+about finalizing the crawl of that seed. On the other hand, brozzler blissfully
+ignores 430 responses, because soft limits only apply to a particular bucket
+(like a domain), and don't have any effect on crawling of urls that don't fall
+in that bucket.
 
 Example::
 
     Warcprox-Meta: {"stats": {"buckets": [{"bucket": "test_domain_doc_limit_bucket", "tally-domains": ["foo.localhost"]}]}, "soft-limits": {"test_domain_doc_limit_bucket:foo.localhost/total/urls": 10}}
 
+::
+
+    $ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"stats": {"buckets": ["test_limits_bucket"]}, "soft-limits": {"test_limits_bucket/total/urls": 10}}' http://example.com/foo
+    HTTP/1.0 430 Reached soft limit
+    Server: BaseHTTP/0.6 Python/3.6.3
+    Date: Fri, 25 May 2018 23:12:06 GMT
+    Content-Type: text/plain;charset=utf-8
+    Connection: close
+    Content-Length: 82
+    Warcprox-Meta: {"stats":{"test_limits_bucket":{"bucket":"test_limits_bucket","total":{"urls":10,"wire_bytes":15840},"new":{"urls":0,"wire_bytes":0},"revisit":{"urls":10,"wire_bytes":15840}}},"reached-soft-limit":{"test_limits_bucket/total/urls":10}}
+
+    request rejected by warcprox: reached soft limit test_limits_bucket/total/urls=10
 
 ``metadata`` (dictionary)
 ~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -244,8 +281,8 @@ Example::
 
     Warcprox-Meta: {"metadata": {"seed": "http://example.com/seed", "description": "here's some information about this crawl job. blah blah"}
 
-``accept``
-~~~~~~~~~~
+``accept`` (list)
+~~~~~~~~~~~~~~~~~
 
 Example::
 
@@ -257,7 +294,8 @@ Example::
 In some cases warcprox will add a ``Warcprox-Meta`` header in the http response
 that it sends to the client. Like the request header, the value is a json blob.
 It is only included if something in the ``warcprox-meta`` request header calls
-for it. Those cases are described above in the "``Warcprox-Meta`` http request header" section.
+for it. Those cases are described above in the
+`#warcprox-meta-http-request-header`_ section.
 
 ### - blocked-by-rule
 ### - reached-limit

From 195faa5cff2006811cab5b92c9151ab1605de1c4 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Fri, 25 May 2018 17:35:32 -0700
Subject: [PATCH 20/33] new checks exposing bug in limits enforcement

---
 tests/test_warcprox.py | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py
index 7c5253b..0e60319 100755
--- a/tests/test_warcprox.py
+++ b/tests/test_warcprox.py
@@ -709,6 +709,7 @@ def test_limits(http_daemon, warcprox_, archiving_proxies):
     # wait for postfetch chain
     wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 10)
 
+    # next fetch hits the limit
     response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True)
     assert response.status_code == 420
     assert response.reason == "Reached limit"
@@ -717,6 +718,14 @@ def test_limits(http_daemon, warcprox_, archiving_proxies):
     assert response.headers["content-type"] == "text/plain;charset=utf-8"
     assert response.raw.data == b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=10\n"
 
+    # make sure limit doesn't get applied to a different stats bucket
+    request_meta = {"stats":{"buckets":["no_limits_bucket"]},"limits":{"test_limits_bucket/total/urls":10}}
+    headers = {"Warcprox-Meta": json.dumps(request_meta)}
+    response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True)
+    assert response.status_code == 200
+    assert response.headers['warcprox-test-header'] == 'i!'
+    assert response.content == b'I am the warcprox test payload! jjjjjjjjjj!\n'
+
 def test_return_capture_timestamp(http_daemon, warcprox_, archiving_proxies):
     urls_before = warcprox_.proxy.running_stats.urls
 
@@ -999,6 +1008,7 @@ def test_domain_doc_soft_limit(
         http_daemon, https_daemon, warcprox_, archiving_proxies):
     urls_before = warcprox_.proxy.running_stats.urls
 
+    # ** comment is obsolete (server is multithreaded) but still useful **
     # we need to clear the connection pool here because
     # - connection pool already may already have an open connection localhost
     # - we're about to make a connection to foo.localhost
@@ -1134,6 +1144,20 @@ def test_domain_doc_soft_limit(
     assert response.headers["content-type"] == "text/plain;charset=utf-8"
     assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n"
 
+    # make sure soft limit doesn't get applied to a different stats bucket
+    request_meta = {
+        "stats": {"buckets": [{"bucket":"no_limit_bucket","tally-domains":["foo.localhost"]}]},
+        "soft-limits": {"test_domain_doc_limit_bucket:foo.localhost/total/urls":10},
+    }
+    headers = {"Warcprox-Meta": json.dumps(request_meta)}
+    url = 'http://zuh.foo.localhost:{}/o/p'.format(http_daemon.server_port)
+    response = requests.get(
+            url, proxies=archiving_proxies, headers=headers, stream=True,
+            verify=False)
+    assert response.status_code == 200
+    assert response.headers['warcprox-test-header'] == 'o!'
+    assert response.content == b'I am the warcprox test payload! pppppppppp!\n'
+
 def test_domain_data_soft_limit(
         http_daemon, https_daemon, warcprox_, archiving_proxies):
     urls_before = warcprox_.proxy.running_stats.urls
@@ -1228,6 +1252,19 @@ def test_domain_data_soft_limit(
     ### assert response.headers["content-type"] == "text/plain;charset=utf-8"
     ### assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:xn--zz-2ka.localhost/new/wire_bytes=200\n"
 
+    # make sure soft limit doesn't get applied to a different stats bucket
+    request_meta = {
+        "stats": {"buckets": [{"bucket":"no_limit_bucket","tally-domains":['ÞzZ.LOCALhost']}]},
+        "soft-limits": {"test_domain_data_limit_bucket:ÞZZ.localhost/new/wire_bytes":200},
+    }
+    headers = {"Warcprox-Meta": json.dumps(request_meta)}
+    url = 'http://ÞZz.localhost:{}/y/z'.format(http_daemon.server_port)
+    response = requests.get(
+            url, proxies=archiving_proxies, headers=headers, stream=True)
+    assert response.status_code == 200
+    assert response.headers['warcprox-test-header'] == 'y!'
+    assert response.content == b'I am the warcprox test payload! zzzzzzzzzz!\n'
+
 # XXX this test relies on a tor proxy running at localhost:9050 with a working
 # connection to the internet, and relies on a third party site (facebook) being
 # up and behaving a certain way

From 07dc978f093d1a48eb90daf1e60a8dc232611bc8 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Fri, 25 May 2018 17:36:26 -0700
Subject: [PATCH 21/33] docs still in progress

---
 api.rst | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/api.rst b/api.rst
index 71cc59a..f3f958a 100644
--- a/api.rst
+++ b/api.rst
@@ -136,7 +136,7 @@ remote server, and also does not write it in the warc request record.
 
 Brozzler knows about ``warcprox-meta``. For information on configuring
 it in brozzler, see
-`https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#warcprox-meta`_.
+https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#warcprox-meta.
 ``Warcprox-Meta`` is often a very important part of brozzler job configuration.
 It is the way url and data limits on jobs, seeds, and hosts are implemented,
 among other things.
@@ -156,14 +156,14 @@ Example::
 ``stats`` (dictionary)
 ~~~~~~~~~~~~~~~~~~~~~~
 ``stats`` is a dictionary with only one field understood by warcprox,
-``"buckets"``. The value of ``"buckets"`` is a list of strings and/or
+``buckets``. The value of ``buckets`` is a list of strings and/or
 dictionaries. A string signifies the name of the bucket; a dictionary is
-expected to have at least an item with key ``"bucket"`` whose value is the name
-of the bucket. The other currently recognized key is ``"tally-domains"``, which
+expected to have at least an item with key ``bucket`` whose value is the name
+of the bucket. The other currently recognized key is ``tally-domains``, which
 if supplied should be a list of domains. This instructs warcprox to
 additionally tally substats of the given bucket by domain. Host stats are
 stored in the stats table under the key
-``{parent-bucket}:{domain(normalized)}``, e.g. `"bucket2:foo.bar.com"` for the
+``{parent-bucket}:{domain(normalized)}``, e.g. ``"bucket2:foo.bar.com"`` for the
 example below.
 
 Examples::
@@ -196,13 +196,13 @@ Example::
 
 If any of the rules match the url being requested, warcprox aborts normal
 processing and responds with a http 403. The http response includes
-a ``Warcprox-Meta`` **response** header with one field, `"blocked-by-rule"`,
+a ``Warcprox-Meta`` **response** header with one field, ``blocked-by-rule``,
 which reproduces the value of the match rule that resulted in the block. The
 presence of the ``warcprox-meta`` response header can be used by the client to
 distinguish this type of a response from a 403 from the remote url being
 requested.
 
-For example::
+An example::
 
     $ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"blocks": [{"ssurt": "com,example,//http:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]}' http://example.com/foo
     HTTP/1.0 403 Forbidden
@@ -217,10 +217,10 @@ For example::
 
 You might be wondering why ``blocks`` is necessary. Why would the warcprox
 client make a request that it should already know will be blocked by the proxy?
-The answer is that the request may be initiated somewhere where it's not
-possible, or at least not convenient, to evaluate the block rules. In
-particular, this circumstance prevails when the browser controlled by brozzler
-is requesting images, javascript, css, and so on, embedded in a page.
+The answer is that the request may be initiated somewhere where it's difficult
+to evaluate the block rules. In particular, this circumstance prevails when the
+browser controlled by brozzler is requesting images, javascript, css, and so
+on, embedded in a page.
 
 ``limits`` (dictionary)
 ~~~~~~~~~~~~~~~~~~~~~~~

From d9e0ed31f28111d295f2ec51594cac42566adced Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Tue, 29 May 2018 12:18:51 -0700
Subject: [PATCH 22/33] fix bug in limits enforcement

enforce limit only if url is in stats bucket that limit applies to!
---
 tests/test_warcprox.py |  6 +++
 warcprox/stats.py      | 88 +++++++++++++++++++++++-------------------
 warcprox/warcproxy.py  | 40 +++++++++----------
 3 files changed, 74 insertions(+), 60 deletions(-)

diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py
index 0e60319..0deecc6 100755
--- a/tests/test_warcprox.py
+++ b/tests/test_warcprox.py
@@ -1158,6 +1158,9 @@ def test_domain_doc_soft_limit(
     assert response.headers['warcprox-test-header'] == 'o!'
     assert response.content == b'I am the warcprox test payload! pppppppppp!\n'
 
+    # wait for postfetch chain
+    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 22)
+
 def test_domain_data_soft_limit(
         http_daemon, https_daemon, warcprox_, archiving_proxies):
     urls_before = warcprox_.proxy.running_stats.urls
@@ -1265,6 +1268,9 @@ def test_domain_data_soft_limit(
     assert response.headers['warcprox-test-header'] == 'y!'
     assert response.content == b'I am the warcprox test payload! zzzzzzzzzz!\n'
 
+    # wait for postfetch chain
+    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 5)
+
 # XXX this test relies on a tor proxy running at localhost:9050 with a working
 # connection to the internet, and relies on a third party site (facebook) being
 # up and behaving a certain way
diff --git a/warcprox/stats.py b/warcprox/stats.py
index 4de5fef..85539e2 100644
--- a/warcprox/stats.py
+++ b/warcprox/stats.py
@@ -53,6 +53,53 @@ def _empty_bucket(bucket):
         },
     }
 
+def unravel_buckets(url, warcprox_meta):
+    '''
+    Unravels bucket definitions in Warcprox-Meta header. Each bucket
+    definition can either be a string, which signifies the name of the
+    bucket, or a dict. If a dict it is expected to have at least an item
+    with key 'bucket' whose value is the name of the bucket. The other
+    currently recognized item is 'tally-domains', which if supplied should
+    be a list of domains. This instructs warcprox to additionally tally
+    substats of the given bucket by domain. Host stats are stored in the
+    stats table under the key '{parent-bucket}:{domain(normalized)}'.
+
+    Returns:
+        list of strings
+
+    Example Warcprox-Meta header (a real one will likely have other
+    sections besides 'stats'):
+
+    Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}
+
+    In this case the return value would be
+    ["bucket1","bucket2","bucket2:foo.bar.com","bucket2:192.168.10.20"]
+    '''
+    buckets = ["__all__"]
+    if (warcprox_meta and "stats" in warcprox_meta
+            and "buckets" in warcprox_meta["stats"]):
+        for bucket in warcprox_meta["stats"]["buckets"]:
+            if isinstance(bucket, dict):
+                if not 'bucket' in bucket:
+                    self.logger.warn(
+                            'ignoring invalid stats bucket in '
+                            'warcprox-meta header %s', bucket)
+                    continue
+                buckets.append(bucket['bucket'])
+                if bucket.get('tally-domains'):
+                    canon_url = urlcanon.semantic(url)
+                    for domain in bucket['tally-domains']:
+                        domain = urlcanon.normalize_host(domain).decode('ascii')
+                        if urlcanon.url_matches_domain(canon_url, domain):
+                            buckets.append(
+                                    '%s:%s' % (bucket['bucket'], domain))
+            else:
+                buckets.append(bucket)
+    else:
+        buckets.append("__unspecified__")
+
+    return buckets
+
 class StatsProcessor(warcprox.BaseBatchPostfetchProcessor):
     logger = logging.getLogger("warcprox.stats.StatsProcessor")
 
@@ -153,46 +200,7 @@ class StatsProcessor(warcprox.BaseBatchPostfetchProcessor):
             return None
 
     def buckets(self, recorded_url):
-        '''
-        Unravels bucket definitions in Warcprox-Meta header. Each bucket
-        definition can either be a string, which signifies the name of the
-        bucket, or a dict. If a dict it is expected to have at least an item
-        with key 'bucket' whose value is the name of the bucket. The other
-        currently recognized item is 'tally-domains', which if supplied should
-        be a list of domains. This instructs warcprox to additionally tally
-        substats of the given bucket by domain. Host stats are stored in the
-        stats table under the key '{parent-bucket}:{domain(normalized)}'.
-
-        Example Warcprox-Meta header (a real one will likely have other
-        sections besides 'stats'):
-
-        Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}
-        '''
-        buckets = ["__all__"]
-        if (recorded_url.warcprox_meta
-                and "stats" in recorded_url.warcprox_meta
-                and "buckets" in recorded_url.warcprox_meta["stats"]):
-            for bucket in recorded_url.warcprox_meta["stats"]["buckets"]:
-                if isinstance(bucket, dict):
-                    if not 'bucket' in bucket:
-                        self.logger.warn(
-                                'ignoring invalid stats bucket in '
-                                'warcprox-meta header %s', bucket)
-                        continue
-                    buckets.append(bucket['bucket'])
-                    if bucket.get('tally-domains'):
-                        url = urlcanon.semantic(recorded_url.url)
-                        for domain in bucket['tally-domains']:
-                            domain = urlcanon.normalize_host(domain).decode('ascii')
-                            if urlcanon.url_matches_domain(url, domain):
-                                buckets.append(
-                                        '%s:%s' % (bucket['bucket'], domain))
-                else:
-                    buckets.append(bucket)
-        else:
-            buckets.append("__unspecified__")
-
-        return buckets
+        return unravel_buckets(recorded_url.url, recorded_url.warcprox_meta)
 
 class RethinkStatsProcessor(StatsProcessor):
     logger = logging.getLogger("warcprox.stats.RethinkStatsProcessor")
diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py
index 2050807..417f450 100644
--- a/warcprox/warcproxy.py
+++ b/warcprox/warcproxy.py
@@ -92,26 +92,26 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
                                 self.client_address[0], self.command,
                                 self.url, rule))
 
-    def _enforce_limit(self, limit_key, limit_value, soft=False):
+    def _enforce_limit(self, buckets, limit_key, limit_value, soft=False):
         if not self.server.stats_db:
             return
-        bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
-        _limit_key = limit_key
 
-        # if limit_key looks like 'job1:foo.com/total/urls' then we only want
-        # to apply this rule if the requested url is within domain
-        bucket0_fields = bucket0.split(':')
-        if len(bucket0_fields) == 2:
-            domain = urlcanon.normalize_host(bucket0_fields[1])
-            if not urlcanon.host_matches_domain(self.hostname, domain):
-                return # else host matches, go ahead and enforce the limit
-            bucket0 = '%s:%s' % (bucket0_fields[0], domain.decode('ascii'))
-            _limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2)
+        # parse limit key
+        bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
+        # normalize domain if part of bucket
+        if ":" in bucket0:
+            b, raw_domain = bucket0.split(":", 1)
+            domain = urlcanon.normalize_host(raw_domain).decode("ascii")
+            bucket0 = "%s:%s" % (b, domain)
+            limit_key = "%s/%s/%s" % (bucket0, bucket1, bucket2)
+
+        if not bucket0 in buckets:
+            return
 
         value = self.server.stats_db.value(bucket0, bucket1, bucket2)
         if value and limit_value and limit_value > 0 and value >= limit_value:
             body = ("request rejected by warcprox: reached %s %s=%s\n" % (
-                        "soft limit" if soft else "limit", _limit_key,
+                        "soft limit" if soft else "limit", limit_key,
                         limit_value)).encode("utf-8")
             if soft:
                 self.send_response(430, "Reached soft limit")
@@ -124,12 +124,11 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
                 "stats": {bucket0:self.server.stats_db.value(bucket0)}
             }
             if soft:
-                response_meta["reached-soft-limit"] = {_limit_key:limit_value}
+                response_meta["reached-soft-limit"] = {limit_key:limit_value}
             else:
-                response_meta["reached-limit"] = {_limit_key:limit_value}
+                response_meta["reached-limit"] = {limit_key:limit_value}
             self.send_header(
-                    "Warcprox-Meta",
-                    json.dumps(response_meta, separators=(",",":")))
+                    "Warcprox-Meta", json.dumps(response_meta, separators=",:"))
             self.end_headers()
             if self.command != "HEAD":
                 self.wfile.write(body)
@@ -139,7 +138,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
                         self.client_address[0], 430 if soft else 420,
                         self.command, self.url,
                         "soft limit" if soft else "limit",
-                        _limit_key, limit_value))
+                        limit_key, limit_value))
 
     def _enforce_limits(self, warcprox_meta):
         """
@@ -147,14 +146,15 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
         warcprox.RequestBlockedByRule if a limit specified in warcprox_meta is
         reached.
         """
+        buckets = warcprox.stats.unravel_buckets(self.url, warcprox_meta)
         if warcprox_meta and "limits" in warcprox_meta:
             for item in warcprox_meta["limits"].items():
                 limit_key, limit_value = item
-                self._enforce_limit(limit_key, limit_value, soft=False)
+                self._enforce_limit(buckets, limit_key, limit_value, soft=False)
         if warcprox_meta and "soft-limits" in warcprox_meta:
             for item in warcprox_meta["soft-limits"].items():
                 limit_key, limit_value = item
-                self._enforce_limit(limit_key, limit_value, soft=True)
+                self._enforce_limit(buckets, limit_key, limit_value, soft=True)
 
     def _security_check(self, warcprox_meta):
         '''

From 6256ec6a07b1d402def96d02cfb5bff0f260f823 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Tue, 29 May 2018 13:08:34 -0700
Subject: [PATCH 23/33] add another "wait" to fix failing test

---
 tests/test_warcprox.py | 3 +++
 warcprox/stats.py      | 2 --
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py
index 0deecc6..13b6bad 100755
--- a/tests/test_warcprox.py
+++ b/tests/test_warcprox.py
@@ -726,6 +726,9 @@ def test_limits(http_daemon, warcprox_, archiving_proxies):
     assert response.headers['warcprox-test-header'] == 'i!'
     assert response.content == b'I am the warcprox test payload! jjjjjjjjjj!\n'
 
+    # wait for postfetch chain
+    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 11)
+
 def test_return_capture_timestamp(http_daemon, warcprox_, archiving_proxies):
     urls_before = warcprox_.proxy.running_stats.urls
 
diff --git a/warcprox/stats.py b/warcprox/stats.py
index 85539e2..64ff2d7 100644
--- a/warcprox/stats.py
+++ b/warcprox/stats.py
@@ -309,11 +309,9 @@ class RunningStats:
         need_ten_sec_snap = (now - self.ten_sec_snaps[0][0]) // 10 > (self.ten_sec_snaps[-1][0] - self.ten_sec_snaps[0][0]) // 10
         if need_minute_snap:
             self.minute_snaps.append((now, self.urls, self.warc_bytes))
-            logging.debug('added minute snap %r', self.minute_snaps[-1])
         if need_ten_sec_snap:
             self.ten_sec_snaps.popleft()
             self.ten_sec_snaps.append((now, self.urls, self.warc_bytes))
-            logging.trace('rotated in ten second snap %r', self.ten_sec_snaps[-1])
 
     def _closest_ten_sec_snap(self, t):
         # it's a deque so iterating over it is faster than indexed lookup

From 8877259b7d7421ea4323a396d392d958697c4b8b Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Tue, 29 May 2018 16:57:15 -0700
Subject: [PATCH 24/33] more progress on documenting "limits"

---
 api.rst    |  4 ++++
 readme.rst | 23 +++++++++++++++++++++--
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/api.rst b/api.rst
index f3f958a..6104b53 100644
--- a/api.rst
+++ b/api.rst
@@ -224,6 +224,10 @@ on, embedded in a page.
 
 ``limits`` (dictionary)
 ~~~~~~~~~~~~~~~~~~~~~~~
+Specifies quantitative limits for warcprox to enforce. The structure of the
+dictionary is ``{stats_key: numerical_limit, ...}`` where stats key has the
+format ``"bucket/sub-bucket/statistic"``. See `readme.rst#statistics`_ for
+further explanation of what "bucket", "sub-bucket", and "statistic" mean here.
 
 Example::
 
diff --git a/readme.rst b/readme.rst
index 5cdd7cc..44ae1bb 100644
--- a/readme.rst
+++ b/readme.rst
@@ -69,11 +69,30 @@ Deduplication can be disabled entirely by starting warcprox with the argument
 Statistics
 ==========
 Warcprox keeps some crawl statistics and stores them in sqlite or rethinkdb.
-These are consulting when enforcing ``limits`` and ``soft-limits`` (see
+These are consulted for enforcing ``limits`` and ``soft-limits`` (see
 `<api.rst#warcprox-meta-fields>`_), and can also be consulted by other
 processes outside of warcprox, for reporting etc.
 
-This is what they look like currently in sqlite, the default store::
+Statistics are grouped by "bucket". Every capture is counted as part of the
+``__all__`` bucket. Other buckets can be specified in the ``Warcprox-Meta``
+request header. The fallback bucket in case none is specified is called
+``__unspecified__``.
+
+Within each bucket are three sub-buckets:
+* "new" - tallies captures for which a complete record (usually a ``response``
+  record) was written to warc
+* "revisit" - tallies captures for which a ``revisit`` record was written to
+  warc
+* "total" - includes all urls processed, even those not written to warc (so the
+  numbers may be greater than new + revisit)
+
+Within each of these sub-buckets we keep two statistics:
+* urls - simple count of urls
+* wire_bytes - sum of bytes received over the wire from the remote server for
+  each url
+
+For historical reasons, statistics are stored as json blobs in sqlite, the
+default store::
 
     sqlite> select * from buckets_of_stats order by bucket desc;
     bucket           stats

From 4a87a08230ed6ce07c105ca269e035c65ac03d5b Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Tue, 29 May 2018 17:09:14 -0700
Subject: [PATCH 25/33] fixlets

---
 readme.rst | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/readme.rst b/readme.rst
index 44ae1bb..4f7044f 100644
--- a/readme.rst
+++ b/readme.rst
@@ -3,7 +3,7 @@ Warcprox - WARC writing MITM HTTP/S proxy
 .. image:: https://travis-ci.org/internetarchive/warcprox.svg?branch=master
     :target: https://travis-ci.org/internetarchive/warcprox
 
-Based on the excellent and simple pymiproxy by Nadeem Douba.
+Originally based on the excellent and simple pymiproxy by Nadeem Douba.
 https://github.com/allfro/pymiproxy
 
 .. contents::
@@ -52,9 +52,10 @@ for deduplication works similarly to heritrix and other web archiving tools.
 1. while fetching url, calculate payload content digest (typically sha1)
 2. look up digest in deduplication database (warcprox supports a few different
    ones)
-3. if found write warc ``revisit`` record referencing the url and capture time
+3. if found, write warc ``revisit`` record referencing the url and capture time
    of the previous capture
-4. else (if not found)
+4. else (if not found),
+
    a. write warc ``response`` record with full payload
    b. store entry in deduplication database
 
@@ -79,22 +80,24 @@ request header. The fallback bucket in case none is specified is called
 ``__unspecified__``.
 
 Within each bucket are three sub-buckets:
-* "new" - tallies captures for which a complete record (usually a ``response``
+
+* ``new`` - tallies captures for which a complete record (usually a ``response``
   record) was written to warc
-* "revisit" - tallies captures for which a ``revisit`` record was written to
+* ``revisit`` - tallies captures for which a ``revisit`` record was written to
   warc
-* "total" - includes all urls processed, even those not written to warc (so the
+* ``total`` - includes all urls processed, even those not written to warc (so the
   numbers may be greater than new + revisit)
 
 Within each of these sub-buckets we keep two statistics:
-* urls - simple count of urls
-* wire_bytes - sum of bytes received over the wire from the remote server for
-  each url
 
-For historical reasons, statistics are stored as json blobs in sqlite, the
-default store::
+* ``urls`` - simple count of urls
+* ``wire_bytes`` - sum of bytes received over the wire, including http headers,
+  from the remote server for each url
 
-    sqlite> select * from buckets_of_stats order by bucket desc;
+For historical reasons, in sqlite, the default store, statistics are kept as
+json blobs::
+
+    sqlite> select * from buckets_of_stats;
     bucket           stats
     ---------------  ---------------------------------------------------------------------------------------------
     __unspecified__  {"bucket":"__unspecified__","total":{"urls":37,"wire_bytes":1502781},"new":{"urls":15,"wire_bytes":1179906},"revisit":{"urls":22,"wire_bytes":322875}}

From cd6e30fe36a2069b6b80ee6662c1061f63dbcc93 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Tue, 29 May 2018 17:28:04 -0700
Subject: [PATCH 26/33] describe the last two remaining fields

---
 api.rst | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/api.rst b/api.rst
index 6104b53..cef9c7c 100644
--- a/api.rst
+++ b/api.rst
@@ -142,7 +142,7 @@ It is the way url and data limits on jobs, seeds, and hosts are implemented,
 among other things.
 
 Warcprox-Meta fields
--------------------
+--------------------
 
 ``warc-prefix`` (string)
 ~~~~~~~~~~~~~~~~~~~~~~~~
@@ -280,6 +280,11 @@ Example::
 
 ``metadata`` (dictionary)
 ~~~~~~~~~~~~~~~~~~~~~~~~~
+An arbitrary dictionary. Warcprox mostly ignores this. The one exception is
+that if it has a ``seed`` entry and crawl logs are enabled via the
+``--crawl-log-dir`` command line option, the value of ``seed`` is written to
+the crawl log as the 11th field on the line, simulating heritrix's "source
+tag".
 
 Example::
 
@@ -287,24 +292,28 @@ Example::
 
 ``accept`` (list)
 ~~~~~~~~~~~~~~~~~
+Specifies fields that the client would like to receive in the ``Warcprox-Meta``
+*response* header. Only one value is currently understood,
+``capture-metadata``.
 
 Example::
 
-    request_meta = {"accept": ["capture-metadata"]}
+    Warcprox-Meta: {"accept": ["capture-metadata"]}
+
+The response will include a ``Warcpro-Meta`` response header with one field
+also called ``captured-metadata``. Currently warcprox reports one piece of
+capture medata, ``timestamp``, which represents the time fetch began for the
+resource and matches the ``WARC-Date`` written to the warc record. For
+example::
+
+    Warcprox-Meta: {"capture-metadata":{"timestamp":"2018-05-30T00:22:49Z"}}
 
 ``Warcprox-Meta`` http response header
 ======================================
-
 In some cases warcprox will add a ``Warcprox-Meta`` header in the http response
 that it sends to the client. Like the request header, the value is a json blob.
 It is only included if something in the ``warcprox-meta`` request header calls
 for it. Those cases are described above in the
-`#warcprox-meta-http-request-header`_ section.
+`Warcprox-Meta http request header`_ section.
+
 
-### - blocked-by-rule
-### - reached-limit
-### - reached-soft-limit
-### - stats
-### - capture-metadata
-###
-### Response codes 420, 430

From 68ede68e5f484401fc2cf3d78f204ca18ed81522 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Tue, 29 May 2018 17:35:33 -0700
Subject: [PATCH 27/33] little edits

---
 api.rst | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/api.rst b/api.rst
index cef9c7c..ae12575 100644
--- a/api.rst
+++ b/api.rst
@@ -195,7 +195,7 @@ Example::
     Warcprox-Meta: {"blocks": [{"ssurt": "com,example,//http:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]}
 
 If any of the rules match the url being requested, warcprox aborts normal
-processing and responds with a http 403. The http response includes
+processing and responds with a http ``403``. The http response includes
 a ``Warcprox-Meta`` **response** header with one field, ``blocked-by-rule``,
 which reproduces the value of the match rule that resulted in the block. The
 presence of the ``warcprox-meta`` response header can be used by the client to
@@ -229,6 +229,11 @@ dictionary is ``{stats_key: numerical_limit, ...}`` where stats key has the
 format ``"bucket/sub-bucket/statistic"``. See `readme.rst#statistics`_ for
 further explanation of what "bucket", "sub-bucket", and "statistic" mean here.
 
+If processing a request would result in exceeding a limit, warcprox aborts
+normal processing and responds with a http ``420 Reached Limit``. The http
+response includes a ``Warcprox-Meta`` **response** header with the complete set
+of statistics for the bucket whose limit has been reached.
+
 Example::
 
     {"stats": {"buckets": ["test_limits_bucket"]}, "limits": {"test_limits_bucket/total/urls": 10}}
@@ -250,16 +255,16 @@ Example::
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 From warcprox's perspective ``soft-limits`` work almost exactly the same way
 as ``limits``. The only difference is that when a soft limit is hit, warcprox
-response with an http 430 "Reached soft limit" instead of http 420.
+response with an http ``430 Reached soft limit`` instead of http ``420``.
 
-Warcprox clients might treat a 430 very differently from a 420. From brozzler's
-perspective, for instance, ``soft-limits`` are very different from ``limits``.
-When brozzler receives a 420 from warcprox because a ``limit`` has been
-reached, this means that crawling for that seed is finished, and brozzler sets
-about finalizing the crawl of that seed. On the other hand, brozzler blissfully
-ignores 430 responses, because soft limits only apply to a particular bucket
-(like a domain), and don't have any effect on crawling of urls that don't fall
-in that bucket.
+Warcprox clients might treat a 430 very differently from a ``420``. From
+brozzler's perspective, for instance, ``soft-limits`` are very different from
+``limits``. When brozzler receives a ``420`` from warcprox because a ``limit``
+has been reached, this means that crawling for that seed is finished, and
+brozzler sets about finalizing the crawl of that seed. On the other hand,
+brozzler blissfully ignores ``430`` responses, because soft limits only apply
+to a particular bucket (like a domain), and don't have any effect on crawling
+of urls that don't fall in that bucket.
 
 Example::
 
@@ -300,7 +305,7 @@ Example::
 
     Warcprox-Meta: {"accept": ["capture-metadata"]}
 
-The response will include a ``Warcpro-Meta`` response header with one field
+The response will include a ``Warcprox-Meta`` response header with one field
 also called ``captured-metadata``. Currently warcprox reports one piece of
 capture medata, ``timestamp``, which represents the time fetch began for the
 resource and matches the ``WARC-Date`` written to the warc record. For

From f5bcec20a92c675291acc9debe506b0ba1e9907e Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Wed, 30 May 2018 14:12:58 -0700
Subject: [PATCH 28/33] explain a bit about mitm

---
 readme.rst | 179 +++++++++++++++--------------------------------------
 1 file changed, 49 insertions(+), 130 deletions(-)

diff --git a/readme.rst b/readme.rst
index 4f7044f..6f53f66 100644
--- a/readme.rst
+++ b/readme.rst
@@ -3,36 +3,68 @@ Warcprox - WARC writing MITM HTTP/S proxy
 .. image:: https://travis-ci.org/internetarchive/warcprox.svg?branch=master
     :target: https://travis-ci.org/internetarchive/warcprox
 
-Originally based on the excellent and simple pymiproxy by Nadeem Douba.
-https://github.com/allfro/pymiproxy
+Warcprox is a tool for archiving the web. It is an http proxy that stores its
+traffic to disk in `WARC
+<https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/>`_
+format. Warcprox captures encrypted https traffic by using the
+`"man-in-the-middle" <https://en.wikipedia.org/wiki/Man-in-the-middle_attack>`_
+technique (see the `Man-In-The_Middle`_ section for more info).
+
+The web pages that warcprox stores in WARC files can be played back using
+software like `OpenWayback <https://github.com/iipc/openwayback>`_ or `pywb
+<https://github.com/webrecorder/pywb>`_. Warcprox has been developed in
+parallel with `brozzler <https://github.com/internetarchive/brozzler>`_ and
+together they make a comprehensive modern distributed archival web crawling
+system.
+
+Warcprox was originally based on the excellent and simple pymiproxy by Nadeem
+Douba. https://github.com/allfro/pymiproxy
 
 .. contents::
 
-Install
-=======
+Getting started
+===============
 Warcprox runs on python 3.4+.
 
-To install latest release run:
-
-::
+To install latest release run::
 
     # apt-get install libffi-dev libssl-dev
     pip install warcprox
 
-You can also install the latest bleeding edge code:
-
-::
+You can also install the latest bleeding edge code::
 
     pip install git+https://github.com/internetarchive/warcprox.git
 
+To start warcprox run::
 
-Trusting the CA cert
-====================
-For best results while browsing through warcprox, you need to add the CA
-cert as a trusted cert in your browser. If you don't do that, you will
-get the warning when you visit each new site. But worse, any embedded
-https content on a different server will simply fail to load, because
-the browser will reject the certificate without telling you.
+    warcprox
+
+Try ``warcprox --help`` for documentation on command line options.
+
+Man-In-The-Middle?
+==================
+Traffic to and from https sites is encrypted. Normally http proxies can't read
+that traffic. The web client uses the http ``CONNECT`` method to establish a
+tunnel through the proxy, and the proxy merely routes raw bytes between the
+client and server. Since the bytes are encrypted, the proxy can't make sense of
+the information it's proxying. Nonsensical encrypted bytes would not be very
+useful to archive.
+
+In order to capture https traffic, warcprox acts as a "man-in-the-middle"
+(MITM). When it receives a ``CONNECT`` directive from a client, it generates a
+public key certificate for the requested site, presents to the client, and
+proceeds to establish an encrypted connection. Then it makes a separate, normal
+https connection to the remote site. It decrypts, archives, and re-encrypts
+traffic in both directions.
+
+Although "man-in-the-middle" is often paired with "attack", there is nothing
+malicious about what warcprox is doing. If you configure an instance of
+warcprox as your browser's http proxy, you will see lots of certificate
+warnings, since none of the certificates will be signed by trusted authorities.
+To use warcprox effectively the client needs to disable certificate
+verification, or add the CA cert generated by warcprox as a trusted authority.
+(If you do this in your browser, make sure you undo it when you're done using
+warcprox!)
 
 API
 ===
@@ -116,119 +148,6 @@ be configured by specifying ``--plugin`` multiples times.
 
 `A minimal example <https://github.com/internetarchive/warcprox/blob/318405e795ac0ab8760988a1a482cf0a17697148/warcprox/__init__.py#L165>`__
 
-Usage
-=====
-
-::
-
-    usage: warcprox [-h] [-p PORT] [-b ADDRESS] [-c CACERT]
-                    [--certs-dir CERTS_DIR] [-d DIRECTORY]
-                    [--warc-filename WARC_FILENAME] [-z] [-n PREFIX]
-                    [-s ROLLOVER_SIZE]
-                    [--rollover-idle-time ROLLOVER_IDLE_TIME]
-                    [-g DIGEST_ALGORITHM] [--base32]
-                    [--method-filter HTTP_METHOD]
-                    [--stats-db-file STATS_DB_FILE | --rethinkdb-stats-url RETHINKDB_STATS_URL]
-                    [-P PLAYBACK_PORT]
-                    [-j DEDUP_DB_FILE | --rethinkdb-dedup-url RETHINKDB_DEDUP_URL | --rethinkdb-big-table-url RETHINKDB_BIG_TABLE_URL | --rethinkdb-trough-db-url RETHINKDB_TROUGH_DB_URL | --cdxserver-dedup CDXSERVER_DEDUP]
-                    [--rethinkdb-services-url RETHINKDB_SERVICES_URL]
-                    [--onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY]
-                    [--crawl-log-dir CRAWL_LOG_DIR] [--plugin PLUGIN_CLASS]
-                    [--version] [-v] [--trace] [-q]
-
-    warcprox - WARC writing MITM HTTP/S proxy
-
-    optional arguments:
-      -h, --help            show this help message and exit
-      -p PORT, --port PORT  port to listen on (default: 8000)
-      -b ADDRESS, --address ADDRESS
-                            address to listen on (default: localhost)
-      -c CACERT, --cacert CACERT
-                            CA certificate file; if file does not exist, it
-                            will be created (default:
-                            ./ayutla.monkeybrains.net-warcprox-ca.pem)
-      --certs-dir CERTS_DIR
-                            where to store and load generated certificates
-                            (default: ./ayutla.monkeybrains.net-warcprox-ca)
-      -d DIRECTORY, --dir DIRECTORY
-                            where to write warcs (default: ./warcs)
-      --warc-filename WARC_FILENAME
-                            define custom WARC filename with variables
-                            {prefix}, {timestamp14}, {timestamp17},
-                            {serialno}, {randomtoken}, {hostname},
-                            {shorthostname} (default:
-                            {prefix}-{timestamp17}-{serialno}-{randomtoken})
-      -z, --gzip            write gzip-compressed warc records
-      -n PREFIX, --prefix PREFIX
-                            default WARC filename prefix (default: WARCPROX)
-      -s ROLLOVER_SIZE, --size ROLLOVER_SIZE
-                            WARC file rollover size threshold in bytes
-                            (default: 1000000000)
-      --rollover-idle-time ROLLOVER_IDLE_TIME
-                            WARC file rollover idle time threshold in seconds
-                            (so that Friday's last open WARC doesn't sit there
-                            all weekend waiting for more data) (default: None)
-      -g DIGEST_ALGORITHM, --digest-algorithm DIGEST_ALGORITHM
-                            digest algorithm, one of sha384, sha224, md5,
-                            sha256, sha512, sha1 (default: sha1)
-      --base32              write digests in Base32 instead of hex
-      --method-filter HTTP_METHOD
-                            only record requests with the given http method(s)
-                            (can be used more than once) (default: None)
-      --stats-db-file STATS_DB_FILE
-                            persistent statistics database file; empty string
-                            or /dev/null disables statistics tracking
-                            (default: ./warcprox.sqlite)
-      --rethinkdb-stats-url RETHINKDB_STATS_URL
-                            rethinkdb stats table url, e.g. rethinkdb://db0.fo
-                            o.org,db1.foo.org:38015/my_warcprox_db/my_stats_ta
-                            ble (default: None)
-      -P PLAYBACK_PORT, --playback-port PLAYBACK_PORT
-                            port to listen on for instant playback (default:
-                            None)
-      -j DEDUP_DB_FILE, --dedup-db-file DEDUP_DB_FILE
-                            persistent deduplication database file; empty
-                            string or /dev/null disables deduplication
-                            (default: ./warcprox.sqlite)
-      --rethinkdb-dedup-url RETHINKDB_DEDUP_URL
-                            rethinkdb dedup url, e.g. rethinkdb://db0.foo.org,
-                            db1.foo.org:38015/my_warcprox_db/my_dedup_table
-                            (default: None)
-      --rethinkdb-big-table-url RETHINKDB_BIG_TABLE_URL
-                            rethinkdb big table url (table will be populated
-                            with various capture information and is suitable
-                            for use as index for playback), e.g. rethinkdb://d
-                            b0.foo.org,db1.foo.org:38015/my_warcprox_db/captur
-                            es (default: None)
-      --rethinkdb-trough-db-url RETHINKDB_TROUGH_DB_URL
-                            🐷 url pointing to trough configuration rethinkdb
-                            database, e.g. rethinkdb://db0.foo.org,db1.foo.org
-                            :38015/trough_configuration (default: None)
-      --cdxserver-dedup CDXSERVER_DEDUP
-                            use a CDX Server URL for deduplication; e.g.
-                            https://web.archive.org/cdx/search (default: None)
-      --rethinkdb-services-url RETHINKDB_SERVICES_URL
-                            rethinkdb service registry table url; if provided,
-                            warcprox will create and heartbeat entry for
-                            itself (default: None)
-      --onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY
-                            host:port of tor socks proxy, used only to connect
-                            to .onion sites (default: None)
-      --crawl-log-dir CRAWL_LOG_DIR
-                            if specified, write crawl log files in the
-                            specified directory; one crawl log is written per
-                            warc filename prefix; crawl log format mimics
-                            heritrix (default: None)
-      --plugin PLUGIN_CLASS
-                            Qualified name of plugin class, e.g.
-                            "mypkg.mymod.MyClass". May be used multiple times
-                            to register multiple plugins. See README.rst for
-                            more information. (default: None)
-      --version             show program's version number and exit
-      -v, --verbose
-      --trace
-      -q, --quiet
-
 License
 =======
 

From 9434a1ccd87f8f3b179175468ed02d4ada2f037d Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Wed, 30 May 2018 14:26:10 -0700
Subject: [PATCH 29/33] more little edits

---
 readme.rst | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/readme.rst b/readme.rst
index 6f53f66..dbb1440 100644
--- a/readme.rst
+++ b/readme.rst
@@ -8,7 +8,7 @@ traffic to disk in `WARC
 <https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/>`_
 format. Warcprox captures encrypted https traffic by using the
 `"man-in-the-middle" <https://en.wikipedia.org/wiki/Man-in-the-middle_attack>`_
-technique (see the `Man-In-The_Middle`_ section for more info).
+technique (see the `Man-in-the-middle`_ section for more info).
 
 The web pages that warcprox stores in WARC files can be played back using
 software like `OpenWayback <https://github.com/iipc/openwayback>`_ or `pywb
@@ -41,21 +41,21 @@ To start warcprox run::
 
 Try ``warcprox --help`` for documentation on command line options.
 
-Man-In-The-Middle?
-==================
-Traffic to and from https sites is encrypted. Normally http proxies can't read
-that traffic. The web client uses the http ``CONNECT`` method to establish a
-tunnel through the proxy, and the proxy merely routes raw bytes between the
-client and server. Since the bytes are encrypted, the proxy can't make sense of
-the information it's proxying. Nonsensical encrypted bytes would not be very
-useful to archive.
+Man-in-the-middle
+=================
+Normally, http proxies can't read https traffic, because it's encrypted. The
+browser uses the http ``CONNECT`` method to establish a tunnel through the
+proxy, and the proxy merely routes raw bytes between the client and server.
+Since the bytes are encrypted, the proxy can't make sense of the information
+it's proxying. This nonsensical encrypted data would not be very useful to
+archive.
 
 In order to capture https traffic, warcprox acts as a "man-in-the-middle"
 (MITM). When it receives a ``CONNECT`` directive from a client, it generates a
 public key certificate for the requested site, presents to the client, and
-proceeds to establish an encrypted connection. Then it makes a separate, normal
-https connection to the remote site. It decrypts, archives, and re-encrypts
-traffic in both directions.
+proceeds to establish an encrypted connection with the client. Then it makes a
+separate, normal https connection to the remote site. It decrypts, archives,
+and re-encrypts traffic in both directions.
 
 Although "man-in-the-middle" is often paired with "attack", there is nothing
 malicious about what warcprox is doing. If you configure an instance of

From 6f43286b07681208728a93bead543368e3e47169 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Wed, 30 May 2018 14:46:14 -0700
Subject: [PATCH 30/33] more edits

---
 api.rst | 78 +++++++++++++++++++++++++++------------------------------
 1 file changed, 37 insertions(+), 41 deletions(-)

diff --git a/api.rst b/api.rst
index ae12575..bac642b 100644
--- a/api.rst
+++ b/api.rst
@@ -124,15 +124,11 @@ warcprox will write a warc record that looks like this::
 configuration information and metadata with each proxy request to warcprox. The
 value is a json blob. There are several fields understood by warcprox, and
 arbitrary additional fields can be included. If warcprox doesn't recognize a
-field it simply ignores it. Warcprox plugins could make use of custom fields,
-for example.
+field it simply ignores it. Custom fields may be useful for custom warcprox
+plugins (see `<readme.rst#plugins>`_).
 
 Warcprox strips the ``warcprox-meta`` header out before sending the request to
-remote server, and also does not write it in the warc request record.
-
-::
-
-    Warcprox-Meta: {}
+remote server, and does not write it in the warc request record.
 
 Brozzler knows about ``warcprox-meta``. For information on configuring
 it in brozzler, see
@@ -153,27 +149,6 @@ Example::
 
     Warcprox-Meta: {"warc-prefix": "special-warc"}
 
-``stats`` (dictionary)
-~~~~~~~~~~~~~~~~~~~~~~
-``stats`` is a dictionary with only one field understood by warcprox,
-``buckets``. The value of ``buckets`` is a list of strings and/or
-dictionaries. A string signifies the name of the bucket; a dictionary is
-expected to have at least an item with key ``bucket`` whose value is the name
-of the bucket. The other currently recognized key is ``tally-domains``, which
-if supplied should be a list of domains. This instructs warcprox to
-additionally tally substats of the given bucket by domain. Host stats are
-stored in the stats table under the key
-``{parent-bucket}:{domain(normalized)}``, e.g. ``"bucket2:foo.bar.com"`` for the
-example below.
-
-Examples::
-
-    Warcprox-Meta: {"stats":{"buckets":["my-stats-bucket","all-the-stats"]}}
-    Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}
-
-See `<readme.rst#statistics>`_ for more information on statistics kept by
-warcprox.
-
 ``dedup-bucket`` (string)
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 Specifies the deduplication bucket. For more information about deduplication
@@ -196,11 +171,10 @@ Example::
 
 If any of the rules match the url being requested, warcprox aborts normal
 processing and responds with a http ``403``. The http response includes
-a ``Warcprox-Meta`` **response** header with one field, ``blocked-by-rule``,
+a ``Warcprox-Meta`` response header with one field, ``blocked-by-rule``,
 which reproduces the value of the match rule that resulted in the block. The
 presence of the ``warcprox-meta`` response header can be used by the client to
-distinguish this type of a response from a 403 from the remote url being
-requested.
+distinguish this type of a response from a 403 from the remote site.
 
 An example::
 
@@ -222,6 +196,29 @@ to evaluate the block rules. In particular, this circumstance prevails when the
 browser controlled by brozzler is requesting images, javascript, css, and so
 on, embedded in a page.
 
+``stats`` (dictionary)
+~~~~~~~~~~~~~~~~~~~~~~
+``stats`` is a dictionary with only one field understood by warcprox,
+``buckets``. The value of ``buckets`` is a list of strings and/or
+dictionaries. A string signifies the name of the bucket; a dictionary is
+expected to have at least an item with key ``bucket`` whose value is the name
+of the bucket. The other currently recognized key is ``tally-domains``, which
+if supplied should be a list of domains. This instructs warcprox to
+additionally tally substats of the given bucket by domain.
+
+See `<readme.rst#statistics>`_ for more information on statistics kept by
+warcprox.
+
+Examples::
+
+    Warcprox-Meta: {"stats":{"buckets":["my-stats-bucket","all-the-stats"]}}
+    Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}
+
+Domain stats are stored in the stats table under the key
+``"bucket2:foo.bar.com"`` for the latter example. See the following two
+sections for more examples. The ``soft-limits`` section has an example of a
+limit on a domain specified in ``tally-domains``.
+
 ``limits`` (dictionary)
 ~~~~~~~~~~~~~~~~~~~~~~~
 Specifies quantitative limits for warcprox to enforce. The structure of the
@@ -231,12 +228,12 @@ further explanation of what "bucket", "sub-bucket", and "statistic" mean here.
 
 If processing a request would result in exceeding a limit, warcprox aborts
 normal processing and responds with a http ``420 Reached Limit``. The http
-response includes a ``Warcprox-Meta`` **response** header with the complete set
+response includes a ``Warcprox-Meta`` response header with the complete set
 of statistics for the bucket whose limit has been reached.
 
 Example::
 
-    {"stats": {"buckets": ["test_limits_bucket"]}, "limits": {"test_limits_bucket/total/urls": 10}}
+    Warcprox-Meta: {"stats": {"buckets": ["test_limits_bucket"]}, "limits": {"test_limits_bucket/total/urls": 10}}
 
 ::
 
@@ -257,7 +254,7 @@ From warcprox's perspective ``soft-limits`` work almost exactly the same way
 as ``limits``. The only difference is that when a soft limit is hit, warcprox
 response with an http ``430 Reached soft limit`` instead of http ``420``.
 
-Warcprox clients might treat a 430 very differently from a ``420``. From
+Warcprox clients might treat a ``430`` very differently from a ``420``. From
 brozzler's perspective, for instance, ``soft-limits`` are very different from
 ``limits``. When brozzler receives a ``420`` from warcprox because a ``limit``
 has been reached, this means that crawling for that seed is finished, and
@@ -298,7 +295,7 @@ Example::
 ``accept`` (list)
 ~~~~~~~~~~~~~~~~~
 Specifies fields that the client would like to receive in the ``Warcprox-Meta``
-*response* header. Only one value is currently understood,
+response header. Only one value is currently understood,
 ``capture-metadata``.
 
 Example::
@@ -315,10 +312,9 @@ example::
 
 ``Warcprox-Meta`` http response header
 ======================================
-In some cases warcprox will add a ``Warcprox-Meta`` header in the http response
-that it sends to the client. Like the request header, the value is a json blob.
-It is only included if something in the ``warcprox-meta`` request header calls
-for it. Those cases are described above in the
-`Warcprox-Meta http request header`_ section.
-
+In some cases warcprox will add a ``Warcprox-Meta`` header to the http response
+that it sends to the client. As with the request header, the value is a json
+blob. It is only included if something in the ``warcprox-meta`` request header
+calls for it. Those cases are described above in the `Warcprox-Meta http
+request header`_ section.
 

From e8cb3afa719e9ac2015d38bc1ad40a320abf3b67 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Thu, 31 May 2018 16:52:37 -0700
Subject: [PATCH 31/33] bump dev version after merge

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 4fc1cbf..81e46c0 100755
--- a/setup.py
+++ b/setup.py
@@ -40,7 +40,7 @@ except:
 
 setuptools.setup(
         name='warcprox',
-        version='2.4b2.dev174',
+        version='2.4b2.dev175',
         description='WARC writing MITM HTTP/S proxy',
         url='https://github.com/internetarchive/warcprox',
         author='Noah Levitt',

From e73cbcb6b3591492acdb496a45a90f10d6300ff2 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Thu, 31 May 2018 16:57:06 -0700
Subject: [PATCH 32/33] log stack trace in case batch postprocessor raises

exception somehow
---
 warcprox/__init__.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/warcprox/__init__.py b/warcprox/__init__.py
index 20f0de4..4825e29 100644
--- a/warcprox/__init__.py
+++ b/warcprox/__init__.py
@@ -122,14 +122,19 @@ class BasePostfetchProcessor(threading.Thread):
         self.profiler = None
 
     def run(self):
-        if self.options.profile:
-            import cProfile
-            self.profiler = cProfile.Profile()
-            self.profiler.enable()
-            self._run()
-            self.profiler.disable()
-        else:
-            self._run()
+        try:
+            if self.options.profile:
+                import cProfile
+                self.profiler = cProfile.Profile()
+                self.profiler.enable()
+                self._run()
+                self.profiler.disable()
+            else:
+                self._run()
+        except:
+            self.logger.critical(
+                    '%s dying due to uncaught exception',
+                    self.name, exc_info=True)
 
     def _get_process_put(self):
         '''

From ec7a0bf569a8d8fe4c54a7554419655a21036c70 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Thu, 31 May 2018 16:57:37 -0700
Subject: [PATCH 33/33] =?UTF-8?q?log=20exception=20and=20continue=20?=
 =?UTF-8?q?=F0=9F=A4=9E=20if=20schema=20reg=20fails?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

at trough dedup startup
---
 setup.py          | 2 +-
 warcprox/dedup.py | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 81e46c0..6ac73a1 100755
--- a/setup.py
+++ b/setup.py
@@ -40,7 +40,7 @@ except:
 
 setuptools.setup(
         name='warcprox',
-        version='2.4b2.dev175',
+        version='2.4b2.dev176',
         description='WARC writing MITM HTTP/S proxy',
         url='https://github.com/internetarchive/warcprox',
         author='Noah Levitt',
diff --git a/warcprox/dedup.py b/warcprox/dedup.py
index 83f3e0a..81be2ea 100644
--- a/warcprox/dedup.py
+++ b/warcprox/dedup.py
@@ -497,7 +497,13 @@ class TroughDedupDb(DedupDb, DedupableMixin):
         return BatchTroughStorer(self, self.options)
 
     def start(self):
-        self._trough_cli.register_schema(self.SCHEMA_ID, self.SCHEMA_SQL)
+        try:
+            self._trough_cli.register_schema(self.SCHEMA_ID, self.SCHEMA_SQL)
+        except Exception as e:
+            # can happen. hopefully someone else has registered it
+            self.logger.critical(
+                    'will try to continue after problem registering schema %s',
+                    self.SCHEMA_ID, exc_info=True)
 
     def save(self, digest_key, response_record, bucket='__unspecified__'):
         record_id = response_record.get_header(warctools.WarcRecord.ID)