From af863c6dba906d123627a6cc03f106d2baca8124 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 15 May 2018 11:22:10 -0700 Subject: [PATCH 01/33] default values for dedup_min_text_size et al because they may be missing in case warcprox is used as a library --- setup.py | 2 +- warcprox/dedup.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 4579b12..a7fdb47 100755 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ except: setuptools.setup( name='warcprox', - version='2.4b2.dev170', + version='2.4b2.dev171', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/dedup.py b/warcprox/dedup.py index f979d97..5d5039f 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -39,9 +39,9 @@ urllib3.disable_warnings() class DedupableMixin(object): def __init__(self, options=warcprox.Options()): - self.min_text_size = options.dedup_min_text_size - self.min_binary_size = options.dedup_min_binary_size - self.dedup_only_with_bucket = options.dedup_only_with_bucket + self.min_text_size = options.dedup_min_text_size or 0 + self.min_binary_size = options.dedup_min_binary_size or 0 + self.dedup_only_with_bucket = options.dedup_only_with_bucket or False def should_dedup(self, recorded_url): """Check if we should try to run dedup on resource based on payload From e23af32e94d5b33f6527cfc58c798385b2cab79f Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 15 May 2018 15:33:52 -0700 Subject: [PATCH 02/33] we want to save all captures to the big "captures" table, even if we don't want to dedup against them --- setup.py | 2 +- warcprox/bigtable.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index a7fdb47..eefcdb4 100755 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ except: setuptools.setup( name='warcprox', - version='2.4b2.dev171', + version='2.4b2.dev172', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index d8cd218..0d98270 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -253,6 +253,4 @@ class RethinkCapturesDedup(warcprox.dedup.DedupDb, DedupableMixin): self.captures_db.close() def notify(self, recorded_url, records): - if (records and records[0].type == b'response' - and self.should_dedup(recorded_url)): - self.captures_db.notify(recorded_url, records) + self.captures_db.notify(recorded_url, records) From 5f0c46d579a45a06768babbe8b72f83e98d73ce0 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 16 May 2018 10:52:04 -0700 Subject: [PATCH 03/33] rewrite test_dedup_min_size() to account for the fact that we always save a record to the big captures table, partly by adding a new check that --dedup-min-*-size is respected even if there is an entry in the dedup db for the sha1 --- tests/test_warcprox.py | 145 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 132 insertions(+), 13 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 269deee..2a12b22 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1940,7 +1940,7 @@ def test_trough_segment_promotion(warcprox_): time.sleep(3) assert promoted == [] -def test_dedup_min_size(http_daemon, warcprox_, archiving_proxies, playback_proxies): +def test_dedup_min_text_size(http_daemon, warcprox_, archiving_proxies): """We use options --dedup-min-text-size=3 --dedup-min-binary-size=5 and we try to download content smaller than these limits to make sure that it is not deduplicated. We create the digest_str with the following code: @@ -1950,36 +1950,155 @@ def test_dedup_min_size(http_daemon, warcprox_, archiving_proxies, playback_prox warcprox.digest_str(payload_digest) ``` """ + urls_before = warcprox_.proxy.running_stats.urls + + # start a fresh warc + warcprox_.warc_writer_processor.writer_pool.close_writers() + + # fetch small text url = 'http://localhost:%s/text-2bytes' % http_daemon.server_port response = requests.get( url, proxies=archiving_proxies, verify=False, timeout=10) assert len(response.content) == 2 + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1) + # check no dedup was saved (except RethinkCapturesDedup which always saves) dedup_lookup = warcprox_.dedup_db.lookup( b'sha1:e0c9035898dd52fc65c41454cec9c4d2611bfb37') - assert dedup_lookup is None - time.sleep(3) + if not isinstance(warcprox_.dedup_db, warcprox.bigtable.RethinkCapturesDedup): + assert dedup_lookup is None + + # fetch again saving dedup info so that we can test dedup info ignored + orig_should_dedup = warcprox_.dedup_db.should_dedup + warcprox_.dedup_db.should_dedup = lambda *args, **kwargs: True + try: + response = requests.get( + url, proxies=archiving_proxies, verify=False, timeout=10) + assert len(response.content) == 2 + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2) + # check dedup was saved + dedup_lookup = warcprox_.dedup_db.lookup( + b'sha1:e0c9035898dd52fc65c41454cec9c4d2611bfb37') + assert dedup_lookup + finally: + warcprox_.dedup_db.should_dedup = orig_should_dedup + else: + assert dedup_lookup + + # fetch again and check that it was not deduped + urls_before = warcprox_.proxy.running_stats.urls response = requests.get( url, proxies=archiving_proxies, verify=False, timeout=10) - dedup_lookup = warcprox_.dedup_db.lookup( - b'sha1:e0c9035898dd52fc65c41454cec9c4d2611bfb37') - # This would return dedup data if payload_size > dedup-min-text-size - assert dedup_lookup is None + assert len(response.content) == 2 + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1) + # check that response records were written + warc = warcprox_.warc_writer_processor.writer_pool.default_warc_writer._available_warcs.queue[0].path + with open(warc, 'rb') as f: + rec_iter = iter(warcio.archiveiterator.ArchiveIterator(f)) + record = next(rec_iter) + assert record.rec_type == 'warcinfo' + record = next(rec_iter) + assert record.rec_type == 'response' + assert record.rec_headers.get_header('warc-target-uri') == url + record = next(rec_iter) + assert record.rec_type == 'request' + assert record.rec_headers.get_header('warc-target-uri') == url + record = next(rec_iter) + assert record.rec_type == 'response' + assert record.rec_headers.get_header('warc-target-uri') == url + record = next(rec_iter) + assert record.rec_type == 'request' + assert record.rec_headers.get_header('warc-target-uri') == url + if not isinstance(warcprox_.dedup_db, warcprox.bigtable.RethinkCapturesDedup): + record = next(rec_iter) + assert record.rec_type == 'response' + assert record.rec_headers.get_header('warc-target-uri') == url + record = next(rec_iter) + assert record.rec_type == 'request' + assert record.rec_headers.get_header('warc-target-uri') == url + with pytest.raises(StopIteration): + next(rec_iter) + +def test_dedup_min_binary_size(http_daemon, warcprox_, archiving_proxies): + """We use options --dedup-min-text-size=3 --dedup-min-binary-size=5 and we + try to download content smaller than these limits to make sure that it is + not deduplicated. We create the digest_str with the following code: + ``` + payload_digest = hashlib.new('sha1') + payload_digest.update(b'aa') + warcprox.digest_str(payload_digest) + ``` + """ + urls_before = warcprox_.proxy.running_stats.urls + + # start a fresh warc + warcprox_.warc_writer_processor.writer_pool.close_writers() + + # fetch small binary url = 'http://localhost:%s/binary-4bytes' % http_daemon.server_port response = requests.get( url, proxies=archiving_proxies, verify=False, timeout=10) assert len(response.content) == 4 + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1) + # check no dedup was saved (except RethinkCapturesDedup which always saves) dedup_lookup = warcprox_.dedup_db.lookup( b'sha1:70c881d4a26984ddce795f6f71817c9cf4480e79') - assert dedup_lookup is None - time.sleep(3) + if not isinstance(warcprox_.dedup_db, warcprox.bigtable.RethinkCapturesDedup): + assert dedup_lookup is None + + # fetch again saving dedup info so that we can test dedup info ignored + orig_should_dedup = warcprox_.dedup_db.should_dedup + warcprox_.dedup_db.should_dedup = lambda *args, **kwargs: True + try: + response = requests.get( + url, proxies=archiving_proxies, verify=False, timeout=10) + assert len(response.content) == 4 + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2) + # check dedup was saved + dedup_lookup = warcprox_.dedup_db.lookup( + b'sha1:70c881d4a26984ddce795f6f71817c9cf4480e79') + assert dedup_lookup + finally: + warcprox_.dedup_db.should_dedup = orig_should_dedup + else: + assert dedup_lookup + + # fetch again and check that it was not deduped + urls_before = warcprox_.proxy.running_stats.urls response = requests.get( url, proxies=archiving_proxies, verify=False, timeout=10) - dedup_lookup = warcprox_.dedup_db.lookup( - b'sha1:70c881d4a26984ddce795f6f71817c9cf4480e79') - # This would return dedup data if payload_size > dedup-min-binary-size - assert dedup_lookup is None + assert len(response.content) == 4 + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1) + # check that response records were written + warc = warcprox_.warc_writer_processor.writer_pool.default_warc_writer._available_warcs.queue[0].path + with open(warc, 'rb') as f: + rec_iter = iter(warcio.archiveiterator.ArchiveIterator(f)) + record = next(rec_iter) + assert record.rec_type == 'warcinfo' + record = next(rec_iter) + assert record.rec_type == 'response' + assert record.rec_headers.get_header('warc-target-uri') == url + record = next(rec_iter) + assert record.rec_type == 'request' + assert record.rec_headers.get_header('warc-target-uri') == url + record = next(rec_iter) + assert record.rec_type == 'response' + assert record.rec_headers.get_header('warc-target-uri') == url + record = next(rec_iter) + assert record.rec_type == 'request' + assert record.rec_headers.get_header('warc-target-uri') == url + if not isinstance(warcprox_.dedup_db, warcprox.bigtable.RethinkCapturesDedup): + record = next(rec_iter) + assert record.rec_type == 'response' + assert record.rec_headers.get_header('warc-target-uri') == url + record = next(rec_iter) + assert record.rec_type == 'request' + assert record.rec_headers.get_header('warc-target-uri') == url + with pytest.raises(StopIteration): + next(rec_iter) if __name__ == '__main__': pytest.main() From 76ebaea944c7b117bc910c158e741f202ea2101d Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 16 May 2018 12:17:06 -0700 Subject: [PATCH 04/33] fix test_dedup_min_text_size failure? by waiting for postfetch chain in test_socket_timeout_response --- tests/test_warcprox.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 2a12b22..079fdd1 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1837,6 +1837,8 @@ def test_socket_timeout_response( """Response will timeout because we use --socket-timeout=4 whereas the target URL will return after 6 sec. """ + urls_before = warcprox_.proxy.running_stats.urls + url = 'http://localhost:%s/slow-response' % http_daemon.server_port response = requests.get(url, proxies=archiving_proxies, verify=False) assert response.status_code == 502 @@ -1849,6 +1851,8 @@ def test_socket_timeout_response( assert response.status_code == 404 assert response.content == b'404 Not Found\n' + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1) + def test_empty_response( warcprox_, http_daemon, https_daemon, archiving_proxies, playback_proxies): From 49f637af0584fb1ef3d0e61911b43f570ea43962 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 16 May 2018 13:48:04 -0700 Subject: [PATCH 05/33] fix trough deployment in Dockerfile --- tests/Dockerfile | 16 +++++++++------- tests/run-trough.sh | 2 +- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/tests/Dockerfile b/tests/Dockerfile index 5e380d8..6a97ac0 100644 --- a/tests/Dockerfile +++ b/tests/Dockerfile @@ -75,11 +75,13 @@ RUN mv -v /etc/hadoop/conf/hdfs-site.xml /etc/hadoop/conf/hdfs-site.xml.orig \ RUN echo '#!/bin/bash\nservice hadoop-hdfs-namenode start\nservice hadoop-hdfs-datanode start' > /etc/my_init.d/50_start_hdfs.sh \ && chmod a+x /etc/my_init.d/50_start_hdfs.sh +RUN apt-get install -y libsqlite3-dev + # trough itself RUN virtualenv -p python3 /opt/trough-ve3 \ && . /opt/trough-ve3/bin/activate \ && pip install git+https://github.com/jkafader/snakebite@feature/python3-version-string \ - && pip install git+https://github.com/nlevitt/trough.git@toward-warcprox-dedup + && pip install git+https://github.com/internetarchive/trough.git RUN mkdir -vp /etc/service/trough-sync-local \ && echo "#!/bin/bash\nsource /opt/trough-ve3/bin/activate\nexec sync.py >>/tmp/trough-sync-local.out 2>&1" > /etc/service/trough-sync-local/run \ @@ -97,11 +99,11 @@ RUN mkdir -vp /etc/service/trough-write \ && echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6222 --master --processes=2 --harakiri=240 --max-requests=50000 --vacuum --die-on-term --wsgi-file $venv/bin/writer.py >>/tmp/trough-write.out 2>&1' > /etc/service/trough-write/run \ && chmod a+x /etc/service/trough-write/run -RUN mkdir -vp /etc/service/trough-write-provisioner-local \ - && echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6112 --master --processes=2 --harakiri=20 --max-requests=50000 --vacuum --die-on-term --wsgi-file $venv/bin/write_provisioner_local.py >>/tmp/trough-write-provisioner-local.out 2>&1' > /etc/service/trough-write-provisioner-local/run \ - && chmod a+x /etc/service/trough-write-provisioner-local/run +RUN mkdir -vp /etc/service/trough-segment-manager-local \ + && echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6112 --master --processes=2 --harakiri=7200 --http-timeout=7200 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:local >>/tmp/trough-segment-manager-local.out 2>&1' > /etc/service/trough-segment-manager-local/run \ + && chmod a+x /etc/service/trough-segment-manager-local/run -RUN mkdir -vp /etc/service/trough-write-provisioner-server \ - && echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6111 --master --processes=2 --harakiri=20 --max-requests=50000 --vacuum --die-on-term --wsgi-file $venv/bin/write_provisioner_server.py >>/tmp/trough-write-provisioner-server.out 2>&1' > /etc/service/trough-write-provisioner-server/run \ - && chmod a+x /etc/service/trough-write-provisioner-server/run +RUN mkdir -vp /etc/service/trough-segment-manager-server \ + && echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6111 --master --processes=2 --harakiri=7200 --http-timeout=7200 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:server >>/tmp/trough-segment-manager-server.out 2>&1' > /etc/service/trough-segment-manager-server/run \ + && chmod a+x /etc/service/trough-segment-manager-server/run diff --git a/tests/run-trough.sh b/tests/run-trough.sh index c2319a0..81e0e68 100644 --- a/tests/run-trough.sh +++ b/tests/run-trough.sh @@ -4,7 +4,7 @@ # pip install git+https://github.com/jkafader/snakebite@feature/python3-version-string -pip install git+https://github.com/internetarchive/trough.git@toward-warcprox-dedup +pip install git+https://github.com/internetarchive/trough.git mkdir /etc/trough From d834ac3e5910e8354b9e74630b94aa4bab500c81 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 16 May 2018 14:21:18 -0700 Subject: [PATCH 06/33] only run tests in py3 --- tests/run-tests.sh | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/tests/run-tests.sh b/tests/run-tests.sh index a7a819c..12dd371 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -31,18 +31,15 @@ script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" docker build -t internetarchive/warcprox-tests $script_dir -for python in python3 python2.7 -do - docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests /sbin/my_init -- \ - bash -x -c "cd /tmp && git clone /warcprox && cd /tmp/warcprox \ - && (cd /warcprox && git diff HEAD) | patch -p1 \ - && virtualenv -p $python /tmp/venv \ - && source /tmp/venv/bin/activate \ - && pip --log-file /tmp/pip.log install . pytest mock requests warcio \ - && py.test -v tests \ - && py.test -v --rethinkdb-dedup-url=rethinkdb://localhost/test1/dedup tests \ - && py.test -v --rethinkdb-big-table-url=rethinkdb://localhost/test2/captures tests \ - && py.test -v --rethinkdb-trough-db-url=rethinkdb://localhost/trough_configuration tests \ - " -done +docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests /sbin/my_init -- \ + bash -x -c "cd /tmp && git clone /warcprox && cd /tmp/warcprox \ + && (cd /warcprox && git diff HEAD) | patch -p1 \ + && virtualenv -p python3 /tmp/venv \ + && source /tmp/venv/bin/activate \ + && pip --log-file /tmp/pip.log install . pytest mock requests warcio \ + && py.test -v tests \ + && py.test -v --rethinkdb-dedup-url=rethinkdb://localhost/test1/dedup tests \ + && py.test -v --rethinkdb-big-table-url=rethinkdb://localhost/test2/captures tests \ + && py.test -v --rethinkdb-trough-db-url=rethinkdb://localhost/trough_configuration tests \ + " From b762d6468b40fc740c4ec38a575b8896cac86689 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 16 May 2018 14:25:01 -0700 Subject: [PATCH 07/33] just one should_dedup() for trough dedup fixes failing test and clarifies things --- setup.py | 2 +- warcprox/dedup.py | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/setup.py b/setup.py index eefcdb4..8c38b62 100755 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ except: setuptools.setup( name='warcprox', - version='2.4b2.dev172', + version='2.4b2.dev173', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 5d5039f..be91874 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -326,10 +326,9 @@ class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin) if self.outq: self.outq.put(recorded_url) -class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor, DedupableMixin): +class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor): def __init__(self, trough_dedup_db, options=warcprox.Options()): warcprox.BaseBatchPostfetchProcessor.__init__(self, options) - DedupableMixin.__init__(self, options) self.trough_dedup_db = trough_dedup_db def _filter_and_bucketize(self, batch): @@ -341,7 +340,7 @@ class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor, DedupableMixin): for recorded_url in batch: if (recorded_url.warc_records and recorded_url.warc_records[0].type == b'response' - and self.should_dedup(recorded_url)): + and self.trough_dedup_db.should_dedup(recorded_url)): if (recorded_url.warcprox_meta and 'dedup-bucket' in recorded_url.warcprox_meta): bucket = recorded_url.warcprox_meta['dedup-bucket'] @@ -373,10 +372,9 @@ class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor, DedupableMixin): logging.warn( 'timed out saving dedup info to trough', exc_info=True) -class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin): +class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): def __init__(self, trough_dedup_db, options=warcprox.Options()): warcprox.BaseBatchPostfetchProcessor.__init__(self, options) - DedupableMixin.__init__(self, options) self.trough_dedup_db = trough_dedup_db def _startup(self): @@ -391,7 +389,7 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin): for recorded_url in batch: if (recorded_url.response_recorder and recorded_url.payload_digest - and self.should_dedup(recorded_url)): + and self.trough_dedup_db.should_dedup(recorded_url)): if (recorded_url.warcprox_meta and 'dedup-bucket' in recorded_url.warcprox_meta): bucket = recorded_url.warcprox_meta['dedup-bucket'] From 997d4341fe3d99c4d2e65845edff97eb817c130c Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 18 May 2018 17:29:38 -0700 Subject: [PATCH 08/33] add some debug logging in BatchTroughLoader --- setup.py | 2 +- warcprox/dedup.py | 21 ++++++++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 8c38b62..9e7db14 100755 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ except: setuptools.setup( name='warcprox', - version='2.4b2.dev173', + version='2.4b2.dev174', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/dedup.py b/warcprox/dedup.py index be91874..83f3e0a 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -373,6 +373,8 @@ class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor): 'timed out saving dedup info to trough', exc_info=True) class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): + logger = logging.getLogger("warcprox.dedup.BatchTroughLoader") + def __init__(self, trough_dedup_db, options=warcprox.Options()): warcprox.BaseBatchPostfetchProcessor.__init__(self, options) self.trough_dedup_db = trough_dedup_db @@ -386,6 +388,7 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): be looked up. ''' buckets = collections.defaultdict(list) + discards = [] for recorded_url in batch: if (recorded_url.response_recorder and recorded_url.payload_digest @@ -396,6 +399,13 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): else: bucket = '__unspecified__' buckets[bucket].append(recorded_url) + else: + discards.append( + warcprox.digest_str( + recorded_url.payload_digest, self.options.base32) + if recorded_url.payload_digest else 'n/a') + self.logger.debug( + 'filtered out digests (not loading dedup): %r', discards) return buckets def _build_key_index(self, batch): @@ -443,10 +453,19 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): 'problem looking up dedup info for %s urls ' 'in bucket %s', len(buckets[bucket]), bucket, exc_info=True) + + if self.logger.isEnabledFor(logging.DEBUG): + dups = sorted([e['digest_key'] for e in future.result()]) + novel = sorted([ + k for k in key_index.keys() if k not in dups]) + self.logger.debug( + 'bucket %s: dups=%r novel=%r', + bucket, dups, novel) + except futures.TimeoutError as e: # the remaining threads actually keep running in this case, # there's no way to stop them, but that should be harmless - logging.warn( + self.logger.warn( 'timed out loading dedup info from trough', exc_info=True) class TroughDedupDb(DedupDb, DedupableMixin): From b7ebc384915f87127dda0db554f7c6fb90194539 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 21 May 2018 22:18:28 +0000 Subject: [PATCH 09/33] rename README.rst -> readme.rst --- README.rst => readme.rst | 0 setup.py | 2 +- warcprox/main.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename README.rst => readme.rst (100%) diff --git a/README.rst b/readme.rst similarity index 100% rename from README.rst rename to readme.rst diff --git a/setup.py b/setup.py index 9e7db14..4fc1cbf 100755 --- a/setup.py +++ b/setup.py @@ -45,7 +45,7 @@ setuptools.setup( url='https://github.com/internetarchive/warcprox', author='Noah Levitt', author_email='nlevitt@archive.org', - long_description=open('README.rst').read(), + long_description=open('readme.rst').read(), license='GPL', packages=['warcprox'], install_requires=deps, diff --git a/warcprox/main.py b/warcprox/main.py index 6fb46ef..5f45a13 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -193,7 +193,7 @@ def _build_arg_parser(prog='warcprox'): action='append', help=( 'Qualified name of plugin class, e.g. "mypkg.mymod.MyClass". ' 'May be used multiple times to register multiple plugins. ' - 'See README.rst for more information.')) + 'See readme.rst for more information.')) arg_parser.add_argument('--version', action='version', version="warcprox {}".format(warcprox.__version__)) arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true') From efc51a43617eb791e9829dd5ecff00e8bbf6a946 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 22 May 2018 11:59:06 -0700 Subject: [PATCH 10/33] stubby api docs --- api.rst | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++++ readme.rst | 23 ++++++++--- 2 files changed, 127 insertions(+), 6 deletions(-) create mode 100644 api.rst diff --git a/api.rst b/api.rst new file mode 100644 index 0000000..87b444f --- /dev/null +++ b/api.rst @@ -0,0 +1,110 @@ +warcprox API +************ + +Means of Interacting with warcprox over http, aside from simply proxying urls. + +`/status` url +============= + +If warcprox is running at localhost:8000, http://localhost:8000/status returns +a json blob with a bunch of status info. For example: + +:: + + $ curl -sS http://localhost:8000/status + { + "rates_5min": { + "warc_bytes_per_sec": 0.0, + "urls_per_sec": 0.0, + "actual_elapsed": 277.2983281612396 + }, + "version": "2.4b2.dev174", + "load": 0.0, + "seconds_behind": 0.0, + "threads": 100, + "warc_bytes_written": 0, + "port": 8000, + "postfetch_chain": [ + { + "queued_urls": 0, + "processor": "SkipFacebookCaptchas" + }, + { + "queued_urls": 0, + "processor": "BatchTroughLoader" + }, + { + "queued_urls": 0, + "processor": "WarcWriterProcessor" + }, + { + "queued_urls": 0, + "processor": "BatchTroughStorer" + }, + { + "queued_urls": 0, + "processor": "RethinkStatsProcessor" + }, + { + "queued_urls": 0, + "processor": "CrawlLogger" + }, + { + "queued_urls": 0, + "processor": "TroughFeed" + }, + { + "queued_urls": 0, + "processor": "RunningStats" + } + ], + "queue_max_size": 500, + "role": "warcprox", + "queued_urls": 0, + "active_requests": 1, + "host": "wbgrp-svc405.us.archive.org", + "rates_15min": { + "warc_bytes_per_sec": 0.0, + "urls_per_sec": 0.0, + "actual_elapsed": 876.9885368347168 + }, + "unaccepted_requests": 0, + "urls_processed": 0, + "pid": 18841, + "address": "127.0.0.1", + "rates_1min": { + "warc_bytes_per_sec": 0.0, + "urls_per_sec": 0.0, + "actual_elapsed": 54.92501664161682 + }, + "start_time": 1526690353.4060142 + } + +`WARCPROX_WRITE_RECORD` http method +=================================== + +:: + + $ echo -ne 'WARCPROX_WRITE_RECORD special://url/some?thing HTTP/1.1\r\nWARC-Type: resource\r\ncontent-type: text/plain;charset=utf-8\r\ncontent-length: 29\r\n\r\ni am a warc record payload!\r\n' | ncat 127.0.0.1 8000 + HTTP/1.0 204 OK + Server: BaseHTTP/0.6 Python/3.6.3 + Date: Mon, 21 May 2018 23:33:31 GMT + +:: + + WARC/1.0 + WARC-Type: resource + WARC-Record-ID: + WARC-Date: 2018-05-21T23:33:31Z + WARC-Target-URI: special://url/some?thing + WARC-Block-Digest: sha1:a282cfe127ab8d51b315ff3d31de18614979d0df + WARC-Payload-Digest: sha1:a282cfe127ab8d51b315ff3d31de18614979d0df + Content-Type: text/plain;charset=utf-8 + Content-Length: 29 + + i am a warc record payload! + + +`Warcprox-Meta` http request header +=================================== + diff --git a/readme.rst b/readme.rst index 113099b..090130e 100644 --- a/readme.rst +++ b/readme.rst @@ -1,5 +1,5 @@ warcprox - WARC writing MITM HTTP/S proxy ------------------------------------------ +***************************************** .. image:: https://travis-ci.org/internetarchive/warcprox.svg?branch=master :target: https://travis-ci.org/internetarchive/warcprox @@ -7,7 +7,7 @@ Based on the excellent and simple pymiproxy by Nadeem Douba. https://github.com/allfro/pymiproxy Install -~~~~~~~ +======= Warcprox runs on python 3.4+. @@ -26,7 +26,7 @@ You can also install the latest bleeding edge code: Trusting the CA cert -~~~~~~~~~~~~~~~~~~~~ +==================== For best results while browsing through warcprox, you need to add the CA cert as a trusted cert in your browser. If you don't do that, you will @@ -34,8 +34,19 @@ get the warning when you visit each new site. But worse, any embedded https content on a different server will simply fail to load, because the browser will reject the certificate without telling you. +API +=== + +For interacting with a running instance of warcprox. + +* `/status` url +* `WARCPROX_WRITE_RECORD` http method +* `Warcprox-Meta` http request header + +See ``_. + Plugins -~~~~~~~ +======= Warcprox supports a limited notion of plugins by way of the `--plugin` command line argument. Plugin classes are loaded from the regular python module search @@ -49,7 +60,7 @@ specifying `--plugin` multiples times. `A minimal example `__ Usage -~~~~~ +===== :: @@ -162,7 +173,7 @@ Usage -q, --quiet License -~~~~~~~ +======= Warcprox is a derivative work of pymiproxy, which is GPL. Thus warcprox is also GPL. From 44ca939cb6a8ab28518748de5ac8f6bf0fbe12e5 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 22 May 2018 12:02:49 -0700 Subject: [PATCH 11/33] double the backticks --- api.rst | 6 +++--- readme.rst | 22 +++++++++++----------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/api.rst b/api.rst index 87b444f..77d7ca4 100644 --- a/api.rst +++ b/api.rst @@ -3,7 +3,7 @@ warcprox API Means of Interacting with warcprox over http, aside from simply proxying urls. -`/status` url +``/status`` url ============= If warcprox is running at localhost:8000, http://localhost:8000/status returns @@ -80,7 +80,7 @@ a json blob with a bunch of status info. For example: "start_time": 1526690353.4060142 } -`WARCPROX_WRITE_RECORD` http method +``WARCPROX_WRITE_RECORD`` http method =================================== :: @@ -105,6 +105,6 @@ a json blob with a bunch of status info. For example: i am a warc record payload! -`Warcprox-Meta` http request header +``Warcprox-Meta`` http request header =================================== diff --git a/readme.rst b/readme.rst index 090130e..ffeabca 100644 --- a/readme.rst +++ b/readme.rst @@ -39,23 +39,23 @@ API For interacting with a running instance of warcprox. -* `/status` url -* `WARCPROX_WRITE_RECORD` http method -* `Warcprox-Meta` http request header +* ``/status`` url +* ``WARCPROX_WRITE_RECORD`` http method +* ``Warcprox-Meta`` http request header See ``_. Plugins ======= -Warcprox supports a limited notion of plugins by way of the `--plugin` command -line argument. Plugin classes are loaded from the regular python module search -path. They will be instantiated with one argument, a `warcprox.Options`, which -holds the values of all the command line arguments. Legacy plugins with -constructors that take no arguments are also supported. Plugins should either -have a method `notify(self, recorded_url, records)` or should subclass -`warcprox.BasePostfetchProcessor`. More than one plugin can be configured by -specifying `--plugin` multiples times. +Warcprox supports a limited notion of plugins by way of the ``--plugin`` +command line argument. Plugin classes are loaded from the regular python module +search path. They will be instantiated with one argument, a +``warcprox.Options``, which holds the values of all the command line arguments. +Legacy plugins with constructors that take no arguments are also supported. +Plugins should either have a method ``notify(self, recorded_url, records)`` or +should subclass ``warcprox.BasePostfetchProcessor``. More than one plugin can +be configured by specifying ``--plugin`` multiples times. `A minimal example `__ From 36f6696552be5830e6ab823e563ea23b813bdb28 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 22 May 2018 15:00:10 -0700 Subject: [PATCH 12/33] fix failure message in test_return_capture_timestamp --- tests/test_warcprox.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 079fdd1..7c5253b 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -726,14 +726,16 @@ def test_return_capture_timestamp(http_daemon, warcprox_, archiving_proxies): response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 200 assert response.headers['Warcprox-Meta'] - data = json.loads(response.headers['Warcprox-Meta']) - assert data['capture-metadata'] + response_meta = json.loads(response.headers['Warcprox-Meta']) + assert response_meta['capture-metadata'] try: - dt = datetime.datetime.strptime(data['capture-metadata']['timestamp'], + dt = datetime.datetime.strptime(response_meta['capture-metadata']['timestamp'], '%Y-%m-%dT%H:%M:%SZ') assert dt except ValueError: - pytest.fail('Invalid capture-timestamp format %s', data['capture-timestamp']) + pytest.fail( + 'Invalid http response warcprox-meta["capture-metadata"]["timestamp"]: %r', + meta['capture-metadata']['timestamp']) # wait for postfetch chain (or subsequent test could fail) wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1) From b26a5d2d73355b5cbd271984f45ba5c93139508c Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 22 May 2018 15:00:36 -0700 Subject: [PATCH 13/33] starting to talk about warcprox-meta --- api.rst | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 61 insertions(+), 8 deletions(-) diff --git a/api.rst b/api.rst index 77d7ca4..43732aa 100644 --- a/api.rst +++ b/api.rst @@ -4,7 +4,7 @@ warcprox API Means of Interacting with warcprox over http, aside from simply proxying urls. ``/status`` url -============= +=============== If warcprox is running at localhost:8000, http://localhost:8000/status returns a json blob with a bunch of status info. For example: @@ -81,16 +81,27 @@ a json blob with a bunch of status info. For example: } ``WARCPROX_WRITE_RECORD`` http method -=================================== +===================================== -:: +To make warcprox write an arbitrary warc record you can send it a special +request with http method ``WARCPROX_WRITE_RECORD``. The http request must +include the headers ``WARC-Type``, ``Content-Type``, and ``Content-Length``. +Warcprox will use these to populate the warc record. For example:: - $ echo -ne 'WARCPROX_WRITE_RECORD special://url/some?thing HTTP/1.1\r\nWARC-Type: resource\r\ncontent-type: text/plain;charset=utf-8\r\ncontent-length: 29\r\n\r\ni am a warc record payload!\r\n' | ncat 127.0.0.1 8000 + $ ncat --crlf 127.0.0.1 8000 < WARCPROX_WRITE_RECORD special://url/some?thing HTTP/1.1 + > WARC-Type: resource + > Content-type: text/plain;charset=utf-8 + > Content-length: 29 + > + > i am a warc record payload! + > EOF HTTP/1.0 204 OK Server: BaseHTTP/0.6 Python/3.6.3 - Date: Mon, 21 May 2018 23:33:31 GMT + Date: Tue, 22 May 2018 19:21:02 GMT -:: +On success warcprox responds with http status 204. For the request above +warcprox will write a warc record that looks like this:: WARC/1.0 WARC-Type: resource @@ -104,7 +115,49 @@ a json blob with a bunch of status info. For example: i am a warc record payload! - ``Warcprox-Meta`` http request header -=================================== +===================================== +``Warcprox-Meta`` is a special http request header that can be used to pass +configuration information and metadata with each proxy request to warcprox. The +value is a json blob. There are several fields understood by warcprox, and +arbitrary additional fields can be included. If warcprox doesn't recognize a +field it simply ignores it. Warcprox plugins could make use of custom fields, +for example. + +Warcprox strips the ``warcprox-meta`` header out before sending the request to +remote server, and also does not write it in the warc request record. + +:: + + Warcprox-Meta: {} + +- warc-prefix +- stats + - buckets +- dedup-bucket +- blocks +- limits +- soft-limits +- metadata +- accept +- dedup-ok # deprecate? + +Brozzler knows about ``warcprox-meta``. For information on configuring +``warcprox-meta`` in brozzler, see https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#warcprox-meta + +``Warcprox-Meta`` http response header +====================================== + +In some cases warcprox will add a ``Warcprox-Meta`` header in the http response +that it sends to the client. Like the request header, the value is a json blob. +It is only included if something in the ``warcprox-meta`` request header calls +for it. Those cases are described above in the "``Warcprox-Meta`` http request header" section. + +### - blocked-by-rule +### - reached-limit +### - reached-soft-limit +### - stats +### - capture-metadata +### +### Response codes 420, 430 From b562170403f21e01ce341c5ff774678511b908fb Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 25 May 2018 10:32:42 -0700 Subject: [PATCH 14/33] explain deduplication --- readme.rst | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/readme.rst b/readme.rst index ffeabca..fd3cb8c 100644 --- a/readme.rst +++ b/readme.rst @@ -1,4 +1,4 @@ -warcprox - WARC writing MITM HTTP/S proxy +Warcprox - WARC writing MITM HTTP/S proxy ***************************************** .. image:: https://travis-ci.org/internetarchive/warcprox.svg?branch=master :target: https://travis-ci.org/internetarchive/warcprox @@ -6,9 +6,10 @@ warcprox - WARC writing MITM HTTP/S proxy Based on the excellent and simple pymiproxy by Nadeem Douba. https://github.com/allfro/pymiproxy +.. contents:: + Install ======= - Warcprox runs on python 3.4+. To install latest release run: @@ -27,27 +28,46 @@ You can also install the latest bleeding edge code: Trusting the CA cert ==================== - For best results while browsing through warcprox, you need to add the CA cert as a trusted cert in your browser. If you don't do that, you will get the warning when you visit each new site. But worse, any embedded https content on a different server will simply fail to load, because the browser will reject the certificate without telling you. +Deduplication +============= +Warcprox avoids archiving redundant content by "deduplicating" it. The process +for deduplication works similarly to heritrix and other web archiving tools. + +1. while fetching url, calculate payload content digest (typically sha1) +2. look up digest in deduplication database (warcprox supports a few different + ones) +3. if found write warc ``revisit`` record referencing the url and capture time + of the previous capture +4. else (if not found) + a. write warc ``response`` record with full payload + b. store entry in deduplication database + +The dedup database is partitioned into different "buckets". Urls are +deduplicated only against other captures in the same bucket. If specified, the +``dedup-bucket`` field of the ``Warcprox-Meta`` http request header determines +the bucket, otherwise the default bucket is used. + +Deduplication can be disabled entirely by starting warcprox with the argument +``--dedup-db-file=/dev/null``. + API === - For interacting with a running instance of warcprox. * ``/status`` url * ``WARCPROX_WRITE_RECORD`` http method -* ``Warcprox-Meta`` http request header +* ``Warcprox-Meta`` http request header and response header See ``_. Plugins ======= - Warcprox supports a limited notion of plugins by way of the ``--plugin`` command line argument. Plugin classes are loaded from the regular python module search path. They will be instantiated with one argument, a From 02e96188c3a743930992a5ba78b0eb509b62f647 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 25 May 2018 10:33:45 -0700 Subject: [PATCH 15/33] barely starting to flesh out warcprox-meta section --- api.rst | 57 ++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 46 insertions(+), 11 deletions(-) diff --git a/api.rst b/api.rst index 43732aa..8937ade 100644 --- a/api.rst +++ b/api.rst @@ -1,7 +1,9 @@ warcprox API ************ -Means of Interacting with warcprox over http, aside from simply proxying urls. +Means of interacting with warcprox over http, aside from simply proxying urls. + +.. contents:: ``/status`` url =============== @@ -132,16 +134,49 @@ remote server, and also does not write it in the warc request record. Warcprox-Meta: {} -- warc-prefix -- stats - - buckets -- dedup-bucket -- blocks -- limits -- soft-limits -- metadata -- accept -- dedup-ok # deprecate? +Warcprox-Meta fields +------------------- + +``warc-prefix`` (string) +~~~~~~~~~~~~~~~~~~~~~~~~ +Specifies a warc filename prefix. Warcprox will write the warc record for this +capture, if any, to a warc named accordingly. + +Example:: + + Warcprox-Meta: {"warc-prefix": "special-warc"} + +``stats`` (dictionary) +~~~~~~~~~~~~~~~~~~~~~~ +* buckets + +Example:: + + Warcprox-Meta: {"stats":{"buckets":["my-stats-bucket","all-the-stats"]}} + +``dedup-bucket`` (string) +~~~~~~~~~~~~~~~~~~~~~~~~~ +Specifies the deduplication bucket. For more information about deduplication +see ``_. + +Example:: + + Warcprox-Meta: {"dedup-bucket":"my-dedup-bucket"} + +``blocks`` +~~~~~~~~~~ + +``limits`` +~~~~~~~~~~ + +``soft-limits`` +~~~~~~~~~~~~~~~ + +``metadata`` (dictionary) +~~~~~~~~~~~~~~~~~~~~~~~~~ + +``accept`` +~~~~~~~~~~ Brozzler knows about ``warcprox-meta``. For information on configuring ``warcprox-meta`` in brozzler, see https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#warcprox-meta From 401de2260098981d86bf4bcc908e9f672cd55235 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 25 May 2018 14:46:19 -0700 Subject: [PATCH 16/33] short sectioni on stats --- readme.rst | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/readme.rst b/readme.rst index fd3cb8c..5cdd7cc 100644 --- a/readme.rst +++ b/readme.rst @@ -34,6 +34,16 @@ get the warning when you visit each new site. But worse, any embedded https content on a different server will simply fail to load, because the browser will reject the certificate without telling you. +API +=== +For interacting with a running instance of warcprox. + +* ``/status`` url +* ``WARCPROX_WRITE_RECORD`` http method +* ``Warcprox-Meta`` http request header and response header + +See ``_. + Deduplication ============= Warcprox avoids archiving redundant content by "deduplicating" it. The process @@ -56,15 +66,20 @@ the bucket, otherwise the default bucket is used. Deduplication can be disabled entirely by starting warcprox with the argument ``--dedup-db-file=/dev/null``. -API -=== -For interacting with a running instance of warcprox. +Statistics +========== +Warcprox keeps some crawl statistics and stores them in sqlite or rethinkdb. +These are consulting when enforcing ``limits`` and ``soft-limits`` (see +``_), and can also be consulted by other +processes outside of warcprox, for reporting etc. -* ``/status`` url -* ``WARCPROX_WRITE_RECORD`` http method -* ``Warcprox-Meta`` http request header and response header +This is what they look like currently in sqlite, the default store:: -See ``_. + sqlite> select * from buckets_of_stats order by bucket desc; + bucket stats + --------------- --------------------------------------------------------------------------------------------- + __unspecified__ {"bucket":"__unspecified__","total":{"urls":37,"wire_bytes":1502781},"new":{"urls":15,"wire_bytes":1179906},"revisit":{"urls":22,"wire_bytes":322875}} + __all__ {"bucket":"__all__","total":{"urls":37,"wire_bytes":1502781},"new":{"urls":15,"wire_bytes":1179906},"revisit":{"urls":22,"wire_bytes":322875}} Plugins ======= From 4bd49b61a9929f9e89a1186ffccfb979257e3ac1 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 25 May 2018 15:26:26 -0700 Subject: [PATCH 17/33] starting to explain some warcprox-meta fields --- api.rst | 46 ++++++++++++++++++++++++++++++++++++++++++---- warcprox/stats.py | 2 +- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/api.rst b/api.rst index 8937ade..3b2f0b3 100644 --- a/api.rst +++ b/api.rst @@ -134,6 +134,13 @@ remote server, and also does not write it in the warc request record. Warcprox-Meta: {} +Brozzler knows about ``warcprox-meta``. For information on configuring +it in brozzler, see +`https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#warcprox-meta`_. +``Warcprox-Meta`` is often a very important part of brozzler job configuration. +It is the way url and data quotas (limits) on jobs, seeds, and hosts are +implemented, among other things. + Warcprox-Meta fields ------------------- @@ -148,11 +155,24 @@ Example:: ``stats`` (dictionary) ~~~~~~~~~~~~~~~~~~~~~~ -* buckets +``stats`` is a dictionary with only one field understood by warcprox, +``"buckets"``. The value of ``"buckets"`` is a list of strings and/or +dictionaries. A string signifies the name of the bucket; a dictionary is +expected to have at least an item with key ``"bucket"`` whose value is the name +of the bucket. The other currently recognized key is ``"tally-domains"``, which +if supplied should be a list of domains. This instructs warcprox to +additionally tally substats of the given bucket by domain. Host stats are +stored in the stats table under the key +``{parent-bucket}:{domain(normalized)}``, e.g. `"bucket2:foo.bar.com"` for the +example below. -Example:: +Examples:: Warcprox-Meta: {"stats":{"buckets":["my-stats-bucket","all-the-stats"]}} + Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}} + +See ``_ for more information on statistics kept by +warcprox. ``dedup-bucket`` (string) ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -166,20 +186,38 @@ Example:: ``blocks`` ~~~~~~~~~~ +Example:: + + Warcprox-Meta: {"blocks": [{"ssurt": "com,example,//https:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]} + ``limits`` ~~~~~~~~~~ +Example:: + + {"stats": {"buckets": ["test_limits_bucket"]}, "limits": {"test_limits_bucket/total/urls": 10}} + ``soft-limits`` ~~~~~~~~~~~~~~~ +Example:: + + Warcprox-Meta: {"stats": {"buckets": [{"bucket": "test_domain_doc_limit_bucket", "tally-domains": ["foo.localhost"]}]}, "soft-limits": {"test_domain_doc_limit_bucket:foo.localhost/total/urls": 10}} + + ``metadata`` (dictionary) ~~~~~~~~~~~~~~~~~~~~~~~~~ +Example:: + + Warcprox-Meta: {"metadata": {"seed": "http://example.com/seed", "description": "here's some information about this crawl job. blah blah"} + ``accept`` ~~~~~~~~~~ -Brozzler knows about ``warcprox-meta``. For information on configuring -``warcprox-meta`` in brozzler, see https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#warcprox-meta +Example:: + + request_meta = {"accept": ["capture-metadata"]} ``Warcprox-Meta`` http response header ====================================== diff --git a/warcprox/stats.py b/warcprox/stats.py index db2493c..4de5fef 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -166,7 +166,7 @@ class StatsProcessor(warcprox.BaseBatchPostfetchProcessor): Example Warcprox-Meta header (a real one will likely have other sections besides 'stats'): - Warcprox-Meta: {'stats':{'buckets':['bucket1',{'bucket':'bucket2','tally-domains':['foo.bar.com','192.168.10.20'}]}} + Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}} ''' buckets = ["__all__"] if (recorded_url.warcprox_meta From 2c850876e8207aaf9f5162898233d493aa392348 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 25 May 2018 16:06:12 -0700 Subject: [PATCH 18/33] explain warcprox-meta "blocks" --- api.rst | 42 +++++++++++++++++++++++++++++++++++++----- warcprox/warcproxy.py | 6 +++--- 2 files changed, 40 insertions(+), 8 deletions(-) diff --git a/api.rst b/api.rst index 3b2f0b3..d9419b3 100644 --- a/api.rst +++ b/api.rst @@ -138,8 +138,8 @@ Brozzler knows about ``warcprox-meta``. For information on configuring it in brozzler, see `https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#warcprox-meta`_. ``Warcprox-Meta`` is often a very important part of brozzler job configuration. -It is the way url and data quotas (limits) on jobs, seeds, and hosts are -implemented, among other things. +It is the way url and data limits on jobs, seeds, and hosts are implemented, +among other things. Warcprox-Meta fields ------------------- @@ -183,12 +183,44 @@ Example:: Warcprox-Meta: {"dedup-bucket":"my-dedup-bucket"} -``blocks`` -~~~~~~~~~~ +``blocks`` (list) +~~~~~~~~~~~~~~~~~ +List of url match rules. Url match rules are somewhat described at +https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#scoping +and https://github.com/iipc/urlcanon/blob/e2ab3524e/python/urlcanon/rules.py#L70. +(TODO: write a better doc and link to it) Example:: - Warcprox-Meta: {"blocks": [{"ssurt": "com,example,//https:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]} + Warcprox-Meta: {"blocks": [{"ssurt": "com,example,//http:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]} + +If any of the rules match the url being requested, warcprox aborts normal +processing and responds with a http 403. The http response includes +a ``Warcprox-Meta`` **response** header with one field, `"blocked-by-rule"`, +which reproduces the value of the match rule that resulted in the block. The +presence of the ``warcprox-meta`` response header can be used by the client to +distinguish this type of a response from a 403 from the remote url being +requested. + +For example:: + + $ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"blocks": [{"ssurt": "com,example,//http:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]}' http://example.com/foo + HTTP/1.0 403 Forbidden + Server: BaseHTTP/0.6 Python/3.6.3 + Date: Fri, 25 May 2018 22:46:42 GMT + Content-Type: text/plain;charset=utf-8 + Connection: close + Content-Length: 111 + Warcprox-Meta: {"blocked-by-rule":{"ssurt":"com,example,//http:/"}} + + request rejected by warcprox: blocked by rule found in Warcprox-Meta header: {"ssurt": "com,example,//http:/"} + +You might be wondering why ``blocks`` is necessary. Why would the warcprox +client make a request that it should already know will be blocked by the proxy? +The answer is that the request may be initiated somewhere where it's not +possible, or at least not convenient, to evaluate the block rules. In +particular, this circumstance prevails when the browser controlled by brozzler +is requesting images, javascript, css, and so on, embedded in a page. ``limits`` ~~~~~~~~~~ diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 0d93e5c..2050807 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -72,13 +72,13 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): block_rule = urlcanon.MatchRule(**rule) if block_rule.applies(url): body = ("request rejected by warcprox: blocked by " - "rule found in Warcprox-Meta header: %s" - % rule).encode("utf-8") + "rule found in Warcprox-Meta header: %s\n" + % json.dumps(rule)).encode("utf-8") self.send_response(403, "Forbidden") self.send_header("Content-Type", "text/plain;charset=utf-8") self.send_header("Connection", "close") self.send_header("Content-Length", len(body)) - response_meta = {"blocked-by-rule":rule} + response_meta = {"blocked-by-rule": rule} self.send_header( "Warcprox-Meta", json.dumps(response_meta, separators=(",",":"))) From 1e76ed33027833e95f4a78eb664e7d2b0e545887 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 25 May 2018 16:38:19 -0700 Subject: [PATCH 19/33] working on "limits" and "soft-limits" --- api.rst | 52 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 7 deletions(-) diff --git a/api.rst b/api.rst index d9419b3..71cc59a 100644 --- a/api.rst +++ b/api.rst @@ -222,20 +222,57 @@ possible, or at least not convenient, to evaluate the block rules. In particular, this circumstance prevails when the browser controlled by brozzler is requesting images, javascript, css, and so on, embedded in a page. -``limits`` -~~~~~~~~~~ +``limits`` (dictionary) +~~~~~~~~~~~~~~~~~~~~~~~ Example:: {"stats": {"buckets": ["test_limits_bucket"]}, "limits": {"test_limits_bucket/total/urls": 10}} -``soft-limits`` -~~~~~~~~~~~~~~~ +:: + + $ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"stats": {"buckets": ["test_limits_bucket"]}, "limits": {"test_limits_bucket/total/urls": 10}}' http://example.com/foo + HTTP/1.0 420 Reached limit + Server: BaseHTTP/0.6 Python/3.6.3 + Date: Fri, 25 May 2018 23:08:32 GMT + Content-Type: text/plain;charset=utf-8 + Connection: close + Content-Length: 77 + Warcprox-Meta: {"stats":{"test_limits_bucket":{"bucket":"test_limits_bucket","total":{"urls":10,"wire_bytes":15840},"new":{"urls":0,"wire_bytes":0},"revisit":{"urls":10,"wire_bytes":15840}}},"reached-limit":{"test_limits_bucket/total/urls":10}} + + request rejected by warcprox: reached limit test_limits_bucket/total/urls=10 + +``soft-limits`` (dictionary) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +From warcprox's perspective ``soft-limits`` work almost exactly the same way +as ``limits``. The only difference is that when a soft limit is hit, warcprox +response with an http 430 "Reached soft limit" instead of http 420. + +Warcprox clients might treat a 430 very differently from a 420. From brozzler's +perspective, for instance, ``soft-limits`` are very different from ``limits``. +When brozzler receives a 420 from warcprox because a ``limit`` has been +reached, this means that crawling for that seed is finished, and brozzler sets +about finalizing the crawl of that seed. On the other hand, brozzler blissfully +ignores 430 responses, because soft limits only apply to a particular bucket +(like a domain), and don't have any effect on crawling of urls that don't fall +in that bucket. Example:: Warcprox-Meta: {"stats": {"buckets": [{"bucket": "test_domain_doc_limit_bucket", "tally-domains": ["foo.localhost"]}]}, "soft-limits": {"test_domain_doc_limit_bucket:foo.localhost/total/urls": 10}} +:: + + $ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"stats": {"buckets": ["test_limits_bucket"]}, "soft-limits": {"test_limits_bucket/total/urls": 10}}' http://example.com/foo + HTTP/1.0 430 Reached soft limit + Server: BaseHTTP/0.6 Python/3.6.3 + Date: Fri, 25 May 2018 23:12:06 GMT + Content-Type: text/plain;charset=utf-8 + Connection: close + Content-Length: 82 + Warcprox-Meta: {"stats":{"test_limits_bucket":{"bucket":"test_limits_bucket","total":{"urls":10,"wire_bytes":15840},"new":{"urls":0,"wire_bytes":0},"revisit":{"urls":10,"wire_bytes":15840}}},"reached-soft-limit":{"test_limits_bucket/total/urls":10}} + + request rejected by warcprox: reached soft limit test_limits_bucket/total/urls=10 ``metadata`` (dictionary) ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -244,8 +281,8 @@ Example:: Warcprox-Meta: {"metadata": {"seed": "http://example.com/seed", "description": "here's some information about this crawl job. blah blah"} -``accept`` -~~~~~~~~~~ +``accept`` (list) +~~~~~~~~~~~~~~~~~ Example:: @@ -257,7 +294,8 @@ Example:: In some cases warcprox will add a ``Warcprox-Meta`` header in the http response that it sends to the client. Like the request header, the value is a json blob. It is only included if something in the ``warcprox-meta`` request header calls -for it. Those cases are described above in the "``Warcprox-Meta`` http request header" section. +for it. Those cases are described above in the +`#warcprox-meta-http-request-header`_ section. ### - blocked-by-rule ### - reached-limit From 195faa5cff2006811cab5b92c9151ab1605de1c4 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 25 May 2018 17:35:32 -0700 Subject: [PATCH 20/33] new checks exposing bug in limits enforcement --- tests/test_warcprox.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 7c5253b..0e60319 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -709,6 +709,7 @@ def test_limits(http_daemon, warcprox_, archiving_proxies): # wait for postfetch chain wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 10) + # next fetch hits the limit response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 420 assert response.reason == "Reached limit" @@ -717,6 +718,14 @@ def test_limits(http_daemon, warcprox_, archiving_proxies): assert response.headers["content-type"] == "text/plain;charset=utf-8" assert response.raw.data == b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=10\n" + # make sure limit doesn't get applied to a different stats bucket + request_meta = {"stats":{"buckets":["no_limits_bucket"]},"limits":{"test_limits_bucket/total/urls":10}} + headers = {"Warcprox-Meta": json.dumps(request_meta)} + response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'i!' + assert response.content == b'I am the warcprox test payload! jjjjjjjjjj!\n' + def test_return_capture_timestamp(http_daemon, warcprox_, archiving_proxies): urls_before = warcprox_.proxy.running_stats.urls @@ -999,6 +1008,7 @@ def test_domain_doc_soft_limit( http_daemon, https_daemon, warcprox_, archiving_proxies): urls_before = warcprox_.proxy.running_stats.urls + # ** comment is obsolete (server is multithreaded) but still useful ** # we need to clear the connection pool here because # - connection pool already may already have an open connection localhost # - we're about to make a connection to foo.localhost @@ -1134,6 +1144,20 @@ def test_domain_doc_soft_limit( assert response.headers["content-type"] == "text/plain;charset=utf-8" assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n" + # make sure soft limit doesn't get applied to a different stats bucket + request_meta = { + "stats": {"buckets": [{"bucket":"no_limit_bucket","tally-domains":["foo.localhost"]}]}, + "soft-limits": {"test_domain_doc_limit_bucket:foo.localhost/total/urls":10}, + } + headers = {"Warcprox-Meta": json.dumps(request_meta)} + url = 'http://zuh.foo.localhost:{}/o/p'.format(http_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'o!' + assert response.content == b'I am the warcprox test payload! pppppppppp!\n' + def test_domain_data_soft_limit( http_daemon, https_daemon, warcprox_, archiving_proxies): urls_before = warcprox_.proxy.running_stats.urls @@ -1228,6 +1252,19 @@ def test_domain_data_soft_limit( ### assert response.headers["content-type"] == "text/plain;charset=utf-8" ### assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:xn--zz-2ka.localhost/new/wire_bytes=200\n" + # make sure soft limit doesn't get applied to a different stats bucket + request_meta = { + "stats": {"buckets": [{"bucket":"no_limit_bucket","tally-domains":['ÞzZ.LOCALhost']}]}, + "soft-limits": {"test_domain_data_limit_bucket:ÞZZ.localhost/new/wire_bytes":200}, + } + headers = {"Warcprox-Meta": json.dumps(request_meta)} + url = 'http://ÞZz.localhost:{}/y/z'.format(http_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'y!' + assert response.content == b'I am the warcprox test payload! zzzzzzzzzz!\n' + # XXX this test relies on a tor proxy running at localhost:9050 with a working # connection to the internet, and relies on a third party site (facebook) being # up and behaving a certain way From 07dc978f093d1a48eb90daf1e60a8dc232611bc8 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 25 May 2018 17:36:26 -0700 Subject: [PATCH 21/33] docs still in progress --- api.rst | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/api.rst b/api.rst index 71cc59a..f3f958a 100644 --- a/api.rst +++ b/api.rst @@ -136,7 +136,7 @@ remote server, and also does not write it in the warc request record. Brozzler knows about ``warcprox-meta``. For information on configuring it in brozzler, see -`https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#warcprox-meta`_. +https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#warcprox-meta. ``Warcprox-Meta`` is often a very important part of brozzler job configuration. It is the way url and data limits on jobs, seeds, and hosts are implemented, among other things. @@ -156,14 +156,14 @@ Example:: ``stats`` (dictionary) ~~~~~~~~~~~~~~~~~~~~~~ ``stats`` is a dictionary with only one field understood by warcprox, -``"buckets"``. The value of ``"buckets"`` is a list of strings and/or +``buckets``. The value of ``buckets`` is a list of strings and/or dictionaries. A string signifies the name of the bucket; a dictionary is -expected to have at least an item with key ``"bucket"`` whose value is the name -of the bucket. The other currently recognized key is ``"tally-domains"``, which +expected to have at least an item with key ``bucket`` whose value is the name +of the bucket. The other currently recognized key is ``tally-domains``, which if supplied should be a list of domains. This instructs warcprox to additionally tally substats of the given bucket by domain. Host stats are stored in the stats table under the key -``{parent-bucket}:{domain(normalized)}``, e.g. `"bucket2:foo.bar.com"` for the +``{parent-bucket}:{domain(normalized)}``, e.g. ``"bucket2:foo.bar.com"`` for the example below. Examples:: @@ -196,13 +196,13 @@ Example:: If any of the rules match the url being requested, warcprox aborts normal processing and responds with a http 403. The http response includes -a ``Warcprox-Meta`` **response** header with one field, `"blocked-by-rule"`, +a ``Warcprox-Meta`` **response** header with one field, ``blocked-by-rule``, which reproduces the value of the match rule that resulted in the block. The presence of the ``warcprox-meta`` response header can be used by the client to distinguish this type of a response from a 403 from the remote url being requested. -For example:: +An example:: $ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"blocks": [{"ssurt": "com,example,//http:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]}' http://example.com/foo HTTP/1.0 403 Forbidden @@ -217,10 +217,10 @@ For example:: You might be wondering why ``blocks`` is necessary. Why would the warcprox client make a request that it should already know will be blocked by the proxy? -The answer is that the request may be initiated somewhere where it's not -possible, or at least not convenient, to evaluate the block rules. In -particular, this circumstance prevails when the browser controlled by brozzler -is requesting images, javascript, css, and so on, embedded in a page. +The answer is that the request may be initiated somewhere where it's difficult +to evaluate the block rules. In particular, this circumstance prevails when the +browser controlled by brozzler is requesting images, javascript, css, and so +on, embedded in a page. ``limits`` (dictionary) ~~~~~~~~~~~~~~~~~~~~~~~ From d9e0ed31f28111d295f2ec51594cac42566adced Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 29 May 2018 12:18:51 -0700 Subject: [PATCH 22/33] fix bug in limits enforcement enforce limit only if url is in stats bucket that limit applies to! --- tests/test_warcprox.py | 6 +++ warcprox/stats.py | 88 +++++++++++++++++++++++------------------- warcprox/warcproxy.py | 40 +++++++++---------- 3 files changed, 74 insertions(+), 60 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 0e60319..0deecc6 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1158,6 +1158,9 @@ def test_domain_doc_soft_limit( assert response.headers['warcprox-test-header'] == 'o!' assert response.content == b'I am the warcprox test payload! pppppppppp!\n' + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 22) + def test_domain_data_soft_limit( http_daemon, https_daemon, warcprox_, archiving_proxies): urls_before = warcprox_.proxy.running_stats.urls @@ -1265,6 +1268,9 @@ def test_domain_data_soft_limit( assert response.headers['warcprox-test-header'] == 'y!' assert response.content == b'I am the warcprox test payload! zzzzzzzzzz!\n' + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 5) + # XXX this test relies on a tor proxy running at localhost:9050 with a working # connection to the internet, and relies on a third party site (facebook) being # up and behaving a certain way diff --git a/warcprox/stats.py b/warcprox/stats.py index 4de5fef..85539e2 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -53,6 +53,53 @@ def _empty_bucket(bucket): }, } +def unravel_buckets(url, warcprox_meta): + ''' + Unravels bucket definitions in Warcprox-Meta header. Each bucket + definition can either be a string, which signifies the name of the + bucket, or a dict. If a dict it is expected to have at least an item + with key 'bucket' whose value is the name of the bucket. The other + currently recognized item is 'tally-domains', which if supplied should + be a list of domains. This instructs warcprox to additionally tally + substats of the given bucket by domain. Host stats are stored in the + stats table under the key '{parent-bucket}:{domain(normalized)}'. + + Returns: + list of strings + + Example Warcprox-Meta header (a real one will likely have other + sections besides 'stats'): + + Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}} + + In this case the return value would be + ["bucket1","bucket2","bucket2:foo.bar.com","bucket2:192.168.10.20"] + ''' + buckets = ["__all__"] + if (warcprox_meta and "stats" in warcprox_meta + and "buckets" in warcprox_meta["stats"]): + for bucket in warcprox_meta["stats"]["buckets"]: + if isinstance(bucket, dict): + if not 'bucket' in bucket: + self.logger.warn( + 'ignoring invalid stats bucket in ' + 'warcprox-meta header %s', bucket) + continue + buckets.append(bucket['bucket']) + if bucket.get('tally-domains'): + canon_url = urlcanon.semantic(url) + for domain in bucket['tally-domains']: + domain = urlcanon.normalize_host(domain).decode('ascii') + if urlcanon.url_matches_domain(canon_url, domain): + buckets.append( + '%s:%s' % (bucket['bucket'], domain)) + else: + buckets.append(bucket) + else: + buckets.append("__unspecified__") + + return buckets + class StatsProcessor(warcprox.BaseBatchPostfetchProcessor): logger = logging.getLogger("warcprox.stats.StatsProcessor") @@ -153,46 +200,7 @@ class StatsProcessor(warcprox.BaseBatchPostfetchProcessor): return None def buckets(self, recorded_url): - ''' - Unravels bucket definitions in Warcprox-Meta header. Each bucket - definition can either be a string, which signifies the name of the - bucket, or a dict. If a dict it is expected to have at least an item - with key 'bucket' whose value is the name of the bucket. The other - currently recognized item is 'tally-domains', which if supplied should - be a list of domains. This instructs warcprox to additionally tally - substats of the given bucket by domain. Host stats are stored in the - stats table under the key '{parent-bucket}:{domain(normalized)}'. - - Example Warcprox-Meta header (a real one will likely have other - sections besides 'stats'): - - Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}} - ''' - buckets = ["__all__"] - if (recorded_url.warcprox_meta - and "stats" in recorded_url.warcprox_meta - and "buckets" in recorded_url.warcprox_meta["stats"]): - for bucket in recorded_url.warcprox_meta["stats"]["buckets"]: - if isinstance(bucket, dict): - if not 'bucket' in bucket: - self.logger.warn( - 'ignoring invalid stats bucket in ' - 'warcprox-meta header %s', bucket) - continue - buckets.append(bucket['bucket']) - if bucket.get('tally-domains'): - url = urlcanon.semantic(recorded_url.url) - for domain in bucket['tally-domains']: - domain = urlcanon.normalize_host(domain).decode('ascii') - if urlcanon.url_matches_domain(url, domain): - buckets.append( - '%s:%s' % (bucket['bucket'], domain)) - else: - buckets.append(bucket) - else: - buckets.append("__unspecified__") - - return buckets + return unravel_buckets(recorded_url.url, recorded_url.warcprox_meta) class RethinkStatsProcessor(StatsProcessor): logger = logging.getLogger("warcprox.stats.RethinkStatsProcessor") diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 2050807..417f450 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -92,26 +92,26 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): self.client_address[0], self.command, self.url, rule)) - def _enforce_limit(self, limit_key, limit_value, soft=False): + def _enforce_limit(self, buckets, limit_key, limit_value, soft=False): if not self.server.stats_db: return - bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2) - _limit_key = limit_key - # if limit_key looks like 'job1:foo.com/total/urls' then we only want - # to apply this rule if the requested url is within domain - bucket0_fields = bucket0.split(':') - if len(bucket0_fields) == 2: - domain = urlcanon.normalize_host(bucket0_fields[1]) - if not urlcanon.host_matches_domain(self.hostname, domain): - return # else host matches, go ahead and enforce the limit - bucket0 = '%s:%s' % (bucket0_fields[0], domain.decode('ascii')) - _limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2) + # parse limit key + bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2) + # normalize domain if part of bucket + if ":" in bucket0: + b, raw_domain = bucket0.split(":", 1) + domain = urlcanon.normalize_host(raw_domain).decode("ascii") + bucket0 = "%s:%s" % (b, domain) + limit_key = "%s/%s/%s" % (bucket0, bucket1, bucket2) + + if not bucket0 in buckets: + return value = self.server.stats_db.value(bucket0, bucket1, bucket2) if value and limit_value and limit_value > 0 and value >= limit_value: body = ("request rejected by warcprox: reached %s %s=%s\n" % ( - "soft limit" if soft else "limit", _limit_key, + "soft limit" if soft else "limit", limit_key, limit_value)).encode("utf-8") if soft: self.send_response(430, "Reached soft limit") @@ -124,12 +124,11 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): "stats": {bucket0:self.server.stats_db.value(bucket0)} } if soft: - response_meta["reached-soft-limit"] = {_limit_key:limit_value} + response_meta["reached-soft-limit"] = {limit_key:limit_value} else: - response_meta["reached-limit"] = {_limit_key:limit_value} + response_meta["reached-limit"] = {limit_key:limit_value} self.send_header( - "Warcprox-Meta", - json.dumps(response_meta, separators=(",",":"))) + "Warcprox-Meta", json.dumps(response_meta, separators=",:")) self.end_headers() if self.command != "HEAD": self.wfile.write(body) @@ -139,7 +138,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): self.client_address[0], 430 if soft else 420, self.command, self.url, "soft limit" if soft else "limit", - _limit_key, limit_value)) + limit_key, limit_value)) def _enforce_limits(self, warcprox_meta): """ @@ -147,14 +146,15 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): warcprox.RequestBlockedByRule if a limit specified in warcprox_meta is reached. """ + buckets = warcprox.stats.unravel_buckets(self.url, warcprox_meta) if warcprox_meta and "limits" in warcprox_meta: for item in warcprox_meta["limits"].items(): limit_key, limit_value = item - self._enforce_limit(limit_key, limit_value, soft=False) + self._enforce_limit(buckets, limit_key, limit_value, soft=False) if warcprox_meta and "soft-limits" in warcprox_meta: for item in warcprox_meta["soft-limits"].items(): limit_key, limit_value = item - self._enforce_limit(limit_key, limit_value, soft=True) + self._enforce_limit(buckets, limit_key, limit_value, soft=True) def _security_check(self, warcprox_meta): ''' From 6256ec6a07b1d402def96d02cfb5bff0f260f823 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 29 May 2018 13:08:34 -0700 Subject: [PATCH 23/33] add another "wait" to fix failing test --- tests/test_warcprox.py | 3 +++ warcprox/stats.py | 2 -- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 0deecc6..13b6bad 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -726,6 +726,9 @@ def test_limits(http_daemon, warcprox_, archiving_proxies): assert response.headers['warcprox-test-header'] == 'i!' assert response.content == b'I am the warcprox test payload! jjjjjjjjjj!\n' + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 11) + def test_return_capture_timestamp(http_daemon, warcprox_, archiving_proxies): urls_before = warcprox_.proxy.running_stats.urls diff --git a/warcprox/stats.py b/warcprox/stats.py index 85539e2..64ff2d7 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -309,11 +309,9 @@ class RunningStats: need_ten_sec_snap = (now - self.ten_sec_snaps[0][0]) // 10 > (self.ten_sec_snaps[-1][0] - self.ten_sec_snaps[0][0]) // 10 if need_minute_snap: self.minute_snaps.append((now, self.urls, self.warc_bytes)) - logging.debug('added minute snap %r', self.minute_snaps[-1]) if need_ten_sec_snap: self.ten_sec_snaps.popleft() self.ten_sec_snaps.append((now, self.urls, self.warc_bytes)) - logging.trace('rotated in ten second snap %r', self.ten_sec_snaps[-1]) def _closest_ten_sec_snap(self, t): # it's a deque so iterating over it is faster than indexed lookup From 8877259b7d7421ea4323a396d392d958697c4b8b Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 29 May 2018 16:57:15 -0700 Subject: [PATCH 24/33] more progress on documenting "limits" --- api.rst | 4 ++++ readme.rst | 23 +++++++++++++++++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/api.rst b/api.rst index f3f958a..6104b53 100644 --- a/api.rst +++ b/api.rst @@ -224,6 +224,10 @@ on, embedded in a page. ``limits`` (dictionary) ~~~~~~~~~~~~~~~~~~~~~~~ +Specifies quantitative limits for warcprox to enforce. The structure of the +dictionary is ``{stats_key: numerical_limit, ...}`` where stats key has the +format ``"bucket/sub-bucket/statistic"``. See `readme.rst#statistics`_ for +further explanation of what "bucket", "sub-bucket", and "statistic" mean here. Example:: diff --git a/readme.rst b/readme.rst index 5cdd7cc..44ae1bb 100644 --- a/readme.rst +++ b/readme.rst @@ -69,11 +69,30 @@ Deduplication can be disabled entirely by starting warcprox with the argument Statistics ========== Warcprox keeps some crawl statistics and stores them in sqlite or rethinkdb. -These are consulting when enforcing ``limits`` and ``soft-limits`` (see +These are consulted for enforcing ``limits`` and ``soft-limits`` (see ``_), and can also be consulted by other processes outside of warcprox, for reporting etc. -This is what they look like currently in sqlite, the default store:: +Statistics are grouped by "bucket". Every capture is counted as part of the +``__all__`` bucket. Other buckets can be specified in the ``Warcprox-Meta`` +request header. The fallback bucket in case none is specified is called +``__unspecified__``. + +Within each bucket are three sub-buckets: +* "new" - tallies captures for which a complete record (usually a ``response`` + record) was written to warc +* "revisit" - tallies captures for which a ``revisit`` record was written to + warc +* "total" - includes all urls processed, even those not written to warc (so the + numbers may be greater than new + revisit) + +Within each of these sub-buckets we keep two statistics: +* urls - simple count of urls +* wire_bytes - sum of bytes received over the wire from the remote server for + each url + +For historical reasons, statistics are stored as json blobs in sqlite, the +default store:: sqlite> select * from buckets_of_stats order by bucket desc; bucket stats From 4a87a08230ed6ce07c105ca269e035c65ac03d5b Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 29 May 2018 17:09:14 -0700 Subject: [PATCH 25/33] fixlets --- readme.rst | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/readme.rst b/readme.rst index 44ae1bb..4f7044f 100644 --- a/readme.rst +++ b/readme.rst @@ -3,7 +3,7 @@ Warcprox - WARC writing MITM HTTP/S proxy .. image:: https://travis-ci.org/internetarchive/warcprox.svg?branch=master :target: https://travis-ci.org/internetarchive/warcprox -Based on the excellent and simple pymiproxy by Nadeem Douba. +Originally based on the excellent and simple pymiproxy by Nadeem Douba. https://github.com/allfro/pymiproxy .. contents:: @@ -52,9 +52,10 @@ for deduplication works similarly to heritrix and other web archiving tools. 1. while fetching url, calculate payload content digest (typically sha1) 2. look up digest in deduplication database (warcprox supports a few different ones) -3. if found write warc ``revisit`` record referencing the url and capture time +3. if found, write warc ``revisit`` record referencing the url and capture time of the previous capture -4. else (if not found) +4. else (if not found), + a. write warc ``response`` record with full payload b. store entry in deduplication database @@ -79,22 +80,24 @@ request header. The fallback bucket in case none is specified is called ``__unspecified__``. Within each bucket are three sub-buckets: -* "new" - tallies captures for which a complete record (usually a ``response`` + +* ``new`` - tallies captures for which a complete record (usually a ``response`` record) was written to warc -* "revisit" - tallies captures for which a ``revisit`` record was written to +* ``revisit`` - tallies captures for which a ``revisit`` record was written to warc -* "total" - includes all urls processed, even those not written to warc (so the +* ``total`` - includes all urls processed, even those not written to warc (so the numbers may be greater than new + revisit) Within each of these sub-buckets we keep two statistics: -* urls - simple count of urls -* wire_bytes - sum of bytes received over the wire from the remote server for - each url -For historical reasons, statistics are stored as json blobs in sqlite, the -default store:: +* ``urls`` - simple count of urls +* ``wire_bytes`` - sum of bytes received over the wire, including http headers, + from the remote server for each url - sqlite> select * from buckets_of_stats order by bucket desc; +For historical reasons, in sqlite, the default store, statistics are kept as +json blobs:: + + sqlite> select * from buckets_of_stats; bucket stats --------------- --------------------------------------------------------------------------------------------- __unspecified__ {"bucket":"__unspecified__","total":{"urls":37,"wire_bytes":1502781},"new":{"urls":15,"wire_bytes":1179906},"revisit":{"urls":22,"wire_bytes":322875}} From cd6e30fe36a2069b6b80ee6662c1061f63dbcc93 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 29 May 2018 17:28:04 -0700 Subject: [PATCH 26/33] describe the last two remaining fields --- api.rst | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/api.rst b/api.rst index 6104b53..cef9c7c 100644 --- a/api.rst +++ b/api.rst @@ -142,7 +142,7 @@ It is the way url and data limits on jobs, seeds, and hosts are implemented, among other things. Warcprox-Meta fields -------------------- +-------------------- ``warc-prefix`` (string) ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -280,6 +280,11 @@ Example:: ``metadata`` (dictionary) ~~~~~~~~~~~~~~~~~~~~~~~~~ +An arbitrary dictionary. Warcprox mostly ignores this. The one exception is +that if it has a ``seed`` entry and crawl logs are enabled via the +``--crawl-log-dir`` command line option, the value of ``seed`` is written to +the crawl log as the 11th field on the line, simulating heritrix's "source +tag". Example:: @@ -287,24 +292,28 @@ Example:: ``accept`` (list) ~~~~~~~~~~~~~~~~~ +Specifies fields that the client would like to receive in the ``Warcprox-Meta`` +*response* header. Only one value is currently understood, +``capture-metadata``. Example:: - request_meta = {"accept": ["capture-metadata"]} + Warcprox-Meta: {"accept": ["capture-metadata"]} + +The response will include a ``Warcpro-Meta`` response header with one field +also called ``captured-metadata``. Currently warcprox reports one piece of +capture medata, ``timestamp``, which represents the time fetch began for the +resource and matches the ``WARC-Date`` written to the warc record. For +example:: + + Warcprox-Meta: {"capture-metadata":{"timestamp":"2018-05-30T00:22:49Z"}} ``Warcprox-Meta`` http response header ====================================== - In some cases warcprox will add a ``Warcprox-Meta`` header in the http response that it sends to the client. Like the request header, the value is a json blob. It is only included if something in the ``warcprox-meta`` request header calls for it. Those cases are described above in the -`#warcprox-meta-http-request-header`_ section. +`Warcprox-Meta http request header`_ section. + -### - blocked-by-rule -### - reached-limit -### - reached-soft-limit -### - stats -### - capture-metadata -### -### Response codes 420, 430 From 68ede68e5f484401fc2cf3d78f204ca18ed81522 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 29 May 2018 17:35:33 -0700 Subject: [PATCH 27/33] little edits --- api.rst | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/api.rst b/api.rst index cef9c7c..ae12575 100644 --- a/api.rst +++ b/api.rst @@ -195,7 +195,7 @@ Example:: Warcprox-Meta: {"blocks": [{"ssurt": "com,example,//http:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]} If any of the rules match the url being requested, warcprox aborts normal -processing and responds with a http 403. The http response includes +processing and responds with a http ``403``. The http response includes a ``Warcprox-Meta`` **response** header with one field, ``blocked-by-rule``, which reproduces the value of the match rule that resulted in the block. The presence of the ``warcprox-meta`` response header can be used by the client to @@ -229,6 +229,11 @@ dictionary is ``{stats_key: numerical_limit, ...}`` where stats key has the format ``"bucket/sub-bucket/statistic"``. See `readme.rst#statistics`_ for further explanation of what "bucket", "sub-bucket", and "statistic" mean here. +If processing a request would result in exceeding a limit, warcprox aborts +normal processing and responds with a http ``420 Reached Limit``. The http +response includes a ``Warcprox-Meta`` **response** header with the complete set +of statistics for the bucket whose limit has been reached. + Example:: {"stats": {"buckets": ["test_limits_bucket"]}, "limits": {"test_limits_bucket/total/urls": 10}} @@ -250,16 +255,16 @@ Example:: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From warcprox's perspective ``soft-limits`` work almost exactly the same way as ``limits``. The only difference is that when a soft limit is hit, warcprox -response with an http 430 "Reached soft limit" instead of http 420. +response with an http ``430 Reached soft limit`` instead of http ``420``. -Warcprox clients might treat a 430 very differently from a 420. From brozzler's -perspective, for instance, ``soft-limits`` are very different from ``limits``. -When brozzler receives a 420 from warcprox because a ``limit`` has been -reached, this means that crawling for that seed is finished, and brozzler sets -about finalizing the crawl of that seed. On the other hand, brozzler blissfully -ignores 430 responses, because soft limits only apply to a particular bucket -(like a domain), and don't have any effect on crawling of urls that don't fall -in that bucket. +Warcprox clients might treat a 430 very differently from a ``420``. From +brozzler's perspective, for instance, ``soft-limits`` are very different from +``limits``. When brozzler receives a ``420`` from warcprox because a ``limit`` +has been reached, this means that crawling for that seed is finished, and +brozzler sets about finalizing the crawl of that seed. On the other hand, +brozzler blissfully ignores ``430`` responses, because soft limits only apply +to a particular bucket (like a domain), and don't have any effect on crawling +of urls that don't fall in that bucket. Example:: @@ -300,7 +305,7 @@ Example:: Warcprox-Meta: {"accept": ["capture-metadata"]} -The response will include a ``Warcpro-Meta`` response header with one field +The response will include a ``Warcprox-Meta`` response header with one field also called ``captured-metadata``. Currently warcprox reports one piece of capture medata, ``timestamp``, which represents the time fetch began for the resource and matches the ``WARC-Date`` written to the warc record. For From f5bcec20a92c675291acc9debe506b0ba1e9907e Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 30 May 2018 14:12:58 -0700 Subject: [PATCH 28/33] explain a bit about mitm --- readme.rst | 179 +++++++++++++++-------------------------------------- 1 file changed, 49 insertions(+), 130 deletions(-) diff --git a/readme.rst b/readme.rst index 4f7044f..6f53f66 100644 --- a/readme.rst +++ b/readme.rst @@ -3,36 +3,68 @@ Warcprox - WARC writing MITM HTTP/S proxy .. image:: https://travis-ci.org/internetarchive/warcprox.svg?branch=master :target: https://travis-ci.org/internetarchive/warcprox -Originally based on the excellent and simple pymiproxy by Nadeem Douba. -https://github.com/allfro/pymiproxy +Warcprox is a tool for archiving the web. It is an http proxy that stores its +traffic to disk in `WARC +`_ +format. Warcprox captures encrypted https traffic by using the +`"man-in-the-middle" `_ +technique (see the `Man-In-The_Middle`_ section for more info). + +The web pages that warcprox stores in WARC files can be played back using +software like `OpenWayback `_ or `pywb +`_. Warcprox has been developed in +parallel with `brozzler `_ and +together they make a comprehensive modern distributed archival web crawling +system. + +Warcprox was originally based on the excellent and simple pymiproxy by Nadeem +Douba. https://github.com/allfro/pymiproxy .. contents:: -Install -======= +Getting started +=============== Warcprox runs on python 3.4+. -To install latest release run: - -:: +To install latest release run:: # apt-get install libffi-dev libssl-dev pip install warcprox -You can also install the latest bleeding edge code: - -:: +You can also install the latest bleeding edge code:: pip install git+https://github.com/internetarchive/warcprox.git +To start warcprox run:: -Trusting the CA cert -==================== -For best results while browsing through warcprox, you need to add the CA -cert as a trusted cert in your browser. If you don't do that, you will -get the warning when you visit each new site. But worse, any embedded -https content on a different server will simply fail to load, because -the browser will reject the certificate without telling you. + warcprox + +Try ``warcprox --help`` for documentation on command line options. + +Man-In-The-Middle? +================== +Traffic to and from https sites is encrypted. Normally http proxies can't read +that traffic. The web client uses the http ``CONNECT`` method to establish a +tunnel through the proxy, and the proxy merely routes raw bytes between the +client and server. Since the bytes are encrypted, the proxy can't make sense of +the information it's proxying. Nonsensical encrypted bytes would not be very +useful to archive. + +In order to capture https traffic, warcprox acts as a "man-in-the-middle" +(MITM). When it receives a ``CONNECT`` directive from a client, it generates a +public key certificate for the requested site, presents to the client, and +proceeds to establish an encrypted connection. Then it makes a separate, normal +https connection to the remote site. It decrypts, archives, and re-encrypts +traffic in both directions. + +Although "man-in-the-middle" is often paired with "attack", there is nothing +malicious about what warcprox is doing. If you configure an instance of +warcprox as your browser's http proxy, you will see lots of certificate +warnings, since none of the certificates will be signed by trusted authorities. +To use warcprox effectively the client needs to disable certificate +verification, or add the CA cert generated by warcprox as a trusted authority. +(If you do this in your browser, make sure you undo it when you're done using +warcprox!) API === @@ -116,119 +148,6 @@ be configured by specifying ``--plugin`` multiples times. `A minimal example `__ -Usage -===== - -:: - - usage: warcprox [-h] [-p PORT] [-b ADDRESS] [-c CACERT] - [--certs-dir CERTS_DIR] [-d DIRECTORY] - [--warc-filename WARC_FILENAME] [-z] [-n PREFIX] - [-s ROLLOVER_SIZE] - [--rollover-idle-time ROLLOVER_IDLE_TIME] - [-g DIGEST_ALGORITHM] [--base32] - [--method-filter HTTP_METHOD] - [--stats-db-file STATS_DB_FILE | --rethinkdb-stats-url RETHINKDB_STATS_URL] - [-P PLAYBACK_PORT] - [-j DEDUP_DB_FILE | --rethinkdb-dedup-url RETHINKDB_DEDUP_URL | --rethinkdb-big-table-url RETHINKDB_BIG_TABLE_URL | --rethinkdb-trough-db-url RETHINKDB_TROUGH_DB_URL | --cdxserver-dedup CDXSERVER_DEDUP] - [--rethinkdb-services-url RETHINKDB_SERVICES_URL] - [--onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY] - [--crawl-log-dir CRAWL_LOG_DIR] [--plugin PLUGIN_CLASS] - [--version] [-v] [--trace] [-q] - - warcprox - WARC writing MITM HTTP/S proxy - - optional arguments: - -h, --help show this help message and exit - -p PORT, --port PORT port to listen on (default: 8000) - -b ADDRESS, --address ADDRESS - address to listen on (default: localhost) - -c CACERT, --cacert CACERT - CA certificate file; if file does not exist, it - will be created (default: - ./ayutla.monkeybrains.net-warcprox-ca.pem) - --certs-dir CERTS_DIR - where to store and load generated certificates - (default: ./ayutla.monkeybrains.net-warcprox-ca) - -d DIRECTORY, --dir DIRECTORY - where to write warcs (default: ./warcs) - --warc-filename WARC_FILENAME - define custom WARC filename with variables - {prefix}, {timestamp14}, {timestamp17}, - {serialno}, {randomtoken}, {hostname}, - {shorthostname} (default: - {prefix}-{timestamp17}-{serialno}-{randomtoken}) - -z, --gzip write gzip-compressed warc records - -n PREFIX, --prefix PREFIX - default WARC filename prefix (default: WARCPROX) - -s ROLLOVER_SIZE, --size ROLLOVER_SIZE - WARC file rollover size threshold in bytes - (default: 1000000000) - --rollover-idle-time ROLLOVER_IDLE_TIME - WARC file rollover idle time threshold in seconds - (so that Friday's last open WARC doesn't sit there - all weekend waiting for more data) (default: None) - -g DIGEST_ALGORITHM, --digest-algorithm DIGEST_ALGORITHM - digest algorithm, one of sha384, sha224, md5, - sha256, sha512, sha1 (default: sha1) - --base32 write digests in Base32 instead of hex - --method-filter HTTP_METHOD - only record requests with the given http method(s) - (can be used more than once) (default: None) - --stats-db-file STATS_DB_FILE - persistent statistics database file; empty string - or /dev/null disables statistics tracking - (default: ./warcprox.sqlite) - --rethinkdb-stats-url RETHINKDB_STATS_URL - rethinkdb stats table url, e.g. rethinkdb://db0.fo - o.org,db1.foo.org:38015/my_warcprox_db/my_stats_ta - ble (default: None) - -P PLAYBACK_PORT, --playback-port PLAYBACK_PORT - port to listen on for instant playback (default: - None) - -j DEDUP_DB_FILE, --dedup-db-file DEDUP_DB_FILE - persistent deduplication database file; empty - string or /dev/null disables deduplication - (default: ./warcprox.sqlite) - --rethinkdb-dedup-url RETHINKDB_DEDUP_URL - rethinkdb dedup url, e.g. rethinkdb://db0.foo.org, - db1.foo.org:38015/my_warcprox_db/my_dedup_table - (default: None) - --rethinkdb-big-table-url RETHINKDB_BIG_TABLE_URL - rethinkdb big table url (table will be populated - with various capture information and is suitable - for use as index for playback), e.g. rethinkdb://d - b0.foo.org,db1.foo.org:38015/my_warcprox_db/captur - es (default: None) - --rethinkdb-trough-db-url RETHINKDB_TROUGH_DB_URL - 🐷 url pointing to trough configuration rethinkdb - database, e.g. rethinkdb://db0.foo.org,db1.foo.org - :38015/trough_configuration (default: None) - --cdxserver-dedup CDXSERVER_DEDUP - use a CDX Server URL for deduplication; e.g. - https://web.archive.org/cdx/search (default: None) - --rethinkdb-services-url RETHINKDB_SERVICES_URL - rethinkdb service registry table url; if provided, - warcprox will create and heartbeat entry for - itself (default: None) - --onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY - host:port of tor socks proxy, used only to connect - to .onion sites (default: None) - --crawl-log-dir CRAWL_LOG_DIR - if specified, write crawl log files in the - specified directory; one crawl log is written per - warc filename prefix; crawl log format mimics - heritrix (default: None) - --plugin PLUGIN_CLASS - Qualified name of plugin class, e.g. - "mypkg.mymod.MyClass". May be used multiple times - to register multiple plugins. See README.rst for - more information. (default: None) - --version show program's version number and exit - -v, --verbose - --trace - -q, --quiet - License ======= From 9434a1ccd87f8f3b179175468ed02d4ada2f037d Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 30 May 2018 14:26:10 -0700 Subject: [PATCH 29/33] more little edits --- readme.rst | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/readme.rst b/readme.rst index 6f53f66..dbb1440 100644 --- a/readme.rst +++ b/readme.rst @@ -8,7 +8,7 @@ traffic to disk in `WARC `_ format. Warcprox captures encrypted https traffic by using the `"man-in-the-middle" `_ -technique (see the `Man-In-The_Middle`_ section for more info). +technique (see the `Man-in-the-middle`_ section for more info). The web pages that warcprox stores in WARC files can be played back using software like `OpenWayback `_ or `pywb @@ -41,21 +41,21 @@ To start warcprox run:: Try ``warcprox --help`` for documentation on command line options. -Man-In-The-Middle? -================== -Traffic to and from https sites is encrypted. Normally http proxies can't read -that traffic. The web client uses the http ``CONNECT`` method to establish a -tunnel through the proxy, and the proxy merely routes raw bytes between the -client and server. Since the bytes are encrypted, the proxy can't make sense of -the information it's proxying. Nonsensical encrypted bytes would not be very -useful to archive. +Man-in-the-middle +================= +Normally, http proxies can't read https traffic, because it's encrypted. The +browser uses the http ``CONNECT`` method to establish a tunnel through the +proxy, and the proxy merely routes raw bytes between the client and server. +Since the bytes are encrypted, the proxy can't make sense of the information +it's proxying. This nonsensical encrypted data would not be very useful to +archive. In order to capture https traffic, warcprox acts as a "man-in-the-middle" (MITM). When it receives a ``CONNECT`` directive from a client, it generates a public key certificate for the requested site, presents to the client, and -proceeds to establish an encrypted connection. Then it makes a separate, normal -https connection to the remote site. It decrypts, archives, and re-encrypts -traffic in both directions. +proceeds to establish an encrypted connection with the client. Then it makes a +separate, normal https connection to the remote site. It decrypts, archives, +and re-encrypts traffic in both directions. Although "man-in-the-middle" is often paired with "attack", there is nothing malicious about what warcprox is doing. If you configure an instance of From 6f43286b07681208728a93bead543368e3e47169 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 30 May 2018 14:46:14 -0700 Subject: [PATCH 30/33] more edits --- api.rst | 78 +++++++++++++++++++++++++++------------------------------ 1 file changed, 37 insertions(+), 41 deletions(-) diff --git a/api.rst b/api.rst index ae12575..bac642b 100644 --- a/api.rst +++ b/api.rst @@ -124,15 +124,11 @@ warcprox will write a warc record that looks like this:: configuration information and metadata with each proxy request to warcprox. The value is a json blob. There are several fields understood by warcprox, and arbitrary additional fields can be included. If warcprox doesn't recognize a -field it simply ignores it. Warcprox plugins could make use of custom fields, -for example. +field it simply ignores it. Custom fields may be useful for custom warcprox +plugins (see ``_). Warcprox strips the ``warcprox-meta`` header out before sending the request to -remote server, and also does not write it in the warc request record. - -:: - - Warcprox-Meta: {} +remote server, and does not write it in the warc request record. Brozzler knows about ``warcprox-meta``. For information on configuring it in brozzler, see @@ -153,27 +149,6 @@ Example:: Warcprox-Meta: {"warc-prefix": "special-warc"} -``stats`` (dictionary) -~~~~~~~~~~~~~~~~~~~~~~ -``stats`` is a dictionary with only one field understood by warcprox, -``buckets``. The value of ``buckets`` is a list of strings and/or -dictionaries. A string signifies the name of the bucket; a dictionary is -expected to have at least an item with key ``bucket`` whose value is the name -of the bucket. The other currently recognized key is ``tally-domains``, which -if supplied should be a list of domains. This instructs warcprox to -additionally tally substats of the given bucket by domain. Host stats are -stored in the stats table under the key -``{parent-bucket}:{domain(normalized)}``, e.g. ``"bucket2:foo.bar.com"`` for the -example below. - -Examples:: - - Warcprox-Meta: {"stats":{"buckets":["my-stats-bucket","all-the-stats"]}} - Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}} - -See ``_ for more information on statistics kept by -warcprox. - ``dedup-bucket`` (string) ~~~~~~~~~~~~~~~~~~~~~~~~~ Specifies the deduplication bucket. For more information about deduplication @@ -196,11 +171,10 @@ Example:: If any of the rules match the url being requested, warcprox aborts normal processing and responds with a http ``403``. The http response includes -a ``Warcprox-Meta`` **response** header with one field, ``blocked-by-rule``, +a ``Warcprox-Meta`` response header with one field, ``blocked-by-rule``, which reproduces the value of the match rule that resulted in the block. The presence of the ``warcprox-meta`` response header can be used by the client to -distinguish this type of a response from a 403 from the remote url being -requested. +distinguish this type of a response from a 403 from the remote site. An example:: @@ -222,6 +196,29 @@ to evaluate the block rules. In particular, this circumstance prevails when the browser controlled by brozzler is requesting images, javascript, css, and so on, embedded in a page. +``stats`` (dictionary) +~~~~~~~~~~~~~~~~~~~~~~ +``stats`` is a dictionary with only one field understood by warcprox, +``buckets``. The value of ``buckets`` is a list of strings and/or +dictionaries. A string signifies the name of the bucket; a dictionary is +expected to have at least an item with key ``bucket`` whose value is the name +of the bucket. The other currently recognized key is ``tally-domains``, which +if supplied should be a list of domains. This instructs warcprox to +additionally tally substats of the given bucket by domain. + +See ``_ for more information on statistics kept by +warcprox. + +Examples:: + + Warcprox-Meta: {"stats":{"buckets":["my-stats-bucket","all-the-stats"]}} + Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}} + +Domain stats are stored in the stats table under the key +``"bucket2:foo.bar.com"`` for the latter example. See the following two +sections for more examples. The ``soft-limits`` section has an example of a +limit on a domain specified in ``tally-domains``. + ``limits`` (dictionary) ~~~~~~~~~~~~~~~~~~~~~~~ Specifies quantitative limits for warcprox to enforce. The structure of the @@ -231,12 +228,12 @@ further explanation of what "bucket", "sub-bucket", and "statistic" mean here. If processing a request would result in exceeding a limit, warcprox aborts normal processing and responds with a http ``420 Reached Limit``. The http -response includes a ``Warcprox-Meta`` **response** header with the complete set +response includes a ``Warcprox-Meta`` response header with the complete set of statistics for the bucket whose limit has been reached. Example:: - {"stats": {"buckets": ["test_limits_bucket"]}, "limits": {"test_limits_bucket/total/urls": 10}} + Warcprox-Meta: {"stats": {"buckets": ["test_limits_bucket"]}, "limits": {"test_limits_bucket/total/urls": 10}} :: @@ -257,7 +254,7 @@ From warcprox's perspective ``soft-limits`` work almost exactly the same way as ``limits``. The only difference is that when a soft limit is hit, warcprox response with an http ``430 Reached soft limit`` instead of http ``420``. -Warcprox clients might treat a 430 very differently from a ``420``. From +Warcprox clients might treat a ``430`` very differently from a ``420``. From brozzler's perspective, for instance, ``soft-limits`` are very different from ``limits``. When brozzler receives a ``420`` from warcprox because a ``limit`` has been reached, this means that crawling for that seed is finished, and @@ -298,7 +295,7 @@ Example:: ``accept`` (list) ~~~~~~~~~~~~~~~~~ Specifies fields that the client would like to receive in the ``Warcprox-Meta`` -*response* header. Only one value is currently understood, +response header. Only one value is currently understood, ``capture-metadata``. Example:: @@ -315,10 +312,9 @@ example:: ``Warcprox-Meta`` http response header ====================================== -In some cases warcprox will add a ``Warcprox-Meta`` header in the http response -that it sends to the client. Like the request header, the value is a json blob. -It is only included if something in the ``warcprox-meta`` request header calls -for it. Those cases are described above in the -`Warcprox-Meta http request header`_ section. - +In some cases warcprox will add a ``Warcprox-Meta`` header to the http response +that it sends to the client. As with the request header, the value is a json +blob. It is only included if something in the ``warcprox-meta`` request header +calls for it. Those cases are described above in the `Warcprox-Meta http +request header`_ section. From e8cb3afa719e9ac2015d38bc1ad40a320abf3b67 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 31 May 2018 16:52:37 -0700 Subject: [PATCH 31/33] bump dev version after merge --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4fc1cbf..81e46c0 100755 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ except: setuptools.setup( name='warcprox', - version='2.4b2.dev174', + version='2.4b2.dev175', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From e73cbcb6b3591492acdb496a45a90f10d6300ff2 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 31 May 2018 16:57:06 -0700 Subject: [PATCH 32/33] log stack trace in case batch postprocessor raises exception somehow --- warcprox/__init__.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 20f0de4..4825e29 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -122,14 +122,19 @@ class BasePostfetchProcessor(threading.Thread): self.profiler = None def run(self): - if self.options.profile: - import cProfile - self.profiler = cProfile.Profile() - self.profiler.enable() - self._run() - self.profiler.disable() - else: - self._run() + try: + if self.options.profile: + import cProfile + self.profiler = cProfile.Profile() + self.profiler.enable() + self._run() + self.profiler.disable() + else: + self._run() + except: + self.logger.critical( + '%s dying due to uncaught exception', + self.name, exc_info=True) def _get_process_put(self): ''' From ec7a0bf569a8d8fe4c54a7554419655a21036c70 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 31 May 2018 16:57:37 -0700 Subject: [PATCH 33/33] =?UTF-8?q?log=20exception=20and=20continue=20?= =?UTF-8?q?=F0=9F=A4=9E=20if=20schema=20reg=20fails?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit at trough dedup startup --- setup.py | 2 +- warcprox/dedup.py | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 81e46c0..6ac73a1 100755 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ except: setuptools.setup( name='warcprox', - version='2.4b2.dev175', + version='2.4b2.dev176', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 83f3e0a..81be2ea 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -497,7 +497,13 @@ class TroughDedupDb(DedupDb, DedupableMixin): return BatchTroughStorer(self, self.options) def start(self): - self._trough_cli.register_schema(self.SCHEMA_ID, self.SCHEMA_SQL) + try: + self._trough_cli.register_schema(self.SCHEMA_ID, self.SCHEMA_SQL) + except Exception as e: + # can happen. hopefully someone else has registered it + self.logger.critical( + 'will try to continue after problem registering schema %s', + self.SCHEMA_ID, exc_info=True) def save(self, digest_key, response_record, bucket='__unspecified__'): record_id = response_record.get_header(warctools.WarcRecord.ID)