diff --git a/.gitignore b/.gitignore index 72e3644..1da5ebc 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ warcs build dist .tox +out.* diff --git a/.travis.yml b/.travis.yml index 5b54afb..7bac95a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,21 +1,36 @@ -# vim: set sw=4 et: -# -# tox approach stolen from -# https://github.com/pypa/pip/blob/abdb597dbfb51b21cc76c1cff068b72c80f3a77d/.travis.yml -# - language: python +python: +- 3.5 +- 3.4 +- 2.7 +- nightly +- pypy +- pypy3 -env: - - TOXENV=py27 - - TOXENV=py34 +matrix: + allow_failures: + - python: pypy + - python: pypy3 + +addons: + apt: + packages: + - python-gdbm + - python3-gdbm + - tor + +services: +- docker before_install: - - sudo apt-get update - - sudo apt-get -y install python-gdbm python3-gdbm +- sudo service docker restart ; sleep 10 # https://github.com/travis-ci/travis-ci/issues/4778 +- docker run -d --publish=28015:28015 rethinkdb before_script: - - pip install tox +- pip install . pytest requests -script: tox +script: +- py.test -v -s tests +- py.test -v -s --rethinkdb-servers=localhost tests tests +- py.test -v -s --rethinkdb-servers=localhost --rethinkdb-big-table tests diff --git a/README.rst b/README.rst index ff6a6b0..388bcf5 100644 --- a/README.rst +++ b/README.rst @@ -1,15 +1,11 @@ warcprox - WARC writing MITM HTTP/S proxy ----------------------------------------- -.. image:: https://travis-ci.org/internetarchive/warcprox.png?branch=master +.. image:: https://travis-ci.org/internetarchive/warcprox.png?branch=master :target: https://travis-ci.org/internetarchive/warcprox Based on the excellent and simple pymiproxy by Nadeem Douba. https://github.com/allfro/pymiproxy -License: because pymiproxy is GPL and warcprox is a derivative work of -pymiproxy, warcprox is also GPL. - - Install ~~~~~~~ @@ -19,6 +15,7 @@ To install latest release run: :: + # apt-get install libffi-dev libssl-dev python3-gdbm pip install warcprox You can also install the latest bleeding edge code: @@ -45,10 +42,15 @@ Usage usage: warcprox [-h] [-p PORT] [-b ADDRESS] [-c CACERT] [--certs-dir CERTS_DIR] [-d DIRECTORY] [-z] [-n PREFIX] [-s SIZE] [--rollover-idle-time ROLLOVER_IDLE_TIME] - [-g DIGEST_ALGORITHM] [--base32] [-j DEDUP_DB_FILE] - [-P PLAYBACK_PORT] - [--playback-index-db-file PLAYBACK_INDEX_DB_FILE] [--version] - [-v] [-q] + [-g DIGEST_ALGORITHM] [--base32] + [--stats-db-file STATS_DB_FILE] [-P PLAYBACK_PORT] + [--playback-index-db-file PLAYBACK_INDEX_DB_FILE] + [-j DEDUP_DB_FILE | --rethinkdb-servers RETHINKDB_SERVERS] + [--rethinkdb-db RETHINKDB_DB] [--rethinkdb-big-table] + [--kafka-broker-list KAFKA_BROKER_LIST] + [--kafka-capture-feed-topic KAFKA_CAPTURE_FEED_TOPIC] + [--onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY] + [--version] [-v] [-q] warcprox - WARC writing MITM HTTP/S proxy @@ -58,84 +60,91 @@ Usage -b ADDRESS, --address ADDRESS address to listen on (default: localhost) -c CACERT, --cacert CACERT - CA certificate file; if file does not exist, it will - be created (default: ./desktop-nlevitt-warcprox- - ca.pem) + CA certificate file; if file does not exist, it + will be created (default: ./MacBook-Pro.local- + warcprox-ca.pem) --certs-dir CERTS_DIR where to store and load generated certificates - (default: ./desktop-nlevitt-warcprox-ca) + (default: ./MacBook-Pro.local-warcprox-ca) -d DIRECTORY, --dir DIRECTORY where to write warcs (default: ./warcs) - -z, --gzip write gzip-compressed warc records (default: False) + -z, --gzip write gzip-compressed warc records (default: + False) -n PREFIX, --prefix PREFIX WARC filename prefix (default: WARCPROX) - -s SIZE, --size SIZE WARC file rollover size threshold in bytes (default: - 1000000000) + -s SIZE, --size SIZE WARC file rollover size threshold in bytes + (default: 1000000000) --rollover-idle-time ROLLOVER_IDLE_TIME - WARC file rollover idle time threshold in seconds (so - that Friday's last open WARC doesn't sit there all - weekend waiting for more data) (default: None) + WARC file rollover idle time threshold in seconds + (so that Friday's last open WARC doesn't sit there + all weekend waiting for more data) (default: None) -g DIGEST_ALGORITHM, --digest-algorithm DIGEST_ALGORITHM - digest algorithm, one of sha384, sha512, md5, sha224, - sha256, sha1 (default: sha1) + digest algorithm, one of sha1, sha256, md5, + sha224, sha512, sha384 (default: sha1) --base32 write digests in Base32 instead of hex (default: False) - -j DEDUP_DB_FILE, --dedup-db-file DEDUP_DB_FILE - persistent deduplication database file; empty string - or /dev/null disables deduplication (default: - ./warcprox-dedup.db) + --stats-db-file STATS_DB_FILE + persistent statistics database file; empty string + or /dev/null disables statistics tracking + (default: ./warcprox-stats.db) -P PLAYBACK_PORT, --playback-port PLAYBACK_PORT - port to listen on for instant playback (default: None) + port to listen on for instant playback (default: + None) --playback-index-db-file PLAYBACK_INDEX_DB_FILE - playback index database file (only used if --playback- - port is specified) (default: ./warcprox-playback- - index.db) + playback index database file (only used if + --playback-port is specified) (default: + ./warcprox-playback-index.db) + -j DEDUP_DB_FILE, --dedup-db-file DEDUP_DB_FILE + persistent deduplication database file; empty + string or /dev/null disables deduplication + (default: ./warcprox-dedup.db) + --rethinkdb-servers RETHINKDB_SERVERS + rethinkdb servers, used for dedup and stats if + specified; e.g. + db0.foo.org,db0.foo.org:38015,db1.foo.org + (default: None) + --rethinkdb-db RETHINKDB_DB + rethinkdb database name (ignored unless + --rethinkdb-servers is specified) (default: + warcprox) + --rethinkdb-big-table + use a big rethinkdb table called "captures", + instead of a small table called "dedup"; table is + suitable for use as index for playback (ignored + unless --rethinkdb-servers is specified) (default: + False) + --kafka-broker-list KAFKA_BROKER_LIST + kafka broker list for capture feed (default: None) + --kafka-capture-feed-topic KAFKA_CAPTURE_FEED_TOPIC + kafka capture feed topic (default: None) + --onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY + host:port of tor socks proxy, used only to connect + to .onion sites (default: None) --version show program's version number and exit -v, --verbose -q, --quiet -To do -~~~~~ -* (partly done) integration tests, unit tests -* (done) url-agnostic deduplication -* unchunk and/or ungzip before storing payload, or alter request to - discourage server from chunking/gzipping -* check certs from proxied website, like browser does, and present - browser-like warning if appropriate -* keep statistics, produce reports -* write cdx while crawling? -* performance testing -* (done) base32 sha1 like heritrix? -* configurable timeouts and stuff -* evaluate ipv6 support -* (done) more explicit handling of connection closed exception - during transfer -* dns cache?? the system already does a fine job I'm thinking -* keepalive with remote servers? -* (done) python3 -* special handling for 304 not-modified (write nothing or write revisit - record... and/or modify request so server never responds with 304) -* (done) instant playback on a second proxy port -* special url for downloading ca cert e.g. http(s)://warcprox./ca.pem -* special url for other stuff, some status info or something? -* browser plugin for warcprox mode +License +~~~~~~~ - - accept warcprox CA cert only when in warcprox mode - - separate temporary cookie store, like incognito - - "careful! your activity is being archived" banner - - easy switch between archiving and instant playback proxy port +Warcprox is a derivative work of pymiproxy, which is GPL. Thus warcprox is also +GPL. -To not do -^^^^^^^^^ +Copyright (C) 2012 Cygnos Corporation +Copyright (C) 2013-2016 Internet Archive -The features below could also be part of warcprox. But maybe they don't -belong here, since this is a proxy, not a crawler/robot. It can be used -by a human with a browser, or by something automated, i.e. a robot. My -feeling is that it's more appropriate to implement these in the robot. +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. -* politeness, i.e. throttle requests per server -* fetch and obey robots.txt -* alter user-agent, maybe insert something like "warcprox mitm - archiving proxy; +http://archive.org/details/archive.org\_bot" +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. diff --git a/benchmarks/requirements.txt b/benchmarks/requirements.txt new file mode 100644 index 0000000..ee4ba4f --- /dev/null +++ b/benchmarks/requirements.txt @@ -0,0 +1 @@ +aiohttp diff --git a/benchmarks/run-benchmarks.py b/benchmarks/run-benchmarks.py new file mode 100755 index 0000000..275581b --- /dev/null +++ b/benchmarks/run-benchmarks.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python +# +# run-benchmarks.py - some benchmarking code for warcprox +# +# Copyright (C) 2015-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# + +import sys +import aiohttp +import aiohttp.server +import asyncio +import ssl +import tempfile +import OpenSSL.crypto +import OpenSSL.SSL +import random +import os +import threading +import time +import logging +import warcprox.main + +logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s') + +def self_signed_cert(): + key = OpenSSL.crypto.PKey() + key.generate_key(OpenSSL.crypto.TYPE_RSA, 2048) + + cert = OpenSSL.crypto.X509() + cert.set_serial_number(random.randint(0, 2 ** 64 - 1)) + cert.get_subject().CN = 'localhost' + + cert.set_version(2) + cert.gmtime_adj_notBefore(0) + cert.gmtime_adj_notAfter(10 * 365 * 24 * 60 * 60) + + cert.set_issuer(cert.get_subject()) + cert.set_pubkey(key) + cert.sign(key, "sha1") + + return key, cert + +class HttpRequestHandler(aiohttp.server.ServerHttpProtocol): + @asyncio.coroutine + def handle_request(self, message, payload): + response = aiohttp.Response( + self.writer, 200, http_version=message.version + ) + n = int(message.path.partition('/')[2]) + response.add_header('Content-Type', 'text/plain') + # response.add_header('Content-Length', '18') + response.send_headers() + for i in range(n): + response.write(b'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n') + yield from response.write_eof() + +def run_servers(): + loop.run_forever() + +def start_servers(): + loop = asyncio.get_event_loop() + http = loop.create_server(lambda: HttpRequestHandler(debug=True, keep_alive=75), '127.0.0.1', '8080') + sslcontext = ssl.SSLContext(ssl.PROTOCOL_SSLv23) + key, cert = self_signed_cert() + with tempfile.NamedTemporaryFile(delete=False) as certfile: + certfile.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM, key)) + certfile.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM, cert)) + sslcontext.load_cert_chain(certfile.name) + os.remove(certfile.name) + https = loop.create_server(lambda: HttpRequestHandler(debug=True, keep_alive=75), '127.0.0.1', '8443', ssl=sslcontext) + srv = loop.run_until_complete(http) + srv = loop.run_until_complete(https) + logging.info('serving on http://127.0.0.1:8080 and https://127.0.0.1:8443') + +class AsyncClient(object): + def __init__(self, proxy=None): + self.n_urls = 0 + self.n_bytes = 0 + self.proxy = proxy + if proxy: + self.connector = aiohttp.connector.ProxyConnector(proxy, verify_ssl=False) + else: + self.connector = aiohttp.connector.TCPConnector(verify_ssl=False) + + @asyncio.coroutine + def read_response(self, r, url): + # time.sleep(random.random() * 10) + while True: + chunk = yield from r.content.read(2**16) + self.n_bytes += len(chunk) + if not chunk: + self.n_urls += 1 + logging.debug("finished reading from %s", url) + r.close() + break + + @asyncio.coroutine + def one_request(self, url): + logging.debug("issuing request to %s", url) + r = yield from aiohttp.get(url, connector=self.connector) + logging.debug("issued request to %s", url) + yield from self.read_response(r, url) + +def benchmark(client): + try: + start = time.time() + tasks_https = [client.one_request('https://localhost:8443/%s' % int(1.1**i)) for i in range(80)] + asyncio.get_event_loop().run_until_complete(asyncio.wait(tasks_https)) + tasks_http = [client.one_request('http://localhost:8080/%s' % int(1.1**i)) for i in range(80)] + asyncio.get_event_loop().run_until_complete(asyncio.wait(tasks_http)) + finally: + finish = time.time() + logging.info("proxy=%s: %s urls totaling %s bytes in %s seconds", client.proxy, client.n_urls, client.n_bytes, (finish - start)) + +if __name__ == '__main__': + args = warcprox.main.parse_args() + + start_servers() + + baseline_client = AsyncClient() + logging.info("===== baseline benchmark starting (no proxy) =====") + benchmark(baseline_client) + logging.info("===== baseline benchmark finished =====") + + + # Queue size of 1 makes warcprox behave as though it were synchronous (each + # request blocks until the warc writer starts working on the last request). + # This gives us a better sense of sustained max throughput. The + # asynchronous nature of warcprox helps with bursty traffic, as long as the + # average throughput stays below the sustained max. + with tempfile.TemporaryDirectory() as tmpdir: + args.queue_size = 1 + args.cacert = os.path.join(tmpdir, "benchmark-warcprox-ca.pem") + args.certs_dir = os.path.join(tmpdir, "benchmark-warcprox-ca") + args.directory = os.path.join(tmpdir, "warcs") + args.gzip = True + args.base32 = True + args.stats_db_file = os.path.join(tmpdir, "stats.db") + args.dedup_db_file = os.path.join(tmpdir, "dedup.db") + + warcprox_controller = warcprox.main.init_controller(args) + warcprox_controller_thread = threading.Thread(target=warcprox_controller.run_until_shutdown) + warcprox_controller_thread.start() + proxy = "http://%s:%s" % (args.address, args.port) + proxied_client = AsyncClient(proxy=proxy) + + logging.info("===== warcprox benchmark starting =====") + benchmark(proxied_client) + logging.info("===== warcprox benchmark finished =====") + + warcprox_controller.stop.set() + warcprox_controller_thread.join() + + asyncio.get_event_loop().stop() + logging.info("finished") + diff --git a/bin/warcprox b/bin/warcprox deleted file mode 100755 index d978c53..0000000 --- a/bin/warcprox +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env python -# vim: set sw=4 et: - -from __future__ import absolute_import - -import warcprox.main - -warcprox.main.main() diff --git a/setup.py b/setup.py index d8afc87..9e87a09 100755 --- a/setup.py +++ b/setup.py @@ -1,44 +1,57 @@ #!/usr/bin/env python -# vim: set sw=4 et: +''' +setup.py - setuptools installation configuration for warcprox + +Copyright (C) 2013-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +''' -from setuptools.command.test import test as TestCommand import sys -import setuptools - -VERSION_BYTES = b'1.4' - -def full_version_bytes(): - import subprocess, time - try: - commit_bytes = subprocess.check_output(['git', 'log', '-1', '--pretty=format:%h']) - - t_bytes = subprocess.check_output(['git', 'log', '-1', '--pretty=format:%ct']) - t = int(t_bytes.strip().decode('utf-8')) - tm = time.gmtime(t) - timestamp_utc = time.strftime("%Y%m%d%H%M%S", time.gmtime(t)) - return VERSION_BYTES + b'-' + timestamp_utc.encode('utf-8') + b'-' + commit_bytes.strip() - except subprocess.CalledProcessError: - return VERSION_BYTES - -version_bytes = full_version_bytes() -with open('warcprox/version.txt', 'wb') as out: - out.write(version_bytes) - out.write(b'\n'); +import setuptools +import setuptools.command.test # special class needs to be added to support the pytest written dump-anydbm tests -class PyTest(TestCommand): +class PyTest(setuptools.command.test.test): def finalize_options(self): - TestCommand.finalize_options(self) + setuptools.command.test.test.finalize_options(self) self.test_args = [] self.test_suite = True def run_tests(self): - #import here, cause outside the eggs aren't loaded + # import here, because outside the eggs aren't loaded import pytest errno = pytest.main(self.test_args) sys.exit(errno) -setuptools.setup(name='warcprox', - version=version_bytes.decode('utf-8'), +deps = [ + 'certauth>=1.1.0', + 'warctools', + 'kafka-python>=1.0.1', + 'surt>=0.3b4', + 'rethinkstuff', + 'PySocks', +] +try: + import concurrent.futures +except: + deps.append('futures') + +setuptools.setup( + name='warcprox', + version='2.0b2.dev32', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', @@ -46,13 +59,18 @@ setuptools.setup(name='warcprox', long_description=open('README.rst').read(), license='GPL', packages=['warcprox'], - package_data={'warcprox':['version.txt']}, - install_requires=['certauth>=1.1.0', 'warctools>=4.8.3'], # gdbm not in pip :( - dependency_links=['git+https://github.com/internetarchive/warctools.git#egg=warctools-4.8.3'], + install_requires=deps, tests_require=['requests>=2.0.1', 'pytest'], # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636 cmdclass = {'test': PyTest}, test_suite='warcprox.tests', - scripts=['bin/dump-anydbm', 'bin/warcprox'], + entry_points={ + 'console_scripts': [ + 'warcprox=warcprox.main:main', + ('warcprox-ensure-rethinkdb-tables=' + 'warcprox.main:ensure_rethinkdb_tables'), + 'dump-anydbm=warcprox.dump_anydbm:main', + ], + }, zip_safe=False, classifiers=[ 'Development Status :: 5 - Production/Stable', @@ -60,6 +78,7 @@ setuptools.setup(name='warcprox', 'License :: OSI Approved :: GNU General Public License (GPL)', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', 'Topic :: Internet :: Proxy Servers', 'Topic :: Internet :: WWW/HTTP', 'Topic :: Software Development :: Libraries :: Python Modules', diff --git a/tests/Dockerfile b/tests/Dockerfile new file mode 100644 index 0000000..ab1f01a --- /dev/null +++ b/tests/Dockerfile @@ -0,0 +1,49 @@ +# +# Dockerfile for warcprox tests +# +# Copyright (C) 2015-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# + +FROM phusion/baseimage +MAINTAINER Noah Levitt + +# see https://github.com/stuartpb/rethinkdb-dockerfiles/blob/master/trusty/2.1.3/Dockerfile + +ENV LANG=C.UTF-8 + +RUN apt-get update && apt-get --auto-remove -y dist-upgrade + +# Add the RethinkDB repository and public key +# "RethinkDB Packaging " http://download.rethinkdb.com/apt/pubkey.gpg +RUN apt-key adv --keyserver pgp.mit.edu --recv-keys 1614552E5765227AEC39EFCFA7E00EF33A8F2399 \ + && echo "deb http://download.rethinkdb.com/apt trusty main" > /etc/apt/sources.list.d/rethinkdb.list \ + && apt-get update && apt-get -y install rethinkdb + +RUN mkdir -vp /etc/service/rethinkdb \ + && echo "#!/bin/sh\nrethinkdb --bind 0.0.0.0 --directory /tmp/rethink-data --runuser rethinkdb --rungroup rethinkdb\n" > /etc/service/rethinkdb/run \ + && chmod a+x /etc/service/rethinkdb/run + +RUN apt-get -y install python-virtualenv git +RUN apt-get -y install python-gdbm python3-gdbm libpython2.7-dev libpython3.4-dev libffi-dev libssl-dev +RUN pip install devpi-client + +RUN apt-get -y install tor +RUN mkdir -vp /etc/service/tor \ + && echo "#!/bin/sh\ntor\n" > /etc/service/tor/run \ + && chmod a+x /etc/service/tor/run + diff --git a/warcprox/tests/__init__.py b/tests/__init__.py similarity index 100% rename from warcprox/tests/__init__.py rename to tests/__init__.py diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..27d4141 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,39 @@ +# +# tests/conftest.py - command line options for warcprox tests +# +# Copyright (C) 2015-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# + +import pytest + +def pytest_addoption(parser): + parser.addoption('--rethinkdb-servers', dest='rethinkdb_servers', + help='rethink db servers for dedup, e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org') + parser.addoption('--rethinkdb-big-table', + dest='rethinkdb_big_table', action='store_true', default=False, + help='use a big rethinkdb table called "captures", instead of a small table called "dedup"; table is suitable for use as index for playback (ignored unless --rethinkdb-servers is specified)') + +@pytest.fixture(scope="module") +def rethinkdb_servers(request): + return request.config.getoption("--rethinkdb-servers") + +@pytest.fixture(scope="module") +def rethinkdb_big_table(request): + return request.config.getoption("--rethinkdb-big-table") + + diff --git a/tests/run-tests.sh b/tests/run-tests.sh new file mode 100755 index 0000000..b28e606 --- /dev/null +++ b/tests/run-tests.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# +# tests/run-tests.sh - Runs tests in a docker container. Also runs a temporary +# instance of rethinkdb inside the container. The tests run with rethinkdb +# features enabled, against that instance of rethinkdb, and also run without +# rethinkdb features enabled. With python 2.7 and 3.4. +# +# tests/conftest.py - command line options for warcprox tests +# +# Copyright (C) 2015-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# +# 😬 +# + +set -e + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +docker build -t internetarchive/warcprox-tests $script_dir + +for python in python2.7 python3.4 +do + docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests /sbin/my_init -- \ + bash -x -c "cd /tmp && git clone /warcprox && cd /tmp/warcprox \ + && (cd /warcprox && git diff) | patch -p1 \ + && virtualenv -p $python /tmp/venv \ + && source /tmp/venv/bin/activate \ + && pip --log-file /tmp/pip.log install . pytest requests \ + && py.test -s tests \ + && py.test -s --rethinkdb-servers=localhost tests \ + && py.test -s --rethinkdb-servers=localhost --rethinkdb-big-table tests" +done + diff --git a/tests/single-threaded-proxy.py b/tests/single-threaded-proxy.py new file mode 100755 index 0000000..dd5e709 --- /dev/null +++ b/tests/single-threaded-proxy.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python +""" +tests/single-threaded-proxy.py - single-threaded MITM proxy, useful for +debugging, does not write warcs + +Copyright (C) 2015-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +""" + +from __future__ import absolute_import + +import warcprox +import logging +import sys +import argparse +import certauth +import queue +import socket +import os + +class FakeQueue(object): + logger = logging.getLogger("FakeQueue") + def __init__(self, maxsize=0): pass + def join(self): pass + def qsize(self): return 0 + def empty(self): return True + def full(self): return False + def get(self, block=True, timeout=None): raise queue.Empty + def put_nowait(self, item): return self.put(item, block=False) + def get_nowait(self): return self.get(block=False) + def put(self, recorded_url, block=True, timeout=None): + logging.info("{} {} {} {} {} size={} {}".format( + recorded_url.client_ip, recorded_url.status, recorded_url.method, + recorded_url.url.decode("utf-8"), recorded_url.mimetype, + recorded_url.size, warcprox.digest_str(recorded_url.response_recorder.payload_digest, False).decode('utf-8'))) + +def parse_args(): + prog = os.path.basename(sys.argv[0]) + arg_parser = argparse.ArgumentParser(prog=prog, + description='%s - single threaded mitm http/s proxy, for debugging' % prog, + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + arg_parser.add_argument('-p', '--port', dest='port', default='8000', + type=int, help='port to listen on') + arg_parser.add_argument('-b', '--address', dest='address', + default='localhost', help='address to listen on') + arg_parser.add_argument('-c', '--cacert', dest='cacert', + default='./{0}-warcprox-ca.pem'.format(socket.gethostname()), + help='CA certificate file; if file does not exist, it will be created') + arg_parser.add_argument('--certs-dir', dest='certs_dir', + default='./{0}-warcprox-ca'.format(socket.gethostname()), + help='where to store and load generated certificates') + arg_parser.add_argument('--onion-tor-socks-proxy', dest='onion_tor_socks_proxy', + default=None, help='host:port of tor socks proxy, used only to connect to .onion sites') + arg_parser.add_argument('--version', action='version', + version="warcprox {}".format(warcprox.__version__)) + arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true') + arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true') + + return arg_parser.parse_args(args=sys.argv[1:]) + +def init_logging(verbose): + if args.verbose: + loglevel = logging.DEBUG + elif args.quiet: + loglevel = logging.WARNING + else: + loglevel = logging.INFO + + logging.basicConfig(stream=sys.stdout, level=loglevel, + format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s') + # format='%(asctime)s %(funcName) 21s() %(filename)15s:%(lineno)05d %(message)s') + +def init_proxy(args): + ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64] + ca = certauth.certauth.CertificateAuthority(args.cacert, args.certs_dir, + ca_name=ca_name) + options = warcprox.Options(**vars(args)) + proxy = warcprox.warcproxy.SingleThreadedWarcProxy(ca, + recorded_url_q=FakeQueue(), options=options) + return proxy + +if __name__ == "__main__": + args = parse_args() + init_logging(args.verbose) + proxy = init_proxy(args) + + proxy.serve_forever() + diff --git a/warcprox/tests/test_dump-anydbm.py b/tests/test_dump-anydbm.py similarity index 83% rename from warcprox/tests/test_dump-anydbm.py rename to tests/test_dump-anydbm.py index 4cca48d..1bc6ccc 100644 --- a/warcprox/tests/test_dump-anydbm.py +++ b/tests/test_dump-anydbm.py @@ -1,4 +1,24 @@ #!/usr/bin/env python +# +# tests/test_dump-anydbm.py - tests for dump-anydbm +# +# Copyright (C) 2013-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# import pytest import os @@ -6,6 +26,7 @@ import tempfile import subprocess # to access the script from shell import sys import glob +import distutils # will try as python 3 then default to python 2 modules try: @@ -38,7 +59,7 @@ val1 = 'very first value' val2 = 'second value' py = sys.executable -dump_anydbm_loc = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "bin/dump-anydbm") +dump_anydbm_loc = distutils.spawn.find_executable("dump-anydbm") @pytest.fixture(scope="function") def gdbm_test_db(request): diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py new file mode 100755 index 0000000..6d4b986 --- /dev/null +++ b/tests/test_warcprox.py @@ -0,0 +1,1150 @@ +#!/usr/bin/env python +# vim: set fileencoding=utf-8: +''' +tests/test_warcprox.py - automated tests for warcprox + +Copyright (C) 2013-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +''' + +import pytest +import threading +import time +import logging +import sys +import ssl +import re +import tempfile +import OpenSSL +import os +import shutil +import requests +import re +import json +import random +import rethinkstuff +from hanzo import warctools +import warnings +import pprint +import traceback +import signal +from collections import Counter +import socket + +try: + import http.server as http_server +except ImportError: + import BaseHTTPServer as http_server + +try: + import queue +except ImportError: + import Queue as queue + +import certauth.certauth + +import warcprox + +logging.basicConfig( + stream=sys.stdout, level=logging.INFO, # level=warcprox.TRACE, + format='%(asctime)s %(process)d %(levelname)s %(threadName)s ' + '%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s') +logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN) +warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning) +warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning) + +# monkey patch dns lookup so we can test domain inheritance on localhost +orig_getaddrinfo = socket.getaddrinfo +orig_gethostbyname = socket.gethostbyname +orig_socket_connect = socket.socket.connect + +def _getaddrinfo(host, port, family=0, type=0, proto=0, flags=0): + if host.endswith('.localhost'): + return orig_getaddrinfo('localhost', port, family, type, proto, flags) + else: + return orig_getaddrinfo(host, port, family, type, proto, flags) + +def _gethostbyname(host): + if host.endswith('.localhost'): + return orig_gethostbyname('localhost') + else: + return orig_gethostbyname(host) + +def _socket_connect(self, address): + if address[0].endswith('.localhost'): + return orig_socket_connect(self, ('localhost', address[1])) + else: + return orig_socket_connect(self, address) + +socket.gethostbyname = _gethostbyname +socket.getaddrinfo = _getaddrinfo +socket.socket.connect = _socket_connect + +def dump_state(signum=None, frame=None): + pp = pprint.PrettyPrinter(indent=4) + state_strs = [] + + for th in threading.enumerate(): + try: + state_strs.append(str(th)) + except AssertionError: + state_strs.append("") + stack = traceback.format_stack(sys._current_frames()[th.ident]) + state_strs.append("".join(stack)) + + logging.warn("dumping state (caught signal {})\n{}".format(signum, "\n".join(state_strs))) + +signal.signal(signal.SIGQUIT, dump_state) + +class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler): + def do_GET(self): + logging.info('GET {}'.format(self.path)) + + m = re.match(r'^/([^/]+)/([^/]+)$', self.path) + if m is not None: + special_header = 'warcprox-test-header: {}!'.format(m.group(1)).encode('utf-8') + payload = 'I am the warcprox test payload! {}!\n'.format(10*m.group(2)).encode('utf-8') + headers = (b'HTTP/1.1 200 OK\r\n' + + b'Content-Type: text/plain\r\n' + + special_header + b'\r\n' + + b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n' + + b'\r\n') + elif self.path == '/missing-content-length': + headers = (b'HTTP/1.1 200 OK\r\n' + + b'Content-Type: text/plain\r\n' + + b'\r\n') + payload = b'This response is missing a Content-Length http header.' + else: + payload = b'404 Not Found\n' + headers = (b'HTTP/1.1 404 Not Found\r\n' + + b'Content-Type: text/plain\r\n' + + b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n' + + b'\r\n') + + self.connection.sendall(headers) + self.connection.sendall(payload) + +@pytest.fixture(scope="module") +def cert(request): + f = tempfile.NamedTemporaryFile(prefix='warcprox-test-https-', suffix='.pem', delete=False) + + def fin(): + logging.info("deleting file %s", f.name) + os.unlink(f.name) + request.addfinalizer(fin) + + try: + key = OpenSSL.crypto.PKey() + key.generate_key(OpenSSL.crypto.TYPE_RSA, 2048) + req = OpenSSL.crypto.X509Req() + req.get_subject().CN = 'localhost' + req.set_pubkey(key) + req.sign(key, 'sha1') + cert = OpenSSL.crypto.X509() + cert.set_subject(req.get_subject()) + cert.set_serial_number(0) + cert.gmtime_adj_notBefore(0) + cert.gmtime_adj_notAfter(2*60*60) # valid for 2hrs + cert.set_issuer(cert.get_subject()) + cert.set_pubkey(req.get_pubkey()) + cert.sign(key, 'sha1') + + f.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM, key)) + f.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM, cert)) + + logging.info('generated self-signed certificate {}'.format(f.name)) + return f.name + finally: + f.close() + +@pytest.fixture(scope="module") +def http_daemon(request): + http_daemon = http_server.HTTPServer( + ('localhost', 0), RequestHandlerClass=_TestHttpRequestHandler) + logging.info('starting http://{}:{}'.format(http_daemon.server_address[0], http_daemon.server_address[1])) + http_daemon_thread = threading.Thread(name='HttpDaemonThread', + target=http_daemon.serve_forever) + http_daemon_thread.start() + + def fin(): + logging.info("stopping http daemon") + http_daemon.shutdown() + http_daemon.server_close() + http_daemon_thread.join() + request.addfinalizer(fin) + + return http_daemon + +@pytest.fixture(scope="module") +def https_daemon(request, cert): + # http://www.piware.de/2011/01/creating-an-https-server-in-python/ + https_daemon = http_server.HTTPServer(('localhost', 0), + RequestHandlerClass=_TestHttpRequestHandler) + # https_daemon.socket = ssl.wrap_socket(httpd.socket, certfile='path/to/localhost.pem', server_side=True) + https_daemon.socket = ssl.wrap_socket(https_daemon.socket, certfile=cert, server_side=True) + logging.info('starting https://{}:{}'.format(https_daemon.server_address[0], https_daemon.server_address[1])) + https_daemon_thread = threading.Thread(name='HttpsDaemonThread', + target=https_daemon.serve_forever) + https_daemon_thread.start() + + def fin(): + logging.info("stopping https daemon") + https_daemon.shutdown() + https_daemon.server_close() + https_daemon_thread.join() + request.addfinalizer(fin) + + return https_daemon + +# @pytest.fixture(scope="module") +# def options(request): +# return warcprox.Options(base32=True) + +@pytest.fixture(scope="module") +def captures_db(request, rethinkdb_servers, rethinkdb_big_table): + captures_db = None + if rethinkdb_servers: + servers = rethinkdb_servers.split(",") + if rethinkdb_big_table: + db = 'warcprox_test_captures_' + "".join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8)) + r = rethinkstuff.Rethinker(servers, db) + captures_db = warcprox.bigtable.RethinkCaptures(r) + captures_db.start() + + def fin(): + if captures_db: + captures_db.close() + # logging.info('dropping rethinkdb database {}'.format(db)) + # result = captures_db.r.db_drop(db).run() + # logging.info("result=%s", result) + request.addfinalizer(fin) + + return captures_db + +@pytest.fixture(scope="module") +def rethink_dedup_db(request, rethinkdb_servers, captures_db): + ddb = None + if rethinkdb_servers: + if captures_db: + ddb = warcprox.bigtable.RethinkCapturesDedup(captures_db) + else: + servers = rethinkdb_servers.split(",") + db = 'warcprox_test_dedup_' + "".join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8)) + r = rethinkstuff.Rethinker(servers, db) + ddb = warcprox.dedup.RethinkDedupDb(r) + + def fin(): + if rethinkdb_servers: + ddb.close() + if not captures_db: + logging.info('dropping rethinkdb database {}'.format(db)) + result = ddb.r.db_drop(db).run() + logging.info("result=%s", result) + request.addfinalizer(fin) + + return ddb + +@pytest.fixture(scope="module") +def dedup_db(request, rethink_dedup_db): + dedup_db_file = None + ddb = rethink_dedup_db + if not ddb: + f = tempfile.NamedTemporaryFile(prefix='warcprox-test-dedup-', suffix='.db', delete=False) + f.close() + dedup_db_file = f.name + ddb = warcprox.dedup.DedupDb(dedup_db_file) + + def fin(): + if dedup_db_file: + logging.info('deleting file {}'.format(dedup_db_file)) + os.unlink(dedup_db_file) + request.addfinalizer(fin) + + return ddb + +@pytest.fixture(scope="module") +def stats_db(request, rethinkdb_servers): + if rethinkdb_servers: + servers = rethinkdb_servers.split(",") + db = 'warcprox_test_stats_' + "".join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8)) + r = rethinkstuff.Rethinker(servers, db) + sdb = warcprox.stats.RethinkStatsDb(r) + sdb.start() + else: + f = tempfile.NamedTemporaryFile(prefix='warcprox-test-stats-', suffix='.db', delete=False) + f.close() + stats_db_file = f.name + sdb = warcprox.stats.StatsDb(stats_db_file) + + def fin(): + sdb.close() + if rethinkdb_servers: + logging.info('dropping rethinkdb database {}'.format(db)) + result = sdb.r.db_drop(db).run() + logging.info("result=%s", result) + else: + logging.info('deleting file {}'.format(stats_db_file)) + os.unlink(stats_db_file) + request.addfinalizer(fin) + + return sdb + +@pytest.fixture(scope="module") +def service_registry(request, rethinkdb_servers): + if rethinkdb_servers: + servers = rethinkdb_servers.split(",") + db = 'warcprox_test_services_' + "".join(random.sample("abcdefghijklmnopqrstuvwxyz0123456789_",8)) + r = rethinkstuff.Rethinker(servers, db) + + def fin(): + logging.info('dropping rethinkdb database {}'.format(db)) + result = r.db_drop(db).run() + logging.info("result=%s", result) + request.addfinalizer(fin) + + return rethinkstuff.ServiceRegistry(r) + else: + return None + +@pytest.fixture(scope="module") +def warcprox_(request, captures_db, dedup_db, stats_db, service_registry): + f = tempfile.NamedTemporaryFile(prefix='warcprox-test-ca-', suffix='.pem', delete=True) + f.close() # delete it, or CertificateAuthority will try to read it + ca_file = f.name + ca_dir = tempfile.mkdtemp(prefix='warcprox-test-', suffix='-ca') + ca = certauth.certauth.CertificateAuthority(ca_file, ca_dir, 'warcprox-test') + + recorded_url_q = queue.Queue() + + options = warcprox.Options(port=0, playback_port=0, + onion_tor_socks_proxy='localhost:9050') + proxy = warcprox.warcproxy.WarcProxy(ca=ca, recorded_url_q=recorded_url_q, + stats_db=stats_db, options=options) + options.port = proxy.server_port + + options.directory = tempfile.mkdtemp(prefix='warcprox-test-warcs-') + + f = tempfile.NamedTemporaryFile(prefix='warcprox-test-playback-index-', suffix='.db', delete=False) + f.close() + playback_index_db_file = f.name + playback_index_db = warcprox.playback.PlaybackIndexDb(playback_index_db_file) + playback_proxy = warcprox.playback.PlaybackProxy(ca=ca, + playback_index_db=playback_index_db, options=options) + options.playback_proxy = playback_proxy.server_port + + writer_pool = warcprox.writer.WarcWriterPool(options) + warc_writer_thread = warcprox.writerthread.WarcWriterThread( + recorded_url_q=recorded_url_q, writer_pool=writer_pool, + dedup_db=dedup_db, listeners=[captures_db or dedup_db, playback_index_db, stats_db]) + + warcprox_ = warcprox.controller.WarcproxController(proxy=proxy, + warc_writer_thread=warc_writer_thread, playback_proxy=playback_proxy, + service_registry=service_registry, options=options) + logging.info('starting warcprox') + warcprox_thread = threading.Thread(name='WarcproxThread', + target=warcprox_.run_until_shutdown) + warcprox_thread.start() + + def fin(): + logging.info('stopping warcprox') + warcprox_.stop.set() + warcprox_thread.join() + for f in (ca_file, ca_dir, options.directory, playback_index_db_file): + if os.path.isdir(f): + logging.info('deleting directory {}'.format(f)) + shutil.rmtree(f) + else: + logging.info('deleting file {}'.format(f)) + os.unlink(f) + request.addfinalizer(fin) + + return warcprox_ + +@pytest.fixture(scope="module") +def archiving_proxies(warcprox_): + archiving_proxy = 'http://localhost:{}'.format(warcprox_.proxy.server_port) + return {'http':archiving_proxy, 'https':archiving_proxy} + +@pytest.fixture(scope="module") +def playback_proxies(warcprox_): + playback_proxy = 'http://localhost:{}'.format(warcprox_.playback_proxy.server_port) + return {'http':playback_proxy, 'https':playback_proxy} + +def test_httpds_no_proxy(http_daemon, https_daemon): + url = 'http://localhost:{}/'.format(http_daemon.server_port) + response = requests.get(url) + assert response.status_code == 404 + assert response.content == b'404 Not Found\n' + + url = 'https://localhost:{}/'.format(https_daemon.server_port) + response = requests.get(url, verify=False) + assert response.status_code == 404 + assert response.content == b'404 Not Found\n' + + url = 'http://localhost:{}/a/b'.format(http_daemon.server_port) + response = requests.get(url) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'a!' + assert response.content == b'I am the warcprox test payload! bbbbbbbbbb!\n' + + url = 'https://localhost:{}/c/d'.format(https_daemon.server_port) + response = requests.get(url, verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'c!' + assert response.content == b'I am the warcprox test payload! dddddddddd!\n' + + # ensure monkey-patched dns resolution is working + url = 'https://foo.bar.localhost:{}/c/d'.format(https_daemon.server_port) + response = requests.get(url, verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'c!' + assert response.content == b'I am the warcprox test payload! dddddddddd!\n' + +def _poll_playback_until(playback_proxies, url, status, timeout_sec): + start = time.time() + # check playback (warc writing is asynchronous, give it up to 10 sec) + while time.time() - start < timeout_sec: + response = requests.get(url, proxies=playback_proxies, verify=False) + if response.status_code == status: + break + time.sleep(0.5) + return response + +def test_archive_and_playback_http_url(http_daemon, archiving_proxies, playback_proxies): + url = 'http://localhost:{}/a/b'.format(http_daemon.server_port) + + # ensure playback fails before archiving + response = requests.get(url, proxies=playback_proxies) + assert response.status_code == 404 + assert response.content == b'404 Not in Archive\n' + + # archive + response = requests.get(url, proxies=archiving_proxies) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'a!' + assert response.content == b'I am the warcprox test payload! bbbbbbbbbb!\n' + + response = _poll_playback_until(playback_proxies, url, status=200, timeout_sec=10) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'a!' + assert response.content == b'I am the warcprox test payload! bbbbbbbbbb!\n' + +def test_archive_and_playback_https_url(https_daemon, archiving_proxies, playback_proxies): + url = 'https://localhost:{}/c/d'.format(https_daemon.server_port) + + # ensure playback fails before archiving + response = requests.get(url, proxies=playback_proxies, verify=False) + assert response.status_code == 404 + assert response.content == b'404 Not in Archive\n' + + # fetch & archive response + response = requests.get(url, proxies=archiving_proxies, verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'c!' + assert response.content == b'I am the warcprox test payload! dddddddddd!\n' + + # test playback + response = _poll_playback_until(playback_proxies, url, status=200, timeout_sec=10) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'c!' + assert response.content == b'I am the warcprox test payload! dddddddddd!\n' + +# test dedup of same http url with same payload +def test_dedup_http(http_daemon, warcprox_, archiving_proxies, playback_proxies): + url = 'http://localhost:{}/e/f'.format(http_daemon.server_port) + + # ensure playback fails before archiving + response = requests.get(url, proxies=playback_proxies, verify=False) + assert response.status_code == 404 + assert response.content == b'404 Not in Archive\n' + + # check not in dedup db + dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') + assert dedup_lookup is None + + # archive + response = requests.get(url, proxies=archiving_proxies, verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'e!' + assert response.content == b'I am the warcprox test payload! ffffffffff!\n' + + # test playback + response = _poll_playback_until(playback_proxies, url, status=200, timeout_sec=10) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'e!' + assert response.content == b'I am the warcprox test payload! ffffffffff!\n' + + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + time.sleep(0.5) + + # check in dedup db + # {u'id': u'', u'url': u'https://localhost:62841/c/d', u'date': u'2013-11-22T00:14:37Z'} + dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') + assert dedup_lookup['url'] == url.encode('ascii') + assert re.match(br'^$', dedup_lookup['id']) + assert re.match(br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$', dedup_lookup['date']) + record_id = dedup_lookup['id'] + dedup_date = dedup_lookup['date'] + + # need revisit to have a later timestamp than original, else playing + # back the latest record might not hit the revisit + time.sleep(1.5) + + # fetch & archive revisit + response = requests.get(url, proxies=archiving_proxies, verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'e!' + assert response.content == b'I am the warcprox test payload! ffffffffff!\n' + + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + time.sleep(0.5) + + # check in dedup db (no change from prev) + dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') + assert dedup_lookup['url'] == url.encode('ascii') + assert dedup_lookup['id'] == record_id + assert dedup_lookup['date'] == dedup_date + + # test playback + logging.debug('testing playback of revisit of {}'.format(url)) + response = _poll_playback_until(playback_proxies, url, status=200, timeout_sec=10) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'e!' + assert response.content == b'I am the warcprox test payload! ffffffffff!\n' + # XXX how to check dedup was used? + +# test dedup of same https url with same payload +def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxies): + url = 'https://localhost:{}/g/h'.format(https_daemon.server_port) + + # ensure playback fails before archiving + response = requests.get(url, proxies=playback_proxies, verify=False) + assert response.status_code == 404 + assert response.content == b'404 Not in Archive\n' + + # check not in dedup db + dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') + assert dedup_lookup is None + + # archive + response = requests.get(url, proxies=archiving_proxies, verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'g!' + assert response.content == b'I am the warcprox test payload! hhhhhhhhhh!\n' + + # test playback + response = _poll_playback_until(playback_proxies, url, status=200, timeout_sec=10) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'g!' + assert response.content == b'I am the warcprox test payload! hhhhhhhhhh!\n' + + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + time.sleep(0.5) + + # check in dedup db + # {u'id': u'', u'url': u'https://localhost:62841/c/d', u'date': u'2013-11-22T00:14:37Z'} + dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') + assert dedup_lookup['url'] == url.encode('ascii') + assert re.match(br'^$', dedup_lookup['id']) + assert re.match(br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$', dedup_lookup['date']) + record_id = dedup_lookup['id'] + dedup_date = dedup_lookup['date'] + + # need revisit to have a later timestamp than original, else playing + # back the latest record might not hit the revisit + time.sleep(1.5) + + # fetch & archive revisit + response = requests.get(url, proxies=archiving_proxies, verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'g!' + assert response.content == b'I am the warcprox test payload! hhhhhhhhhh!\n' + + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + time.sleep(0.5) + + # check in dedup db (no change from prev) + dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') + assert dedup_lookup['url'] == url.encode('ascii') + assert dedup_lookup['id'] == record_id + assert dedup_lookup['date'] == dedup_date + + # test playback + logging.debug('testing playback of revisit of {}'.format(url)) + response = _poll_playback_until(playback_proxies, url, status=200, timeout_sec=10) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'g!' + assert response.content == b'I am the warcprox test payload! hhhhhhhhhh!\n' + # XXX how to check dedup was used? + +def test_limits(http_daemon, warcprox_, archiving_proxies): + url = 'http://localhost:{}/i/j'.format(http_daemon.server_port) + request_meta = {"stats":{"buckets":["test_limits_bucket"]},"limits":{"test_limits_bucket/total/urls":10}} + headers = {"Warcprox-Meta": json.dumps(request_meta)} + + response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'i!' + assert response.content == b'I am the warcprox test payload! jjjjjjjjjj!\n' + + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + time.sleep(0.5) + + for i in range(9): + response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'i!' + assert response.content == b'I am the warcprox test payload! jjjjjjjjjj!\n' + + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + time.sleep(2.5) + + response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 420 + assert response.reason == "Reached limit" + expected_response_meta = {'reached-limit': {'test_limits_bucket/total/urls': 10}, 'stats': {'test_limits_bucket': {'bucket': 'test_limits_bucket', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'total': {'wire_bytes': 1350, 'urls': 10}, 'new': {'wire_bytes': 135, 'urls': 1}}}} + assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta + assert response.headers["content-type"] == "text/plain;charset=utf-8" + assert response.raw.data == b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=10\n" + +def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, playback_proxies): + url1 = 'http://localhost:{}/k/l'.format(http_daemon.server_port) + url2 = 'https://localhost:{}/k/l'.format(https_daemon.server_port) + + # archive url1 bucket_a + headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","captures-bucket":"bucket_a"})} + response = requests.get(url1, proxies=archiving_proxies, verify=False, headers=headers) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'k!' + assert response.content == b'I am the warcprox test payload! llllllllll!\n' + + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + time.sleep(0.5) + + # check url1 in dedup db bucket_a + dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_a") + assert dedup_lookup['url'] == url1.encode('ascii') + assert re.match(br'^$', dedup_lookup['id']) + assert re.match(br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$', dedup_lookup['date']) + record_id = dedup_lookup['id'] + dedup_date = dedup_lookup['date'] + + # check url1 not in dedup db bucket_b + dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_b") + assert dedup_lookup is None + + # archive url2 bucket_b + headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","captures-bucket":"bucket_b"})} + response = requests.get(url2, proxies=archiving_proxies, verify=False, headers=headers) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'k!' + assert response.content == b'I am the warcprox test payload! llllllllll!\n' + + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + time.sleep(0.5) + + # check url2 in dedup db bucket_b + dedup_lookup = warcprox_.warc_writer_thread.dedup_db.lookup(b'sha1:bc3fac8847c9412f49d955e626fb58a76befbf81', bucket="bucket_b") + assert dedup_lookup['url'] == url2.encode('ascii') + assert re.match(br'^$', dedup_lookup['id']) + assert re.match(br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$', dedup_lookup['date']) + record_id = dedup_lookup['id'] + dedup_date = dedup_lookup['date'] + + # archive url2 bucket_a + headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","captures-bucket":"bucket_a"})} + response = requests.get(url2, proxies=archiving_proxies, verify=False, headers=headers) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'k!' + assert response.content == b'I am the warcprox test payload! llllllllll!\n' + + # archive url1 bucket_b + headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"test_dedup_buckets","captures-bucket":"bucket_b"})} + response = requests.get(url1, proxies=archiving_proxies, verify=False, headers=headers) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'k!' + assert response.content == b'I am the warcprox test payload! llllllllll!\n' + + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + time.sleep(0.5) + + # close the warc + assert warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"] + writer = warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"] + warc_path = os.path.join(writer.directory, writer._f_finalname) + warcprox_.warc_writer_thread.writer_pool.warc_writers["test_dedup_buckets"].close_writer() + assert os.path.exists(warc_path) + + # read the warc + fh = warctools.ArchiveRecord.open_archive(warc_path) + record_iter = fh.read_records(limit=None, offsets=True) + try: + (offset, record, errors) = next(record_iter) + assert record.type == b'warcinfo' + + # url1 bucket_a + (offset, record, errors) = next(record_iter) + assert record.type == b'response' + assert record.url == url1.encode('ascii') + # check for duplicate warc record headers + assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1 + assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\nI am the warcprox test payload! llllllllll!\n' + (offset, record, errors) = next(record_iter) + assert record.type == b'request' + + # url2 bucket_b + (offset, record, errors) = next(record_iter) + assert record.type == b'response' + assert record.url == url2.encode('ascii') + # check for duplicate warc record headers + assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1 + assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\nI am the warcprox test payload! llllllllll!\n' + (offset, record, errors) = next(record_iter) + assert record.type == b'request' + + # url2 bucket_a (revisit) + (offset, record, errors) = next(record_iter) + assert record.type == b'revisit' + assert record.url == url2.encode('ascii') + # check for duplicate warc record headers + assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1 + assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\n' + (offset, record, errors) = next(record_iter) + assert record.type == b'request' + + # url1 bucket_b (revisit) + (offset, record, errors) = next(record_iter) + assert record.type == b'revisit' + assert record.url == url1.encode('ascii') + # check for duplicate warc record headers + assert Counter(h[0] for h in record.headers).most_common(1)[0][1] == 1 + assert record.content[1] == b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nwarcprox-test-header: k!\r\nContent-Length: 44\r\n\r\n' + (offset, record, errors) = next(record_iter) + assert record.type == b'request' + + # that's all folks + assert next(record_iter)[1] == None + assert next(record_iter, None) == None + + finally: + fh.close() + +def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies): + rules = [ + { + "domain": "localhost", + "url_match": "STRING_MATCH", + "value": "bar", + }, + { + "url_match": "SURT_MATCH", + "value": "http://(localhost:%s,)/fuh/" % (http_daemon.server_port), + }, + { + "url_match": "SURT_MATCH", + # this rule won't match because of http scheme, https port + "value": "http://(localhost:%s,)/fuh/" % (https_daemon.server_port), + }, + { + "domain": "bad.domain.com", + }, + ] + request_meta = {"blocks":rules} + headers = {"Warcprox-Meta":json.dumps(request_meta)} + + # blocked by STRING_MATCH rule + url = 'http://localhost:{}/bar'.format(http_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 403 + assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") + assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[0]} + + # not blocked + url = 'http://localhost:{}/m/n'.format(http_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 200 + + # blocked by SURT_MATCH + url = 'http://localhost:{}/fuh/guh'.format(http_daemon.server_port) + # logging.info("%s => %s", repr(url), repr(warcprox.warcproxy.Url(url).surt)) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 403 + assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") + assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[1]} + + # not blocked (no trailing slash) + url = 'http://localhost:{}/fuh'.format(http_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + # 404 because server set up at the top of this file doesn't handle this url + assert response.status_code == 404 + + # not blocked because surt scheme does not match (differs from heritrix + # behavior where https urls are coerced to http surt form) + url = 'https://localhost:{}/fuh/guh'.format(https_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 200 + + # blocked by blanket domain block + url = 'http://bad.domain.com/' + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 403 + assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") + assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]} + + # blocked by blanket domain block + url = 'https://bad.domain.com/' + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 403 + assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") + assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]} + + # blocked by blanket domain block + url = 'http://bad.domain.com:1234/' + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 403 + assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") + assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]} + + # blocked by blanket domain block + url = 'http://foo.bar.bad.domain.com/' + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 403 + assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") + assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]} + + # domain block also applies to subdomains + url = 'https://foo.bar.bad.domain.com/' + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 403 + assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") + assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]} + + # blocked by blanket domain block + url = 'http://foo.bar.bad.domain.com:1234/' + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 403 + assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") + assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]} + +def test_domain_doc_soft_limit( + http_daemon, https_daemon, warcprox_, archiving_proxies): + request_meta = { + "stats": {"buckets": [{"bucket":"test_domain_doc_limit_bucket","tally-domains":["foo.localhost"]}]}, + "soft-limits": {"test_domain_doc_limit_bucket:foo.localhost/total/urls":10}, + } + headers = {"Warcprox-Meta": json.dumps(request_meta)} + + # (1) + url = 'http://foo.localhost:{}/o/p'.format(http_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'o!' + assert response.content == b'I am the warcprox test payload! pppppppppp!\n' + + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + time.sleep(0.5) + + # make sure stats from different domain don't count + url = 'http://bar.localhost:{}/o/p'.format(http_daemon.server_port) + for i in range(10): + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'o!' + assert response.content == b'I am the warcprox test payload! pppppppppp!\n' + + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) + time.sleep(2.0) + + # (2) same host but different scheme and port: domain limit applies + # + url = 'https://foo.localhost:{}/o/p'.format(https_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'o!' + assert response.content == b'I am the warcprox test payload! pppppppppp!\n' + + # (3-9) different subdomain: host limit applies + url = 'https://baz.foo.localhost:{}/o/p'.format(https_daemon.server_port) + for i in range(7): + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'o!' + assert response.content == b'I am the warcprox test payload! pppppppppp!\n' + + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) + time.sleep(2.0) + + # (10) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'o!' + assert response.content == b'I am the warcprox test payload! pppppppppp!\n' + + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) + time.sleep(2.0) + + # (11) back to http, and this is the 11th request + url = 'http://zuh.foo.localhost:{}/o/p'.format(http_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 430 + assert response.reason == "Reached soft limit" + expected_response_meta = {'reached-soft-limit': {'test_domain_doc_limit_bucket:foo.localhost/total/urls': 10}, 'stats': {'test_domain_doc_limit_bucket:foo.localhost': {'bucket': 'test_domain_doc_limit_bucket:foo.localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} + assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta + assert response.headers["content-type"] == "text/plain;charset=utf-8" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n" + + # make sure limit doesn't get applied to a different domain + url = 'https://localhost:{}/o/p'.format(https_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'o!' + assert response.content == b'I am the warcprox test payload! pppppppppp!\n' + + # https also blocked + url = 'https://zuh.foo.localhost:{}/o/p'.format(https_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 430 + assert response.reason == "Reached soft limit" + expected_response_meta = {'reached-soft-limit': {'test_domain_doc_limit_bucket:foo.localhost/total/urls': 10}, 'stats': {'test_domain_doc_limit_bucket:foo.localhost': {'bucket': 'test_domain_doc_limit_bucket:foo.localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} + assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta + assert response.headers["content-type"] == "text/plain;charset=utf-8" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n" + + # same host, different capitalization still blocked + url = 'https://HEHEHE.fOO.lOcALhoST:{}/o/p'.format(https_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 430 + assert response.reason == "Reached soft limit" + expected_response_meta = {'reached-soft-limit': {'test_domain_doc_limit_bucket:foo.localhost/total/urls': 10}, 'stats': {'test_domain_doc_limit_bucket:foo.localhost': {'bucket': 'test_domain_doc_limit_bucket:foo.localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} + assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta + assert response.headers["content-type"] == "text/plain;charset=utf-8" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n" + +def test_domain_data_soft_limit( + http_daemon, https_daemon, warcprox_, archiving_proxies): + # using idn + request_meta = { + "stats": {"buckets": [{"bucket":"test_domain_data_limit_bucket","tally-domains":['♛zZ.LOCALhost']}]}, + # response is 135 bytes, so 3rd novel url should be disallowed + "soft-limits": {"test_domain_data_limit_bucket:♛ZZ.localhost/new/wire_bytes":200}, + } + headers = {"Warcprox-Meta": json.dumps(request_meta)} + + url = 'http://♛Zz.localhost:{}/y/z'.format(http_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'y!' + assert response.content == b'I am the warcprox test payload! zzzzzzzzzz!\n' + + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) + time.sleep(2.0) + + # duplicate, does not count toward limit + url = 'https://baz.♛zz.localhost:{}/y/z'.format(https_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'y!' + assert response.content == b'I am the warcprox test payload! zzzzzzzzzz!\n' + + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) + time.sleep(2.0) + + # novel, pushes stats over the limit + url = 'https://muh.XN--Zz-xZX.locALHOst:{}/z/~'.format(https_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'z!' + assert response.content == b'I am the warcprox test payload! ~~~~~~~~~~!\n' + + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) + time.sleep(2.0) + + # make sure limit doesn't get applied to a different host + url = 'http://baz.localhost:{}/z/~'.format(http_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'z!' + assert response.content == b'I am the warcprox test payload! ~~~~~~~~~~!\n' + + # blocked because we're over the limit now + url = 'http://lOl.wHut.♛ZZ.lOcALHOst:{}/y/z'.format(http_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 430 + assert response.reason == "Reached soft limit" + expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:xn--zz-xzx.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:xn--zz-xzx.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:xn--zz-xzx.localhost'}}} + assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta + assert response.headers["content-type"] == "text/plain;charset=utf-8" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:xn--zz-xzx.localhost/new/wire_bytes=200\n" + + # XXX this check is resulting in a segfault on mac and linux, from ssl I + # think, probably because of the dns resolution monkey-patching + # https://travis-ci.org/internetarchive/warcprox/builds/141187342 + # + ### # https also blocked + ### url = 'https://xn--zz-xzxh.loCAlhost:{}/w/x'.format(https_daemon.server_port) + ### response = requests.get( + ### url, proxies=archiving_proxies, headers=headers, stream=True, + ### verify=False) + ### assert response.status_code == 430 + ### assert response.reason == "Reached soft limit" + ### expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:xn--zz-xzx.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:xn--zz-xzx.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:xn--zz-xzx.localhost'}}} + ### assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta + ### assert response.headers["content-type"] == "text/plain;charset=utf-8" + ### assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:xn--zz-xzx.localhost/new/wire_bytes=200\n" + +# XXX this test relies on a tor proxy running at localhost:9050 with a working +# connection to the internet, and relies on a third party site (facebook) being +# up and behaving a certain way +def _test_tor_onion(archiving_proxies): + response = requests.get('http://www.facebookcorewwwi.onion/', + proxies=archiving_proxies, verify=False, allow_redirects=False) + assert response.status_code == 302 + + response = requests.get('https://www.facebookcorewwwi.onion/', + proxies=archiving_proxies, verify=False, allow_redirects=False) + assert response.status_code == 200 + +def test_missing_content_length(archiving_proxies, http_daemon, https_daemon): + # double-check that our test http server is responding as expected + url = 'http://localhost:%s/missing-content-length' % ( + http_daemon.server_port) + response = requests.get(url, verify=False, timeout=10) + assert response.content == ( + b'This response is missing a Content-Length http header.') + assert not 'content-length' in response.headers + + # double-check that our test https server is responding as expected + url = 'https://localhost:%s/missing-content-length' % ( + https_daemon.server_port) + response = requests.get(url, verify=False, timeout=10) + assert response.content == ( + b'This response is missing a Content-Length http header.') + assert not 'content-length' in response.headers + + # now check that the proxy doesn't hang (http) + url = 'http://localhost:%s/missing-content-length' % ( + http_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, verify=False, timeout=10) + assert response.content == ( + b'This response is missing a Content-Length http header.') + assert not 'content-length' in response.headers + + # now check that the proxy doesn't hang (https) + url = 'https://localhost:%s/missing-content-length' % ( + https_daemon.server_port) + # before fixing the issue this tests for, this would fail by raising + # requests.exceptions.ConnectionError: ... Read timed out + response = requests.get( + url, proxies=archiving_proxies, verify=False, timeout=10) + assert response.content == ( + b'This response is missing a Content-Length http header.') + assert not 'content-length' in response.headers + +if __name__ == '__main__': + pytest.main() + diff --git a/tox.ini b/tox.ini deleted file mode 100644 index f5b0c23..0000000 --- a/tox.ini +++ /dev/null @@ -1,13 +0,0 @@ -# Tox (http://tox.testrun.org/) is a tool for running tests -# in multiple virtualenvs. This configuration file will run the -# test suite on all supported python versions. To use it, "pip install tox" -# and then run "tox" from this directory. - -[tox] -envlist = py27, py34 - -[testenv] -commands = py.test warcprox -deps = - pytest - requests diff --git a/warcprox/__init__.py b/warcprox/__init__.py index e061a70..45b38b2 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -1,8 +1,141 @@ -def _read_version_bytes(): - import os - version_txt = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ['version.txt']) - with open(version_txt, 'rb') as fin: - return fin.read().strip() +""" +warcprox/__init__.py - warcprox package main file, contains some utility code -version_bytes = _read_version_bytes().strip() -version_str = version_bytes.decode('utf-8') +Copyright (C) 2013-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +""" + +from argparse import Namespace as _Namespace +from pkg_resources import get_distribution as _get_distribution +__version__ = _get_distribution('warcprox').version + +def digest_str(hash_obj, base32): + import base64 + return hash_obj.name.encode('utf-8') + b':' + ( + base64.b32encode(hash_obj.digest()) if base32 + else hash_obj.hexdigest().encode('ascii')) + +class Options(_Namespace): + def __getattr__(self, name): + try: + return super(Options, self).__getattr__(self, name) + except AttributeError: + return None + +# XXX linux-specific +def gettid(): + try: + import ctypes + libc = ctypes.cdll.LoadLibrary('libc.so.6') + SYS_gettid = 186 + tid = libc.syscall(SYS_gettid) + return tid + except: + return "n/a" + +class RequestBlockedByRule(Exception): + """ + An exception raised when a request should be blocked to respect a + Warcprox-Meta rule. + """ + def __init__(self, msg): + self.msg = msg + def __str__(self): + return "%s: %s" % (self.__class__.__name__, self.msg) + +class Url: + ''' + Utility class + ''' + def __init__(self, url): + self.url = url + self._surt = None + self._host = None + + @property + def surt(self): + if not self._surt: + import surt + hurl = surt.handyurl.parse(self.url) + surt.GoogleURLCanonicalizer.canonicalize(hurl) + hurl.query = None + hurl.hash = None + self._surt = hurl.getURLString(surt=True, trailing_comma=True) + return self._surt + + @property + def host(self): + if not self._host: + import surt + self._host = surt.handyurl.parse(self.url).host + return self._host + + def matches_ip_or_domain(self, ip_or_domain): + return host_matches_ip_or_domain(self.host, ip_or_domain) + +def normalize_host(host): + # normalize host (punycode and lowercase) + return host.encode('idna').decode('ascii').lower() + +def host_matches_ip_or_domain(host, ip_or_domain): + ''' + Returns true if + - ip_or_domain is an ip address and host is the same ip address + - ip_or_domain is a domain and host is the same domain + - ip_or_domain is a domain and host is a subdomain of it + ''' + _host = normalize_host(host) + _ip_or_domain = normalize_host(ip_or_domain) + + if _ip_or_domain == _host: + return True + + # if either _ip_or_domain or host are ip addresses, and they're not + # identical (previous check), not a match + try: + ipaddress.ip_address(_ip_or_domain) + return False + except: + pass + try: + ipaddress.ip_address(_host) + return False + except: + pass + + # if we get here, we're looking at two hostnames + domain_parts = _ip_or_domain.split(".") + host_parts = _host.split(".") + + result = host_parts[-len(domain_parts):] == domain_parts + return result + + +# logging level more fine-grained than logging.DEBUG==10 +TRACE = 5 + +import warcprox.controller as controller +import warcprox.playback as playback +import warcprox.dedup as dedup +import warcprox.warcproxy as warcproxy +import warcprox.mitmproxy as mitmproxy +import warcprox.writer as writer +import warcprox.warc as warc +import warcprox.writerthread as writerthread +import warcprox.stats as stats +import warcprox.bigtable as bigtable +import warcprox.kafkafeed as kafkafeed diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py new file mode 100644 index 0000000..a1ac377 --- /dev/null +++ b/warcprox/bigtable.py @@ -0,0 +1,218 @@ +""" +warcprox/bigtable.py - module for "big" RethinkDB table for deduplication; +the table is "big" in the sense that it is designed to be usable as an index +for playback software outside of warcprox, and contains information not +needed merely for deduplication + +Copyright (C) 2015-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +""" + +from __future__ import absolute_import + +import logging +from hanzo import warctools +import random +import warcprox +import base64 +import surt +import os +import hashlib +import threading +import datetime +import rethinkstuff + +class RethinkCaptures: + """Inserts in batches every 0.5 seconds""" + logger = logging.getLogger("warcprox.bigtable.RethinkCaptures") + + def __init__(self, r, table="captures", shards=None, replicas=None, options=warcprox.Options()): + self.r = r + self.table = table + self.shards = shards or len(r.servers) + self.replicas = replicas or min(3, len(r.servers)) + self.options = options + self._ensure_db_table() + + self._stop = threading.Event() + self._batch_lock = threading.RLock() + with self._batch_lock: + self._batch = [] + self._timer = None + + def start(self): + """Starts batch insert repeating timer""" + self._insert_batch() + + def _insert_batch(self): + try: + with self._batch_lock: + if len(self._batch) > 0: + result = self.r.table(self.table).insert(self._batch).run() + if result["inserted"] != len(self._batch) or sorted( + result.values()) != [0,0,0,0,0,len(self._batch)]: + raise Exception( + "unexpected result %s saving batch of %s " + "entries", result, len(self._batch)) + self.logger.debug( + "saved %s entries to big capture table db", + len(self._batch)) + self._batch = [] + except BaseException as e: + self.logger.error( + "caught exception trying to save %s entries, they will " + "be included in the next batch", len(self._batch), + exc_info=True) + finally: + if not self._stop.is_set(): + t = threading.Timer(0.5, self._insert_batch) + t.name = "RethinkCaptures-batch-insert-timer-%s" % datetime.datetime.utcnow().isoformat() + t.start() + # ensure self._timer joinable (already started) whenever close() happens to be called + self._timer = t + else: + self.logger.info("finished") + + def _ensure_db_table(self): + dbs = self.r.db_list().run() + if not self.r.dbname in dbs: + self.logger.info("creating rethinkdb database %s", repr(self.r.dbname)) + self.r.db_create(self.r.dbname).run() + tables = self.r.table_list().run() + if not self.table in tables: + self.logger.info("creating rethinkdb table %s in database %s", repr(self.table), repr(self.r.dbname)) + self.r.table_create(self.table, shards=self.shards, replicas=self.replicas).run() + self.r.table(self.table).index_create("abbr_canon_surt_timestamp", [self.r.row["abbr_canon_surt"], self.r.row["timestamp"]]).run() + self.r.table(self.table).index_create("sha1_warc_type", [self.r.row["sha1base32"], self.r.row["warc_type"], self.r.row["bucket"]]).run() + + def find_response_by_digest(self, algo, raw_digest, bucket="__unspecified__"): + if algo != "sha1": + raise Exception("digest type is {} but big capture table is indexed by sha1".format(algo)) + sha1base32 = base64.b32encode(raw_digest).decode("utf-8") + results_iter = self.r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type").run() + results = list(results_iter) + if len(results) > 0: + if len(results) > 1: + self.logger.debug("expected 0 or 1 but found %s results for sha1base32=%s bucket=%s (will use first result)", len(results), sha1base32, bucket) + result = results[0] + else: + result = None + self.logger.debug("returning %s for sha1base32=%s bucket=%s", + result, sha1base32, bucket) + return result + + def _assemble_entry(self, recorded_url, records): + if recorded_url.response_recorder: + if recorded_url.response_recorder.payload_digest.name == "sha1": + sha1base32 = base64.b32encode( + recorded_url.response_recorder.payload_digest.digest() + ).decode("utf-8") + else: + self.logger.warn( + "digest type is %s but big capture table is indexed " + "by sha1", + recorded_url.response_recorder.payload_digest.name) + else: + digest = hashlib.new("sha1", records[0].content[1]) + sha1base32 = base64.b32encode(digest.digest()).decode("utf-8") + + if (recorded_url.warcprox_meta + and "captures-bucket" in recorded_url.warcprox_meta): + bucket = recorded_url.warcprox_meta["captures-bucket"] + else: + bucket = "__unspecified__" + + canon_surt = surt.surt(recorded_url.url.decode("utf-8"), + trailing_comma=True, host_massage=False, with_scheme=True) + + entry = { + # id only specified for rethinkdb partitioning + "id": "{} {}".format( + canon_surt[:20], records[0].id.decode("utf-8")[10:-1]), + "abbr_canon_surt": canon_surt[:150], + "canon_surt": canon_surt, + "timestamp": recorded_url.timestamp.replace( + tzinfo=rethinkstuff.UTC), + "url": recorded_url.url.decode("utf-8"), + "offset": records[0].offset, + "filename": os.path.basename(records[0].warc_filename), + "warc_type": records[0].type.decode("utf-8"), + "warc_id": records[0].id.decode("utf-8"), + "sha1base32": sha1base32, + "content_type": recorded_url.mimetype, + "response_code": recorded_url.status, + "http_method": recorded_url.method, + "bucket": bucket, + "length": records[0].length, + } + + if (recorded_url.warcprox_meta and + "captures-table-extra-fields" in recorded_url.warcprox_meta): + extras = recorded_url.warcprox_meta["captures-table-extra-fields"] + for extra_field in extras: + entry[extra_field] = extras[extra_field] + + return entry + + def notify(self, recorded_url, records): + entry = self._assemble_entry(recorded_url, records) + with self._batch_lock: + self._batch.append(entry) + + def close(self): + self.stop() + + def stop(self): + self.logger.info("closing rethinkdb captures table") + self._stop.set() + if self._timer: + self._timer.join() + +class RethinkCapturesDedup: + logger = logging.getLogger("warcprox.dedup.RethinkCapturesDedup") + + def __init__(self, captures_db, options=warcprox.Options()): + self.captures_db = captures_db + self.options = options + + def lookup(self, digest_key, bucket="__unspecified__"): + k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key + algo, value_str = k.split(":") + if self.options.base32: + raw_digest = base64.b32decode(value_str, casefold=True) + else: + raw_digest = base64.b16decode(value_str, casefold=True) + entry = self.captures_db.find_response_by_digest(algo, raw_digest, bucket) + if entry: + dedup_info = { + "url": entry["url"].encode("utf-8"), + "date": entry["timestamp"].strftime("%Y-%m-%dT%H:%M:%SZ").encode("utf-8"), + } + if "warc_id" in entry: + dedup_info["id"] = entry["warc_id"].encode("utf-8") + return dedup_info + else: + return None + + def start(self): + self.captures_db.start() + + def stop(self): + self.captures_db.stop() + + def close(self): + self.captures_db.close() diff --git a/warcprox/controller.py b/warcprox/controller.py index 185ce9f..9796e71 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -1,19 +1,45 @@ -# vim: set sw=4 et: +''' +warcprox/controller.py - contains WarcproxController class, responsible for +starting up and shutting down the various components of warcprox, and for +sending heartbeats to the service registry if configured to do so; also has +some memory profiling capabilities + +Copyright (C) 2013-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +''' from __future__ import absolute_import import logging import threading -import signal import time - -import warcprox.warcprox -import warcprox.warcwriter +import warcprox +import sys +import gc +import datetime class WarcproxController(object): logger = logging.getLogger("warcprox.controller.WarcproxController") - def __init__(self, proxy=None, warc_writer_thread=None, playback_proxy=None): + HEARTBEAT_INTERVAL = 20.0 + + def __init__(self, proxy=None, warc_writer_thread=None, + playback_proxy=None, service_registry=None, + options=warcprox.Options()): """ Create warcprox controller. @@ -34,44 +60,129 @@ class WarcproxController(object): else: self.warc_writer_thread = warcprox.warcwriter.WarcWriterThread(recorded_url_q=self.proxy.recorded_url_q) + self.proxy_thread = None + self.playback_proxy_thread = None self.playback_proxy = playback_proxy + self.service_registry = service_registry + self.options = options - - def run_until_shutdown(self): - """Start warcprox and run until shut down. - - If running in the main thread, SIGTERM initiates a graceful shutdown. - Otherwise, call warcprox_controller.stop.set(). - """ - proxy_thread = threading.Thread(target=self.proxy.serve_forever, name='ProxyThread') - proxy_thread.start() - self.warc_writer_thread.start() - - if self.playback_proxy is not None: - playback_proxy_thread = threading.Thread(target=self.playback_proxy.serve_forever, name='PlaybackProxyThread') - playback_proxy_thread.start() + self._last_rss = None self.stop = threading.Event() + self._start_stop_lock = threading.Lock() - try: - signal.signal(signal.SIGTERM, self.stop.set) - self.logger.info('SIGTERM will initiate graceful shutdown') - except ValueError: - pass + def debug_mem(self): + self.logger.info("self.proxy.recorded_url_q.qsize()=%s", self.proxy.recorded_url_q.qsize()) + with open("/proc/self/status") as f: + for line in f: + fields = line.split() + if len(fields) >= 2: + k, v = fields[0:2] + if k == "VmHWM:": + hwm = int(v) + elif k == "VmRSS:": + rss = int(v) + elif k == "VmData:": + data = int(v) + elif k == "VmStk:": + stk = int(v) + self.logger.info("rss=%s data=%s stack=%s hwm=%s", rss, data, stk, hwm) + self._last_rss = self._last_rss or rss # to set initial value + + if rss - self._last_rss > 1024: + num_unreachable = gc.collect() + all_objects = gc.get_objects() + total_size = 0 + summary = {} + biggest_objects = [None] * 10 + for obj in all_objects: + size = sys.getsizeof(obj) + total_size += size + if not type(obj) in summary: + summary[type(obj)] = {"count":0,"size":0} + summary[type(obj)]["count"] += 1 + summary[type(obj)]["size"] += size + if size > sys.getsizeof(biggest_objects[-1]): + for i in range(len(biggest_objects)): + if size > sys.getsizeof(biggest_objects[i]): + index = i + break + biggest_objects[index+1:] = biggest_objects[index:-1] + biggest_objects[index] = obj + + self.logger.info("%s objects totaling %s bytes", len(all_objects), total_size) + + self.logger.info("=== biggest types ===") + for item in sorted(summary.items(), key=lambda item: item[1]["size"], reverse=True)[:10]: + self.logger.info("%s bytes in %s instances of %s", item[1]["size"], item[1]["count"], item[0]) + + self.logger.info("=== warcprox types ===") + for t in (t for t in summary if str(t).find("warcprox") >= 0): + self.logger.info("%s bytes in %s instances of %s", summary[t]["size"], summary[t]["count"], t) + + for i in range(len(biggest_objects)): + obj = biggest_objects[i] + try: + value = repr(bytes(obj.getbuffer()[:100])) + except: + try: + value = repr(obj)[:100] + except BaseException as e: + value = "<{} getting value>".format(e) + self.logger.info("#%s (%s) (%s bytes) (%s refs) (id=%s): %s", i+1, type(obj), sys.getsizeof(obj), sys.getrefcount(obj), id(obj), value) + self.logger.info("%s unreachable objects totaling %s bytes", len(gc.garbage), sum(sys.getsizeof(x) for x in gc.garbage)) + + self._last_rss = rss + + def _service_heartbeat(self): + if hasattr(self, 'status_info'): + status_info = self.status_info + else: + status_info = { + 'role': 'warcprox', + 'heartbeat_interval': self.HEARTBEAT_INTERVAL, + 'port': self.options.port, + } + status_info['load'] = 1.0 * self.proxy.recorded_url_q.qsize() / (self.proxy.recorded_url_q.maxsize or 100) + status_info['queue_size'] = self.proxy.recorded_url_q.qsize() + + self.status_info = self.service_registry.heartbeat(status_info) + self.logger.log( + warcprox.TRACE, "status in service registry: %s", + self.status_info) + + def start(self): + with self._start_stop_lock: + if self.proxy_thread and self.proxy_thread.is_alive(): + self.logger.info('warcprox is already running') + return + + if self.proxy.stats_db: + self.proxy.stats_db.start() + self.proxy_thread = threading.Thread( + target=self.proxy.serve_forever, name='ProxyThread') + self.proxy_thread.start() + + if self.warc_writer_thread.dedup_db: + self.warc_writer_thread.dedup_db.start() + self.warc_writer_thread.start() + + if self.playback_proxy is not None: + self.playback_proxy_thread = threading.Thread( + target=self.playback_proxy.serve_forever, + name='PlaybackProxyThread') + self.playback_proxy_thread.start() + + def shutdown(self): + with self._start_stop_lock: + if not self.proxy_thread or not self.proxy_thread.is_alive(): + self.logger.info('warcprox is not running') + return - try: - while not self.stop.is_set(): - time.sleep(0.5) - except: - pass - finally: self.warc_writer_thread.stop.set() self.proxy.shutdown() self.proxy.server_close() - if self.warc_writer_thread.warc_writer.dedup_db is not None: - self.warc_writer_thread.warc_writer.dedup_db.close() - if self.playback_proxy is not None: self.playback_proxy.shutdown() self.playback_proxy.server_close() @@ -80,7 +191,59 @@ class WarcproxController(object): # wait for threads to finish self.warc_writer_thread.join() - proxy_thread.join() - if self.playback_proxy is not None: - playback_proxy_thread.join() + + if self.proxy.stats_db: + self.proxy.stats_db.stop() + if self.warc_writer_thread.dedup_db: + self.warc_writer_thread.dedup_db.close() + + self.proxy_thread.join() + if self.playback_proxy is not None: + self.playback_proxy_thread.join() + + if self.service_registry and hasattr(self, "status_info"): + self.service_registry.unregister(self.status_info["id"]) + + def run_until_shutdown(self): + """ + Start warcprox and run until shut down. Call + warcprox_controller.stop.set() to initiate graceful shutdown. + """ + self.start() + + last_mem_dbg = datetime.datetime.utcfromtimestamp(0) + + try: + utc = datetime.timezone.utc + except AttributeError: + # python2 :-\ + class UTC(datetime.tzinfo): + def tzname(self, dt): return "UTC+00:00" + def dst(self, dt): return datetime.timedelta(0) + def utcoffset(self, dt): return datetime.timedelta(0) + utc = UTC() + + try: + while not self.stop.is_set(): + if self.service_registry and ( + not hasattr(self, "status_info") or ( + datetime.datetime.now(utc) + - self.status_info["last_heartbeat"] + ).total_seconds() > self.HEARTBEAT_INTERVAL): + self._service_heartbeat() + + if self.options.profile and ( + datetime.datetime.utcnow() - last_mem_dbg + ).total_seconds() > 60: + self.debug_mem() + last_mem_dbg = datetime.datetime.utcnow() + + time.sleep(0.5) + except: + self.logger.critical( + "shutting down in response to fatal exception", + exc_info=True) + pass + finally: + self.shutdown() diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 99a8d55..c5080d3 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -1,30 +1,58 @@ -# vim:set sw=4 et: +# +# warcprox/dedup.py - identical payload digest deduplication +# +# Copyright (C) 2013-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# from __future__ import absolute_import -try: - import dbm.gnu as dbm_gnu -except ImportError: - try: - import gdbm as dbm_gnu - except ImportError: - import anydbm as dbm_gnu - import logging import os import json from hanzo import warctools +import warcprox +import random class DedupDb(object): logger = logging.getLogger("warcprox.dedup.DedupDb") - def __init__(self, dbm_file='./warcprox-dedup.db'): + def __init__(self, dbm_file='./warcprox-dedup.db', options=warcprox.Options()): + try: + import dbm.gnu as dbm_gnu + except ImportError: + try: + import gdbm as dbm_gnu + except ImportError: + import anydbm as dbm_gnu + if os.path.exists(dbm_file): self.logger.info('opening existing deduplication database {}'.format(dbm_file)) else: self.logger.info('creating new deduplication database {}'.format(dbm_file)) self.db = dbm_gnu.open(dbm_file, 'c') + self.options = options + + def start(self): + pass + + def stop(self): + self.close() def close(self): self.db.close() @@ -35,26 +63,115 @@ class DedupDb(object): except: pass - def save(self, key, response_record, offset): + def save(self, digest_key, response_record, bucket=""): record_id = response_record.get_header(warctools.WarcRecord.ID).decode('latin1') url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1') date = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1') - py_value = {'i':record_id, 'u':url, 'd':date} + key = digest_key + b"|" + bucket.encode("utf-8") + + py_value = {'id':record_id, 'url':url, 'date':date} json_value = json.dumps(py_value, separators=(',',':')) self.db[key] = json_value.encode('utf-8') - self.logger.debug('dedup db saved {}:{}'.format(key, json_value)) + self.logger.debug('dedup db saved %s:%s', key, json_value) - def lookup(self, key): + def lookup(self, digest_key, bucket=""): + result = None + key = digest_key + b"|" + bucket.encode("utf-8") if key in self.db: json_result = self.db[key] result = json.loads(json_result.decode('utf-8')) - result['i'] = result['i'].encode('latin1') - result['u'] = result['u'].encode('latin1') - result['d'] = result['d'].encode('latin1') - return result + result['id'] = result['id'].encode('latin1') + result['url'] = result['url'].encode('latin1') + result['date'] = result['date'].encode('latin1') + self.logger.debug('dedup db lookup of key=%s returning %s', key, result) + return result + + def notify(self, recorded_url, records): + if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE + and recorded_url.response_recorder.payload_size() > 0): + digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, + self.options.base32) + if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: + self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"]) + else: + self.save(digest_key, records[0]) + + +def decorate_with_dedup_info(dedup_db, recorded_url, base32=False): + if (recorded_url.response_recorder + and recorded_url.response_recorder.payload_digest + and recorded_url.response_recorder.payload_size() > 0): + digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32) + if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: + recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"]) else: - return None + recorded_url.dedup_info = dedup_db.lookup(digest_key) + +class RethinkDedupDb: + logger = logging.getLogger("warcprox.dedup.RethinkDedupDb") + + def __init__(self, r, table="dedup", shards=None, replicas=None, options=warcprox.Options()): + self.r = r + self.table = table + self.shards = shards or len(r.servers) + self.replicas = replicas or min(3, len(r.servers)) + self._ensure_db_table() + self.options = options + + def _ensure_db_table(self): + dbs = self.r.db_list().run() + if not self.r.dbname in dbs: + self.logger.info("creating rethinkdb database %s", repr(self.r.dbname)) + self.r.db_create(self.r.dbname).run() + tables = self.r.table_list().run() + if not self.table in tables: + self.logger.info("creating rethinkdb table %s in database %s shards=%s replicas=%s", + repr(self.table), repr(self.r.dbname), self.shards, self.replicas) + self.r.table_create(self.table, primary_key="key", shards=self.shards, replicas=self.replicas).run() + def start(self): + pass + + def stop(self): + pass + + def close(self): + pass + + def sync(self): + pass + + def save(self, digest_key, response_record, bucket=""): + k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key + k = "{}|{}".format(k, bucket) + record_id = response_record.get_header(warctools.WarcRecord.ID).decode('latin1') + url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1') + date = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1') + record = {'key':k,'url':url,'date':date,'id':record_id} + result = self.r.table(self.table).insert(record,conflict="replace").run() + if sorted(result.values()) != [0,0,0,0,0,1] and [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]: + raise Exception("unexpected result %s saving %s", result, record) + self.logger.debug('dedup db saved %s:%s', k, record) + + def lookup(self, digest_key, bucket=""): + k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key + k = "{}|{}".format(k, bucket) + result = self.r.table(self.table).get(k).run() + if result: + for x in result: + result[x] = result[x].encode("utf-8") + self.logger.debug('dedup db lookup of key=%s returning %s', k, result) + return result + + def notify(self, recorded_url, records): + if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE + and recorded_url.response_recorder.payload_size() > 0): + digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, + self.options.base32) + if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: + self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"]) + else: + self.save(digest_key, records[0]) diff --git a/bin/dump-anydbm b/warcprox/dump_anydbm.py similarity index 67% rename from bin/dump-anydbm rename to warcprox/dump_anydbm.py index 1d1ae4b..6de00c6 100755 --- a/bin/dump-anydbm +++ b/warcprox/dump_anydbm.py @@ -1,12 +1,28 @@ #!/usr/bin/env python -# vim:set sw=4 et: -# +''' +dump-anydbm - dumps contents of dbm file to stdout -""" Dump contents of database to stdout. Database can be any file that the anydbm module can read. Included with warcprox because it's useful for inspecting a deduplication database or a playback index database, but it is a generic tool. -""" + +Copyright (C) 2013-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +''' try: import dbm @@ -14,7 +30,7 @@ try: whichdb = dbm.whichdb except: - import anydbm + import anydbm dbm = anydbm from whichdb import whichdb @@ -22,6 +38,9 @@ import sys import os.path if __name__ == "__main__": + main() + +def main(): if len(sys.argv) != 2: sys.stderr.write("usage: {} DBM_FILE\n".format(sys.argv[0])) exit(1) diff --git a/warcprox/kafkafeed.py b/warcprox/kafkafeed.py new file mode 100644 index 0000000..e17d2c4 --- /dev/null +++ b/warcprox/kafkafeed.py @@ -0,0 +1,101 @@ +''' +warcprox/kafkafeed.py - support for publishing information about archived +urls to apache kafka + +Copyright (C) 2015-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +''' + +import kafka +import datetime +import json +import logging +from hanzo import warctools + +class CaptureFeed: + logger = logging.getLogger('warcprox.kafkafeed.CaptureFeed') + + def __init__(self, broker_list, topic=None): + self.broker_list = broker_list + self.topic = topic + self.__producer = None + self._connection_exception = None + + def _producer(self): + if not self.__producer: + try: + # acks=0 to avoid ever blocking + self.__producer = kafka.KafkaProducer( + bootstrap_servers=self.broker_list, acks=0) + if self._connection_exception: + logging.info('connected to kafka successfully!') + self._connection_exception = None + except Exception as e: + if not self._connection_exception: + self._connection_exception = e + logging.error('problem connecting to kafka', exc_info=True) + + return self.__producer + + def notify(self, recorded_url, records): + if records[0].type not in (b'revisit', b'response'): + return + + topic = recorded_url.warcprox_meta.get('capture-feed-topic', self.topic) + if not topic: + return + + try: + payload_digest = records[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode('utf-8') + except: + payload_digest = '-' + + # {"status_code":200,"content_digest":"sha1:3VU56HI3BTMDZBL2TP7SQYXITT7VEAJQ","host":"www.kaosgl.com","via":"http://www.kaosgl.com/sayfa.php?id=4427","account_id":"877","seed":"http://www.kaosgl.com/","warc_filename":"ARCHIVEIT-6003-WEEKLY-JOB171310-20150903100014694-00002.warc.gz","url":"http://www.kaosgl.com/resim/HomofobiKarsitiBulusma/trabzon05.jpg","size":29700,"start_time_plus_duration":"20150903175709637+1049","timestamp":"2015-09-03T17:57:10.707Z","mimetype":"image/jpeg","collection_id":"6003","is_test_crawl":"false","job_name":"6003-20150902172136074","warc_offset":856320200,"thread":6,"hop_path":"RLLLLLE","extra_info":{},"annotations":"duplicate:digest","content_length":29432} + + now = datetime.datetime.utcnow() + d = { + 'timestamp': '{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000), + 'size': recorded_url.size, + 'status_code': recorded_url.status, + 'url': recorded_url.url.decode('utf-8'), + 'mimetype': recorded_url.mimetype, + 'content_digest': payload_digest, + 'warc_filename': records[0].warc_filename, + 'warc_offset': records[0].offset, + 'host': recorded_url.host, + 'annotations': 'duplicate:digest' if records[0].type == 'revisit' else '', + 'content_length': recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset, + 'start_time_plus_duration': '{:%Y%m%d%H%M%S}{:03d}+{}'.format( + recorded_url.timestamp, recorded_url.timestamp.microsecond//1000, + int(recorded_url.duration.total_seconds() * 1000)), + # 'hop_path': ? # only used for seed redirects, which are n/a to brozzler (?) + # 'via': ? + # 'thread': ? # not needed + } + + # fields expected to be populated here are (for archive-it): + # account_id, collection_id, is_test_crawl, seed, job_name + if recorded_url.warcprox_meta and 'capture-feed-extra-fields' in recorded_url.warcprox_meta: + for (k,v) in recorded_url.warcprox_meta['capture-feed-extra-fields'].items(): + d[k] = v + + msg = json.dumps(d, separators=(',', ':')).encode('utf-8') + self.logger.debug('feeding kafka topic=%s msg=%s', repr(topic), msg) + p = self._producer() + if p: + p.send(topic, msg) + diff --git a/warcprox/main.py b/warcprox/main.py index 04156d3..a127016 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -1,5 +1,25 @@ #!/usr/bin/env python -# vim:set sw=4 et: +''' +warcprox/main.py - entrypoint for warcprox executable, parses command line +arguments, initializes components, starts controller, handles signals + +Copyright (C) 2013-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +''' from __future__ import absolute_import @@ -14,21 +34,21 @@ import hashlib import argparse import os import socket - +import traceback +import signal +import threading import certauth.certauth - -import warcprox.playback -import warcprox.dedup -import warcprox.warcwriter -import warcprox.warcprox -import warcprox.controller +import warcprox +import re +import rethinkstuff +import cryptography.hazmat.backends.openssl def _build_arg_parser(prog=os.path.basename(sys.argv[0])): arg_parser = argparse.ArgumentParser(prog=prog, description='warcprox - WARC writing MITM HTTP/S proxy', formatter_class=argparse.ArgumentDefaultsHelpFormatter) arg_parser.add_argument('-p', '--port', dest='port', default='8000', - help='port to listen on') + type=int, help='port to listen on') arg_parser.add_argument('-b', '--address', dest='address', default='localhost', help='address to listen on') arg_parser.add_argument('-c', '--cacert', dest='cacert', @@ -44,10 +64,10 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): arg_parser.add_argument('-n', '--prefix', dest='prefix', default='WARCPROX', help='WARC filename prefix') arg_parser.add_argument('-s', '--size', dest='size', - default=1000*1000*1000, + default=1000*1000*1000, type=int, help='WARC file rollover size threshold in bytes') arg_parser.add_argument('--rollover-idle-time', - dest='rollover_idle_time', default=None, + dest='rollover_idle_time', default=None, type=int, help="WARC file rollover idle time threshold in seconds (so that Friday's last open WARC doesn't sit there all weekend waiting for more data)") try: hash_algos = hashlib.algorithms_guaranteed @@ -57,30 +77,171 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): default='sha1', help='digest algorithm, one of {}'.format(', '.join(hash_algos))) arg_parser.add_argument('--base32', dest='base32', action='store_true', default=False, help='write digests in Base32 instead of hex') - arg_parser.add_argument('-j', '--dedup-db-file', dest='dedup_db_file', - default='./warcprox-dedup.db', help='persistent deduplication database file; empty string or /dev/null disables deduplication') + arg_parser.add_argument('--stats-db-file', dest='stats_db_file', + default='./warcprox-stats.db', help='persistent statistics database file; empty string or /dev/null disables statistics tracking') arg_parser.add_argument('-P', '--playback-port', dest='playback_port', - default=None, help='port to listen on for instant playback') + type=int, default=None, help='port to listen on for instant playback') arg_parser.add_argument('--playback-index-db-file', dest='playback_index_db_file', default='./warcprox-playback-index.db', help='playback index database file (only used if --playback-port is specified)') + group = arg_parser.add_mutually_exclusive_group() + group.add_argument('-j', '--dedup-db-file', dest='dedup_db_file', + default='./warcprox-dedup.db', help='persistent deduplication database file; empty string or /dev/null disables deduplication') + group.add_argument('--rethinkdb-servers', dest='rethinkdb_servers', + help='rethinkdb servers, used for dedup and stats if specified; e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org') + arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default='warcprox', + help='rethinkdb database name (ignored unless --rethinkdb-servers is specified)') + arg_parser.add_argument('--rethinkdb-big-table', + dest='rethinkdb_big_table', action='store_true', default=False, + help='use a big rethinkdb table called "captures", instead of a small table called "dedup"; table is suitable for use as index for playback (ignored unless --rethinkdb-servers is specified)') + arg_parser.add_argument('--kafka-broker-list', dest='kafka_broker_list', + default=None, help='kafka broker list for capture feed') + arg_parser.add_argument('--kafka-capture-feed-topic', dest='kafka_capture_feed_topic', + default=None, help='kafka capture feed topic') + arg_parser.add_argument('--queue-size', dest='queue_size', default=500, + help=argparse.SUPPRESS) + arg_parser.add_argument('--max-threads', dest='max_threads', + help=argparse.SUPPRESS) + arg_parser.add_argument('--profile', action='store_true', default=False, + help=argparse.SUPPRESS) + arg_parser.add_argument('--onion-tor-socks-proxy', dest='onion_tor_socks_proxy', + default=None, help='host:port of tor socks proxy, used only to connect to .onion sites') arg_parser.add_argument('--version', action='version', - version="warcprox {}".format(warcprox.version_str)) + version="warcprox {}".format(warcprox.__version__)) arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true') + arg_parser.add_argument('--trace', dest='trace', action='store_true') arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true') - # [--ispartof=warcinfo ispartof] - # [--description=warcinfo description] - # [--operator=warcinfo operator] - # [--httpheader=warcinfo httpheader] return arg_parser +def dump_state(signum=None, frame=None): + ''' + Signal handler, logs stack traces of active threads. + ''' + state_strs = [] -def main(argv=sys.argv): + for th in threading.enumerate(): + try: + state_strs.append(str(th)) + except AssertionError: + state_strs.append('') + stack = traceback.format_stack(sys._current_frames()[th.ident]) + state_strs.append(''.join(stack)) + + logging.warn( + 'dumping state (caught signal %s)\n%s', + signum, '\n'.join(state_strs)) + +def init_controller(args): + ''' + Creates a warcprox.controller.WarcproxController configured according to + the supplied arguments (normally the result of parse_args(sys.argv)). + ''' + options = warcprox.Options(**vars(args)) + + try: + hashlib.new(args.digest_algorithm) + except Exception as e: + logging.fatal(e) + exit(1) + + listeners = [] + if args.rethinkdb_servers: + r = rethinkstuff.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db) + if args.rethinkdb_big_table: + captures_db = warcprox.bigtable.RethinkCaptures(r, options=options) + dedup_db = warcprox.bigtable.RethinkCapturesDedup(captures_db, options=options) + listeners.append(captures_db) + else: + dedup_db = warcprox.dedup.RethinkDedupDb(r, options=options) + listeners.append(dedup_db) + elif args.dedup_db_file in (None, '', '/dev/null'): + logging.info('deduplication disabled') + dedup_db = None + else: + dedup_db = warcprox.dedup.DedupDb(args.dedup_db_file, options=options) + listeners.append(dedup_db) + + if args.rethinkdb_servers: + stats_db = warcprox.stats.RethinkStatsDb(r, options=options) + listeners.append(stats_db) + elif args.stats_db_file in (None, '', '/dev/null'): + logging.info('statistics tracking disabled') + stats_db = None + else: + stats_db = warcprox.stats.StatsDb(args.stats_db_file, options=options) + listeners.append(stats_db) + + if args.kafka_broker_list: + kafka_capture_feed = warcprox.kafkafeed.CaptureFeed( + args.kafka_broker_list, args.kafka_capture_feed_topic) + listeners.append(kafka_capture_feed) + + recorded_url_q = queue.Queue(maxsize=args.queue_size) + + ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64] + ca = certauth.certauth.CertificateAuthority(args.cacert, args.certs_dir, + ca_name=ca_name) + + proxy = warcprox.warcproxy.WarcProxy(ca=ca, recorded_url_q=recorded_url_q, + stats_db=stats_db, options=options) + + if args.playback_port is not None: + playback_index_db = warcprox.playback.PlaybackIndexDb(args.playback_index_db_file, options=options) + playback_proxy = warcprox.playback.PlaybackProxy( + server_address=(args.address, args.playback_port), ca=ca, + playback_index_db=playback_index_db, warcs_dir=args.directory, + options=options) + listeners.append(playback_index_db) + else: + playback_index_db = None + playback_proxy = None + + writer_pool = warcprox.writer.WarcWriterPool(options=options) + warc_writer_thread = warcprox.writerthread.WarcWriterThread( + recorded_url_q=recorded_url_q, writer_pool=writer_pool, + dedup_db=dedup_db, listeners=listeners, options=options) + + if args.rethinkdb_servers: + svcreg = rethinkstuff.ServiceRegistry(r) + else: + svcreg = None + + controller = warcprox.controller.WarcproxController(proxy, + warc_writer_thread, playback_proxy, service_registry=svcreg, + options=options) + + return controller + +def real_main(args): + # see https://github.com/pyca/cryptography/issues/2911 + cryptography.hazmat.backends.openssl.backend.activate_builtin_random() + + controller = init_controller(args) + + signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set()) + signal.signal(signal.SIGINT, lambda a,b: controller.stop.set()) + signal.signal(signal.SIGQUIT, dump_state) + + controller.run_until_shutdown() + +def parse_args(argv=sys.argv): + ''' + Parses command line arguments with argparse. + ''' arg_parser = _build_arg_parser(prog=os.path.basename(argv[0])) args = arg_parser.parse_args(args=argv[1:]) + return args - if args.verbose: +def main(argv=sys.argv): + ''' + Main method, entry point of warcprox command. + ''' + args = parse_args(argv) + + if args.trace: + loglevel = warcprox.TRACE + elif args.verbose: loglevel = logging.DEBUG elif args.quiet: loglevel = logging.WARNING @@ -90,51 +251,50 @@ def main(argv=sys.argv): logging.basicConfig(stream=sys.stdout, level=loglevel, format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s') - try: - hashlib.new(args.digest_algorithm) - except Exception as e: - logging.fatal(e) - exit(1) + real_main(args) - if args.dedup_db_file in (None, '', '/dev/null'): - logging.info('deduplication disabled') - dedup_db = None - else: - dedup_db = warcprox.dedup.DedupDb(args.dedup_db_file) +def ensure_rethinkdb_tables(): + ''' + Creates rethinkdb tables if they don't already exist. Warcprox normally + creates the tables it needs on demand at startup, but if multiple instances + are starting up at the same time, you can end up with duplicate broken + tables. So it's a good idea to use this utility at an early step when + spinning up a cluster. + ''' + arg_parser = argparse.ArgumentParser( + prog=os.path.basename(sys.argv[0]), + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + arg_parser.add_argument( + '--rethinkdb-servers', dest='rethinkdb_servers', default='localhost', + help='rethinkdb servers e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org') + arg_parser.add_argument( + '--rethinkdb-db', dest='rethinkdb_db', default='warcprox', + help='rethinkdb database name') + arg_parser.add_argument( + '-q', '--quiet', dest='log_level', + action='store_const', default=logging.INFO, const=logging.WARN) + arg_parser.add_argument( + '-v', '--verbose', dest='log_level', + action='store_const', default=logging.INFO, const=logging.DEBUG) + args = arg_parser.parse_args(args=sys.argv[1:]) - recorded_url_q = queue.Queue() + logging.basicConfig( + stream=sys.stdout, level=args.log_level, + format=( + '%(asctime)s %(levelname)s %(name)s.%(funcName)s' + '(%(filename)s:%(lineno)d) %(message)s')) - ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64] - ca = certauth.certauth.CertificateAuthority(args.cacert, args.certs_dir, - ca_name=ca_name) + r = rethinkstuff.Rethinker( + args.rethinkdb_servers.split(','), args.rethinkdb_db) - proxy = warcprox.warcprox.WarcProxy( - server_address=(args.address, int(args.port)), ca=ca, - recorded_url_q=recorded_url_q, - digest_algorithm=args.digest_algorithm) + # services table + rethinkstuff.ServiceRegistry(r) - if args.playback_port is not None: - playback_index_db = warcprox.playback.PlaybackIndexDb(args.playback_index_db_file) - playback_server_address=(args.address, int(args.playback_port)) - playback_proxy = warcprox.playback.PlaybackProxy(server_address=playback_server_address, - ca=ca, playback_index_db=playback_index_db, - warcs_dir=args.directory) - else: - playback_index_db = None - playback_proxy = None - - warc_writer = warcprox.warcwriter.WarcWriter(directory=args.directory, - gzip=args.gzip, prefix=args.prefix, port=int(args.port), - rollover_size=int(args.size), base32=args.base32, - dedup_db=dedup_db, digest_algorithm=args.digest_algorithm, - playback_index_db=playback_index_db) - warc_writer_thread = warcprox.warcwriter.WarcWriterThread( - recorded_url_q=recorded_url_q, warc_writer=warc_writer, - rollover_idle_time=int(args.rollover_idle_time) if args.rollover_idle_time is not None else None) - - controller = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy) - controller.run_until_shutdown() + # stats table + warcprox.stats.RethinkStatsDb(r) + # captures table + warcprox.bigtable.RethinkCaptures(r) if __name__ == '__main__': main() diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 0ed3211..6f48d30 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -1,4 +1,28 @@ -# vim:set sw=4 et: +''' +warcprox/mitmproxy.py - man-in-the-middle http/s proxy code, handles http +CONNECT method by creating a snakeoil certificate for the requested site, +calling ssl.wrap_socket() on the client connection; connects to remote +(proxied) host, possibly using tor if host tld is .onion and tor proxy is +configured + +Copyright (C) 2012 Cygnos Corporation +Copyright (C) 2013-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +''' from __future__ import absolute_import @@ -11,46 +35,194 @@ try: import urllib.parse as urllib_parse except ImportError: import urlparse as urllib_parse - +try: + import http.client as http_client +except ImportError: + import httplib as http_client import socket import logging import ssl +import warcprox +import threading +import datetime +import socks +import tempfile +import hashlib +try: + import socketserver +except ImportError: + import SocketServer as socketserver +import resource +import concurrent.futures + +class ProxyingRecorder(object): + """ + Wraps a socket._fileobject, recording the bytes as they are read, + calculating digests, and sending them on to the proxy client. + """ + + logger = logging.getLogger("warcprox.mitmproxy.ProxyingRecorder") + + def __init__(self, fp, proxy_client, digest_algorithm='sha1', url=None): + self.fp = fp + # "The file has no name, and will cease to exist when it is closed." + self.tempfile = tempfile.SpooledTemporaryFile(max_size=512*1024) + self.digest_algorithm = digest_algorithm + self.block_digest = hashlib.new(digest_algorithm) + self.payload_offset = None + self.payload_digest = None + self.proxy_client = proxy_client + self._proxy_client_conn_open = True + self.len = 0 + self.url = url + + def payload_starts_now(self): + self.payload_digest = hashlib.new(self.digest_algorithm) + self.payload_offset = self.len + + def _update_payload_digest(self, hunk): + if self.payload_digest: + self.payload_digest.update(hunk) + + def _update(self, hunk): + self._update_payload_digest(hunk) + self.block_digest.update(hunk) + + self.tempfile.write(hunk) + + if self.payload_digest and self._proxy_client_conn_open: + try: + self.proxy_client.sendall(hunk) + except BaseException as e: + self._proxy_client_conn_open = False + self.logger.warn( + '%s sending data to proxy client for url %s', + e, self.url) + self.logger.info( + 'will continue downloading from remote server without ' + 'sending to client %s', self.url) + + self.len += len(hunk) + + def read(self, size=-1): + hunk = self.fp.read(size) + self._update(hunk) + return hunk + + def readinto(self, b): + n = self.fp.readinto(b) + self._update(b[:n]) + return n + + def readline(self, size=-1): + # XXX depends on implementation details of self.fp.readline(), in + # particular that it doesn't call self.fp.read() + hunk = self.fp.readline(size) + self._update(hunk) + return hunk + + def flush(self): + return self.fp.flush() + + def close(self): + return self.fp.close() + + def __len__(self): + return self.len + + def payload_size(self): + if self.payload_offset is not None: + return self.len - self.payload_offset + else: + return 0 + +class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): + ''' + Implementation of HTTPResponse that uses a ProxyingRecorder to read the + response from the remote web server and send it on to the proxy client, + while recording the bytes in transit. + ''' + def __init__( + self, sock, debuglevel=0, method=None, proxy_client=None, + digest_algorithm='sha1', url=None): + http_client.HTTPResponse.__init__( + self, sock, debuglevel=debuglevel, method=method) + self.proxy_client = proxy_client + self.url = url + + # Keep around extra reference to self.fp because HTTPResponse sets + # self.fp=None after it finishes reading, but we still need it + self.recorder = ProxyingRecorder( + self.fp, proxy_client, digest_algorithm, url=url) + self.fp = self.recorder + + def begin(self): + http_client.HTTPResponse.begin(self) # reads status line, headers + + status_and_headers = 'HTTP/1.1 {} {}\r\n'.format( + self.status, self.reason) + for k,v in self.msg.items(): + if k.lower() not in ( + 'connection', 'proxy-connection', 'keep-alive', + 'proxy-authenticate', 'proxy-authorization', 'upgrade', + 'strict-transport-security'): + status_and_headers += '{}: {}\r\n'.format(k, v) + status_and_headers += 'Connection: close\r\n\r\n' + self.proxy_client.sendall(status_and_headers.encode('latin1')) + + self.recorder.payload_starts_now() class MitmProxyHandler(http_server.BaseHTTPRequestHandler): + ''' + An http proxy implementation of BaseHTTPRequestHandler, that acts as a + man-in-the-middle in order to peek at the content of https transactions, + and records the bytes in transit as it proxies them. + ''' logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler") def __init__(self, request, client_address, server): + threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1]) self.is_connect = False self._headers_buffer = [] + request.settimeout(60) # XXX what value should this have? http_server.BaseHTTPRequestHandler.__init__(self, request, client_address, server) def _determine_host_port(self): # Get hostname and port to connect to if self.is_connect: - self.hostname, self.port = self.path.split(':') + host, self.port = self.path.split(':') else: self.url = self.path u = urllib_parse.urlparse(self.url) if u.scheme != 'http': - raise Exception('Unknown scheme %s' % repr(u.scheme)) - self.hostname = u.hostname + raise Exception( + 'unable to parse request %s as a proxy request' % ( + repr(self.requestline))) + host = u.hostname self.port = u.port or 80 self.path = urllib_parse.urlunparse( urllib_parse.ParseResult( - scheme='', - netloc='', - params=u.params, - path=u.path or '/', - query=u.query, - fragment=u.fragment - ) - ) + scheme='', netloc='', params=u.params, path=u.path or '/', + query=u.query, fragment=u.fragment)) + self.hostname = warcprox.normalize_host(host) - def _connect_to_host(self): + def _connect_to_remote_server(self): # Connect to destination - self._proxy_sock = socket.socket() - self._proxy_sock.settimeout(60) - self._proxy_sock.connect((self.hostname, int(self.port))) + if self.onion_tor_socks_proxy_host and self.hostname.lower().endswith('.onion'): + self.logger.info("using tor socks proxy at %s:%s to connect to %s", + self.onion_tor_socks_proxy_host, + self.onion_tor_socks_proxy_port or 1080, + self.hostname) + self._remote_server_sock = socks.socksocket() + self._remote_server_sock.set_proxy( + socks.SOCKS5, addr=self.onion_tor_socks_proxy_host, + port=self.onion_tor_socks_proxy_port, rdns=True) + else: + self._remote_server_sock = socket.socket() + + # XXX what value should this timeout have? + self._remote_server_sock.settimeout(60) + self._remote_server_sock.connect((self.hostname, int(self.port))) # Wrap socket if SSL is required if self.is_connect: @@ -58,24 +230,44 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): context = ssl.create_default_context() context.check_hostname = False context.verify_mode = ssl.CERT_NONE - self._proxy_sock = context.wrap_socket(self._proxy_sock, server_hostname=self.hostname) + self._remote_server_sock = context.wrap_socket( + self._remote_server_sock, server_hostname=self.hostname) except AttributeError: try: - self._proxy_sock = ssl.wrap_socket(self._proxy_sock) + self._remote_server_sock = ssl.wrap_socket( + self._remote_server_sock) except ssl.SSLError: - self.logger.warn("failed to establish ssl connection to {}; python ssl library does not support SNI, considering upgrading to python >= 2.7.9 or python 3.4".format(self.hostname)) + self.logger.warn( + "failed to establish ssl connection to %s; python " + "ssl library does not support SNI, considering " + "upgrading to python >= 2.7.9 or python 3.4", + self.hostname) raise + return self._remote_server_sock + def _transition_to_ssl(self): self.request = self.connection = ssl.wrap_socket(self.connection, server_side=True, certfile=self.server.ca.cert_for_host(self.hostname)) def do_CONNECT(self): + ''' + Handles a http CONNECT request. + + The CONNECT method is meant to "convert the request connection to a + transparent TCP/IP tunnel, usually to facilitate SSL-encrypted + communication (HTTPS) through an unencrypted HTTP proxy" (Wikipedia). + + do_CONNECT is where the man-in-the-middle logic happens. In do_CONNECT + the proxy transitions the proxy client connection to ssl while + masquerading as the remote web server using a generated certificate. + Meanwhile makes its own separate ssl connection to the remote web + server. Then it calls self.handle_one_request() again to handle the + request intended for the remote server. + ''' self.is_connect = True try: - # Connect to destination first self._determine_host_port() - self._connect_to_host() # If successful, let's do this! self.send_response(200, 'Connection established') @@ -83,6 +275,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._transition_to_ssl() except Exception as e: try: + self.logger.error("problem handling {}: {}".format(repr(self.requestline), e)) if type(e) is socket.timeout: self.send_error(504, str(e)) else: @@ -115,35 +308,162 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): return result def do_COMMAND(self): - if not self.is_connect: - try: - # Connect to destination - self._determine_host_port() - self._connect_to_host() - assert self.url - except Exception as e: - self.send_error(500, str(e)) - return - else: - # if self.is_connect we already connected in do_CONNECT + if self.is_connect: self.url = self._construct_tunneled_url() + else: + self._determine_host_port() + assert self.url - self._proxy_request() + try: + # Connect to destination + self._connect_to_remote_server() + except warcprox.RequestBlockedByRule as e: + # limit enforcers have already sent the appropriate response + self.logger.info("%s: %s", repr(self.requestline), e) + return + except Exception as e: + self.logger.error("problem processing request {}: {}".format(repr(self.requestline), e), exc_info=True) + self.send_error(500, str(e)) + return + try: + self._proxy_request() + except: + self.logger.error("exception proxying request", exc_info=True) + raise def _proxy_request(self): - raise Exception('_proxy_request() not implemented in MitmProxyHandler, must be implemented in subclass!') + ''' + Sends the request to the remote server, then uses a ProxyingRecorder to + read the response and send it to the proxy client, while recording the + bytes in transit. Returns a tuple (request, response) where request is + the raw request bytes, and response is a ProxyingRecorder. + ''' + # Build request + req_str = '{} {} {}\r\n'.format( + self.command, self.path, self.request_version) + + # Swallow headers that don't make sense to forward on, i.e. most + # hop-by-hop headers, see + # http://tools.ietf.org/html/rfc2616#section-13.5. + # self.headers is an email.message.Message, which is case-insensitive + # and doesn't throw KeyError in __delitem__ + for key in ( + 'Connection', 'Proxy-Connection', 'Keep-Alive', + 'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'): + del self.headers[key] + + # Add headers to the request + # XXX in at least python3.3 str(self.headers) uses \n not \r\n :( + req_str += '\r\n'.join( + '{}: {}'.format(k,v) for (k,v) in self.headers.items()) + + req = req_str.encode('latin1') + b'\r\n\r\n' + + # Append message body if present to the request + if 'Content-Length' in self.headers: + req += self.rfile.read(int(self.headers['Content-Length'])) + + try: + self.logger.debug('sending to remote server req=%s', repr(req)) + + # Send it down the pipe! + self._remote_server_sock.sendall(req) + + prox_rec_res = ProxyingRecordingHTTPResponse( + self._remote_server_sock, proxy_client=self.connection, + digest_algorithm=self.server.digest_algorithm, + url=self.url) + prox_rec_res.begin() + + buf = prox_rec_res.read(8192) + while buf != b'': + buf = prox_rec_res.read(8192) + + self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) + except socket.timeout as e: + self.logger.warn( + "%s proxying %s %s", repr(e), self.command, self.url) + except BaseException as e: + self.logger.error( + "%s proxying %s %s", repr(e), self.command, self.url, + exc_info=True) + finally: + # Let's close off the remote end + if prox_rec_res: + prox_rec_res.close() + self._remote_server_sock.close() + + return req, prox_rec_res def __getattr__(self, item): if item.startswith('do_'): return self.do_COMMAND def log_error(self, fmt, *args): - self.logger.error("{0} - - [{1}] {2}".format(self.address_string(), - self.log_date_time_string(), fmt % args)) + self.logger.warn(fmt, *args) - def log_message(self, fmt, *args): - self.logger.info("{} {} - - [{}] {}".format(self.__class__.__name__, - self.address_string(), self.log_date_time_string(), fmt % args)) +class PooledMixIn(socketserver.ThreadingMixIn): + logger = logging.getLogger("warcprox.mitmproxy.PooledMixIn") + def __init__(self, max_threads=None): + ''' + If max_threads is not supplied, calculates a reasonable value based + on system resource limits. + ''' + if not max_threads: + # man getrlimit: "RLIMIT_NPROC The maximum number of processes (or, + # more precisely on Linux, threads) that can be created for the + # real user ID of the calling process." + rlimit_nproc = resource.getrlimit(resource.RLIMIT_NPROC)[0] + rlimit_nofile = resource.getrlimit(resource.RLIMIT_NOFILE)[0] + max_threads = min(rlimit_nofile // 10, rlimit_nproc // 2) + self.logger.info( + "max_threads=%s (rlimit_nproc=%s, rlimit_nofile=%s)", + max_threads, rlimit_nproc, rlimit_nofile) + self.pool = concurrent.futures.ThreadPoolExecutor(max_threads) + def process_request(self, request, client_address): + self.pool.submit(self.process_request_thread, request, client_address) + +class MitmProxy(http_server.HTTPServer): + def finish_request(self, request, client_address): + ''' + We override socketserver.BaseServer.finish_request to get at + MitmProxyHandler's self.request. A normal socket server's self.request + is set to `request` and never changes, but in our case, it may be + replaced with an SSL socket. The caller of this method (e.g. + self.process_request or PooledMitmProxy.process_request_thread) needs + to get a hold of that socket so it can close it. + ''' + req_handler = self.RequestHandlerClass(request, client_address, self) + return req_handler.request + + def process_request(self, request, client_address): + ''' + This an almost verbatim copy/paste of + socketserver.BaseServer.process_request. + The only difference is that it expects self.finish_request to return + the request (i.e. the socket). This new value of request is passed on + to self.shutdown_request. See the comment on self.finish_request for + the rationale. + ''' + request = self.finish_request(request, client_address) + self.shutdown_request(request) + +class PooledMitmProxy(PooledMixIn, MitmProxy): + def process_request_thread(self, request, client_address): + ''' + This an almost verbatim copy/paste of + socketserver.ThreadingMixIn.process_request_thread. + The only difference is that it expects self.finish_request to return + the request (i.e. the socket). This new value of request is passed on + to self.shutdown_request. See the comment on MitmProxy.finish_request + for the rationale. + ''' + try: + request = self.finish_request(request, client_address) + self.shutdown_request(request) + except: + self.handle_error(request, client_address) + self.shutdown_request(request) diff --git a/warcprox/playback.py b/warcprox/playback.py index 9fae6e1..164ba48 100644 --- a/warcprox/playback.py +++ b/warcprox/playback.py @@ -1,4 +1,24 @@ -# vim:set sw=4 et: +''' +warcprox/playback.py - rudimentary support for playback of urls archived by +warcprox (not much used or maintained) + +Copyright (C) 2013-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +''' from __future__ import absolute_import @@ -12,14 +32,6 @@ try: except ImportError: import SocketServer as socketserver -try: - import dbm.gnu as dbm_gnu -except ImportError: - try: - import gdbm as dbm_gnu - except ImportError: - import anydbm as dbm_gnu - import logging import os from hanzo import warctools @@ -27,13 +39,14 @@ import json import traceback import re from warcprox.mitmproxy import MitmProxyHandler +import warcprox class PlaybackProxyHandler(MitmProxyHandler): logger = logging.getLogger("warcprox.playback.PlaybackProxyHandler") # @Override - def _connect_to_host(self): - # don't connect to host! + def _connect_to_remote_server(self): + # don't connect to any remote server! pass @@ -180,13 +193,14 @@ class PlaybackProxyHandler(MitmProxyHandler): class PlaybackProxy(socketserver.ThreadingMixIn, http_server.HTTPServer): logger = logging.getLogger("warcprox.playback.PlaybackProxy") - def __init__(self, server_address, req_handler_class=PlaybackProxyHandler, - bind_and_activate=True, ca=None, playback_index_db=None, - warcs_dir=None): - http_server.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate) + + def __init__(self, ca=None, playback_index_db=None, options=warcprox.Options()): + server_address = (options.address or 'localhost', options.playback_port if options.playback_port is not None else 8001) + http_server.HTTPServer.__init__(self, server_address, PlaybackProxyHandler, bind_and_activate=True) self.ca = ca self.playback_index_db = playback_index_db - self.warcs_dir = warcs_dir + self.warcs_dir = options.directory + self.options = options def server_activate(self): http_server.HTTPServer.server_activate(self) @@ -201,6 +215,14 @@ class PlaybackIndexDb(object): logger = logging.getLogger("warcprox.playback.PlaybackIndexDb") def __init__(self, dbm_file='./warcprox-playback-index.db'): + try: + import dbm.gnu as dbm_gnu + except ImportError: + try: + import gdbm as dbm_gnu + except ImportError: + import anydbm as dbm_gnu + if os.path.exists(dbm_file): self.logger.info('opening existing playback index database {}'.format(dbm_file)) else: @@ -217,6 +239,9 @@ class PlaybackIndexDb(object): except: pass + def notify(self, recorded_url, records): + self.save(records[0].warc_filename, records, records[0].offset) + def save(self, warcfile, recordset, offset): response_record = recordset[0] # XXX canonicalize url? diff --git a/warcprox/stats.py b/warcprox/stats.py new file mode 100644 index 0000000..9fd892d --- /dev/null +++ b/warcprox/stats.py @@ -0,0 +1,303 @@ +''' +warcprox/stats.py - keeps statistics on what has been archived + +Copyright (C) 2013-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +''' + +from __future__ import absolute_import + +import logging +import os +import json +from hanzo import warctools +import random +import warcprox +import threading +import rethinkdb as r +import datetime +import surt + +def _empty_bucket(bucket): + return { + "bucket": bucket, + "total": { + "urls": 0, + "wire_bytes": 0, + }, + "new": { + "urls": 0, + "wire_bytes": 0, + }, + "revisit": { + "urls": 0, + "wire_bytes": 0, + }, + } + +class StatsDb: + logger = logging.getLogger("warcprox.stats.StatsDb") + + def __init__(self, dbm_file='./warcprox-stats.db', options=warcprox.Options()): + try: + import dbm.gnu as dbm_gnu + except ImportError: + try: + import gdbm as dbm_gnu + except ImportError: + import anydbm as dbm_gnu + + if os.path.exists(dbm_file): + self.logger.info('opening existing stats database {}'.format(dbm_file)) + else: + self.logger.info('creating new stats database {}'.format(dbm_file)) + + self.db = dbm_gnu.open(dbm_file, 'c') + self.options = options + + def start(self): + # method only exists to match RethinkStatsDb + pass + + def stop(self): + self.close() + + def close(self): + self.db.close() + + def sync(self): + try: + self.db.sync() + except: + pass + + def value(self, bucket0="__all__", bucket1=None, bucket2=None): + # Gdbm wants str/bytes keys in python2, str/unicode keys in python3. + # This ugliness deals with keys that arrive as unicode in py2. + b0 = bucket0.encode("utf-8") if bucket0 and not isinstance(bucket0, str) else bucket0 + b1 = bucket1.encode("utf-8") if bucket1 and not isinstance(bucket1, str) else bucket1 + b2 = bucket2.encode("utf-8") if bucket2 and not isinstance(bucket2, str) else bucket2 + + if b0 in self.db: + bucket0_stats = json.loads(self.db[b0].decode("utf-8")) + if b1: + if b2: + return bucket0_stats[b1][b2] + else: + return bucket0_stats[b1] + else: + return bucket0_stats + else: + return None + + def notify(self, recorded_url, records): + self.tally(recorded_url, records) + + def buckets(self, recorded_url): + ''' + Unravels bucket definitions in Warcprox-Meta header. Each bucket + definition can either be a string, which signifies the name of the + bucket, or a dict. If a dict it is expected to have at least an item + with key 'bucket' whose value is the name of the bucket. The other + currently recognized item is 'tally-domains', which if supplied should + be a list of domains. This instructs warcprox to additionally tally + substats of the given bucket by domain. Host stats are stored in the + stats table under the key '{parent-bucket}:{domain(normalized)}'. + + Example Warcprox-Meta header (a real one will likely have other + sections besides 'stats'): + + Warcprox-Meta: {'stats':{'buckets':['bucket1',{'bucket':'bucket2','tally-domains':['foo.bar.com','192.168.10.20'}]}} + ''' + buckets = ["__all__"] + if (recorded_url.warcprox_meta + and "stats" in recorded_url.warcprox_meta + and "buckets" in recorded_url.warcprox_meta["stats"]): + for bucket in recorded_url.warcprox_meta["stats"]["buckets"]: + if isinstance(bucket, dict): + if not 'bucket' in bucket: + self.logger.warn( + 'ignoring invalid stats bucket in ' + 'warcprox-meta header %s', bucket) + continue + buckets.append(bucket['bucket']) + if bucket.get('tally-domains'): + url = warcprox.Url(recorded_url.url.decode('utf-8')) + for domain in bucket['tally-domains']: + if url.matches_ip_or_domain(domain): + buckets.append('%s:%s' % ( + bucket['bucket'], + warcprox.normalize_host(domain))) + else: + buckets.append(bucket) + else: + buckets.append("__unspecified__") + + return buckets + + def tally(self, recorded_url, records): + for bucket in self.buckets(recorded_url): + # Gdbm wants str/bytes keys in python2, str/unicode keys in python3. + # This ugliness deals with keys that arrive as unicode in py2. + b = bucket.encode("utf-8") if bucket and not isinstance(bucket, str) else bucket + if b in self.db: + bucket_stats = json.loads(self.db[b].decode("utf-8")) + else: + bucket_stats = _empty_bucket(b) + + bucket_stats["total"]["urls"] += 1 + bucket_stats["total"]["wire_bytes"] += recorded_url.size + + if records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT: + bucket_stats["revisit"]["urls"] += 1 + bucket_stats["revisit"]["wire_bytes"] += recorded_url.size + else: + bucket_stats["new"]["urls"] += 1 + bucket_stats["new"]["wire_bytes"] += recorded_url.size + + self.db[b] = json.dumps(bucket_stats, separators=(',',':')).encode("utf-8") + +class RethinkStatsDb(StatsDb): + """Updates database in batch every 2.0 seconds""" + logger = logging.getLogger("warcprox.stats.RethinkStatsDb") + + def __init__(self, rethinker, table="stats", shards=None, replicas=None, options=warcprox.Options()): + self.r = rethinker + self.table = table + self.shards = shards or 1 # 1 shard by default because it's probably a small table + self.replicas = replicas or min(3, len(self.r.servers)) + self._ensure_db_table() + self.options = options + + self._stop = threading.Event() + self._batch_lock = threading.RLock() + with self._batch_lock: + self._batch = {} + self._timer = None + + def start(self): + """Starts batch update repeating timer.""" + self._update_batch() # starts repeating timer + + def _bucket_batch_update_reql(self, bucket): + return self.r.table(self.table).get(bucket).replace( + lambda old: r.branch( + old.eq(None), self._batch[bucket], old.merge({ + "total": { + "urls": old["total"]["urls"].add( + self._batch[bucket]["total"]["urls"]), + "wire_bytes": old["total"]["wire_bytes"].add( + self._batch[bucket]["total"]["wire_bytes"]), + }, + "new": { + "urls": old["new"]["urls"].add( + self._batch[bucket]["new"]["urls"]), + "wire_bytes": old["new"]["wire_bytes"].add( + self._batch[bucket]["new"]["wire_bytes"]), + }, + "revisit": { + "urls": old["revisit"]["urls"].add( + self._batch[bucket]["revisit"]["urls"]), + "wire_bytes": old["revisit"]["wire_bytes"].add( + self._batch[bucket]["revisit"]["wire_bytes"]), + }, + }))) + + def _update_batch(self): + with self._batch_lock: + if len(self._batch) > 0: + # XXX can all the buckets be done in one query? + for bucket in self._batch: + result = self._bucket_batch_update_reql(bucket).run() + if (not result["inserted"] and not result["replaced"] + or sorted(result.values()) != [0,0,0,0,0,1]): + raise Exception( + "unexpected result %s updating stats %s" % ( + result, self._batch[bucket])) + self._batch = {} + + if not self._stop.is_set(): + self._timer = threading.Timer(2.0, self._update_batch) + self._timer.name = "RethinkStats-batch-update-timer-%s" % ( + datetime.datetime.utcnow().isoformat()) + self._timer.start() + else: + self.logger.info("finished") + + def _ensure_db_table(self): + dbs = self.r.db_list().run() + if not self.r.dbname in dbs: + self.logger.info( + "creating rethinkdb database %s", repr(self.r.dbname)) + self.r.db_create(self.r.dbname).run() + tables = self.r.table_list().run() + if not self.table in tables: + self.logger.info( + "creating rethinkdb table %s in database %s shards=%s " + "replicas=%s", repr(self.table), repr(self.r.dbname), + self.shards, self.replicas) + self.r.table_create( + self.table, primary_key="bucket", shards=self.shards, + replicas=self.replicas).run() + + def close(self): + self.stop() + + def stop(self): + self.logger.info("stopping rethinkdb stats table batch updates") + self._stop.set() + if self._timer: + self._timer.join() + + def sync(self): + pass + + def value(self, bucket0="__all__", bucket1=None, bucket2=None): + bucket0_stats = self.r.table(self.table).get(bucket0).run() + self.logger.debug( + 'stats db lookup of bucket=%s returned %s', + bucket0, bucket0_stats) + if bucket0_stats: + if bucket1: + if bucket2: + return bucket0_stats[bucket1][bucket2] + else: + return bucket0_stats[bucket1] + return bucket0_stats + + def tally(self, recorded_url, records): + buckets = self.buckets(recorded_url) + is_revisit = records[0].get_header( + warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT + with self._batch_lock: + for bucket in buckets: + bucket_stats = self._batch.setdefault( + bucket, _empty_bucket(bucket)) + + bucket_stats["total"]["urls"] += 1 + bucket_stats["total"]["wire_bytes"] += recorded_url.size + + if is_revisit: + bucket_stats["revisit"]["urls"] += 1 + bucket_stats["revisit"]["wire_bytes"] += recorded_url.size + else: + bucket_stats["new"]["urls"] += 1 + bucket_stats["new"]["wire_bytes"] += recorded_url.size + + def notify(self, recorded_url, records): + self.tally(recorded_url, records) + diff --git a/warcprox/tests/test_warcprox.py b/warcprox/tests/test_warcprox.py deleted file mode 100755 index f263bef..0000000 --- a/warcprox/tests/test_warcprox.py +++ /dev/null @@ -1,414 +0,0 @@ -#!/usr/bin/env python -# vim: set sw=4 et: - -import unittest -import threading -import time -import logging -import sys -import ssl -import re -import tempfile -import OpenSSL -import os -import shutil -import requests - -try: - import http.server as http_server -except ImportError: - import BaseHTTPServer as http_server - -try: - import queue -except ImportError: - import Queue as queue - -import certauth.certauth - -import warcprox.controller -import warcprox.warcprox -import warcprox.playback -import warcprox.warcwriter -import warcprox.dedup - -class TestHttpRequestHandler(http_server.BaseHTTPRequestHandler): - logger = logging.getLogger('TestHttpRequestHandler') - - def do_GET(self): - self.logger.info('GET {}'.format(self.path)) - - m = re.match(r'^/([^/]+)/([^/]+)$', self.path) - if m is not None: - special_header = 'warcprox-test-header: {}!'.format(m.group(1)).encode('utf-8') - payload = 'I am the warcprox test payload! {}!\n'.format(10*m.group(2)).encode('utf-8') - headers = (b'HTTP/1.1 200 OK\r\n' - + b'Content-Type: text/plain\r\n' - + special_header + b'\r\n' - + b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n' - + b'\r\n') - else: - payload = b'404 Not Found\n' - headers = (b'HTTP/1.1 404 Not Found\r\n' - + b'Content-Type: text/plain\r\n' - + b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n' - + b'\r\n') - - self.connection.sendall(headers) - self.connection.sendall(payload) - - -class WarcproxTest(unittest.TestCase): - logger = logging.getLogger('WarcproxTest') - - def __init__(self, methodName='runTest'): - self.__cert = None - unittest.TestCase.__init__(self, methodName) - - @property - def _cert(self): - if self.__cert is None: - f = tempfile.NamedTemporaryFile(prefix='warcprox-test-https-', suffix='.pem', delete=False) - try: - key = OpenSSL.crypto.PKey() - key.generate_key(OpenSSL.crypto.TYPE_RSA, 2048) - req = OpenSSL.crypto.X509Req() - req.get_subject().CN = 'localhost' - req.set_pubkey(key) - req.sign(key, 'sha1') - cert = OpenSSL.crypto.X509() - cert.set_subject(req.get_subject()) - cert.set_serial_number(0) - cert.gmtime_adj_notBefore(0) - cert.gmtime_adj_notAfter(2*60*60) # valid for 2hrs - cert.set_issuer(cert.get_subject()) - cert.set_pubkey(req.get_pubkey()) - cert.sign(key, 'sha1') - - f.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM, key)) - f.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM, cert)) - - self.logger.info('generated self-signed certificate {}'.format(f.name)) - self.__cert = f.name - finally: - f.close() - - return self.__cert - - - def _start_http_servers(self): - self.http_daemon = http_server.HTTPServer(('localhost', 0), - RequestHandlerClass=TestHttpRequestHandler) - self.logger.info('starting http://{}:{}'.format(self.http_daemon.server_address[0], self.http_daemon.server_address[1])) - self.http_daemon_thread = threading.Thread(name='HttpdThread', - target=self.http_daemon.serve_forever) - self.http_daemon_thread.start() - - # http://www.piware.de/2011/01/creating-an-https-server-in-python/ - self.https_daemon = http_server.HTTPServer(('localhost', 0), - RequestHandlerClass=TestHttpRequestHandler) - # self.https_daemon.socket = ssl.wrap_socket(httpd.socket, certfile='path/to/localhost.pem', server_side=True) - self.https_daemon.socket = ssl.wrap_socket(self.https_daemon.socket, certfile=self._cert, server_side=True) - self.logger.info('starting https://{}:{}'.format(self.https_daemon.server_address[0], self.https_daemon.server_address[1])) - self.https_daemon_thread = threading.Thread(name='HttpdThread', - target=self.https_daemon.serve_forever) - self.https_daemon_thread.start() - - - def _start_warcprox(self): - f = tempfile.NamedTemporaryFile(prefix='warcprox-test-ca-', suffix='.pem', delete=True) - f.close() # delete it, or CertificateAuthority will try to read it - self._ca_file = f.name - self._ca_dir = tempfile.mkdtemp(prefix='warcprox-test-', suffix='-ca') - ca = certauth.certauth.CertificateAuthority(self._ca_file, self._ca_dir, 'warcprox-test') - - recorded_url_q = queue.Queue() - - proxy = warcprox.warcprox.WarcProxy(server_address=('localhost', 0), ca=ca, - recorded_url_q=recorded_url_q) - - self._warcs_dir = tempfile.mkdtemp(prefix='warcprox-test-warcs-') - - f = tempfile.NamedTemporaryFile(prefix='warcprox-test-playback-index-', suffix='.db', delete=False) - f.close() - self._playback_index_db_file = f.name - playback_index_db = warcprox.playback.PlaybackIndexDb(self._playback_index_db_file) - playback_proxy = warcprox.playback.PlaybackProxy(server_address=('localhost', 0), ca=ca, - playback_index_db=playback_index_db, warcs_dir=self._warcs_dir) - - f = tempfile.NamedTemporaryFile(prefix='warcprox-test-dedup-', suffix='.db', delete=False) - f.close() - self._dedup_db_file = f.name - dedup_db = warcprox.dedup.DedupDb(self._dedup_db_file) - - warc_writer = warcprox.warcwriter.WarcWriter(directory=self._warcs_dir, - port=proxy.server_port, dedup_db=dedup_db, - playback_index_db=playback_index_db) - warc_writer_thread = warcprox.warcwriter.WarcWriterThread(recorded_url_q=recorded_url_q, - warc_writer=warc_writer) - - self.warcprox = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy) - self.logger.info('starting warcprox') - self.warcprox_thread = threading.Thread(name='WarcproxThread', - target=self.warcprox.run_until_shutdown) - self.warcprox_thread.start() - - - def setUp(self): - logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, - format='%(asctime)s %(levelname)s %(process)d %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s') - - self._start_http_servers() - self._start_warcprox() - - archiving_proxy = 'http://localhost:{}'.format(self.warcprox.proxy.server_port) - self.archiving_proxies = {'http':archiving_proxy, 'https':archiving_proxy} - - playback_proxy = 'http://localhost:{}'.format(self.warcprox.playback_proxy.server_port) - self.playback_proxies = {'http':playback_proxy, 'https':playback_proxy} - - - def tearDown(self): - self.logger.info('stopping warcprox') - self.warcprox.stop.set() - - self.logger.info('stopping http and https daemons') - self.http_daemon.shutdown() - self.https_daemon.shutdown() - self.http_daemon.server_close() - self.https_daemon.server_close() - - # Have to wait for threads to finish or the threads will try to use - # variables that no longer exist, resulting in errors like this: - # File "/usr/lib/python2.7/SocketServer.py", line 235, in serve_forever - # r, w, e = _eintr_retry(select.select, [self], [], [], - # AttributeError: 'NoneType' object has no attribute 'select' - self.http_daemon_thread.join() - self.https_daemon_thread.join() - self.warcprox_thread.join() - - for f in (self.__cert, self._ca_file, self._ca_dir, self._warcs_dir, self._playback_index_db_file, self._dedup_db_file): - if os.path.isdir(f): - self.logger.info('deleting directory {}'.format(f)) - shutil.rmtree(f) - else: - self.logger.info('deleting file {}'.format(f)) - os.unlink(f) - - - def _test_httpds_no_proxy(self): - url = 'http://localhost:{}/'.format(self.http_daemon.server_port) - response = requests.get(url) - self.assertEqual(response.status_code, 404) - self.assertEqual(response.content, b'404 Not Found\n') - - url = 'https://localhost:{}/'.format(self.https_daemon.server_port) - response = requests.get(url, verify=False) - self.assertEqual(response.status_code, 404) - self.assertEqual(response.content, b'404 Not Found\n') - - url = 'http://localhost:{}/a/b'.format(self.http_daemon.server_port) - response = requests.get(url) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'a!') - self.assertEqual(response.content, b'I am the warcprox test payload! bbbbbbbbbb!\n') - - url = 'https://localhost:{}/c/d'.format(self.https_daemon.server_port) - response = requests.get(url, verify=False) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'c!') - self.assertEqual(response.content, b'I am the warcprox test payload! dddddddddd!\n') - - - def poll_playback_until(self, url, status, timeout_sec): - start = time.time() - # check playback (warc writing is asynchronous, give it up to 10 sec) - while time.time() - start < timeout_sec: - response = requests.get(url, proxies=self.playback_proxies, verify=False) - if response.status_code == status: - break - time.sleep(0.5) - - return response - - - def _test_archive_and_playback_http_url(self): - url = 'http://localhost:{}/a/b'.format(self.http_daemon.server_port) - - # ensure playback fails before archiving - response = requests.get(url, proxies=self.playback_proxies) - self.assertEqual(response.status_code, 404) - self.assertEqual(response.content, b'404 Not in Archive\n') - - # archive - response = requests.get(url, proxies=self.archiving_proxies) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'a!') - self.assertEqual(response.content, b'I am the warcprox test payload! bbbbbbbbbb!\n') - - response = self.poll_playback_until(url, status=200, timeout_sec=10) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'a!') - self.assertEqual(response.content, b'I am the warcprox test payload! bbbbbbbbbb!\n') - - - def _test_archive_and_playback_https_url(self): - url = 'https://localhost:{}/c/d'.format(self.https_daemon.server_port) - - # ensure playback fails before archiving - response = requests.get(url, proxies=self.playback_proxies, verify=False) - self.assertEqual(response.status_code, 404) - self.assertEqual(response.content, b'404 Not in Archive\n') - - # fetch & archive response - response = requests.get(url, proxies=self.archiving_proxies, verify=False) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'c!') - self.assertEqual(response.content, b'I am the warcprox test payload! dddddddddd!\n') - - # test playback - response = self.poll_playback_until(url, status=200, timeout_sec=10) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'c!') - self.assertEqual(response.content, b'I am the warcprox test payload! dddddddddd!\n') - - - # test dedup of same http url with same payload - def _test_dedup_http(self): - url = 'http://localhost:{}/e/f'.format(self.http_daemon.server_port) - - # ensure playback fails before archiving - response = requests.get(url, proxies=self.playback_proxies, verify=False) - self.assertEqual(response.status_code, 404) - self.assertEqual(response.content, b'404 Not in Archive\n') - - # check not in dedup db - dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') - self.assertIsNone(dedup_lookup) - - # archive - response = requests.get(url, proxies=self.archiving_proxies, verify=False) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'e!') - self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n') - - # test playback - response = self.poll_playback_until(url, status=200, timeout_sec=10) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'e!') - self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n') - - # check in dedup db - # {u'i': u'', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'} - dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') - self.assertEqual(dedup_lookup['u'], url.encode('ascii')) - self.assertRegexpMatches(dedup_lookup['i'], br'^$') - self.assertRegexpMatches(dedup_lookup['d'], br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$') - record_id = dedup_lookup['i'] - dedup_date = dedup_lookup['d'] - - # need revisit to have a later timestamp than original, else playing - # back the latest record might not hit the revisit - time.sleep(1.5) - - # fetch & archive revisit - response = requests.get(url, proxies=self.archiving_proxies, verify=False) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'e!') - self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n') - - # XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\ - time.sleep(2.0) - - # check in dedup db (no change from prev) - dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') - self.assertEqual(dedup_lookup['u'], url.encode('ascii')) - self.assertEqual(dedup_lookup['i'], record_id) - self.assertEqual(dedup_lookup['d'], dedup_date) - - # test playback - self.logger.debug('testing playback of revisit of {}'.format(url)) - response = self.poll_playback_until(url, status=200, timeout_sec=10) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'e!') - self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n') - # XXX how to check dedup was used? - - - # test dedup of same https url with same payload - def _test_dedup_https(self): - url = 'https://localhost:{}/g/h'.format(self.https_daemon.server_port) - - # ensure playback fails before archiving - response = requests.get(url, proxies=self.playback_proxies, verify=False) - self.assertEqual(response.status_code, 404) - self.assertEqual(response.content, b'404 Not in Archive\n') - - # check not in dedup db - dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') - self.assertIsNone(dedup_lookup) - - # archive - response = requests.get(url, proxies=self.archiving_proxies, verify=False) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'g!') - self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n') - - # test playback - response = self.poll_playback_until(url, status=200, timeout_sec=10) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'g!') - self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n') - - # check in dedup db - # {u'i': u'', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'} - dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') - self.assertEqual(dedup_lookup['u'], url.encode('ascii')) - self.assertRegexpMatches(dedup_lookup['i'], br'^$') - self.assertRegexpMatches(dedup_lookup['d'], br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$') - record_id = dedup_lookup['i'] - dedup_date = dedup_lookup['d'] - - # need revisit to have a later timestamp than original, else playing - # back the latest record might not hit the revisit - time.sleep(1.5) - - # fetch & archive revisit - response = requests.get(url, proxies=self.archiving_proxies, verify=False) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'g!') - self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n') - - # XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\ - time.sleep(2.0) - - # check in dedup db (no change from prev) - dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') - self.assertEqual(dedup_lookup['u'], url.encode('ascii')) - self.assertEqual(dedup_lookup['i'], record_id) - self.assertEqual(dedup_lookup['d'], dedup_date) - - # test playback - self.logger.debug('testing playback of revisit of {}'.format(url)) - response = self.poll_playback_until(url, status=200, timeout_sec=10) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.headers['warcprox-test-header'], 'g!') - self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n') - # XXX how to check dedup was used? - - - # run everything from here, otherwise it wants to setUp() and tearDown - # around each test - def runTest(self): - self._test_httpds_no_proxy() - self._test_archive_and_playback_http_url() - self._test_archive_and_playback_https_url() - self._test_dedup_http() - self._test_dedup_https() - # self._test_dedup_mixed_http() - # self._test_dedup_mixed_https() - - -if __name__ == '__main__': - unittest.main() - diff --git a/warcprox/warc.py b/warcprox/warc.py new file mode 100644 index 0000000..fbc2a33 --- /dev/null +++ b/warcprox/warc.py @@ -0,0 +1,171 @@ +# +# warcprox/warc.py - assembles warc records +# +# Copyright (C) 2013-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# + +from __future__ import absolute_import + +import logging +import warcprox +import hashlib +import socket +import hanzo.httptools +from hanzo import warctools +import warcprox +import datetime + +class WarcRecordBuilder: + logger = logging.getLogger("warcprox.warc.WarcRecordBuilder") + + def __init__(self, digest_algorithm="sha1", base32=False): + self.digest_algorithm = digest_algorithm + self.base32 = base32 + + def _build_response_principal_record(self, recorded_url, warc_date): + """Builds response or revisit record, whichever is appropriate.""" + if hasattr(recorded_url, "dedup_info") and recorded_url.dedup_info: + # revisit record + recorded_url.response_recorder.tempfile.seek(0) + if recorded_url.response_recorder.payload_offset is not None: + response_header_block = recorded_url.response_recorder.tempfile.read(recorded_url.response_recorder.payload_offset) + else: + response_header_block = recorded_url.response_recorder.tempfile.read() + + return self.build_warc_record( + url=recorded_url.url, warc_date=warc_date, + data=response_header_block, + warc_type=warctools.WarcRecord.REVISIT, + refers_to=recorded_url.dedup_info['id'], + refers_to_target_uri=recorded_url.dedup_info['url'], + refers_to_date=recorded_url.dedup_info['date'], + payload_digest=warcprox.digest_str(recorded_url.response_recorder.payload_digest, self.base32), + profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST, + content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE, + remote_ip=recorded_url.remote_ip) + else: + # response record + return self.build_warc_record( + url=recorded_url.url, warc_date=warc_date, + recorder=recorded_url.response_recorder, + warc_type=warctools.WarcRecord.RESPONSE, + content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE, + remote_ip=recorded_url.remote_ip) + + def build_warc_records(self, recorded_url): + """Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)""" + warc_date = warctools.warc.warc_datetime_str(recorded_url.timestamp) + + if recorded_url.response_recorder: + principal_record = self._build_response_principal_record(recorded_url, warc_date) + request_record = self.build_warc_record(url=recorded_url.url, + warc_date=warc_date, data=recorded_url.request_data, + warc_type=warctools.WarcRecord.REQUEST, + content_type=hanzo.httptools.RequestMessage.CONTENT_TYPE, + concurrent_to=principal_record.id) + return principal_record, request_record + else: + principal_record = self.build_warc_record(url=recorded_url.url, + warc_date=warc_date, data=recorded_url.request_data, + warc_type=recorded_url.custom_type, + content_type=recorded_url.content_type.encode("latin1")) + return (principal_record,) + + def build_warc_record(self, url, warc_date=None, recorder=None, data=None, + concurrent_to=None, warc_type=None, content_type=None, remote_ip=None, + profile=None, refers_to=None, refers_to_target_uri=None, + refers_to_date=None, payload_digest=None): + + if warc_date is None: + warc_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow()) + + record_id = warctools.WarcRecord.random_warc_uuid() + + headers = [] + if warc_type is not None: + headers.append((warctools.WarcRecord.TYPE, warc_type)) + headers.append((warctools.WarcRecord.ID, record_id)) + headers.append((warctools.WarcRecord.DATE, warc_date)) + headers.append((warctools.WarcRecord.URL, url)) + if remote_ip is not None: + headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip)) + if profile is not None: + headers.append((warctools.WarcRecord.PROFILE, profile)) + if refers_to is not None: + headers.append((warctools.WarcRecord.REFERS_TO, refers_to)) + if refers_to_target_uri is not None: + headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri)) + if refers_to_date is not None: + headers.append((warctools.WarcRecord.REFERS_TO_DATE, refers_to_date)) + if concurrent_to is not None: + headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to)) + if content_type is not None: + headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type)) + if payload_digest is not None: + headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest)) + + if recorder is not None: + headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1'))) + headers.append((warctools.WarcRecord.BLOCK_DIGEST, + warcprox.digest_str(recorder.block_digest, self.base32))) + if recorder.payload_digest is not None: + headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, + warcprox.digest_str(recorder.payload_digest, self.base32))) + + recorder.tempfile.seek(0) + record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile) + + else: + headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1'))) + digest = hashlib.new(self.digest_algorithm, data) + headers.append((warctools.WarcRecord.BLOCK_DIGEST, + warcprox.digest_str(digest, self.base32))) + if not payload_digest: + headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, + warcprox.digest_str(digest, self.base32))) + + content_tuple = content_type, data + record = warctools.WarcRecord(headers=headers, content=content_tuple) + + return record + + def build_warcinfo_record(self, filename): + warc_record_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow()) + record_id = warctools.WarcRecord.random_warc_uuid() + + headers = [] + headers.append((warctools.WarcRecord.ID, record_id)) + headers.append((warctools.WarcRecord.TYPE, warctools.WarcRecord.WARCINFO)) + headers.append((warctools.WarcRecord.FILENAME, filename.encode('latin1'))) + headers.append((warctools.WarcRecord.DATE, warc_record_date)) + + warcinfo_fields = [] + warcinfo_fields.append(b'software: warcprox ' + warcprox.__version__.encode('latin1')) + hostname = socket.gethostname() + warcinfo_fields.append('hostname: {}'.format(hostname).encode('latin1')) + warcinfo_fields.append('ip: {}'.format(socket.gethostbyname(hostname)).encode('latin1')) + warcinfo_fields.append(b'format: WARC File Format 1.0') + # warcinfo_fields.append('robots: ignore') + # warcinfo_fields.append('description: {0}'.format(self.description)) + # warcinfo_fields.append('isPartOf: {0}'.format(self.is_part_of)) + data = b'\r\n'.join(warcinfo_fields) + b'\r\n' + + record = warctools.WarcRecord(headers=headers, content=(b'application/warc-fields', data)) + + return record + diff --git a/warcprox/warcprox.py b/warcprox/warcprox.py deleted file mode 100644 index 7d98293..0000000 --- a/warcprox/warcprox.py +++ /dev/null @@ -1,272 +0,0 @@ -#!/usr/bin/env python -# vim:set sw=4 et: -# -""" -WARC writing MITM HTTP/S proxy - -See README.rst or https://github.com/internetarchive/warcprox -""" - -from __future__ import absolute_import - -try: - import http.server as http_server -except ImportError: - import BaseHTTPServer as http_server - -try: - import socketserver -except ImportError: - import SocketServer as socketserver - -try: - import queue -except ImportError: - import Queue as queue - -try: - import http.client as http_client -except ImportError: - import httplib as http_client - -import logging -import re -import tempfile -import traceback -import hashlib -import json -import socket - -from certauth.certauth import CertificateAuthority -import warcprox.mitmproxy - -class ProxyingRecorder(object): - """ - Wraps a socket._fileobject, recording the bytes as they are read, - calculating digests, and sending them on to the proxy client. - """ - - logger = logging.getLogger("warcprox.warcprox.ProxyingRecorder") - - def __init__(self, fp, proxy_dest, digest_algorithm='sha1'): - self.fp = fp - # "The file has no name, and will cease to exist when it is closed." - self.tempfile = tempfile.SpooledTemporaryFile(max_size=512*1024) - self.digest_algorithm = digest_algorithm - self.block_digest = hashlib.new(digest_algorithm) - self.payload_offset = None - self.payload_digest = None - self.proxy_dest = proxy_dest - self._proxy_dest_conn_open = True - self._prev_hunk_last_two_bytes = b'' - self.len = 0 - - def _update_payload_digest(self, hunk): - if self.payload_digest is None: - # convoluted handling of two newlines crossing hunks - # XXX write tests for this - if self._prev_hunk_last_two_bytes.endswith(b'\n'): - if hunk.startswith(b'\n'): - self.payload_digest = hashlib.new(self.digest_algorithm) - self.payload_digest.update(hunk[1:]) - self.payload_offset = self.len + 1 - elif hunk.startswith(b'\r\n'): - self.payload_digest = hashlib.new(self.digest_algorithm) - self.payload_digest.update(hunk[2:]) - self.payload_offset = self.len + 2 - elif self._prev_hunk_last_two_bytes == b'\n\r': - if hunk.startswith(b'\n'): - self.payload_digest = hashlib.new(self.digest_algorithm) - self.payload_digest.update(hunk[1:]) - self.payload_offset = self.len + 1 - else: - m = re.search(br'\n\r?\n', hunk) - if m is not None: - self.payload_digest = hashlib.new(self.digest_algorithm) - self.payload_digest.update(hunk[m.end():]) - self.payload_offset = self.len + m.end() - - # if we still haven't found start of payload hold on to these bytes - if self.payload_digest is None: - self._prev_hunk_last_two_bytes = hunk[-2:] - else: - self.payload_digest.update(hunk) - - def _update(self, hunk): - self._update_payload_digest(hunk) - self.block_digest.update(hunk) - - self.tempfile.write(hunk) - - if self._proxy_dest_conn_open: - try: - self.proxy_dest.sendall(hunk) - except BaseException as e: - self._proxy_dest_conn_open = False - self.logger.warn('{} sending data to proxy client'.format(e)) - self.logger.info('will continue downloading from remote server without sending to client') - - self.len += len(hunk) - - def read(self, size=-1): - hunk = self.fp.read(size) - self._update(hunk) - return hunk - - def readinto(self, b): - n = self.fp.readinto(b) - self._update(b[:n]) - return n - - def readline(self, size=-1): - # XXX depends on implementation details of self.fp.readline(), in - # particular that it doesn't call self.fp.read() - hunk = self.fp.readline(size) - self._update(hunk) - return hunk - - def close(self): - return self.fp.close() - - def __len__(self): - return self.len - - def payload_size(self): - if self.payload_offset is not None: - return self.len - self.payload_offset - else: - return 0 - - -class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): - - def __init__(self, sock, debuglevel=0, method=None, proxy_dest=None, digest_algorithm='sha1'): - http_client.HTTPResponse.__init__(self, sock, debuglevel=debuglevel, method=method) - - # Keep around extra reference to self.fp because HTTPResponse sets - # self.fp=None after it finishes reading, but we still need it - self.recorder = ProxyingRecorder(self.fp, proxy_dest, digest_algorithm) - self.fp = self.recorder - - -class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): - logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler") - - def _proxy_request(self): - # Build request - req_str = '{} {} {}\r\n'.format(self.command, self.path, self.request_version) - - warcprox_meta = self.headers.get('Warcprox-Meta') - - # Swallow headers that don't make sense to forward on, i.e. most - # hop-by-hop headers, see http://tools.ietf.org/html/rfc2616#section-13.5 - # self.headers is an email.message.Message, which is case-insensitive - # and doesn't throw KeyError in __delitem__ - for h in ('Connection', 'Proxy-Connection', 'Keep-Alive', - 'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade', - 'Warcprox-Meta'): - del self.headers[h] - - # Add headers to the request - # XXX in at least python3.3 str(self.headers) uses \n not \r\n :( - req_str += '\r\n'.join('{}: {}'.format(k,v) for (k,v) in self.headers.items()) - - req = req_str.encode('utf-8') + b'\r\n\r\n' - - # Append message body if present to the request - if 'Content-Length' in self.headers: - req += self.rfile.read(int(self.headers['Content-Length'])) - - self.logger.debug('req={}'.format(repr(req))) - - # Send it down the pipe! - self._proxy_sock.sendall(req) - - # We want HTTPResponse's smarts about http and handling of - # non-compliant servers. But HTTPResponse.read() doesn't return the raw - # bytes read from the server, it unchunks them if they're chunked, and - # might do other stuff. We want to send the raw bytes back to the - # client. So we ignore the values returned by h.read() below. Instead - # the ProxyingRecordingHTTPResponse takes care of sending the raw bytes - # to the proxy client. - - # Proxy and record the response - h = ProxyingRecordingHTTPResponse(self._proxy_sock, - proxy_dest=self.connection, - digest_algorithm=self.server.digest_algorithm) - h.begin() - - buf = h.read(8192) - while buf != b'': - buf = h.read(8192) - - self.log_request(h.status, h.recorder.len) - - remote_ip = self._proxy_sock.getpeername()[0] - - # Let's close off the remote end - h.close() - self._proxy_sock.close() - - recorded_url = RecordedUrl(url=self.url, request_data=req, - response_recorder=h.recorder, remote_ip=remote_ip, - warcprox_meta=warcprox_meta) - self.server.recorded_url_q.put(recorded_url) - - return recorded_url - - -class RecordedUrl(object): - def __init__(self, url, request_data, response_recorder, remote_ip, warcprox_meta=None): - # XXX should test what happens with non-ascii url (when does - # url-encoding happen?) - if type(url) is not bytes: - self.url = url.encode('ascii') - else: - self.url = url - - if type(remote_ip) is not bytes: - self.remote_ip = remote_ip.encode('ascii') - else: - self.remote_ip = remote_ip - - self.request_data = request_data - self.response_recorder = response_recorder - - if warcprox_meta: - self.warcprox_meta = json.loads(warcprox_meta) - else: - self.warcprox_meta = {} - - -class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer): - logger = logging.getLogger("warcprox.warcprox.WarcProxy") - - def __init__(self, server_address=('localhost', 8000), - req_handler_class=WarcProxyHandler, bind_and_activate=True, - ca=None, recorded_url_q=None, digest_algorithm='sha1'): - http_server.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate) - - self.digest_algorithm = digest_algorithm - - if ca is not None: - self.ca = ca - else: - ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64] - self.ca = CertificateAuthority(ca_file='warcprox-ca.pem', - certs_dir='./warcprox-ca', - ca_name=ca_name) - - if recorded_url_q is not None: - self.recorded_url_q = recorded_url_q - else: - self.recorded_url_q = queue.Queue() - - def server_activate(self): - http_server.HTTPServer.server_activate(self) - self.logger.info('WarcProxy listening on {0}:{1}'.format(self.server_address[0], self.server_address[1])) - - def server_close(self): - self.logger.info('WarcProxy shutting down') - http_server.HTTPServer.server_close(self) - diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py new file mode 100644 index 0000000..f9e07c3 --- /dev/null +++ b/warcprox/warcproxy.py @@ -0,0 +1,415 @@ +''' +warcprox/warcproxy.py - recording proxy, extends mitmproxy to record traffic, +enqueue info on the recorded url queue + +Copyright (C) 2013-2016 Internet Archive + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +USA. +''' + +from __future__ import absolute_import + +try: + import http.server as http_server +except ImportError: + import BaseHTTPServer as http_server +try: + import socketserver +except ImportError: + import SocketServer as socketserver +try: + import queue +except ImportError: + import Queue as queue +import logging +import re +import traceback +import json +import socket +from hanzo import warctools +from certauth.certauth import CertificateAuthority +import warcprox +import datetime +import ipaddress +import surt + +class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): + ''' + XXX add more information. + + Among other things, this class enforces limits specified in the + Warcprox-Meta request header. If a limit is deemed to have been reached, no + request will be made to the remote destination server. This implementation + detail has implications worth noting. For example, if a limit applies to + "new" (not deduplicated) bytes, and the limit has already been reached, no + request will be made, even if it would have resulted in duplicate content, + which would not count toward the limit. To reiterate, this is because the + limit enforcer does not know that the content would be deduplicated. + ''' + # self.server is WarcProxy + logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler") + + # XXX nearly identical to brozzler.site.Site._scope_rule_applies() but + # there's no obvious common dependency where this code should go... TBD + def _scope_rule_applies(self, rule): + u = warcprox.Url(self.url) + + if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]): + return False + if "url_match" in rule: + if rule["url_match"] == "STRING_MATCH": + return u.url.find(rule["value"]) >= 0 + elif rule["url_match"] == "REGEX_MATCH": + try: + return re.fullmatch(rule["value"], u.url) + except Exception as e: + self.logger.warn( + "caught exception matching against regex %s: %s", + rule["value"], e) + return False + elif rule["url_match"] == "SURT_MATCH": + return u.surt.startswith(rule["value"]) + else: + self.logger.warn("invalid rule.url_match=%s", rule.url_match) + return False + else: + if "domain" in rule: + # we already know that it matches from earlier check + return True + else: + self.logger.warn("unable to make sense of scope rule %s", rule) + return False + + def _enforce_blocks(self, warcprox_meta): + """ + Sends a 403 response and raises warcprox.RequestBlockedByRule if the + url is blocked by a rule in warcprox_meta. + """ + if warcprox_meta and "blocks" in warcprox_meta: + for rule in warcprox_meta["blocks"]: + if self._scope_rule_applies(rule): + body = ("request rejected by warcprox: blocked by " + "rule found in Warcprox-Meta header: %s" + % rule).encode("utf-8") + self.send_response(403, "Forbidden") + self.send_header("Content-Type", "text/plain;charset=utf-8") + self.send_header("Connection", "close") + self.send_header("Content-Length", len(body)) + response_meta = {"blocked-by-rule":rule} + self.send_header( + "Warcprox-Meta", + json.dumps(response_meta, separators=(",",":"))) + self.end_headers() + if self.command != "HEAD": + self.wfile.write(body) + self.connection.close() + raise warcprox.RequestBlockedByRule( + "%s 403 %s %s -- blocked by rule in Warcprox-Meta " + "request header %s" % ( + self.client_address[0], self.command, + self.url, rule)) + + def _enforce_limit(self, limit_key, limit_value, soft=False): + bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2) + _limit_key = limit_key + + # if limit_key looks like 'job1:foo.com/total/urls' then we only want + # to apply this rule if the requested url is within domain + bucket0_fields = bucket0.split(':') + if len(bucket0_fields) == 2: + if not warcprox.host_matches_ip_or_domain( + self.hostname, bucket0_fields[1]): + return # else host matches, go ahead and enforce the limit + bucket0 = '%s:%s' % ( + bucket0_fields[0], + warcprox.normalize_host(bucket0_fields[1])) + _limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2) + + value = self.server.stats_db.value(bucket0, bucket1, bucket2) + if value and value >= limit_value: + body = ("request rejected by warcprox: reached %s %s=%s\n" % ( + "soft limit" if soft else "limit", _limit_key, + limit_value)).encode("utf-8") + if soft: + self.send_response(430, "Reached soft limit") + else: + self.send_response(420, "Reached limit") + self.send_header("Content-Type", "text/plain;charset=utf-8") + self.send_header("Connection", "close") + self.send_header("Content-Length", len(body)) + response_meta = { + "stats": {bucket0:self.server.stats_db.value(bucket0)} + } + if soft: + response_meta["reached-soft-limit"] = {_limit_key:limit_value} + else: + response_meta["reached-limit"] = {_limit_key:limit_value} + self.send_header( + "Warcprox-Meta", + json.dumps(response_meta, separators=(",",":"))) + self.end_headers() + if self.command != "HEAD": + self.wfile.write(body) + self.connection.close() + raise warcprox.RequestBlockedByRule( + "%s %s %s %s -- reached %s %s=%s" % ( + self.client_address[0], 430 if soft else 420, + self.command, self.url, + "soft limit" if soft else "limit", + _limit_key, limit_value)) + + def _enforce_limits(self, warcprox_meta): + """ + Sends a 420 (hard limit) or 430 (soft limit) response and raises + warcprox.RequestBlockedByRule if a limit specified in warcprox_meta is + reached. + """ + if warcprox_meta and "limits" in warcprox_meta: + for item in warcprox_meta["limits"].items(): + limit_key, limit_value = item + self._enforce_limit(limit_key, limit_value, soft=False) + if warcprox_meta and "soft-limits" in warcprox_meta: + for item in warcprox_meta["soft-limits"].items(): + limit_key, limit_value = item + self._enforce_limit(limit_key, limit_value, soft=True) + + def _connect_to_remote_server(self): + ''' + Wraps MitmProxyHandler._connect_to_remote_server, first enforcing + limits and block rules in the Warcprox-Meta request header, if any. + Raises warcprox.RequestBlockedByRule if a rule has been enforced. + Otherwise calls MitmProxyHandler._connect_to_remote_server, which + initializes self._remote_server_sock. + ''' + if 'Warcprox-Meta' in self.headers: + warcprox_meta = json.loads(self.headers['Warcprox-Meta']) + self._enforce_limits(warcprox_meta) + self._enforce_blocks(warcprox_meta) + return warcprox.mitmproxy.MitmProxyHandler._connect_to_remote_server(self) + + def _proxy_request(self): + warcprox_meta = None + raw_warcprox_meta = self.headers.get('Warcprox-Meta') + self.logger.log( + warcprox.TRACE, 'request for %s Warcprox-Meta header: %s', + self.url, repr(raw_warcprox_meta)) + if raw_warcprox_meta: + warcprox_meta = json.loads(raw_warcprox_meta) + del self.headers['Warcprox-Meta'] + + remote_ip = self._remote_server_sock.getpeername()[0] + timestamp = datetime.datetime.utcnow() + + req, prox_rec_res = warcprox.mitmproxy.MitmProxyHandler._proxy_request( + self) + + recorded_url = RecordedUrl( + url=self.url, request_data=req, + response_recorder=prox_rec_res.recorder, remote_ip=remote_ip, + warcprox_meta=warcprox_meta, status=prox_rec_res.status, + size=prox_rec_res.recorder.len, + client_ip=self.client_address[0], + content_type=prox_rec_res.getheader("Content-Type"), + method=self.command, timestamp=timestamp, host=self.hostname, + duration=datetime.datetime.utcnow()-timestamp) + self.server.recorded_url_q.put(recorded_url) + + return recorded_url + + # deprecated + def do_PUTMETA(self): + ''' + Handles a special warcprox PUTMETA request (deprecated). A PUTMETA + request is equivalent to a WARCPROX_WRITE_RECORD request with + WARC-Type: metadata. + ''' + self.do_WARCPROX_WRITE_RECORD(warc_type=warctools.WarcRecord.METADATA) + + def do_WARCPROX_WRITE_RECORD(self, warc_type=None): + ''' + Handles a request with http method WARCPROX_WRITE_RECORD, a special + type of request which tells warcprox to construct a warc record from + the request more or less verbatim, and write it to a warc. + + To honor the request, this method creates a RecordedUrl queues it for + the WarcWriterThread to process. The warc record headers Content-Type + and WARC-Type are taken from the request headers, as is the payload. + + Example request: + + WARCPROX_WRITE_RECORD screenshot:https://example.com/ HTTP/1.1 + WARC-Type: metadata + Content-Type: image/png + Content-Length: 12345 + Connection: close + + + ''' + try: + self.url = self.path + + if ('Content-Length' in self.headers and 'Content-Type' in self.headers + and (warc_type or 'WARC-Type' in self.headers)): + timestamp = datetime.datetime.utcnow() + + # stream this? + request_data = self.rfile.read(int(self.headers['Content-Length'])) + + warcprox_meta = None + raw_warcprox_meta = self.headers.get('Warcprox-Meta') + if raw_warcprox_meta: + warcprox_meta = json.loads(raw_warcprox_meta) + + rec_custom = RecordedUrl(url=self.url, + request_data=request_data, + response_recorder=None, + remote_ip=b'', + warcprox_meta=warcprox_meta, + content_type=self.headers['Content-Type'], + custom_type=warc_type or self.headers['WARC-Type'].encode('utf-8'), + status=204, size=len(request_data), + client_ip=self.client_address[0], + method=self.command, timestamp=timestamp) + + self.server.recorded_url_q.put(rec_custom) + self.send_response(204, 'OK') + else: + self.send_error(400, 'Bad request') + + self.end_headers() + except: + self.logger.error("uncaught exception in do_WARCPROX_WRITE_RECORD", exc_info=True) + raise + + def log_message(self, fmt, *args): + # logging better handled elsewhere? + pass + + +class RecordedUrl: + logger = logging.getLogger("warcprox.warcproxy.RecordedUrl") + + def __init__(self, url, request_data, response_recorder, remote_ip, + warcprox_meta=None, content_type=None, custom_type=None, + status=None, size=None, client_ip=None, method=None, + timestamp=None, host=None, duration=None): + # XXX should test what happens with non-ascii url (when does + # url-encoding happen?) + if type(url) is not bytes: + self.url = url.encode('ascii') + else: + self.url = url + + if type(remote_ip) is not bytes: + self.remote_ip = remote_ip.encode('ascii') + else: + self.remote_ip = remote_ip + + self.request_data = request_data + self.response_recorder = response_recorder + + if warcprox_meta: + self.warcprox_meta = warcprox_meta + else: + self.warcprox_meta = {} + + self.content_type = content_type + + self.mimetype = content_type + if self.mimetype: + n = self.mimetype.find(";") + if n >= 0: + self.mimetype = self.mimetype[:n] + + self.custom_type = custom_type + self.status = status + self.size = size + self.client_ip = client_ip + self.method = method + self.timestamp = timestamp + self.host = host + self.duration = duration + + +class SingleThreadedWarcProxy(http_server.HTTPServer): + logger = logging.getLogger("warcprox.warcproxy.WarcProxy") + + def __init__( + self, ca=None, recorded_url_q=None, stats_db=None, + options=warcprox.Options()): + server_address = ( + options.address or 'localhost', + options.port if options.port is not None else 8000) + + if options.onion_tor_socks_proxy: + try: + host, port = options.onion_tor_socks_proxy.split(':') + WarcProxyHandler.onion_tor_socks_proxy_host = host + WarcProxyHandler.onion_tor_socks_proxy_port = int(port) + except ValueError: + WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy + WarcProxyHandler.onion_tor_socks_proxy_port = None + + http_server.HTTPServer.__init__( + self, server_address, WarcProxyHandler, bind_and_activate=True) + + self.digest_algorithm = options.digest_algorithm or 'sha1' + + if ca is not None: + self.ca = ca + else: + ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64] + self.ca = CertificateAuthority(ca_file='warcprox-ca.pem', + certs_dir='./warcprox-ca', + ca_name=ca_name) + + if recorded_url_q is not None: + self.recorded_url_q = recorded_url_q + else: + self.recorded_url_q = queue.Queue(maxsize=options.queue_size or 1000) + + self.stats_db = stats_db + + self.options = options + +class WarcProxy(SingleThreadedWarcProxy, warcprox.mitmproxy.PooledMitmProxy): + logger = logging.getLogger("warcprox.warcproxy.WarcProxy") + + def __init__( + self, ca=None, recorded_url_q=None, stats_db=None, + options=warcprox.Options()): + if options.max_threads: + self.logger.info( + "max_threads=%s set by command line option", + options.max_threads) + warcprox.mitmproxy.PooledMitmProxy.__init__(self, options.max_threads) + SingleThreadedWarcProxy.__init__( + self, ca, recorded_url_q, stats_db, options) + + def server_activate(self): + http_server.HTTPServer.server_activate(self) + self.logger.info( + 'listening on %s:%s', self.server_address[0], + self.server_address[1]) + + def server_close(self): + self.logger.info('shutting down') + http_server.HTTPServer.server_close(self) + + def handle_error(self, request, client_address): + self.logger.warn( + "exception processing request %s from %s", request, + client_address, exc_info=True) diff --git a/warcprox/warcwriter.py b/warcprox/warcwriter.py deleted file mode 100644 index 6af6733..0000000 --- a/warcprox/warcwriter.py +++ /dev/null @@ -1,301 +0,0 @@ -# vim:set sw=4 et: - -from __future__ import absolute_import - -try: - import queue -except ImportError: - import Queue as queue - -import logging -import threading -import os -import hashlib -import time -import socket -import base64 -from datetime import datetime -import hanzo.httptools -from hanzo import warctools -import warcprox - -class WarcWriter: - logger = logging.getLogger("warcprox.warcwriter.WarcWriter") - - # port is only used for warc filename - def __init__(self, directory='./warcs', rollover_size=1000000000, - gzip=False, prefix='WARCPROX', port=0, - digest_algorithm='sha1', base32=False, dedup_db=None, - playback_index_db=None): - - self.rollover_size = rollover_size - - self.gzip = gzip - self.digest_algorithm = digest_algorithm - self.base32 = base32 - self.dedup_db = dedup_db - - self.playback_index_db = playback_index_db - - # warc path and filename stuff - self.directory = directory - self.prefix = prefix - self.port = port - - self._f = None - self._fpath = None - self._serial = 0 - - if not os.path.exists(directory): - self.logger.info("warc destination directory {} doesn't exist, creating it".format(directory)) - os.mkdir(directory) - - - # returns a tuple (principal_record, request_record) where principal_record is either a response or revisit record - def build_warc_records(self, recorded_url): - warc_date = warctools.warc.warc_datetime_str(datetime.utcnow()) - - dedup_info = None - if self.dedup_db is not None and recorded_url.response_recorder.payload_digest is not None: - key = self.digest_str(recorded_url.response_recorder.payload_digest) - dedup_info = self.dedup_db.lookup(key) - - if dedup_info is not None: - # revisit record - recorded_url.response_recorder.tempfile.seek(0) - if recorded_url.response_recorder.payload_offset is not None: - response_header_block = recorded_url.response_recorder.tempfile.read(recorded_url.response_recorder.payload_offset) - else: - response_header_block = recorded_url.response_recorder.tempfile.read() - - principal_record = self.build_warc_record( - url=recorded_url.url, warc_date=warc_date, - data=response_header_block, - warc_type=warctools.WarcRecord.REVISIT, - refers_to=dedup_info['i'], - refers_to_target_uri=dedup_info['u'], - refers_to_date=dedup_info['d'], - payload_digest=self.digest_str(recorded_url.response_recorder.payload_digest), - profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST, - content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE, - remote_ip=recorded_url.remote_ip) - else: - # response record - principal_record = self.build_warc_record( - url=recorded_url.url, warc_date=warc_date, - recorder=recorded_url.response_recorder, - warc_type=warctools.WarcRecord.RESPONSE, - content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE, - remote_ip=recorded_url.remote_ip) - - request_record = self.build_warc_record( - url=recorded_url.url, warc_date=warc_date, - data=recorded_url.request_data, - warc_type=warctools.WarcRecord.REQUEST, - content_type=hanzo.httptools.RequestMessage.CONTENT_TYPE, - concurrent_to=principal_record.id) - - return principal_record, request_record - - - def digest_str(self, hash_obj): - return hash_obj.name.encode('utf-8') + b':' + (base64.b32encode(hash_obj.digest()) if self.base32 else hash_obj.hexdigest().encode('ascii')) - - - def build_warc_record(self, url, warc_date=None, recorder=None, data=None, - concurrent_to=None, warc_type=None, content_type=None, remote_ip=None, - profile=None, refers_to=None, refers_to_target_uri=None, - refers_to_date=None, payload_digest=None): - - if warc_date is None: - warc_date = warctools.warc.warc_datetime_str(datetime.utcnow()) - - record_id = warctools.WarcRecord.random_warc_uuid() - - headers = [] - if warc_type is not None: - headers.append((warctools.WarcRecord.TYPE, warc_type)) - headers.append((warctools.WarcRecord.ID, record_id)) - headers.append((warctools.WarcRecord.DATE, warc_date)) - headers.append((warctools.WarcRecord.URL, url)) - if remote_ip is not None: - headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip)) - if profile is not None: - headers.append((warctools.WarcRecord.PROFILE, profile)) - if refers_to is not None: - headers.append((warctools.WarcRecord.REFERS_TO, refers_to)) - if refers_to_target_uri is not None: - headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri)) - if refers_to_date is not None: - headers.append((warctools.WarcRecord.REFERS_TO_DATE, refers_to_date)) - if concurrent_to is not None: - headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to)) - if content_type is not None: - headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type)) - if payload_digest is not None: - headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest)) - - if recorder is not None: - headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1'))) - headers.append((warctools.WarcRecord.BLOCK_DIGEST, - self.digest_str(recorder.block_digest))) - if recorder.payload_digest is not None: - headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, - self.digest_str(recorder.payload_digest))) - - recorder.tempfile.seek(0) - record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile) - - else: - headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1'))) - block_digest = hashlib.new(self.digest_algorithm, data) - headers.append((warctools.WarcRecord.BLOCK_DIGEST, - self.digest_str(block_digest))) - - content_tuple = content_type, data - record = warctools.WarcRecord(headers=headers, content=content_tuple) - - return record - - - def timestamp17(self): - now = datetime.utcnow() - return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000) - - def close_writer(self): - if self._fpath: - self.logger.info('closing {0}'.format(self._f_finalname)) - self._f.close() - finalpath = os.path.sep.join([self.directory, self._f_finalname]) - os.rename(self._fpath, finalpath) - - self._fpath = None - self._f = None - - def _build_warcinfo_record(self, filename): - warc_record_date = warctools.warc.warc_datetime_str(datetime.utcnow()) - record_id = warctools.WarcRecord.random_warc_uuid() - - headers = [] - headers.append((warctools.WarcRecord.ID, record_id)) - headers.append((warctools.WarcRecord.TYPE, warctools.WarcRecord.WARCINFO)) - headers.append((warctools.WarcRecord.FILENAME, filename.encode('latin1'))) - headers.append((warctools.WarcRecord.DATE, warc_record_date)) - - warcinfo_fields = [] - warcinfo_fields.append(b'software: warcprox ' + warcprox.version_bytes) - hostname = socket.gethostname() - warcinfo_fields.append('hostname: {}'.format(hostname).encode('latin1')) - warcinfo_fields.append('ip: {0}'.format(socket.gethostbyname(hostname)).encode('latin1')) - warcinfo_fields.append(b'format: WARC File Format 1.0') - # warcinfo_fields.append('robots: ignore') - # warcinfo_fields.append('description: {0}'.format(self.description)) - # warcinfo_fields.append('isPartOf: {0}'.format(self.is_part_of)) - data = b'\r\n'.join(warcinfo_fields) + b'\r\n' - - record = warctools.WarcRecord(headers=headers, content=(b'application/warc-fields', data)) - - return record - - - # - def _writer(self): - if self._fpath and os.path.getsize(self._fpath) > self.rollover_size: - self.close_writer() - - if self._f == None: - self._f_finalname = '{}-{}-{:05d}-{}-{}-{}.warc{}'.format( - self.prefix, self.timestamp17(), self._serial, os.getpid(), - socket.gethostname(), self.port, '.gz' if self.gzip else '') - self._fpath = os.path.sep.join([self.directory, self._f_finalname + '.open']) - - self._f = open(self._fpath, 'wb') - - warcinfo_record = self._build_warcinfo_record(self._f_finalname) - self.logger.debug('warcinfo_record.headers={}'.format(warcinfo_record.headers)) - warcinfo_record.write_to(self._f, gzip=self.gzip) - - self._serial += 1 - - return self._f - - - def _final_tasks(self, recorded_url, recordset, recordset_offset): - if (self.dedup_db is not None - and recordset[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE - and recorded_url.response_recorder.payload_size() > 0): - key = self.digest_str(recorded_url.response_recorder.payload_digest) - self.dedup_db.save(key, recordset[0], recordset_offset) - - if self.playback_index_db is not None: - self.playback_index_db.save(self._f_finalname, recordset, recordset_offset) - - recorded_url.response_recorder.tempfile.close() - - def write_records(self, recorded_url): - recordset = self.build_warc_records(recorded_url) - - writer = self._writer() - recordset_offset = writer.tell() - - for record in recordset: - offset = writer.tell() - record.write_to(writer, gzip=self.gzip) - self.logger.debug('wrote warc record: warc_type={} content_length={} url={} warc={} offset={}'.format( - record.get_header(warctools.WarcRecord.TYPE), - record.get_header(warctools.WarcRecord.CONTENT_LENGTH), - record.get_header(warctools.WarcRecord.URL), - self._fpath, offset)) - - self._f.flush() - - self._final_tasks(recorded_url, recordset, recordset_offset) - - - -class WarcWriterThread(threading.Thread): - logger = logging.getLogger("warcprox.warcwriter.WarcWriterThread") - - def __init__(self, recorded_url_q=None, warc_writer=None, rollover_idle_time=None): - """recorded_url_q is a queue.Queue of warcprox.warcprox.RecordedUrl.""" - threading.Thread.__init__(self, name='WarcWriterThread') - self.recorded_url_q = recorded_url_q - self.rollover_idle_time = rollover_idle_time - self.stop = threading.Event() - if warc_writer: - self.warc_writer = warc_writer - else: - self.warc_writer = WarcWriter() - - def run(self): - self.logger.info('WarcWriterThread starting, directory={} gzip={} rollover_size={} rollover_idle_time={} prefix={} port={}'.format( - os.path.abspath(self.warc_writer.directory), self.warc_writer.gzip, self.warc_writer.rollover_size, - self.rollover_idle_time, self.warc_writer.prefix, self.warc_writer.port)) - - self._last_sync = self._last_activity = time.time() - - while not self.stop.is_set(): - try: - recorded_url = self.recorded_url_q.get(block=True, timeout=0.5) - self.logger.info("recorded_url.warcprox_meta={} for {}".format(recorded_url.warcprox_meta, recorded_url.url)) - self.warc_writer.write_records(recorded_url) - self._last_activity = time.time() - except queue.Empty: - if (self.warc_writer._fpath is not None - and self.rollover_idle_time is not None - and self.rollover_idle_time > 0 - and time.time() - self._last_activity > self.rollover_idle_time): - self.logger.debug('rolling over warc file after {} seconds idle'.format(time.time() - self._last_activity)) - self.warc_writer.close_writer() - - if time.time() - self._last_sync > 60: - if self.warc_writer.dedup_db: - self.warc_writer.dedup_db.sync() - if self.warc_writer.playback_index_db: - self.warc_writer.playback_index_db.sync() - self._last_sync = time.time() - - self.logger.info('WarcWriterThread shutting down') - self.warc_writer.close_writer(); - - diff --git a/warcprox/writer.py b/warcprox/writer.py new file mode 100644 index 0000000..72c292f --- /dev/null +++ b/warcprox/writer.py @@ -0,0 +1,168 @@ +# +# warcprox/writer.py - warc writer, manages and writes records to warc files +# +# Copyright (C) 2013-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# + +from __future__ import absolute_import + +import logging +from datetime import datetime +from hanzo import warctools +import time +import warcprox +import os +import socket +import string +import random + +class WarcWriter: + logger = logging.getLogger('warcprox.writer.WarcWriter') + + def __init__(self, options=warcprox.Options()): + + self.rollover_size = options.rollover_size or 1000000000 + self.rollover_idle_time = options.rollover_idle_time or None + self._last_activity = time.time() + + self.gzip = options.gzip or False + digest_algorithm = options.digest_algorithm or 'sha1' + base32 = options.base32 + self.record_builder = warcprox.warc.WarcRecordBuilder(digest_algorithm=digest_algorithm, base32=base32) + + # warc path and filename stuff + self.directory = options.directory or './warcs' + self.prefix = options.prefix or 'warcprox' + + self._f = None + self._fpath = None + self._f_finalname = None + self._serial = 0 + + self._randomtoken = "".join(random.Random().sample(string.digits + string.ascii_lowercase, 8)) + + if not os.path.exists(self.directory): + self.logger.info("warc destination directory {} doesn't exist, creating it".format(self.directory)) + os.mkdir(self.directory) + + def timestamp17(self): + now = datetime.utcnow() + return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000) + + def close_writer(self): + if self._fpath: + self.logger.info('closing {0}'.format(self._f_finalname)) + self._f.close() + finalpath = os.path.sep.join([self.directory, self._f_finalname]) + os.rename(self._fpath, finalpath) + + self._fpath = None + self._f = None + + # h3 default + # ${prefix}-${timestamp17}-${randomtoken}-${serialno}.warc.gz" + def _writer(self): + if self._fpath and os.path.getsize(self._fpath) > self.rollover_size: + self.close_writer() + + if self._f == None: + self._f_finalname = '{}-{}-{:05d}-{}.warc{}'.format( + self.prefix, self.timestamp17(), self._serial, self._randomtoken, '.gz' if self.gzip else '') + self._fpath = os.path.sep.join([self.directory, self._f_finalname + '.open']) + + self._f = open(self._fpath, 'wb') + + warcinfo_record = self.record_builder.build_warcinfo_record(self._f_finalname) + self.logger.debug('warcinfo_record.headers={}'.format(warcinfo_record.headers)) + warcinfo_record.write_to(self._f, gzip=self.gzip) + + self._serial += 1 + + return self._f + + def write_records(self, recorded_url): + """Returns tuple of records written, which are instances of + hanzo.warctools.warc.WarcRecord, decorated with "warc_filename" and + "offset" attributes.""" + records = self.record_builder.build_warc_records(recorded_url) + + writer = self._writer() + recordset_offset = writer.tell() + + for record in records: + offset = writer.tell() + record.write_to(writer, gzip=self.gzip) + record.offset = offset + record.length = writer.tell() - offset + record.warc_filename = self._f_finalname + self.logger.debug('wrote warc record: warc_type=%s content_length=%s url=%s warc=%s offset=%d', + record.get_header(warctools.WarcRecord.TYPE), + record.get_header(warctools.WarcRecord.CONTENT_LENGTH), + record.get_header(warctools.WarcRecord.URL), + self._fpath, record.offset) + + self._f.flush() + self._last_activity = time.time() + + return records + + def maybe_idle_rollover(self): + if (self._fpath is not None + and self.rollover_idle_time is not None + and self.rollover_idle_time > 0 + and time.time() - self._last_activity > self.rollover_idle_time): + self.logger.debug('rolling over {} after {} seconds idle'.format(self._f_finalname, time.time() - self._last_activity)) + self.close_writer() + +class WarcWriterPool: + logger = logging.getLogger("warcprox.writer.WarcWriterPool") + + def __init__(self, options=warcprox.Options()): + self.default_warc_writer = WarcWriter(options=options) + self.warc_writers = {} # {prefix:WarcWriter} + self._last_sync = time.time() + self.options = options + + # chooses writer for filename specified by warcprox_meta["warc-prefix"] if set + def _writer(self, recorded_url): + w = self.default_warc_writer + if recorded_url.warcprox_meta and "warc-prefix" in recorded_url.warcprox_meta: + # self.logger.info("recorded_url.warcprox_meta={} for {}".format(recorded_url.warcprox_meta, recorded_url.url)) + options = warcprox.Options(**vars(self.options)) + options.prefix = recorded_url.warcprox_meta["warc-prefix"] + if not options.prefix in self.warc_writers: + self.warc_writers[options.prefix] = WarcWriter(options=options) + w = self.warc_writers[options.prefix] + return w + + def write_records(self, recorded_url): + """Returns tuple of records written, which are instances of + hanzo.warctools.warc.WarcRecord, decorated with "warc_filename" and + "offset" attributes.""" + return self._writer(recorded_url).write_records(recorded_url) + + def maybe_idle_rollover(self): + self.default_warc_writer.maybe_idle_rollover() + for w in self.warc_writers.values(): + w.maybe_idle_rollover() + + def close_writers(self): + self.default_warc_writer.close_writer() + for w in self.warc_writers.values(): + w.close_writer() + diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py new file mode 100644 index 0000000..002b897 --- /dev/null +++ b/warcprox/writerthread.py @@ -0,0 +1,122 @@ +# +# warcprox/writerthread.py - warc writer thread, reads from the recorded url +# queue, writes warc records, runs final tasks after warc records are written +# +# Copyright (C) 2013-2016 Internet Archive +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. +# + +from __future__ import absolute_import + +try: + import queue +except ImportError: + import Queue as queue + +import logging +import threading +import os +import hashlib +import time +import socket +import base64 +from datetime import datetime +import hanzo.httptools +from hanzo import warctools +import warcprox +import cProfile + +class WarcWriterThread(threading.Thread): + logger = logging.getLogger("warcprox.warcproxwriter.WarcWriterThread") + + def __init__(self, recorded_url_q=None, writer_pool=None, dedup_db=None, listeners=None, options=warcprox.Options()): + """recorded_url_q is a queue.Queue of warcprox.warcprox.RecordedUrl.""" + threading.Thread.__init__(self, name='WarcWriterThread') + self.recorded_url_q = recorded_url_q + self.stop = threading.Event() + if writer_pool: + self.writer_pool = writer_pool + else: + self.writer_pool = WarcWriterPool() + self.dedup_db = dedup_db + self.listeners = listeners + self.options = options + self.idle = None + + def run(self): + if self.options.profile: + cProfile.runctx('self._run()', globals(), locals(), sort='cumulative') + else: + self._run() + + def _run(self): + while not self.stop.is_set(): + try: + self.name = 'WarcWriterThread(tid={})'.format(warcprox.gettid()) + while True: + try: + if self.stop.is_set(): + qsize = self.recorded_url_q.qsize() + if qsize % 50 == 0: + self.logger.info("%s urls left to write", qsize) + + recorded_url = self.recorded_url_q.get(block=True, timeout=0.5) + self.idle = None + if self.dedup_db: + warcprox.dedup.decorate_with_dedup_info(self.dedup_db, + recorded_url, base32=self.options.base32) + records = self.writer_pool.write_records(recorded_url) + self._final_tasks(recorded_url, records) + + # try to release resources in a timely fashion + if recorded_url.response_recorder and recorded_url.response_recorder.tempfile: + recorded_url.response_recorder.tempfile.close() + except queue.Empty: + if self.stop.is_set(): + break + self.idle = time.time() + self.writer_pool.maybe_idle_rollover() + + self.logger.info('WarcWriterThread shutting down') + self.writer_pool.close_writers() + except: + self.logger.critical("WarcWriterThread will try to continue after unexpected error", exc_info=True) + time.sleep(0.5) + + # closest thing we have to heritrix crawl log at the moment + def _log(self, recorded_url, records): + try: + payload_digest = records[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode("utf-8") + except: + payload_digest = "-" + + # 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"} + self.logger.info("{} {} {} {} {} size={} {} {} {} offset={}".format( + recorded_url.client_ip, recorded_url.status, recorded_url.method, + recorded_url.url.decode("utf-8"), recorded_url.mimetype, + recorded_url.size, payload_digest, records[0].type.decode("utf-8"), + records[0].warc_filename, records[0].offset)) + + def _final_tasks(self, recorded_url, records): + if self.listeners: + for listener in self.listeners: + try: + listener.notify(recorded_url, records) + except: + self.logger.error('%s raised exception', + listener.notify, exc_info=True) + self._log(recorded_url, records)