Merge pull request #17 from internetarchive/2.x

2.x
This commit is contained in:
Noah Levitt 2016-10-19 15:34:49 -07:00 committed by GitHub
commit de3c81fdc8
32 changed files with 4359 additions and 1306 deletions

1
.gitignore vendored
View File

@ -11,3 +11,4 @@ warcs
build
dist
.tox
out.*

View File

@ -1,21 +1,36 @@
# vim: set sw=4 et:
#
# tox approach stolen from
# https://github.com/pypa/pip/blob/abdb597dbfb51b21cc76c1cff068b72c80f3a77d/.travis.yml
#
language: python
python:
- 3.5
- 3.4
- 2.7
- nightly
- pypy
- pypy3
env:
- TOXENV=py27
- TOXENV=py34
matrix:
allow_failures:
- python: pypy
- python: pypy3
addons:
apt:
packages:
- python-gdbm
- python3-gdbm
- tor
services:
- docker
before_install:
- sudo apt-get update
- sudo apt-get -y install python-gdbm python3-gdbm
- sudo service docker restart ; sleep 10 # https://github.com/travis-ci/travis-ci/issues/4778
- docker run -d --publish=28015:28015 rethinkdb
before_script:
- pip install tox
- pip install . pytest requests
script: tox
script:
- py.test -v -s tests
- py.test -v -s --rethinkdb-servers=localhost tests tests
- py.test -v -s --rethinkdb-servers=localhost --rethinkdb-big-table tests

View File

@ -1,15 +1,11 @@
warcprox - WARC writing MITM HTTP/S proxy
-----------------------------------------
.. image:: https://travis-ci.org/internetarchive/warcprox.png?branch=master
.. image:: https://travis-ci.org/internetarchive/warcprox.png?branch=master
:target: https://travis-ci.org/internetarchive/warcprox
Based on the excellent and simple pymiproxy by Nadeem Douba.
https://github.com/allfro/pymiproxy
License: because pymiproxy is GPL and warcprox is a derivative work of
pymiproxy, warcprox is also GPL.
Install
~~~~~~~
@ -19,6 +15,7 @@ To install latest release run:
::
# apt-get install libffi-dev libssl-dev python3-gdbm
pip install warcprox
You can also install the latest bleeding edge code:
@ -45,10 +42,15 @@ Usage
usage: warcprox [-h] [-p PORT] [-b ADDRESS] [-c CACERT]
[--certs-dir CERTS_DIR] [-d DIRECTORY] [-z] [-n PREFIX]
[-s SIZE] [--rollover-idle-time ROLLOVER_IDLE_TIME]
[-g DIGEST_ALGORITHM] [--base32] [-j DEDUP_DB_FILE]
[-P PLAYBACK_PORT]
[--playback-index-db-file PLAYBACK_INDEX_DB_FILE] [--version]
[-v] [-q]
[-g DIGEST_ALGORITHM] [--base32]
[--stats-db-file STATS_DB_FILE] [-P PLAYBACK_PORT]
[--playback-index-db-file PLAYBACK_INDEX_DB_FILE]
[-j DEDUP_DB_FILE | --rethinkdb-servers RETHINKDB_SERVERS]
[--rethinkdb-db RETHINKDB_DB] [--rethinkdb-big-table]
[--kafka-broker-list KAFKA_BROKER_LIST]
[--kafka-capture-feed-topic KAFKA_CAPTURE_FEED_TOPIC]
[--onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY]
[--version] [-v] [-q]
warcprox - WARC writing MITM HTTP/S proxy
@ -58,84 +60,91 @@ Usage
-b ADDRESS, --address ADDRESS
address to listen on (default: localhost)
-c CACERT, --cacert CACERT
CA certificate file; if file does not exist, it will
be created (default: ./desktop-nlevitt-warcprox-
ca.pem)
CA certificate file; if file does not exist, it
will be created (default: ./MacBook-Pro.local-
warcprox-ca.pem)
--certs-dir CERTS_DIR
where to store and load generated certificates
(default: ./desktop-nlevitt-warcprox-ca)
(default: ./MacBook-Pro.local-warcprox-ca)
-d DIRECTORY, --dir DIRECTORY
where to write warcs (default: ./warcs)
-z, --gzip write gzip-compressed warc records (default: False)
-z, --gzip write gzip-compressed warc records (default:
False)
-n PREFIX, --prefix PREFIX
WARC filename prefix (default: WARCPROX)
-s SIZE, --size SIZE WARC file rollover size threshold in bytes (default:
1000000000)
-s SIZE, --size SIZE WARC file rollover size threshold in bytes
(default: 1000000000)
--rollover-idle-time ROLLOVER_IDLE_TIME
WARC file rollover idle time threshold in seconds (so
that Friday's last open WARC doesn't sit there all
weekend waiting for more data) (default: None)
WARC file rollover idle time threshold in seconds
(so that Friday's last open WARC doesn't sit there
all weekend waiting for more data) (default: None)
-g DIGEST_ALGORITHM, --digest-algorithm DIGEST_ALGORITHM
digest algorithm, one of sha384, sha512, md5, sha224,
sha256, sha1 (default: sha1)
digest algorithm, one of sha1, sha256, md5,
sha224, sha512, sha384 (default: sha1)
--base32 write digests in Base32 instead of hex (default:
False)
-j DEDUP_DB_FILE, --dedup-db-file DEDUP_DB_FILE
persistent deduplication database file; empty string
or /dev/null disables deduplication (default:
./warcprox-dedup.db)
--stats-db-file STATS_DB_FILE
persistent statistics database file; empty string
or /dev/null disables statistics tracking
(default: ./warcprox-stats.db)
-P PLAYBACK_PORT, --playback-port PLAYBACK_PORT
port to listen on for instant playback (default: None)
port to listen on for instant playback (default:
None)
--playback-index-db-file PLAYBACK_INDEX_DB_FILE
playback index database file (only used if --playback-
port is specified) (default: ./warcprox-playback-
index.db)
playback index database file (only used if
--playback-port is specified) (default:
./warcprox-playback-index.db)
-j DEDUP_DB_FILE, --dedup-db-file DEDUP_DB_FILE
persistent deduplication database file; empty
string or /dev/null disables deduplication
(default: ./warcprox-dedup.db)
--rethinkdb-servers RETHINKDB_SERVERS
rethinkdb servers, used for dedup and stats if
specified; e.g.
db0.foo.org,db0.foo.org:38015,db1.foo.org
(default: None)
--rethinkdb-db RETHINKDB_DB
rethinkdb database name (ignored unless
--rethinkdb-servers is specified) (default:
warcprox)
--rethinkdb-big-table
use a big rethinkdb table called "captures",
instead of a small table called "dedup"; table is
suitable for use as index for playback (ignored
unless --rethinkdb-servers is specified) (default:
False)
--kafka-broker-list KAFKA_BROKER_LIST
kafka broker list for capture feed (default: None)
--kafka-capture-feed-topic KAFKA_CAPTURE_FEED_TOPIC
kafka capture feed topic (default: None)
--onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY
host:port of tor socks proxy, used only to connect
to .onion sites (default: None)
--version show program's version number and exit
-v, --verbose
-q, --quiet
To do
~~~~~
* (partly done) integration tests, unit tests
* (done) url-agnostic deduplication
* unchunk and/or ungzip before storing payload, or alter request to
discourage server from chunking/gzipping
* check certs from proxied website, like browser does, and present
browser-like warning if appropriate
* keep statistics, produce reports
* write cdx while crawling?
* performance testing
* (done) base32 sha1 like heritrix?
* configurable timeouts and stuff
* evaluate ipv6 support
* (done) more explicit handling of connection closed exception
during transfer
* dns cache?? the system already does a fine job I'm thinking
* keepalive with remote servers?
* (done) python3
* special handling for 304 not-modified (write nothing or write revisit
record... and/or modify request so server never responds with 304)
* (done) instant playback on a second proxy port
* special url for downloading ca cert e.g. http(s)://warcprox./ca.pem
* special url for other stuff, some status info or something?
* browser plugin for warcprox mode
License
~~~~~~~
- accept warcprox CA cert only when in warcprox mode
- separate temporary cookie store, like incognito
- "careful! your activity is being archived" banner
- easy switch between archiving and instant playback proxy port
Warcprox is a derivative work of pymiproxy, which is GPL. Thus warcprox is also
GPL.
To not do
^^^^^^^^^
Copyright (C) 2012 Cygnos Corporation
Copyright (C) 2013-2016 Internet Archive
The features below could also be part of warcprox. But maybe they don't
belong here, since this is a proxy, not a crawler/robot. It can be used
by a human with a browser, or by something automated, i.e. a robot. My
feeling is that it's more appropriate to implement these in the robot.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
* politeness, i.e. throttle requests per server
* fetch and obey robots.txt
* alter user-agent, maybe insert something like "warcprox mitm
archiving proxy; +http://archive.org/details/archive.org\_bot"
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.

View File

@ -0,0 +1 @@
aiohttp

172
benchmarks/run-benchmarks.py Executable file
View File

@ -0,0 +1,172 @@
#!/usr/bin/env python
#
# run-benchmarks.py - some benchmarking code for warcprox
#
# Copyright (C) 2015-2016 Internet Archive
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
# USA.
#
import sys
import aiohttp
import aiohttp.server
import asyncio
import ssl
import tempfile
import OpenSSL.crypto
import OpenSSL.SSL
import random
import os
import threading
import time
import logging
import warcprox.main
logging.basicConfig(stream=sys.stdout, level=logging.INFO,
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
def self_signed_cert():
key = OpenSSL.crypto.PKey()
key.generate_key(OpenSSL.crypto.TYPE_RSA, 2048)
cert = OpenSSL.crypto.X509()
cert.set_serial_number(random.randint(0, 2 ** 64 - 1))
cert.get_subject().CN = 'localhost'
cert.set_version(2)
cert.gmtime_adj_notBefore(0)
cert.gmtime_adj_notAfter(10 * 365 * 24 * 60 * 60)
cert.set_issuer(cert.get_subject())
cert.set_pubkey(key)
cert.sign(key, "sha1")
return key, cert
class HttpRequestHandler(aiohttp.server.ServerHttpProtocol):
@asyncio.coroutine
def handle_request(self, message, payload):
response = aiohttp.Response(
self.writer, 200, http_version=message.version
)
n = int(message.path.partition('/')[2])
response.add_header('Content-Type', 'text/plain')
# response.add_header('Content-Length', '18')
response.send_headers()
for i in range(n):
response.write(b'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n')
yield from response.write_eof()
def run_servers():
loop.run_forever()
def start_servers():
loop = asyncio.get_event_loop()
http = loop.create_server(lambda: HttpRequestHandler(debug=True, keep_alive=75), '127.0.0.1', '8080')
sslcontext = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
key, cert = self_signed_cert()
with tempfile.NamedTemporaryFile(delete=False) as certfile:
certfile.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM, key))
certfile.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM, cert))
sslcontext.load_cert_chain(certfile.name)
os.remove(certfile.name)
https = loop.create_server(lambda: HttpRequestHandler(debug=True, keep_alive=75), '127.0.0.1', '8443', ssl=sslcontext)
srv = loop.run_until_complete(http)
srv = loop.run_until_complete(https)
logging.info('serving on http://127.0.0.1:8080 and https://127.0.0.1:8443')
class AsyncClient(object):
def __init__(self, proxy=None):
self.n_urls = 0
self.n_bytes = 0
self.proxy = proxy
if proxy:
self.connector = aiohttp.connector.ProxyConnector(proxy, verify_ssl=False)
else:
self.connector = aiohttp.connector.TCPConnector(verify_ssl=False)
@asyncio.coroutine
def read_response(self, r, url):
# time.sleep(random.random() * 10)
while True:
chunk = yield from r.content.read(2**16)
self.n_bytes += len(chunk)
if not chunk:
self.n_urls += 1
logging.debug("finished reading from %s", url)
r.close()
break
@asyncio.coroutine
def one_request(self, url):
logging.debug("issuing request to %s", url)
r = yield from aiohttp.get(url, connector=self.connector)
logging.debug("issued request to %s", url)
yield from self.read_response(r, url)
def benchmark(client):
try:
start = time.time()
tasks_https = [client.one_request('https://localhost:8443/%s' % int(1.1**i)) for i in range(80)]
asyncio.get_event_loop().run_until_complete(asyncio.wait(tasks_https))
tasks_http = [client.one_request('http://localhost:8080/%s' % int(1.1**i)) for i in range(80)]
asyncio.get_event_loop().run_until_complete(asyncio.wait(tasks_http))
finally:
finish = time.time()
logging.info("proxy=%s: %s urls totaling %s bytes in %s seconds", client.proxy, client.n_urls, client.n_bytes, (finish - start))
if __name__ == '__main__':
args = warcprox.main.parse_args()
start_servers()
baseline_client = AsyncClient()
logging.info("===== baseline benchmark starting (no proxy) =====")
benchmark(baseline_client)
logging.info("===== baseline benchmark finished =====")
# Queue size of 1 makes warcprox behave as though it were synchronous (each
# request blocks until the warc writer starts working on the last request).
# This gives us a better sense of sustained max throughput. The
# asynchronous nature of warcprox helps with bursty traffic, as long as the
# average throughput stays below the sustained max.
with tempfile.TemporaryDirectory() as tmpdir:
args.queue_size = 1
args.cacert = os.path.join(tmpdir, "benchmark-warcprox-ca.pem")
args.certs_dir = os.path.join(tmpdir, "benchmark-warcprox-ca")
args.directory = os.path.join(tmpdir, "warcs")
args.gzip = True
args.base32 = True
args.stats_db_file = os.path.join(tmpdir, "stats.db")
args.dedup_db_file = os.path.join(tmpdir, "dedup.db")
warcprox_controller = warcprox.main.init_controller(args)
warcprox_controller_thread = threading.Thread(target=warcprox_controller.run_until_shutdown)
warcprox_controller_thread.start()
proxy = "http://%s:%s" % (args.address, args.port)
proxied_client = AsyncClient(proxy=proxy)
logging.info("===== warcprox benchmark starting =====")
benchmark(proxied_client)
logging.info("===== warcprox benchmark finished =====")
warcprox_controller.stop.set()
warcprox_controller_thread.join()
asyncio.get_event_loop().stop()
logging.info("finished")

View File

@ -1,8 +0,0 @@
#!/usr/bin/env python
# vim: set sw=4 et:
from __future__ import absolute_import
import warcprox.main
warcprox.main.main()

View File

@ -1,44 +1,57 @@
#!/usr/bin/env python
# vim: set sw=4 et:
'''
setup.py - setuptools installation configuration for warcprox
Copyright (C) 2013-2016 Internet Archive
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
USA.
'''
from setuptools.command.test import test as TestCommand
import sys
import setuptools
VERSION_BYTES = b'1.4'
def full_version_bytes():
import subprocess, time
try:
commit_bytes = subprocess.check_output(['git', 'log', '-1', '--pretty=format:%h'])
t_bytes = subprocess.check_output(['git', 'log', '-1', '--pretty=format:%ct'])
t = int(t_bytes.strip().decode('utf-8'))
tm = time.gmtime(t)
timestamp_utc = time.strftime("%Y%m%d%H%M%S", time.gmtime(t))
return VERSION_BYTES + b'-' + timestamp_utc.encode('utf-8') + b'-' + commit_bytes.strip()
except subprocess.CalledProcessError:
return VERSION_BYTES
version_bytes = full_version_bytes()
with open('warcprox/version.txt', 'wb') as out:
out.write(version_bytes)
out.write(b'\n');
import setuptools
import setuptools.command.test
# special class needs to be added to support the pytest written dump-anydbm tests
class PyTest(TestCommand):
class PyTest(setuptools.command.test.test):
def finalize_options(self):
TestCommand.finalize_options(self)
setuptools.command.test.test.finalize_options(self)
self.test_args = []
self.test_suite = True
def run_tests(self):
#import here, cause outside the eggs aren't loaded
# import here, because outside the eggs aren't loaded
import pytest
errno = pytest.main(self.test_args)
sys.exit(errno)
setuptools.setup(name='warcprox',
version=version_bytes.decode('utf-8'),
deps = [
'certauth>=1.1.0',
'warctools',
'kafka-python>=1.0.1',
'surt>=0.3b4',
'rethinkstuff',
'PySocks',
]
try:
import concurrent.futures
except:
deps.append('futures')
setuptools.setup(
name='warcprox',
version='2.0b2.dev32',
description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox',
author='Noah Levitt',
@ -46,13 +59,18 @@ setuptools.setup(name='warcprox',
long_description=open('README.rst').read(),
license='GPL',
packages=['warcprox'],
package_data={'warcprox':['version.txt']},
install_requires=['certauth>=1.1.0', 'warctools>=4.8.3'], # gdbm not in pip :(
dependency_links=['git+https://github.com/internetarchive/warctools.git#egg=warctools-4.8.3'],
install_requires=deps,
tests_require=['requests>=2.0.1', 'pytest'], # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636
cmdclass = {'test': PyTest},
test_suite='warcprox.tests',
scripts=['bin/dump-anydbm', 'bin/warcprox'],
entry_points={
'console_scripts': [
'warcprox=warcprox.main:main',
('warcprox-ensure-rethinkdb-tables='
'warcprox.main:ensure_rethinkdb_tables'),
'dump-anydbm=warcprox.dump_anydbm:main',
],
},
zip_safe=False,
classifiers=[
'Development Status :: 5 - Production/Stable',
@ -60,6 +78,7 @@ setuptools.setup(name='warcprox',
'License :: OSI Approved :: GNU General Public License (GPL)',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Topic :: Internet :: Proxy Servers',
'Topic :: Internet :: WWW/HTTP',
'Topic :: Software Development :: Libraries :: Python Modules',

49
tests/Dockerfile Normal file
View File

@ -0,0 +1,49 @@
#
# Dockerfile for warcprox tests
#
# Copyright (C) 2015-2016 Internet Archive
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
# USA.
#
FROM phusion/baseimage
MAINTAINER Noah Levitt <nlevitt@archive.org>
# see https://github.com/stuartpb/rethinkdb-dockerfiles/blob/master/trusty/2.1.3/Dockerfile
ENV LANG=C.UTF-8
RUN apt-get update && apt-get --auto-remove -y dist-upgrade
# Add the RethinkDB repository and public key
# "RethinkDB Packaging <packaging@rethinkdb.com>" http://download.rethinkdb.com/apt/pubkey.gpg
RUN apt-key adv --keyserver pgp.mit.edu --recv-keys 1614552E5765227AEC39EFCFA7E00EF33A8F2399 \
&& echo "deb http://download.rethinkdb.com/apt trusty main" > /etc/apt/sources.list.d/rethinkdb.list \
&& apt-get update && apt-get -y install rethinkdb
RUN mkdir -vp /etc/service/rethinkdb \
&& echo "#!/bin/sh\nrethinkdb --bind 0.0.0.0 --directory /tmp/rethink-data --runuser rethinkdb --rungroup rethinkdb\n" > /etc/service/rethinkdb/run \
&& chmod a+x /etc/service/rethinkdb/run
RUN apt-get -y install python-virtualenv git
RUN apt-get -y install python-gdbm python3-gdbm libpython2.7-dev libpython3.4-dev libffi-dev libssl-dev
RUN pip install devpi-client
RUN apt-get -y install tor
RUN mkdir -vp /etc/service/tor \
&& echo "#!/bin/sh\ntor\n" > /etc/service/tor/run \
&& chmod a+x /etc/service/tor/run

39
tests/conftest.py Normal file
View File

@ -0,0 +1,39 @@
#
# tests/conftest.py - command line options for warcprox tests
#
# Copyright (C) 2015-2016 Internet Archive
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
# USA.
#
import pytest
def pytest_addoption(parser):
parser.addoption('--rethinkdb-servers', dest='rethinkdb_servers',
help='rethink db servers for dedup, e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org')
parser.addoption('--rethinkdb-big-table',
dest='rethinkdb_big_table', action='store_true', default=False,
help='use a big rethinkdb table called "captures", instead of a small table called "dedup"; table is suitable for use as index for playback (ignored unless --rethinkdb-servers is specified)')
@pytest.fixture(scope="module")
def rethinkdb_servers(request):
return request.config.getoption("--rethinkdb-servers")
@pytest.fixture(scope="module")
def rethinkdb_big_table(request):
return request.config.getoption("--rethinkdb-big-table")

48
tests/run-tests.sh Executable file
View File

@ -0,0 +1,48 @@
#!/bin/bash
#
# tests/run-tests.sh - Runs tests in a docker container. Also runs a temporary
# instance of rethinkdb inside the container. The tests run with rethinkdb
# features enabled, against that instance of rethinkdb, and also run without
# rethinkdb features enabled. With python 2.7 and 3.4.
#
# tests/conftest.py - command line options for warcprox tests
#
# Copyright (C) 2015-2016 Internet Archive
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
# USA.
#
# 😬
#
set -e
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
docker build -t internetarchive/warcprox-tests $script_dir
for python in python2.7 python3.4
do
docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests /sbin/my_init -- \
bash -x -c "cd /tmp && git clone /warcprox && cd /tmp/warcprox \
&& (cd /warcprox && git diff) | patch -p1 \
&& virtualenv -p $python /tmp/venv \
&& source /tmp/venv/bin/activate \
&& pip --log-file /tmp/pip.log install . pytest requests \
&& py.test -s tests \
&& py.test -s --rethinkdb-servers=localhost tests \
&& py.test -s --rethinkdb-servers=localhost --rethinkdb-big-table tests"
done

102
tests/single-threaded-proxy.py Executable file
View File

@ -0,0 +1,102 @@
#!/usr/bin/env python
"""
tests/single-threaded-proxy.py - single-threaded MITM proxy, useful for
debugging, does not write warcs
Copyright (C) 2015-2016 Internet Archive
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
USA.
"""
from __future__ import absolute_import
import warcprox
import logging
import sys
import argparse
import certauth
import queue
import socket
import os
class FakeQueue(object):
logger = logging.getLogger("FakeQueue")
def __init__(self, maxsize=0): pass
def join(self): pass
def qsize(self): return 0
def empty(self): return True
def full(self): return False
def get(self, block=True, timeout=None): raise queue.Empty
def put_nowait(self, item): return self.put(item, block=False)
def get_nowait(self): return self.get(block=False)
def put(self, recorded_url, block=True, timeout=None):
logging.info("{} {} {} {} {} size={} {}".format(
recorded_url.client_ip, recorded_url.status, recorded_url.method,
recorded_url.url.decode("utf-8"), recorded_url.mimetype,
recorded_url.size, warcprox.digest_str(recorded_url.response_recorder.payload_digest, False).decode('utf-8')))
def parse_args():
prog = os.path.basename(sys.argv[0])
arg_parser = argparse.ArgumentParser(prog=prog,
description='%s - single threaded mitm http/s proxy, for debugging' % prog,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument('-p', '--port', dest='port', default='8000',
type=int, help='port to listen on')
arg_parser.add_argument('-b', '--address', dest='address',
default='localhost', help='address to listen on')
arg_parser.add_argument('-c', '--cacert', dest='cacert',
default='./{0}-warcprox-ca.pem'.format(socket.gethostname()),
help='CA certificate file; if file does not exist, it will be created')
arg_parser.add_argument('--certs-dir', dest='certs_dir',
default='./{0}-warcprox-ca'.format(socket.gethostname()),
help='where to store and load generated certificates')
arg_parser.add_argument('--onion-tor-socks-proxy', dest='onion_tor_socks_proxy',
default=None, help='host:port of tor socks proxy, used only to connect to .onion sites')
arg_parser.add_argument('--version', action='version',
version="warcprox {}".format(warcprox.__version__))
arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')
arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true')
return arg_parser.parse_args(args=sys.argv[1:])
def init_logging(verbose):
if args.verbose:
loglevel = logging.DEBUG
elif args.quiet:
loglevel = logging.WARNING
else:
loglevel = logging.INFO
logging.basicConfig(stream=sys.stdout, level=loglevel,
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
# format='%(asctime)s %(funcName) 21s() %(filename)15s:%(lineno)05d %(message)s')
def init_proxy(args):
ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64]
ca = certauth.certauth.CertificateAuthority(args.cacert, args.certs_dir,
ca_name=ca_name)
options = warcprox.Options(**vars(args))
proxy = warcprox.warcproxy.SingleThreadedWarcProxy(ca,
recorded_url_q=FakeQueue(), options=options)
return proxy
if __name__ == "__main__":
args = parse_args()
init_logging(args.verbose)
proxy = init_proxy(args)
proxy.serve_forever()

View File

@ -1,4 +1,24 @@
#!/usr/bin/env python
#
# tests/test_dump-anydbm.py - tests for dump-anydbm
#
# Copyright (C) 2013-2016 Internet Archive
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
# USA.
#
import pytest
import os
@ -6,6 +26,7 @@ import tempfile
import subprocess # to access the script from shell
import sys
import glob
import distutils
# will try as python 3 then default to python 2 modules
try:
@ -38,7 +59,7 @@ val1 = 'very first value'
val2 = 'second value'
py = sys.executable
dump_anydbm_loc = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "bin/dump-anydbm")
dump_anydbm_loc = distutils.spawn.find_executable("dump-anydbm")
@pytest.fixture(scope="function")
def gdbm_test_db(request):

1150
tests/test_warcprox.py Executable file

File diff suppressed because it is too large Load Diff

13
tox.ini
View File

@ -1,13 +0,0 @@
# Tox (http://tox.testrun.org/) is a tool for running tests
# in multiple virtualenvs. This configuration file will run the
# test suite on all supported python versions. To use it, "pip install tox"
# and then run "tox" from this directory.
[tox]
envlist = py27, py34
[testenv]
commands = py.test warcprox
deps =
pytest
requests

View File

@ -1,8 +1,141 @@
def _read_version_bytes():
import os
version_txt = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ['version.txt'])
with open(version_txt, 'rb') as fin:
return fin.read().strip()
"""
warcprox/__init__.py - warcprox package main file, contains some utility code
version_bytes = _read_version_bytes().strip()
version_str = version_bytes.decode('utf-8')
Copyright (C) 2013-2016 Internet Archive
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
USA.
"""
from argparse import Namespace as _Namespace
from pkg_resources import get_distribution as _get_distribution
__version__ = _get_distribution('warcprox').version
def digest_str(hash_obj, base32):
import base64
return hash_obj.name.encode('utf-8') + b':' + (
base64.b32encode(hash_obj.digest()) if base32
else hash_obj.hexdigest().encode('ascii'))
class Options(_Namespace):
def __getattr__(self, name):
try:
return super(Options, self).__getattr__(self, name)
except AttributeError:
return None
# XXX linux-specific
def gettid():
try:
import ctypes
libc = ctypes.cdll.LoadLibrary('libc.so.6')
SYS_gettid = 186
tid = libc.syscall(SYS_gettid)
return tid
except:
return "n/a"
class RequestBlockedByRule(Exception):
"""
An exception raised when a request should be blocked to respect a
Warcprox-Meta rule.
"""
def __init__(self, msg):
self.msg = msg
def __str__(self):
return "%s: %s" % (self.__class__.__name__, self.msg)
class Url:
'''
Utility class
'''
def __init__(self, url):
self.url = url
self._surt = None
self._host = None
@property
def surt(self):
if not self._surt:
import surt
hurl = surt.handyurl.parse(self.url)
surt.GoogleURLCanonicalizer.canonicalize(hurl)
hurl.query = None
hurl.hash = None
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
return self._surt
@property
def host(self):
if not self._host:
import surt
self._host = surt.handyurl.parse(self.url).host
return self._host
def matches_ip_or_domain(self, ip_or_domain):
return host_matches_ip_or_domain(self.host, ip_or_domain)
def normalize_host(host):
# normalize host (punycode and lowercase)
return host.encode('idna').decode('ascii').lower()
def host_matches_ip_or_domain(host, ip_or_domain):
'''
Returns true if
- ip_or_domain is an ip address and host is the same ip address
- ip_or_domain is a domain and host is the same domain
- ip_or_domain is a domain and host is a subdomain of it
'''
_host = normalize_host(host)
_ip_or_domain = normalize_host(ip_or_domain)
if _ip_or_domain == _host:
return True
# if either _ip_or_domain or host are ip addresses, and they're not
# identical (previous check), not a match
try:
ipaddress.ip_address(_ip_or_domain)
return False
except:
pass
try:
ipaddress.ip_address(_host)
return False
except:
pass
# if we get here, we're looking at two hostnames
domain_parts = _ip_or_domain.split(".")
host_parts = _host.split(".")
result = host_parts[-len(domain_parts):] == domain_parts
return result
# logging level more fine-grained than logging.DEBUG==10
TRACE = 5
import warcprox.controller as controller
import warcprox.playback as playback
import warcprox.dedup as dedup
import warcprox.warcproxy as warcproxy
import warcprox.mitmproxy as mitmproxy
import warcprox.writer as writer
import warcprox.warc as warc
import warcprox.writerthread as writerthread
import warcprox.stats as stats
import warcprox.bigtable as bigtable
import warcprox.kafkafeed as kafkafeed

218
warcprox/bigtable.py Normal file
View File

@ -0,0 +1,218 @@
"""
warcprox/bigtable.py - module for "big" RethinkDB table for deduplication;
the table is "big" in the sense that it is designed to be usable as an index
for playback software outside of warcprox, and contains information not
needed merely for deduplication
Copyright (C) 2015-2016 Internet Archive
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
USA.
"""
from __future__ import absolute_import
import logging
from hanzo import warctools
import random
import warcprox
import base64
import surt
import os
import hashlib
import threading
import datetime
import rethinkstuff
class RethinkCaptures:
"""Inserts in batches every 0.5 seconds"""
logger = logging.getLogger("warcprox.bigtable.RethinkCaptures")
def __init__(self, r, table="captures", shards=None, replicas=None, options=warcprox.Options()):
self.r = r
self.table = table
self.shards = shards or len(r.servers)
self.replicas = replicas or min(3, len(r.servers))
self.options = options
self._ensure_db_table()
self._stop = threading.Event()
self._batch_lock = threading.RLock()
with self._batch_lock:
self._batch = []
self._timer = None
def start(self):
"""Starts batch insert repeating timer"""
self._insert_batch()
def _insert_batch(self):
try:
with self._batch_lock:
if len(self._batch) > 0:
result = self.r.table(self.table).insert(self._batch).run()
if result["inserted"] != len(self._batch) or sorted(
result.values()) != [0,0,0,0,0,len(self._batch)]:
raise Exception(
"unexpected result %s saving batch of %s "
"entries", result, len(self._batch))
self.logger.debug(
"saved %s entries to big capture table db",
len(self._batch))
self._batch = []
except BaseException as e:
self.logger.error(
"caught exception trying to save %s entries, they will "
"be included in the next batch", len(self._batch),
exc_info=True)
finally:
if not self._stop.is_set():
t = threading.Timer(0.5, self._insert_batch)
t.name = "RethinkCaptures-batch-insert-timer-%s" % datetime.datetime.utcnow().isoformat()
t.start()
# ensure self._timer joinable (already started) whenever close() happens to be called
self._timer = t
else:
self.logger.info("finished")
def _ensure_db_table(self):
dbs = self.r.db_list().run()
if not self.r.dbname in dbs:
self.logger.info("creating rethinkdb database %s", repr(self.r.dbname))
self.r.db_create(self.r.dbname).run()
tables = self.r.table_list().run()
if not self.table in tables:
self.logger.info("creating rethinkdb table %s in database %s", repr(self.table), repr(self.r.dbname))
self.r.table_create(self.table, shards=self.shards, replicas=self.replicas).run()
self.r.table(self.table).index_create("abbr_canon_surt_timestamp", [self.r.row["abbr_canon_surt"], self.r.row["timestamp"]]).run()
self.r.table(self.table).index_create("sha1_warc_type", [self.r.row["sha1base32"], self.r.row["warc_type"], self.r.row["bucket"]]).run()
def find_response_by_digest(self, algo, raw_digest, bucket="__unspecified__"):
if algo != "sha1":
raise Exception("digest type is {} but big capture table is indexed by sha1".format(algo))
sha1base32 = base64.b32encode(raw_digest).decode("utf-8")
results_iter = self.r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type").run()
results = list(results_iter)
if len(results) > 0:
if len(results) > 1:
self.logger.debug("expected 0 or 1 but found %s results for sha1base32=%s bucket=%s (will use first result)", len(results), sha1base32, bucket)
result = results[0]
else:
result = None
self.logger.debug("returning %s for sha1base32=%s bucket=%s",
result, sha1base32, bucket)
return result
def _assemble_entry(self, recorded_url, records):
if recorded_url.response_recorder:
if recorded_url.response_recorder.payload_digest.name == "sha1":
sha1base32 = base64.b32encode(
recorded_url.response_recorder.payload_digest.digest()
).decode("utf-8")
else:
self.logger.warn(
"digest type is %s but big capture table is indexed "
"by sha1",
recorded_url.response_recorder.payload_digest.name)
else:
digest = hashlib.new("sha1", records[0].content[1])
sha1base32 = base64.b32encode(digest.digest()).decode("utf-8")
if (recorded_url.warcprox_meta
and "captures-bucket" in recorded_url.warcprox_meta):
bucket = recorded_url.warcprox_meta["captures-bucket"]
else:
bucket = "__unspecified__"
canon_surt = surt.surt(recorded_url.url.decode("utf-8"),
trailing_comma=True, host_massage=False, with_scheme=True)
entry = {
# id only specified for rethinkdb partitioning
"id": "{} {}".format(
canon_surt[:20], records[0].id.decode("utf-8")[10:-1]),
"abbr_canon_surt": canon_surt[:150],
"canon_surt": canon_surt,
"timestamp": recorded_url.timestamp.replace(
tzinfo=rethinkstuff.UTC),
"url": recorded_url.url.decode("utf-8"),
"offset": records[0].offset,
"filename": os.path.basename(records[0].warc_filename),
"warc_type": records[0].type.decode("utf-8"),
"warc_id": records[0].id.decode("utf-8"),
"sha1base32": sha1base32,
"content_type": recorded_url.mimetype,
"response_code": recorded_url.status,
"http_method": recorded_url.method,
"bucket": bucket,
"length": records[0].length,
}
if (recorded_url.warcprox_meta and
"captures-table-extra-fields" in recorded_url.warcprox_meta):
extras = recorded_url.warcprox_meta["captures-table-extra-fields"]
for extra_field in extras:
entry[extra_field] = extras[extra_field]
return entry
def notify(self, recorded_url, records):
entry = self._assemble_entry(recorded_url, records)
with self._batch_lock:
self._batch.append(entry)
def close(self):
self.stop()
def stop(self):
self.logger.info("closing rethinkdb captures table")
self._stop.set()
if self._timer:
self._timer.join()
class RethinkCapturesDedup:
logger = logging.getLogger("warcprox.dedup.RethinkCapturesDedup")
def __init__(self, captures_db, options=warcprox.Options()):
self.captures_db = captures_db
self.options = options
def lookup(self, digest_key, bucket="__unspecified__"):
k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key
algo, value_str = k.split(":")
if self.options.base32:
raw_digest = base64.b32decode(value_str, casefold=True)
else:
raw_digest = base64.b16decode(value_str, casefold=True)
entry = self.captures_db.find_response_by_digest(algo, raw_digest, bucket)
if entry:
dedup_info = {
"url": entry["url"].encode("utf-8"),
"date": entry["timestamp"].strftime("%Y-%m-%dT%H:%M:%SZ").encode("utf-8"),
}
if "warc_id" in entry:
dedup_info["id"] = entry["warc_id"].encode("utf-8")
return dedup_info
else:
return None
def start(self):
self.captures_db.start()
def stop(self):
self.captures_db.stop()
def close(self):
self.captures_db.close()

View File

@ -1,19 +1,45 @@
# vim: set sw=4 et:
'''
warcprox/controller.py - contains WarcproxController class, responsible for
starting up and shutting down the various components of warcprox, and for
sending heartbeats to the service registry if configured to do so; also has
some memory profiling capabilities
Copyright (C) 2013-2016 Internet Archive
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
USA.
'''
from __future__ import absolute_import
import logging
import threading
import signal
import time
import warcprox.warcprox
import warcprox.warcwriter
import warcprox
import sys
import gc
import datetime
class WarcproxController(object):
logger = logging.getLogger("warcprox.controller.WarcproxController")
def __init__(self, proxy=None, warc_writer_thread=None, playback_proxy=None):
HEARTBEAT_INTERVAL = 20.0
def __init__(self, proxy=None, warc_writer_thread=None,
playback_proxy=None, service_registry=None,
options=warcprox.Options()):
"""
Create warcprox controller.
@ -34,44 +60,129 @@ class WarcproxController(object):
else:
self.warc_writer_thread = warcprox.warcwriter.WarcWriterThread(recorded_url_q=self.proxy.recorded_url_q)
self.proxy_thread = None
self.playback_proxy_thread = None
self.playback_proxy = playback_proxy
self.service_registry = service_registry
self.options = options
def run_until_shutdown(self):
"""Start warcprox and run until shut down.
If running in the main thread, SIGTERM initiates a graceful shutdown.
Otherwise, call warcprox_controller.stop.set().
"""
proxy_thread = threading.Thread(target=self.proxy.serve_forever, name='ProxyThread')
proxy_thread.start()
self.warc_writer_thread.start()
if self.playback_proxy is not None:
playback_proxy_thread = threading.Thread(target=self.playback_proxy.serve_forever, name='PlaybackProxyThread')
playback_proxy_thread.start()
self._last_rss = None
self.stop = threading.Event()
self._start_stop_lock = threading.Lock()
try:
signal.signal(signal.SIGTERM, self.stop.set)
self.logger.info('SIGTERM will initiate graceful shutdown')
except ValueError:
pass
def debug_mem(self):
self.logger.info("self.proxy.recorded_url_q.qsize()=%s", self.proxy.recorded_url_q.qsize())
with open("/proc/self/status") as f:
for line in f:
fields = line.split()
if len(fields) >= 2:
k, v = fields[0:2]
if k == "VmHWM:":
hwm = int(v)
elif k == "VmRSS:":
rss = int(v)
elif k == "VmData:":
data = int(v)
elif k == "VmStk:":
stk = int(v)
self.logger.info("rss=%s data=%s stack=%s hwm=%s", rss, data, stk, hwm)
self._last_rss = self._last_rss or rss # to set initial value
if rss - self._last_rss > 1024:
num_unreachable = gc.collect()
all_objects = gc.get_objects()
total_size = 0
summary = {}
biggest_objects = [None] * 10
for obj in all_objects:
size = sys.getsizeof(obj)
total_size += size
if not type(obj) in summary:
summary[type(obj)] = {"count":0,"size":0}
summary[type(obj)]["count"] += 1
summary[type(obj)]["size"] += size
if size > sys.getsizeof(biggest_objects[-1]):
for i in range(len(biggest_objects)):
if size > sys.getsizeof(biggest_objects[i]):
index = i
break
biggest_objects[index+1:] = biggest_objects[index:-1]
biggest_objects[index] = obj
self.logger.info("%s objects totaling %s bytes", len(all_objects), total_size)
self.logger.info("=== biggest types ===")
for item in sorted(summary.items(), key=lambda item: item[1]["size"], reverse=True)[:10]:
self.logger.info("%s bytes in %s instances of %s", item[1]["size"], item[1]["count"], item[0])
self.logger.info("=== warcprox types ===")
for t in (t for t in summary if str(t).find("warcprox") >= 0):
self.logger.info("%s bytes in %s instances of %s", summary[t]["size"], summary[t]["count"], t)
for i in range(len(biggest_objects)):
obj = biggest_objects[i]
try:
value = repr(bytes(obj.getbuffer()[:100]))
except:
try:
value = repr(obj)[:100]
except BaseException as e:
value = "<{} getting value>".format(e)
self.logger.info("#%s (%s) (%s bytes) (%s refs) (id=%s): %s", i+1, type(obj), sys.getsizeof(obj), sys.getrefcount(obj), id(obj), value)
self.logger.info("%s unreachable objects totaling %s bytes", len(gc.garbage), sum(sys.getsizeof(x) for x in gc.garbage))
self._last_rss = rss
def _service_heartbeat(self):
if hasattr(self, 'status_info'):
status_info = self.status_info
else:
status_info = {
'role': 'warcprox',
'heartbeat_interval': self.HEARTBEAT_INTERVAL,
'port': self.options.port,
}
status_info['load'] = 1.0 * self.proxy.recorded_url_q.qsize() / (self.proxy.recorded_url_q.maxsize or 100)
status_info['queue_size'] = self.proxy.recorded_url_q.qsize()
self.status_info = self.service_registry.heartbeat(status_info)
self.logger.log(
warcprox.TRACE, "status in service registry: %s",
self.status_info)
def start(self):
with self._start_stop_lock:
if self.proxy_thread and self.proxy_thread.is_alive():
self.logger.info('warcprox is already running')
return
if self.proxy.stats_db:
self.proxy.stats_db.start()
self.proxy_thread = threading.Thread(
target=self.proxy.serve_forever, name='ProxyThread')
self.proxy_thread.start()
if self.warc_writer_thread.dedup_db:
self.warc_writer_thread.dedup_db.start()
self.warc_writer_thread.start()
if self.playback_proxy is not None:
self.playback_proxy_thread = threading.Thread(
target=self.playback_proxy.serve_forever,
name='PlaybackProxyThread')
self.playback_proxy_thread.start()
def shutdown(self):
with self._start_stop_lock:
if not self.proxy_thread or not self.proxy_thread.is_alive():
self.logger.info('warcprox is not running')
return
try:
while not self.stop.is_set():
time.sleep(0.5)
except:
pass
finally:
self.warc_writer_thread.stop.set()
self.proxy.shutdown()
self.proxy.server_close()
if self.warc_writer_thread.warc_writer.dedup_db is not None:
self.warc_writer_thread.warc_writer.dedup_db.close()
if self.playback_proxy is not None:
self.playback_proxy.shutdown()
self.playback_proxy.server_close()
@ -80,7 +191,59 @@ class WarcproxController(object):
# wait for threads to finish
self.warc_writer_thread.join()
proxy_thread.join()
if self.playback_proxy is not None:
playback_proxy_thread.join()
if self.proxy.stats_db:
self.proxy.stats_db.stop()
if self.warc_writer_thread.dedup_db:
self.warc_writer_thread.dedup_db.close()
self.proxy_thread.join()
if self.playback_proxy is not None:
self.playback_proxy_thread.join()
if self.service_registry and hasattr(self, "status_info"):
self.service_registry.unregister(self.status_info["id"])
def run_until_shutdown(self):
"""
Start warcprox and run until shut down. Call
warcprox_controller.stop.set() to initiate graceful shutdown.
"""
self.start()
last_mem_dbg = datetime.datetime.utcfromtimestamp(0)
try:
utc = datetime.timezone.utc
except AttributeError:
# python2 :-\
class UTC(datetime.tzinfo):
def tzname(self, dt): return "UTC+00:00"
def dst(self, dt): return datetime.timedelta(0)
def utcoffset(self, dt): return datetime.timedelta(0)
utc = UTC()
try:
while not self.stop.is_set():
if self.service_registry and (
not hasattr(self, "status_info") or (
datetime.datetime.now(utc)
- self.status_info["last_heartbeat"]
).total_seconds() > self.HEARTBEAT_INTERVAL):
self._service_heartbeat()
if self.options.profile and (
datetime.datetime.utcnow() - last_mem_dbg
).total_seconds() > 60:
self.debug_mem()
last_mem_dbg = datetime.datetime.utcnow()
time.sleep(0.5)
except:
self.logger.critical(
"shutting down in response to fatal exception",
exc_info=True)
pass
finally:
self.shutdown()

View File

@ -1,30 +1,58 @@
# vim:set sw=4 et:
#
# warcprox/dedup.py - identical payload digest deduplication
#
# Copyright (C) 2013-2016 Internet Archive
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
# USA.
#
from __future__ import absolute_import
try:
import dbm.gnu as dbm_gnu
except ImportError:
try:
import gdbm as dbm_gnu
except ImportError:
import anydbm as dbm_gnu
import logging
import os
import json
from hanzo import warctools
import warcprox
import random
class DedupDb(object):
logger = logging.getLogger("warcprox.dedup.DedupDb")
def __init__(self, dbm_file='./warcprox-dedup.db'):
def __init__(self, dbm_file='./warcprox-dedup.db', options=warcprox.Options()):
try:
import dbm.gnu as dbm_gnu
except ImportError:
try:
import gdbm as dbm_gnu
except ImportError:
import anydbm as dbm_gnu
if os.path.exists(dbm_file):
self.logger.info('opening existing deduplication database {}'.format(dbm_file))
else:
self.logger.info('creating new deduplication database {}'.format(dbm_file))
self.db = dbm_gnu.open(dbm_file, 'c')
self.options = options
def start(self):
pass
def stop(self):
self.close()
def close(self):
self.db.close()
@ -35,26 +63,115 @@ class DedupDb(object):
except:
pass
def save(self, key, response_record, offset):
def save(self, digest_key, response_record, bucket=""):
record_id = response_record.get_header(warctools.WarcRecord.ID).decode('latin1')
url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1')
date = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1')
py_value = {'i':record_id, 'u':url, 'd':date}
key = digest_key + b"|" + bucket.encode("utf-8")
py_value = {'id':record_id, 'url':url, 'date':date}
json_value = json.dumps(py_value, separators=(',',':'))
self.db[key] = json_value.encode('utf-8')
self.logger.debug('dedup db saved {}:{}'.format(key, json_value))
self.logger.debug('dedup db saved %s:%s', key, json_value)
def lookup(self, key):
def lookup(self, digest_key, bucket=""):
result = None
key = digest_key + b"|" + bucket.encode("utf-8")
if key in self.db:
json_result = self.db[key]
result = json.loads(json_result.decode('utf-8'))
result['i'] = result['i'].encode('latin1')
result['u'] = result['u'].encode('latin1')
result['d'] = result['d'].encode('latin1')
return result
result['id'] = result['id'].encode('latin1')
result['url'] = result['url'].encode('latin1')
result['date'] = result['date'].encode('latin1')
self.logger.debug('dedup db lookup of key=%s returning %s', key, result)
return result
def notify(self, recorded_url, records):
if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
and recorded_url.response_recorder.payload_size() > 0):
digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest,
self.options.base32)
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"])
else:
self.save(digest_key, records[0])
def decorate_with_dedup_info(dedup_db, recorded_url, base32=False):
if (recorded_url.response_recorder
and recorded_url.response_recorder.payload_digest
and recorded_url.response_recorder.payload_size() > 0):
digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32)
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"])
else:
return None
recorded_url.dedup_info = dedup_db.lookup(digest_key)
class RethinkDedupDb:
logger = logging.getLogger("warcprox.dedup.RethinkDedupDb")
def __init__(self, r, table="dedup", shards=None, replicas=None, options=warcprox.Options()):
self.r = r
self.table = table
self.shards = shards or len(r.servers)
self.replicas = replicas or min(3, len(r.servers))
self._ensure_db_table()
self.options = options
def _ensure_db_table(self):
dbs = self.r.db_list().run()
if not self.r.dbname in dbs:
self.logger.info("creating rethinkdb database %s", repr(self.r.dbname))
self.r.db_create(self.r.dbname).run()
tables = self.r.table_list().run()
if not self.table in tables:
self.logger.info("creating rethinkdb table %s in database %s shards=%s replicas=%s",
repr(self.table), repr(self.r.dbname), self.shards, self.replicas)
self.r.table_create(self.table, primary_key="key", shards=self.shards, replicas=self.replicas).run()
def start(self):
pass
def stop(self):
pass
def close(self):
pass
def sync(self):
pass
def save(self, digest_key, response_record, bucket=""):
k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key
k = "{}|{}".format(k, bucket)
record_id = response_record.get_header(warctools.WarcRecord.ID).decode('latin1')
url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1')
date = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1')
record = {'key':k,'url':url,'date':date,'id':record_id}
result = self.r.table(self.table).insert(record,conflict="replace").run()
if sorted(result.values()) != [0,0,0,0,0,1] and [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]:
raise Exception("unexpected result %s saving %s", result, record)
self.logger.debug('dedup db saved %s:%s', k, record)
def lookup(self, digest_key, bucket=""):
k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key
k = "{}|{}".format(k, bucket)
result = self.r.table(self.table).get(k).run()
if result:
for x in result:
result[x] = result[x].encode("utf-8")
self.logger.debug('dedup db lookup of key=%s returning %s', k, result)
return result
def notify(self, recorded_url, records):
if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
and recorded_url.response_recorder.payload_size() > 0):
digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest,
self.options.base32)
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"])
else:
self.save(digest_key, records[0])

View File

@ -1,12 +1,28 @@
#!/usr/bin/env python
# vim:set sw=4 et:
#
'''
dump-anydbm - dumps contents of dbm file to stdout
"""
Dump contents of database to stdout. Database can be any file that the anydbm
module can read. Included with warcprox because it's useful for inspecting a
deduplication database or a playback index database, but it is a generic tool.
"""
Copyright (C) 2013-2016 Internet Archive
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
USA.
'''
try:
import dbm
@ -14,7 +30,7 @@ try:
whichdb = dbm.whichdb
except:
import anydbm
import anydbm
dbm = anydbm
from whichdb import whichdb
@ -22,6 +38,9 @@ import sys
import os.path
if __name__ == "__main__":
main()
def main():
if len(sys.argv) != 2:
sys.stderr.write("usage: {} DBM_FILE\n".format(sys.argv[0]))
exit(1)

101
warcprox/kafkafeed.py Normal file
View File

@ -0,0 +1,101 @@
'''
warcprox/kafkafeed.py - support for publishing information about archived
urls to apache kafka
Copyright (C) 2015-2016 Internet Archive
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
USA.
'''
import kafka
import datetime
import json
import logging
from hanzo import warctools
class CaptureFeed:
logger = logging.getLogger('warcprox.kafkafeed.CaptureFeed')
def __init__(self, broker_list, topic=None):
self.broker_list = broker_list
self.topic = topic
self.__producer = None
self._connection_exception = None
def _producer(self):
if not self.__producer:
try:
# acks=0 to avoid ever blocking
self.__producer = kafka.KafkaProducer(
bootstrap_servers=self.broker_list, acks=0)
if self._connection_exception:
logging.info('connected to kafka successfully!')
self._connection_exception = None
except Exception as e:
if not self._connection_exception:
self._connection_exception = e
logging.error('problem connecting to kafka', exc_info=True)
return self.__producer
def notify(self, recorded_url, records):
if records[0].type not in (b'revisit', b'response'):
return
topic = recorded_url.warcprox_meta.get('capture-feed-topic', self.topic)
if not topic:
return
try:
payload_digest = records[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode('utf-8')
except:
payload_digest = '-'
# {"status_code":200,"content_digest":"sha1:3VU56HI3BTMDZBL2TP7SQYXITT7VEAJQ","host":"www.kaosgl.com","via":"http://www.kaosgl.com/sayfa.php?id=4427","account_id":"877","seed":"http://www.kaosgl.com/","warc_filename":"ARCHIVEIT-6003-WEEKLY-JOB171310-20150903100014694-00002.warc.gz","url":"http://www.kaosgl.com/resim/HomofobiKarsitiBulusma/trabzon05.jpg","size":29700,"start_time_plus_duration":"20150903175709637+1049","timestamp":"2015-09-03T17:57:10.707Z","mimetype":"image/jpeg","collection_id":"6003","is_test_crawl":"false","job_name":"6003-20150902172136074","warc_offset":856320200,"thread":6,"hop_path":"RLLLLLE","extra_info":{},"annotations":"duplicate:digest","content_length":29432}
now = datetime.datetime.utcnow()
d = {
'timestamp': '{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000),
'size': recorded_url.size,
'status_code': recorded_url.status,
'url': recorded_url.url.decode('utf-8'),
'mimetype': recorded_url.mimetype,
'content_digest': payload_digest,
'warc_filename': records[0].warc_filename,
'warc_offset': records[0].offset,
'host': recorded_url.host,
'annotations': 'duplicate:digest' if records[0].type == 'revisit' else '',
'content_length': recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset,
'start_time_plus_duration': '{:%Y%m%d%H%M%S}{:03d}+{}'.format(
recorded_url.timestamp, recorded_url.timestamp.microsecond//1000,
int(recorded_url.duration.total_seconds() * 1000)),
# 'hop_path': ? # only used for seed redirects, which are n/a to brozzler (?)
# 'via': ?
# 'thread': ? # not needed
}
# fields expected to be populated here are (for archive-it):
# account_id, collection_id, is_test_crawl, seed, job_name
if recorded_url.warcprox_meta and 'capture-feed-extra-fields' in recorded_url.warcprox_meta:
for (k,v) in recorded_url.warcprox_meta['capture-feed-extra-fields'].items():
d[k] = v
msg = json.dumps(d, separators=(',', ':')).encode('utf-8')
self.logger.debug('feeding kafka topic=%s msg=%s', repr(topic), msg)
p = self._producer()
if p:
p.send(topic, msg)

View File

@ -1,5 +1,25 @@
#!/usr/bin/env python
# vim:set sw=4 et:
'''
warcprox/main.py - entrypoint for warcprox executable, parses command line
arguments, initializes components, starts controller, handles signals
Copyright (C) 2013-2016 Internet Archive
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
USA.
'''
from __future__ import absolute_import
@ -14,21 +34,21 @@ import hashlib
import argparse
import os
import socket
import traceback
import signal
import threading
import certauth.certauth
import warcprox.playback
import warcprox.dedup
import warcprox.warcwriter
import warcprox.warcprox
import warcprox.controller
import warcprox
import re
import rethinkstuff
import cryptography.hazmat.backends.openssl
def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
arg_parser = argparse.ArgumentParser(prog=prog,
description='warcprox - WARC writing MITM HTTP/S proxy',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument('-p', '--port', dest='port', default='8000',
help='port to listen on')
type=int, help='port to listen on')
arg_parser.add_argument('-b', '--address', dest='address',
default='localhost', help='address to listen on')
arg_parser.add_argument('-c', '--cacert', dest='cacert',
@ -44,10 +64,10 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
arg_parser.add_argument('-n', '--prefix', dest='prefix',
default='WARCPROX', help='WARC filename prefix')
arg_parser.add_argument('-s', '--size', dest='size',
default=1000*1000*1000,
default=1000*1000*1000, type=int,
help='WARC file rollover size threshold in bytes')
arg_parser.add_argument('--rollover-idle-time',
dest='rollover_idle_time', default=None,
dest='rollover_idle_time', default=None, type=int,
help="WARC file rollover idle time threshold in seconds (so that Friday's last open WARC doesn't sit there all weekend waiting for more data)")
try:
hash_algos = hashlib.algorithms_guaranteed
@ -57,30 +77,171 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
default='sha1', help='digest algorithm, one of {}'.format(', '.join(hash_algos)))
arg_parser.add_argument('--base32', dest='base32', action='store_true',
default=False, help='write digests in Base32 instead of hex')
arg_parser.add_argument('-j', '--dedup-db-file', dest='dedup_db_file',
default='./warcprox-dedup.db', help='persistent deduplication database file; empty string or /dev/null disables deduplication')
arg_parser.add_argument('--stats-db-file', dest='stats_db_file',
default='./warcprox-stats.db', help='persistent statistics database file; empty string or /dev/null disables statistics tracking')
arg_parser.add_argument('-P', '--playback-port', dest='playback_port',
default=None, help='port to listen on for instant playback')
type=int, default=None, help='port to listen on for instant playback')
arg_parser.add_argument('--playback-index-db-file', dest='playback_index_db_file',
default='./warcprox-playback-index.db',
help='playback index database file (only used if --playback-port is specified)')
group = arg_parser.add_mutually_exclusive_group()
group.add_argument('-j', '--dedup-db-file', dest='dedup_db_file',
default='./warcprox-dedup.db', help='persistent deduplication database file; empty string or /dev/null disables deduplication')
group.add_argument('--rethinkdb-servers', dest='rethinkdb_servers',
help='rethinkdb servers, used for dedup and stats if specified; e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org')
arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default='warcprox',
help='rethinkdb database name (ignored unless --rethinkdb-servers is specified)')
arg_parser.add_argument('--rethinkdb-big-table',
dest='rethinkdb_big_table', action='store_true', default=False,
help='use a big rethinkdb table called "captures", instead of a small table called "dedup"; table is suitable for use as index for playback (ignored unless --rethinkdb-servers is specified)')
arg_parser.add_argument('--kafka-broker-list', dest='kafka_broker_list',
default=None, help='kafka broker list for capture feed')
arg_parser.add_argument('--kafka-capture-feed-topic', dest='kafka_capture_feed_topic',
default=None, help='kafka capture feed topic')
arg_parser.add_argument('--queue-size', dest='queue_size', default=500,
help=argparse.SUPPRESS)
arg_parser.add_argument('--max-threads', dest='max_threads',
help=argparse.SUPPRESS)
arg_parser.add_argument('--profile', action='store_true', default=False,
help=argparse.SUPPRESS)
arg_parser.add_argument('--onion-tor-socks-proxy', dest='onion_tor_socks_proxy',
default=None, help='host:port of tor socks proxy, used only to connect to .onion sites')
arg_parser.add_argument('--version', action='version',
version="warcprox {}".format(warcprox.version_str))
version="warcprox {}".format(warcprox.__version__))
arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')
arg_parser.add_argument('--trace', dest='trace', action='store_true')
arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true')
# [--ispartof=warcinfo ispartof]
# [--description=warcinfo description]
# [--operator=warcinfo operator]
# [--httpheader=warcinfo httpheader]
return arg_parser
def dump_state(signum=None, frame=None):
'''
Signal handler, logs stack traces of active threads.
'''
state_strs = []
def main(argv=sys.argv):
for th in threading.enumerate():
try:
state_strs.append(str(th))
except AssertionError:
state_strs.append('<n/a:AssertionError>')
stack = traceback.format_stack(sys._current_frames()[th.ident])
state_strs.append(''.join(stack))
logging.warn(
'dumping state (caught signal %s)\n%s',
signum, '\n'.join(state_strs))
def init_controller(args):
'''
Creates a warcprox.controller.WarcproxController configured according to
the supplied arguments (normally the result of parse_args(sys.argv)).
'''
options = warcprox.Options(**vars(args))
try:
hashlib.new(args.digest_algorithm)
except Exception as e:
logging.fatal(e)
exit(1)
listeners = []
if args.rethinkdb_servers:
r = rethinkstuff.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db)
if args.rethinkdb_big_table:
captures_db = warcprox.bigtable.RethinkCaptures(r, options=options)
dedup_db = warcprox.bigtable.RethinkCapturesDedup(captures_db, options=options)
listeners.append(captures_db)
else:
dedup_db = warcprox.dedup.RethinkDedupDb(r, options=options)
listeners.append(dedup_db)
elif args.dedup_db_file in (None, '', '/dev/null'):
logging.info('deduplication disabled')
dedup_db = None
else:
dedup_db = warcprox.dedup.DedupDb(args.dedup_db_file, options=options)
listeners.append(dedup_db)
if args.rethinkdb_servers:
stats_db = warcprox.stats.RethinkStatsDb(r, options=options)
listeners.append(stats_db)
elif args.stats_db_file in (None, '', '/dev/null'):
logging.info('statistics tracking disabled')
stats_db = None
else:
stats_db = warcprox.stats.StatsDb(args.stats_db_file, options=options)
listeners.append(stats_db)
if args.kafka_broker_list:
kafka_capture_feed = warcprox.kafkafeed.CaptureFeed(
args.kafka_broker_list, args.kafka_capture_feed_topic)
listeners.append(kafka_capture_feed)
recorded_url_q = queue.Queue(maxsize=args.queue_size)
ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64]
ca = certauth.certauth.CertificateAuthority(args.cacert, args.certs_dir,
ca_name=ca_name)
proxy = warcprox.warcproxy.WarcProxy(ca=ca, recorded_url_q=recorded_url_q,
stats_db=stats_db, options=options)
if args.playback_port is not None:
playback_index_db = warcprox.playback.PlaybackIndexDb(args.playback_index_db_file, options=options)
playback_proxy = warcprox.playback.PlaybackProxy(
server_address=(args.address, args.playback_port), ca=ca,
playback_index_db=playback_index_db, warcs_dir=args.directory,
options=options)
listeners.append(playback_index_db)
else:
playback_index_db = None
playback_proxy = None
writer_pool = warcprox.writer.WarcWriterPool(options=options)
warc_writer_thread = warcprox.writerthread.WarcWriterThread(
recorded_url_q=recorded_url_q, writer_pool=writer_pool,
dedup_db=dedup_db, listeners=listeners, options=options)
if args.rethinkdb_servers:
svcreg = rethinkstuff.ServiceRegistry(r)
else:
svcreg = None
controller = warcprox.controller.WarcproxController(proxy,
warc_writer_thread, playback_proxy, service_registry=svcreg,
options=options)
return controller
def real_main(args):
# see https://github.com/pyca/cryptography/issues/2911
cryptography.hazmat.backends.openssl.backend.activate_builtin_random()
controller = init_controller(args)
signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set())
signal.signal(signal.SIGINT, lambda a,b: controller.stop.set())
signal.signal(signal.SIGQUIT, dump_state)
controller.run_until_shutdown()
def parse_args(argv=sys.argv):
'''
Parses command line arguments with argparse.
'''
arg_parser = _build_arg_parser(prog=os.path.basename(argv[0]))
args = arg_parser.parse_args(args=argv[1:])
return args
if args.verbose:
def main(argv=sys.argv):
'''
Main method, entry point of warcprox command.
'''
args = parse_args(argv)
if args.trace:
loglevel = warcprox.TRACE
elif args.verbose:
loglevel = logging.DEBUG
elif args.quiet:
loglevel = logging.WARNING
@ -90,51 +251,50 @@ def main(argv=sys.argv):
logging.basicConfig(stream=sys.stdout, level=loglevel,
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
try:
hashlib.new(args.digest_algorithm)
except Exception as e:
logging.fatal(e)
exit(1)
real_main(args)
if args.dedup_db_file in (None, '', '/dev/null'):
logging.info('deduplication disabled')
dedup_db = None
else:
dedup_db = warcprox.dedup.DedupDb(args.dedup_db_file)
def ensure_rethinkdb_tables():
'''
Creates rethinkdb tables if they don't already exist. Warcprox normally
creates the tables it needs on demand at startup, but if multiple instances
are starting up at the same time, you can end up with duplicate broken
tables. So it's a good idea to use this utility at an early step when
spinning up a cluster.
'''
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]),
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument(
'--rethinkdb-servers', dest='rethinkdb_servers', default='localhost',
help='rethinkdb servers e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org')
arg_parser.add_argument(
'--rethinkdb-db', dest='rethinkdb_db', default='warcprox',
help='rethinkdb database name')
arg_parser.add_argument(
'-q', '--quiet', dest='log_level',
action='store_const', default=logging.INFO, const=logging.WARN)
arg_parser.add_argument(
'-v', '--verbose', dest='log_level',
action='store_const', default=logging.INFO, const=logging.DEBUG)
args = arg_parser.parse_args(args=sys.argv[1:])
recorded_url_q = queue.Queue()
logging.basicConfig(
stream=sys.stdout, level=args.log_level,
format=(
'%(asctime)s %(levelname)s %(name)s.%(funcName)s'
'(%(filename)s:%(lineno)d) %(message)s'))
ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64]
ca = certauth.certauth.CertificateAuthority(args.cacert, args.certs_dir,
ca_name=ca_name)
r = rethinkstuff.Rethinker(
args.rethinkdb_servers.split(','), args.rethinkdb_db)
proxy = warcprox.warcprox.WarcProxy(
server_address=(args.address, int(args.port)), ca=ca,
recorded_url_q=recorded_url_q,
digest_algorithm=args.digest_algorithm)
# services table
rethinkstuff.ServiceRegistry(r)
if args.playback_port is not None:
playback_index_db = warcprox.playback.PlaybackIndexDb(args.playback_index_db_file)
playback_server_address=(args.address, int(args.playback_port))
playback_proxy = warcprox.playback.PlaybackProxy(server_address=playback_server_address,
ca=ca, playback_index_db=playback_index_db,
warcs_dir=args.directory)
else:
playback_index_db = None
playback_proxy = None
warc_writer = warcprox.warcwriter.WarcWriter(directory=args.directory,
gzip=args.gzip, prefix=args.prefix, port=int(args.port),
rollover_size=int(args.size), base32=args.base32,
dedup_db=dedup_db, digest_algorithm=args.digest_algorithm,
playback_index_db=playback_index_db)
warc_writer_thread = warcprox.warcwriter.WarcWriterThread(
recorded_url_q=recorded_url_q, warc_writer=warc_writer,
rollover_idle_time=int(args.rollover_idle_time) if args.rollover_idle_time is not None else None)
controller = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy)
controller.run_until_shutdown()
# stats table
warcprox.stats.RethinkStatsDb(r)
# captures table
warcprox.bigtable.RethinkCaptures(r)
if __name__ == '__main__':
main()

View File

@ -1,4 +1,28 @@
# vim:set sw=4 et:
'''
warcprox/mitmproxy.py - man-in-the-middle http/s proxy code, handles http
CONNECT method by creating a snakeoil certificate for the requested site,
calling ssl.wrap_socket() on the client connection; connects to remote
(proxied) host, possibly using tor if host tld is .onion and tor proxy is
configured
Copyright (C) 2012 Cygnos Corporation
Copyright (C) 2013-2016 Internet Archive
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
USA.
'''
from __future__ import absolute_import
@ -11,46 +35,194 @@ try:
import urllib.parse as urllib_parse
except ImportError:
import urlparse as urllib_parse
try:
import http.client as http_client
except ImportError:
import httplib as http_client
import socket
import logging
import ssl
import warcprox
import threading
import datetime
import socks
import tempfile
import hashlib
try:
import socketserver
except ImportError:
import SocketServer as socketserver
import resource
import concurrent.futures
class ProxyingRecorder(object):
"""
Wraps a socket._fileobject, recording the bytes as they are read,
calculating digests, and sending them on to the proxy client.
"""
logger = logging.getLogger("warcprox.mitmproxy.ProxyingRecorder")
def __init__(self, fp, proxy_client, digest_algorithm='sha1', url=None):
self.fp = fp
# "The file has no name, and will cease to exist when it is closed."
self.tempfile = tempfile.SpooledTemporaryFile(max_size=512*1024)
self.digest_algorithm = digest_algorithm
self.block_digest = hashlib.new(digest_algorithm)
self.payload_offset = None
self.payload_digest = None
self.proxy_client = proxy_client
self._proxy_client_conn_open = True
self.len = 0
self.url = url
def payload_starts_now(self):
self.payload_digest = hashlib.new(self.digest_algorithm)
self.payload_offset = self.len
def _update_payload_digest(self, hunk):
if self.payload_digest:
self.payload_digest.update(hunk)
def _update(self, hunk):
self._update_payload_digest(hunk)
self.block_digest.update(hunk)
self.tempfile.write(hunk)
if self.payload_digest and self._proxy_client_conn_open:
try:
self.proxy_client.sendall(hunk)
except BaseException as e:
self._proxy_client_conn_open = False
self.logger.warn(
'%s sending data to proxy client for url %s',
e, self.url)
self.logger.info(
'will continue downloading from remote server without '
'sending to client %s', self.url)
self.len += len(hunk)
def read(self, size=-1):
hunk = self.fp.read(size)
self._update(hunk)
return hunk
def readinto(self, b):
n = self.fp.readinto(b)
self._update(b[:n])
return n
def readline(self, size=-1):
# XXX depends on implementation details of self.fp.readline(), in
# particular that it doesn't call self.fp.read()
hunk = self.fp.readline(size)
self._update(hunk)
return hunk
def flush(self):
return self.fp.flush()
def close(self):
return self.fp.close()
def __len__(self):
return self.len
def payload_size(self):
if self.payload_offset is not None:
return self.len - self.payload_offset
else:
return 0
class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
'''
Implementation of HTTPResponse that uses a ProxyingRecorder to read the
response from the remote web server and send it on to the proxy client,
while recording the bytes in transit.
'''
def __init__(
self, sock, debuglevel=0, method=None, proxy_client=None,
digest_algorithm='sha1', url=None):
http_client.HTTPResponse.__init__(
self, sock, debuglevel=debuglevel, method=method)
self.proxy_client = proxy_client
self.url = url
# Keep around extra reference to self.fp because HTTPResponse sets
# self.fp=None after it finishes reading, but we still need it
self.recorder = ProxyingRecorder(
self.fp, proxy_client, digest_algorithm, url=url)
self.fp = self.recorder
def begin(self):
http_client.HTTPResponse.begin(self) # reads status line, headers
status_and_headers = 'HTTP/1.1 {} {}\r\n'.format(
self.status, self.reason)
for k,v in self.msg.items():
if k.lower() not in (
'connection', 'proxy-connection', 'keep-alive',
'proxy-authenticate', 'proxy-authorization', 'upgrade',
'strict-transport-security'):
status_and_headers += '{}: {}\r\n'.format(k, v)
status_and_headers += 'Connection: close\r\n\r\n'
self.proxy_client.sendall(status_and_headers.encode('latin1'))
self.recorder.payload_starts_now()
class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
'''
An http proxy implementation of BaseHTTPRequestHandler, that acts as a
man-in-the-middle in order to peek at the content of https transactions,
and records the bytes in transit as it proxies them.
'''
logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler")
def __init__(self, request, client_address, server):
threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1])
self.is_connect = False
self._headers_buffer = []
request.settimeout(60) # XXX what value should this have?
http_server.BaseHTTPRequestHandler.__init__(self, request, client_address, server)
def _determine_host_port(self):
# Get hostname and port to connect to
if self.is_connect:
self.hostname, self.port = self.path.split(':')
host, self.port = self.path.split(':')
else:
self.url = self.path
u = urllib_parse.urlparse(self.url)
if u.scheme != 'http':
raise Exception('Unknown scheme %s' % repr(u.scheme))
self.hostname = u.hostname
raise Exception(
'unable to parse request %s as a proxy request' % (
repr(self.requestline)))
host = u.hostname
self.port = u.port or 80
self.path = urllib_parse.urlunparse(
urllib_parse.ParseResult(
scheme='',
netloc='',
params=u.params,
path=u.path or '/',
query=u.query,
fragment=u.fragment
)
)
scheme='', netloc='', params=u.params, path=u.path or '/',
query=u.query, fragment=u.fragment))
self.hostname = warcprox.normalize_host(host)
def _connect_to_host(self):
def _connect_to_remote_server(self):
# Connect to destination
self._proxy_sock = socket.socket()
self._proxy_sock.settimeout(60)
self._proxy_sock.connect((self.hostname, int(self.port)))
if self.onion_tor_socks_proxy_host and self.hostname.lower().endswith('.onion'):
self.logger.info("using tor socks proxy at %s:%s to connect to %s",
self.onion_tor_socks_proxy_host,
self.onion_tor_socks_proxy_port or 1080,
self.hostname)
self._remote_server_sock = socks.socksocket()
self._remote_server_sock.set_proxy(
socks.SOCKS5, addr=self.onion_tor_socks_proxy_host,
port=self.onion_tor_socks_proxy_port, rdns=True)
else:
self._remote_server_sock = socket.socket()
# XXX what value should this timeout have?
self._remote_server_sock.settimeout(60)
self._remote_server_sock.connect((self.hostname, int(self.port)))
# Wrap socket if SSL is required
if self.is_connect:
@ -58,24 +230,44 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
context = ssl.create_default_context()
context.check_hostname = False
context.verify_mode = ssl.CERT_NONE
self._proxy_sock = context.wrap_socket(self._proxy_sock, server_hostname=self.hostname)
self._remote_server_sock = context.wrap_socket(
self._remote_server_sock, server_hostname=self.hostname)
except AttributeError:
try:
self._proxy_sock = ssl.wrap_socket(self._proxy_sock)
self._remote_server_sock = ssl.wrap_socket(
self._remote_server_sock)
except ssl.SSLError:
self.logger.warn("failed to establish ssl connection to {}; python ssl library does not support SNI, considering upgrading to python >= 2.7.9 or python 3.4".format(self.hostname))
self.logger.warn(
"failed to establish ssl connection to %s; python "
"ssl library does not support SNI, considering "
"upgrading to python >= 2.7.9 or python 3.4",
self.hostname)
raise
return self._remote_server_sock
def _transition_to_ssl(self):
self.request = self.connection = ssl.wrap_socket(self.connection,
server_side=True, certfile=self.server.ca.cert_for_host(self.hostname))
def do_CONNECT(self):
'''
Handles a http CONNECT request.
The CONNECT method is meant to "convert the request connection to a
transparent TCP/IP tunnel, usually to facilitate SSL-encrypted
communication (HTTPS) through an unencrypted HTTP proxy" (Wikipedia).
do_CONNECT is where the man-in-the-middle logic happens. In do_CONNECT
the proxy transitions the proxy client connection to ssl while
masquerading as the remote web server using a generated certificate.
Meanwhile makes its own separate ssl connection to the remote web
server. Then it calls self.handle_one_request() again to handle the
request intended for the remote server.
'''
self.is_connect = True
try:
# Connect to destination first
self._determine_host_port()
self._connect_to_host()
# If successful, let's do this!
self.send_response(200, 'Connection established')
@ -83,6 +275,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
self._transition_to_ssl()
except Exception as e:
try:
self.logger.error("problem handling {}: {}".format(repr(self.requestline), e))
if type(e) is socket.timeout:
self.send_error(504, str(e))
else:
@ -115,35 +308,162 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
return result
def do_COMMAND(self):
if not self.is_connect:
try:
# Connect to destination
self._determine_host_port()
self._connect_to_host()
assert self.url
except Exception as e:
self.send_error(500, str(e))
return
else:
# if self.is_connect we already connected in do_CONNECT
if self.is_connect:
self.url = self._construct_tunneled_url()
else:
self._determine_host_port()
assert self.url
self._proxy_request()
try:
# Connect to destination
self._connect_to_remote_server()
except warcprox.RequestBlockedByRule as e:
# limit enforcers have already sent the appropriate response
self.logger.info("%s: %s", repr(self.requestline), e)
return
except Exception as e:
self.logger.error("problem processing request {}: {}".format(repr(self.requestline), e), exc_info=True)
self.send_error(500, str(e))
return
try:
self._proxy_request()
except:
self.logger.error("exception proxying request", exc_info=True)
raise
def _proxy_request(self):
raise Exception('_proxy_request() not implemented in MitmProxyHandler, must be implemented in subclass!')
'''
Sends the request to the remote server, then uses a ProxyingRecorder to
read the response and send it to the proxy client, while recording the
bytes in transit. Returns a tuple (request, response) where request is
the raw request bytes, and response is a ProxyingRecorder.
'''
# Build request
req_str = '{} {} {}\r\n'.format(
self.command, self.path, self.request_version)
# Swallow headers that don't make sense to forward on, i.e. most
# hop-by-hop headers, see
# http://tools.ietf.org/html/rfc2616#section-13.5.
# self.headers is an email.message.Message, which is case-insensitive
# and doesn't throw KeyError in __delitem__
for key in (
'Connection', 'Proxy-Connection', 'Keep-Alive',
'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'):
del self.headers[key]
# Add headers to the request
# XXX in at least python3.3 str(self.headers) uses \n not \r\n :(
req_str += '\r\n'.join(
'{}: {}'.format(k,v) for (k,v) in self.headers.items())
req = req_str.encode('latin1') + b'\r\n\r\n'
# Append message body if present to the request
if 'Content-Length' in self.headers:
req += self.rfile.read(int(self.headers['Content-Length']))
try:
self.logger.debug('sending to remote server req=%s', repr(req))
# Send it down the pipe!
self._remote_server_sock.sendall(req)
prox_rec_res = ProxyingRecordingHTTPResponse(
self._remote_server_sock, proxy_client=self.connection,
digest_algorithm=self.server.digest_algorithm,
url=self.url)
prox_rec_res.begin()
buf = prox_rec_res.read(8192)
while buf != b'':
buf = prox_rec_res.read(8192)
self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
except socket.timeout as e:
self.logger.warn(
"%s proxying %s %s", repr(e), self.command, self.url)
except BaseException as e:
self.logger.error(
"%s proxying %s %s", repr(e), self.command, self.url,
exc_info=True)
finally:
# Let's close off the remote end
if prox_rec_res:
prox_rec_res.close()
self._remote_server_sock.close()
return req, prox_rec_res
def __getattr__(self, item):
if item.startswith('do_'):
return self.do_COMMAND
def log_error(self, fmt, *args):
self.logger.error("{0} - - [{1}] {2}".format(self.address_string(),
self.log_date_time_string(), fmt % args))
self.logger.warn(fmt, *args)
def log_message(self, fmt, *args):
self.logger.info("{} {} - - [{}] {}".format(self.__class__.__name__,
self.address_string(), self.log_date_time_string(), fmt % args))
class PooledMixIn(socketserver.ThreadingMixIn):
logger = logging.getLogger("warcprox.mitmproxy.PooledMixIn")
def __init__(self, max_threads=None):
'''
If max_threads is not supplied, calculates a reasonable value based
on system resource limits.
'''
if not max_threads:
# man getrlimit: "RLIMIT_NPROC The maximum number of processes (or,
# more precisely on Linux, threads) that can be created for the
# real user ID of the calling process."
rlimit_nproc = resource.getrlimit(resource.RLIMIT_NPROC)[0]
rlimit_nofile = resource.getrlimit(resource.RLIMIT_NOFILE)[0]
max_threads = min(rlimit_nofile // 10, rlimit_nproc // 2)
self.logger.info(
"max_threads=%s (rlimit_nproc=%s, rlimit_nofile=%s)",
max_threads, rlimit_nproc, rlimit_nofile)
self.pool = concurrent.futures.ThreadPoolExecutor(max_threads)
def process_request(self, request, client_address):
self.pool.submit(self.process_request_thread, request, client_address)
class MitmProxy(http_server.HTTPServer):
def finish_request(self, request, client_address):
'''
We override socketserver.BaseServer.finish_request to get at
MitmProxyHandler's self.request. A normal socket server's self.request
is set to `request` and never changes, but in our case, it may be
replaced with an SSL socket. The caller of this method (e.g.
self.process_request or PooledMitmProxy.process_request_thread) needs
to get a hold of that socket so it can close it.
'''
req_handler = self.RequestHandlerClass(request, client_address, self)
return req_handler.request
def process_request(self, request, client_address):
'''
This an almost verbatim copy/paste of
socketserver.BaseServer.process_request.
The only difference is that it expects self.finish_request to return
the request (i.e. the socket). This new value of request is passed on
to self.shutdown_request. See the comment on self.finish_request for
the rationale.
'''
request = self.finish_request(request, client_address)
self.shutdown_request(request)
class PooledMitmProxy(PooledMixIn, MitmProxy):
def process_request_thread(self, request, client_address):
'''
This an almost verbatim copy/paste of
socketserver.ThreadingMixIn.process_request_thread.
The only difference is that it expects self.finish_request to return
the request (i.e. the socket). This new value of request is passed on
to self.shutdown_request. See the comment on MitmProxy.finish_request
for the rationale.
'''
try:
request = self.finish_request(request, client_address)
self.shutdown_request(request)
except:
self.handle_error(request, client_address)
self.shutdown_request(request)

View File

@ -1,4 +1,24 @@
# vim:set sw=4 et:
'''
warcprox/playback.py - rudimentary support for playback of urls archived by
warcprox (not much used or maintained)
Copyright (C) 2013-2016 Internet Archive
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
USA.
'''
from __future__ import absolute_import
@ -12,14 +32,6 @@ try:
except ImportError:
import SocketServer as socketserver
try:
import dbm.gnu as dbm_gnu
except ImportError:
try:
import gdbm as dbm_gnu
except ImportError:
import anydbm as dbm_gnu
import logging
import os
from hanzo import warctools
@ -27,13 +39,14 @@ import json
import traceback
import re
from warcprox.mitmproxy import MitmProxyHandler
import warcprox
class PlaybackProxyHandler(MitmProxyHandler):
logger = logging.getLogger("warcprox.playback.PlaybackProxyHandler")
# @Override
def _connect_to_host(self):
# don't connect to host!
def _connect_to_remote_server(self):
# don't connect to any remote server!
pass
@ -180,13 +193,14 @@ class PlaybackProxyHandler(MitmProxyHandler):
class PlaybackProxy(socketserver.ThreadingMixIn, http_server.HTTPServer):
logger = logging.getLogger("warcprox.playback.PlaybackProxy")
def __init__(self, server_address, req_handler_class=PlaybackProxyHandler,
bind_and_activate=True, ca=None, playback_index_db=None,
warcs_dir=None):
http_server.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate)
def __init__(self, ca=None, playback_index_db=None, options=warcprox.Options()):
server_address = (options.address or 'localhost', options.playback_port if options.playback_port is not None else 8001)
http_server.HTTPServer.__init__(self, server_address, PlaybackProxyHandler, bind_and_activate=True)
self.ca = ca
self.playback_index_db = playback_index_db
self.warcs_dir = warcs_dir
self.warcs_dir = options.directory
self.options = options
def server_activate(self):
http_server.HTTPServer.server_activate(self)
@ -201,6 +215,14 @@ class PlaybackIndexDb(object):
logger = logging.getLogger("warcprox.playback.PlaybackIndexDb")
def __init__(self, dbm_file='./warcprox-playback-index.db'):
try:
import dbm.gnu as dbm_gnu
except ImportError:
try:
import gdbm as dbm_gnu
except ImportError:
import anydbm as dbm_gnu
if os.path.exists(dbm_file):
self.logger.info('opening existing playback index database {}'.format(dbm_file))
else:
@ -217,6 +239,9 @@ class PlaybackIndexDb(object):
except:
pass
def notify(self, recorded_url, records):
self.save(records[0].warc_filename, records, records[0].offset)
def save(self, warcfile, recordset, offset):
response_record = recordset[0]
# XXX canonicalize url?

303
warcprox/stats.py Normal file
View File

@ -0,0 +1,303 @@
'''
warcprox/stats.py - keeps statistics on what has been archived
Copyright (C) 2013-2016 Internet Archive
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
USA.
'''
from __future__ import absolute_import
import logging
import os
import json
from hanzo import warctools
import random
import warcprox
import threading
import rethinkdb as r
import datetime
import surt
def _empty_bucket(bucket):
return {
"bucket": bucket,
"total": {
"urls": 0,
"wire_bytes": 0,
},
"new": {
"urls": 0,
"wire_bytes": 0,
},
"revisit": {
"urls": 0,
"wire_bytes": 0,
},
}
class StatsDb:
logger = logging.getLogger("warcprox.stats.StatsDb")
def __init__(self, dbm_file='./warcprox-stats.db', options=warcprox.Options()):
try:
import dbm.gnu as dbm_gnu
except ImportError:
try:
import gdbm as dbm_gnu
except ImportError:
import anydbm as dbm_gnu
if os.path.exists(dbm_file):
self.logger.info('opening existing stats database {}'.format(dbm_file))
else:
self.logger.info('creating new stats database {}'.format(dbm_file))
self.db = dbm_gnu.open(dbm_file, 'c')
self.options = options
def start(self):
# method only exists to match RethinkStatsDb
pass
def stop(self):
self.close()
def close(self):
self.db.close()
def sync(self):
try:
self.db.sync()
except:
pass
def value(self, bucket0="__all__", bucket1=None, bucket2=None):
# Gdbm wants str/bytes keys in python2, str/unicode keys in python3.
# This ugliness deals with keys that arrive as unicode in py2.
b0 = bucket0.encode("utf-8") if bucket0 and not isinstance(bucket0, str) else bucket0
b1 = bucket1.encode("utf-8") if bucket1 and not isinstance(bucket1, str) else bucket1
b2 = bucket2.encode("utf-8") if bucket2 and not isinstance(bucket2, str) else bucket2
if b0 in self.db:
bucket0_stats = json.loads(self.db[b0].decode("utf-8"))
if b1:
if b2:
return bucket0_stats[b1][b2]
else:
return bucket0_stats[b1]
else:
return bucket0_stats
else:
return None
def notify(self, recorded_url, records):
self.tally(recorded_url, records)
def buckets(self, recorded_url):
'''
Unravels bucket definitions in Warcprox-Meta header. Each bucket
definition can either be a string, which signifies the name of the
bucket, or a dict. If a dict it is expected to have at least an item
with key 'bucket' whose value is the name of the bucket. The other
currently recognized item is 'tally-domains', which if supplied should
be a list of domains. This instructs warcprox to additionally tally
substats of the given bucket by domain. Host stats are stored in the
stats table under the key '{parent-bucket}:{domain(normalized)}'.
Example Warcprox-Meta header (a real one will likely have other
sections besides 'stats'):
Warcprox-Meta: {'stats':{'buckets':['bucket1',{'bucket':'bucket2','tally-domains':['foo.bar.com','192.168.10.20'}]}}
'''
buckets = ["__all__"]
if (recorded_url.warcprox_meta
and "stats" in recorded_url.warcprox_meta
and "buckets" in recorded_url.warcprox_meta["stats"]):
for bucket in recorded_url.warcprox_meta["stats"]["buckets"]:
if isinstance(bucket, dict):
if not 'bucket' in bucket:
self.logger.warn(
'ignoring invalid stats bucket in '
'warcprox-meta header %s', bucket)
continue
buckets.append(bucket['bucket'])
if bucket.get('tally-domains'):
url = warcprox.Url(recorded_url.url.decode('utf-8'))
for domain in bucket['tally-domains']:
if url.matches_ip_or_domain(domain):
buckets.append('%s:%s' % (
bucket['bucket'],
warcprox.normalize_host(domain)))
else:
buckets.append(bucket)
else:
buckets.append("__unspecified__")
return buckets
def tally(self, recorded_url, records):
for bucket in self.buckets(recorded_url):
# Gdbm wants str/bytes keys in python2, str/unicode keys in python3.
# This ugliness deals with keys that arrive as unicode in py2.
b = bucket.encode("utf-8") if bucket and not isinstance(bucket, str) else bucket
if b in self.db:
bucket_stats = json.loads(self.db[b].decode("utf-8"))
else:
bucket_stats = _empty_bucket(b)
bucket_stats["total"]["urls"] += 1
bucket_stats["total"]["wire_bytes"] += recorded_url.size
if records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT:
bucket_stats["revisit"]["urls"] += 1
bucket_stats["revisit"]["wire_bytes"] += recorded_url.size
else:
bucket_stats["new"]["urls"] += 1
bucket_stats["new"]["wire_bytes"] += recorded_url.size
self.db[b] = json.dumps(bucket_stats, separators=(',',':')).encode("utf-8")
class RethinkStatsDb(StatsDb):
"""Updates database in batch every 2.0 seconds"""
logger = logging.getLogger("warcprox.stats.RethinkStatsDb")
def __init__(self, rethinker, table="stats", shards=None, replicas=None, options=warcprox.Options()):
self.r = rethinker
self.table = table
self.shards = shards or 1 # 1 shard by default because it's probably a small table
self.replicas = replicas or min(3, len(self.r.servers))
self._ensure_db_table()
self.options = options
self._stop = threading.Event()
self._batch_lock = threading.RLock()
with self._batch_lock:
self._batch = {}
self._timer = None
def start(self):
"""Starts batch update repeating timer."""
self._update_batch() # starts repeating timer
def _bucket_batch_update_reql(self, bucket):
return self.r.table(self.table).get(bucket).replace(
lambda old: r.branch(
old.eq(None), self._batch[bucket], old.merge({
"total": {
"urls": old["total"]["urls"].add(
self._batch[bucket]["total"]["urls"]),
"wire_bytes": old["total"]["wire_bytes"].add(
self._batch[bucket]["total"]["wire_bytes"]),
},
"new": {
"urls": old["new"]["urls"].add(
self._batch[bucket]["new"]["urls"]),
"wire_bytes": old["new"]["wire_bytes"].add(
self._batch[bucket]["new"]["wire_bytes"]),
},
"revisit": {
"urls": old["revisit"]["urls"].add(
self._batch[bucket]["revisit"]["urls"]),
"wire_bytes": old["revisit"]["wire_bytes"].add(
self._batch[bucket]["revisit"]["wire_bytes"]),
},
})))
def _update_batch(self):
with self._batch_lock:
if len(self._batch) > 0:
# XXX can all the buckets be done in one query?
for bucket in self._batch:
result = self._bucket_batch_update_reql(bucket).run()
if (not result["inserted"] and not result["replaced"]
or sorted(result.values()) != [0,0,0,0,0,1]):
raise Exception(
"unexpected result %s updating stats %s" % (
result, self._batch[bucket]))
self._batch = {}
if not self._stop.is_set():
self._timer = threading.Timer(2.0, self._update_batch)
self._timer.name = "RethinkStats-batch-update-timer-%s" % (
datetime.datetime.utcnow().isoformat())
self._timer.start()
else:
self.logger.info("finished")
def _ensure_db_table(self):
dbs = self.r.db_list().run()
if not self.r.dbname in dbs:
self.logger.info(
"creating rethinkdb database %s", repr(self.r.dbname))
self.r.db_create(self.r.dbname).run()
tables = self.r.table_list().run()
if not self.table in tables:
self.logger.info(
"creating rethinkdb table %s in database %s shards=%s "
"replicas=%s", repr(self.table), repr(self.r.dbname),
self.shards, self.replicas)
self.r.table_create(
self.table, primary_key="bucket", shards=self.shards,
replicas=self.replicas).run()
def close(self):
self.stop()
def stop(self):
self.logger.info("stopping rethinkdb stats table batch updates")
self._stop.set()
if self._timer:
self._timer.join()
def sync(self):
pass
def value(self, bucket0="__all__", bucket1=None, bucket2=None):
bucket0_stats = self.r.table(self.table).get(bucket0).run()
self.logger.debug(
'stats db lookup of bucket=%s returned %s',
bucket0, bucket0_stats)
if bucket0_stats:
if bucket1:
if bucket2:
return bucket0_stats[bucket1][bucket2]
else:
return bucket0_stats[bucket1]
return bucket0_stats
def tally(self, recorded_url, records):
buckets = self.buckets(recorded_url)
is_revisit = records[0].get_header(
warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT
with self._batch_lock:
for bucket in buckets:
bucket_stats = self._batch.setdefault(
bucket, _empty_bucket(bucket))
bucket_stats["total"]["urls"] += 1
bucket_stats["total"]["wire_bytes"] += recorded_url.size
if is_revisit:
bucket_stats["revisit"]["urls"] += 1
bucket_stats["revisit"]["wire_bytes"] += recorded_url.size
else:
bucket_stats["new"]["urls"] += 1
bucket_stats["new"]["wire_bytes"] += recorded_url.size
def notify(self, recorded_url, records):
self.tally(recorded_url, records)

View File

@ -1,414 +0,0 @@
#!/usr/bin/env python
# vim: set sw=4 et:
import unittest
import threading
import time
import logging
import sys
import ssl
import re
import tempfile
import OpenSSL
import os
import shutil
import requests
try:
import http.server as http_server
except ImportError:
import BaseHTTPServer as http_server
try:
import queue
except ImportError:
import Queue as queue
import certauth.certauth
import warcprox.controller
import warcprox.warcprox
import warcprox.playback
import warcprox.warcwriter
import warcprox.dedup
class TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
logger = logging.getLogger('TestHttpRequestHandler')
def do_GET(self):
self.logger.info('GET {}'.format(self.path))
m = re.match(r'^/([^/]+)/([^/]+)$', self.path)
if m is not None:
special_header = 'warcprox-test-header: {}!'.format(m.group(1)).encode('utf-8')
payload = 'I am the warcprox test payload! {}!\n'.format(10*m.group(2)).encode('utf-8')
headers = (b'HTTP/1.1 200 OK\r\n'
+ b'Content-Type: text/plain\r\n'
+ special_header + b'\r\n'
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n'
+ b'\r\n')
else:
payload = b'404 Not Found\n'
headers = (b'HTTP/1.1 404 Not Found\r\n'
+ b'Content-Type: text/plain\r\n'
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n'
+ b'\r\n')
self.connection.sendall(headers)
self.connection.sendall(payload)
class WarcproxTest(unittest.TestCase):
logger = logging.getLogger('WarcproxTest')
def __init__(self, methodName='runTest'):
self.__cert = None
unittest.TestCase.__init__(self, methodName)
@property
def _cert(self):
if self.__cert is None:
f = tempfile.NamedTemporaryFile(prefix='warcprox-test-https-', suffix='.pem', delete=False)
try:
key = OpenSSL.crypto.PKey()
key.generate_key(OpenSSL.crypto.TYPE_RSA, 2048)
req = OpenSSL.crypto.X509Req()
req.get_subject().CN = 'localhost'
req.set_pubkey(key)
req.sign(key, 'sha1')
cert = OpenSSL.crypto.X509()
cert.set_subject(req.get_subject())
cert.set_serial_number(0)
cert.gmtime_adj_notBefore(0)
cert.gmtime_adj_notAfter(2*60*60) # valid for 2hrs
cert.set_issuer(cert.get_subject())
cert.set_pubkey(req.get_pubkey())
cert.sign(key, 'sha1')
f.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM, key))
f.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM, cert))
self.logger.info('generated self-signed certificate {}'.format(f.name))
self.__cert = f.name
finally:
f.close()
return self.__cert
def _start_http_servers(self):
self.http_daemon = http_server.HTTPServer(('localhost', 0),
RequestHandlerClass=TestHttpRequestHandler)
self.logger.info('starting http://{}:{}'.format(self.http_daemon.server_address[0], self.http_daemon.server_address[1]))
self.http_daemon_thread = threading.Thread(name='HttpdThread',
target=self.http_daemon.serve_forever)
self.http_daemon_thread.start()
# http://www.piware.de/2011/01/creating-an-https-server-in-python/
self.https_daemon = http_server.HTTPServer(('localhost', 0),
RequestHandlerClass=TestHttpRequestHandler)
# self.https_daemon.socket = ssl.wrap_socket(httpd.socket, certfile='path/to/localhost.pem', server_side=True)
self.https_daemon.socket = ssl.wrap_socket(self.https_daemon.socket, certfile=self._cert, server_side=True)
self.logger.info('starting https://{}:{}'.format(self.https_daemon.server_address[0], self.https_daemon.server_address[1]))
self.https_daemon_thread = threading.Thread(name='HttpdThread',
target=self.https_daemon.serve_forever)
self.https_daemon_thread.start()
def _start_warcprox(self):
f = tempfile.NamedTemporaryFile(prefix='warcprox-test-ca-', suffix='.pem', delete=True)
f.close() # delete it, or CertificateAuthority will try to read it
self._ca_file = f.name
self._ca_dir = tempfile.mkdtemp(prefix='warcprox-test-', suffix='-ca')
ca = certauth.certauth.CertificateAuthority(self._ca_file, self._ca_dir, 'warcprox-test')
recorded_url_q = queue.Queue()
proxy = warcprox.warcprox.WarcProxy(server_address=('localhost', 0), ca=ca,
recorded_url_q=recorded_url_q)
self._warcs_dir = tempfile.mkdtemp(prefix='warcprox-test-warcs-')
f = tempfile.NamedTemporaryFile(prefix='warcprox-test-playback-index-', suffix='.db', delete=False)
f.close()
self._playback_index_db_file = f.name
playback_index_db = warcprox.playback.PlaybackIndexDb(self._playback_index_db_file)
playback_proxy = warcprox.playback.PlaybackProxy(server_address=('localhost', 0), ca=ca,
playback_index_db=playback_index_db, warcs_dir=self._warcs_dir)
f = tempfile.NamedTemporaryFile(prefix='warcprox-test-dedup-', suffix='.db', delete=False)
f.close()
self._dedup_db_file = f.name
dedup_db = warcprox.dedup.DedupDb(self._dedup_db_file)
warc_writer = warcprox.warcwriter.WarcWriter(directory=self._warcs_dir,
port=proxy.server_port, dedup_db=dedup_db,
playback_index_db=playback_index_db)
warc_writer_thread = warcprox.warcwriter.WarcWriterThread(recorded_url_q=recorded_url_q,
warc_writer=warc_writer)
self.warcprox = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy)
self.logger.info('starting warcprox')
self.warcprox_thread = threading.Thread(name='WarcproxThread',
target=self.warcprox.run_until_shutdown)
self.warcprox_thread.start()
def setUp(self):
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(process)d %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
self._start_http_servers()
self._start_warcprox()
archiving_proxy = 'http://localhost:{}'.format(self.warcprox.proxy.server_port)
self.archiving_proxies = {'http':archiving_proxy, 'https':archiving_proxy}
playback_proxy = 'http://localhost:{}'.format(self.warcprox.playback_proxy.server_port)
self.playback_proxies = {'http':playback_proxy, 'https':playback_proxy}
def tearDown(self):
self.logger.info('stopping warcprox')
self.warcprox.stop.set()
self.logger.info('stopping http and https daemons')
self.http_daemon.shutdown()
self.https_daemon.shutdown()
self.http_daemon.server_close()
self.https_daemon.server_close()
# Have to wait for threads to finish or the threads will try to use
# variables that no longer exist, resulting in errors like this:
# File "/usr/lib/python2.7/SocketServer.py", line 235, in serve_forever
# r, w, e = _eintr_retry(select.select, [self], [], [],
# AttributeError: 'NoneType' object has no attribute 'select'
self.http_daemon_thread.join()
self.https_daemon_thread.join()
self.warcprox_thread.join()
for f in (self.__cert, self._ca_file, self._ca_dir, self._warcs_dir, self._playback_index_db_file, self._dedup_db_file):
if os.path.isdir(f):
self.logger.info('deleting directory {}'.format(f))
shutil.rmtree(f)
else:
self.logger.info('deleting file {}'.format(f))
os.unlink(f)
def _test_httpds_no_proxy(self):
url = 'http://localhost:{}/'.format(self.http_daemon.server_port)
response = requests.get(url)
self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, b'404 Not Found\n')
url = 'https://localhost:{}/'.format(self.https_daemon.server_port)
response = requests.get(url, verify=False)
self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, b'404 Not Found\n')
url = 'http://localhost:{}/a/b'.format(self.http_daemon.server_port)
response = requests.get(url)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'a!')
self.assertEqual(response.content, b'I am the warcprox test payload! bbbbbbbbbb!\n')
url = 'https://localhost:{}/c/d'.format(self.https_daemon.server_port)
response = requests.get(url, verify=False)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'c!')
self.assertEqual(response.content, b'I am the warcprox test payload! dddddddddd!\n')
def poll_playback_until(self, url, status, timeout_sec):
start = time.time()
# check playback (warc writing is asynchronous, give it up to 10 sec)
while time.time() - start < timeout_sec:
response = requests.get(url, proxies=self.playback_proxies, verify=False)
if response.status_code == status:
break
time.sleep(0.5)
return response
def _test_archive_and_playback_http_url(self):
url = 'http://localhost:{}/a/b'.format(self.http_daemon.server_port)
# ensure playback fails before archiving
response = requests.get(url, proxies=self.playback_proxies)
self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, b'404 Not in Archive\n')
# archive
response = requests.get(url, proxies=self.archiving_proxies)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'a!')
self.assertEqual(response.content, b'I am the warcprox test payload! bbbbbbbbbb!\n')
response = self.poll_playback_until(url, status=200, timeout_sec=10)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'a!')
self.assertEqual(response.content, b'I am the warcprox test payload! bbbbbbbbbb!\n')
def _test_archive_and_playback_https_url(self):
url = 'https://localhost:{}/c/d'.format(self.https_daemon.server_port)
# ensure playback fails before archiving
response = requests.get(url, proxies=self.playback_proxies, verify=False)
self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, b'404 Not in Archive\n')
# fetch & archive response
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'c!')
self.assertEqual(response.content, b'I am the warcprox test payload! dddddddddd!\n')
# test playback
response = self.poll_playback_until(url, status=200, timeout_sec=10)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'c!')
self.assertEqual(response.content, b'I am the warcprox test payload! dddddddddd!\n')
# test dedup of same http url with same payload
def _test_dedup_http(self):
url = 'http://localhost:{}/e/f'.format(self.http_daemon.server_port)
# ensure playback fails before archiving
response = requests.get(url, proxies=self.playback_proxies, verify=False)
self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, b'404 Not in Archive\n')
# check not in dedup db
dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
self.assertIsNone(dedup_lookup)
# archive
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'e!')
self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n')
# test playback
response = self.poll_playback_until(url, status=200, timeout_sec=10)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'e!')
self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n')
# check in dedup db
# {u'i': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'}
dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
self.assertEqual(dedup_lookup['u'], url.encode('ascii'))
self.assertRegexpMatches(dedup_lookup['i'], br'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$')
self.assertRegexpMatches(dedup_lookup['d'], br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$')
record_id = dedup_lookup['i']
dedup_date = dedup_lookup['d']
# need revisit to have a later timestamp than original, else playing
# back the latest record might not hit the revisit
time.sleep(1.5)
# fetch & archive revisit
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'e!')
self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n')
# XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\
time.sleep(2.0)
# check in dedup db (no change from prev)
dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
self.assertEqual(dedup_lookup['u'], url.encode('ascii'))
self.assertEqual(dedup_lookup['i'], record_id)
self.assertEqual(dedup_lookup['d'], dedup_date)
# test playback
self.logger.debug('testing playback of revisit of {}'.format(url))
response = self.poll_playback_until(url, status=200, timeout_sec=10)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'e!')
self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n')
# XXX how to check dedup was used?
# test dedup of same https url with same payload
def _test_dedup_https(self):
url = 'https://localhost:{}/g/h'.format(self.https_daemon.server_port)
# ensure playback fails before archiving
response = requests.get(url, proxies=self.playback_proxies, verify=False)
self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, b'404 Not in Archive\n')
# check not in dedup db
dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
self.assertIsNone(dedup_lookup)
# archive
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'g!')
self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n')
# test playback
response = self.poll_playback_until(url, status=200, timeout_sec=10)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'g!')
self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n')
# check in dedup db
# {u'i': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'}
dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
self.assertEqual(dedup_lookup['u'], url.encode('ascii'))
self.assertRegexpMatches(dedup_lookup['i'], br'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$')
self.assertRegexpMatches(dedup_lookup['d'], br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$')
record_id = dedup_lookup['i']
dedup_date = dedup_lookup['d']
# need revisit to have a later timestamp than original, else playing
# back the latest record might not hit the revisit
time.sleep(1.5)
# fetch & archive revisit
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'g!')
self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n')
# XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\
time.sleep(2.0)
# check in dedup db (no change from prev)
dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
self.assertEqual(dedup_lookup['u'], url.encode('ascii'))
self.assertEqual(dedup_lookup['i'], record_id)
self.assertEqual(dedup_lookup['d'], dedup_date)
# test playback
self.logger.debug('testing playback of revisit of {}'.format(url))
response = self.poll_playback_until(url, status=200, timeout_sec=10)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'g!')
self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n')
# XXX how to check dedup was used?
# run everything from here, otherwise it wants to setUp() and tearDown
# around each test
def runTest(self):
self._test_httpds_no_proxy()
self._test_archive_and_playback_http_url()
self._test_archive_and_playback_https_url()
self._test_dedup_http()
self._test_dedup_https()
# self._test_dedup_mixed_http()
# self._test_dedup_mixed_https()
if __name__ == '__main__':
unittest.main()

171
warcprox/warc.py Normal file
View File

@ -0,0 +1,171 @@
#
# warcprox/warc.py - assembles warc records
#
# Copyright (C) 2013-2016 Internet Archive
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
# USA.
#
from __future__ import absolute_import
import logging
import warcprox
import hashlib
import socket
import hanzo.httptools
from hanzo import warctools
import warcprox
import datetime
class WarcRecordBuilder:
logger = logging.getLogger("warcprox.warc.WarcRecordBuilder")
def __init__(self, digest_algorithm="sha1", base32=False):
self.digest_algorithm = digest_algorithm
self.base32 = base32
def _build_response_principal_record(self, recorded_url, warc_date):
"""Builds response or revisit record, whichever is appropriate."""
if hasattr(recorded_url, "dedup_info") and recorded_url.dedup_info:
# revisit record
recorded_url.response_recorder.tempfile.seek(0)
if recorded_url.response_recorder.payload_offset is not None:
response_header_block = recorded_url.response_recorder.tempfile.read(recorded_url.response_recorder.payload_offset)
else:
response_header_block = recorded_url.response_recorder.tempfile.read()
return self.build_warc_record(
url=recorded_url.url, warc_date=warc_date,
data=response_header_block,
warc_type=warctools.WarcRecord.REVISIT,
refers_to=recorded_url.dedup_info['id'],
refers_to_target_uri=recorded_url.dedup_info['url'],
refers_to_date=recorded_url.dedup_info['date'],
payload_digest=warcprox.digest_str(recorded_url.response_recorder.payload_digest, self.base32),
profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST,
content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
remote_ip=recorded_url.remote_ip)
else:
# response record
return self.build_warc_record(
url=recorded_url.url, warc_date=warc_date,
recorder=recorded_url.response_recorder,
warc_type=warctools.WarcRecord.RESPONSE,
content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
remote_ip=recorded_url.remote_ip)
def build_warc_records(self, recorded_url):
"""Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)"""
warc_date = warctools.warc.warc_datetime_str(recorded_url.timestamp)
if recorded_url.response_recorder:
principal_record = self._build_response_principal_record(recorded_url, warc_date)
request_record = self.build_warc_record(url=recorded_url.url,
warc_date=warc_date, data=recorded_url.request_data,
warc_type=warctools.WarcRecord.REQUEST,
content_type=hanzo.httptools.RequestMessage.CONTENT_TYPE,
concurrent_to=principal_record.id)
return principal_record, request_record
else:
principal_record = self.build_warc_record(url=recorded_url.url,
warc_date=warc_date, data=recorded_url.request_data,
warc_type=recorded_url.custom_type,
content_type=recorded_url.content_type.encode("latin1"))
return (principal_record,)
def build_warc_record(self, url, warc_date=None, recorder=None, data=None,
concurrent_to=None, warc_type=None, content_type=None, remote_ip=None,
profile=None, refers_to=None, refers_to_target_uri=None,
refers_to_date=None, payload_digest=None):
if warc_date is None:
warc_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow())
record_id = warctools.WarcRecord.random_warc_uuid()
headers = []
if warc_type is not None:
headers.append((warctools.WarcRecord.TYPE, warc_type))
headers.append((warctools.WarcRecord.ID, record_id))
headers.append((warctools.WarcRecord.DATE, warc_date))
headers.append((warctools.WarcRecord.URL, url))
if remote_ip is not None:
headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip))
if profile is not None:
headers.append((warctools.WarcRecord.PROFILE, profile))
if refers_to is not None:
headers.append((warctools.WarcRecord.REFERS_TO, refers_to))
if refers_to_target_uri is not None:
headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri))
if refers_to_date is not None:
headers.append((warctools.WarcRecord.REFERS_TO_DATE, refers_to_date))
if concurrent_to is not None:
headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to))
if content_type is not None:
headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
if payload_digest is not None:
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
if recorder is not None:
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1')))
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
warcprox.digest_str(recorder.block_digest, self.base32)))
if recorder.payload_digest is not None:
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
warcprox.digest_str(recorder.payload_digest, self.base32)))
recorder.tempfile.seek(0)
record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)
else:
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1')))
digest = hashlib.new(self.digest_algorithm, data)
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
warcprox.digest_str(digest, self.base32)))
if not payload_digest:
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
warcprox.digest_str(digest, self.base32)))
content_tuple = content_type, data
record = warctools.WarcRecord(headers=headers, content=content_tuple)
return record
def build_warcinfo_record(self, filename):
warc_record_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow())
record_id = warctools.WarcRecord.random_warc_uuid()
headers = []
headers.append((warctools.WarcRecord.ID, record_id))
headers.append((warctools.WarcRecord.TYPE, warctools.WarcRecord.WARCINFO))
headers.append((warctools.WarcRecord.FILENAME, filename.encode('latin1')))
headers.append((warctools.WarcRecord.DATE, warc_record_date))
warcinfo_fields = []
warcinfo_fields.append(b'software: warcprox ' + warcprox.__version__.encode('latin1'))
hostname = socket.gethostname()
warcinfo_fields.append('hostname: {}'.format(hostname).encode('latin1'))
warcinfo_fields.append('ip: {}'.format(socket.gethostbyname(hostname)).encode('latin1'))
warcinfo_fields.append(b'format: WARC File Format 1.0')
# warcinfo_fields.append('robots: ignore')
# warcinfo_fields.append('description: {0}'.format(self.description))
# warcinfo_fields.append('isPartOf: {0}'.format(self.is_part_of))
data = b'\r\n'.join(warcinfo_fields) + b'\r\n'
record = warctools.WarcRecord(headers=headers, content=(b'application/warc-fields', data))
return record

View File

@ -1,272 +0,0 @@
#!/usr/bin/env python
# vim:set sw=4 et:
#
"""
WARC writing MITM HTTP/S proxy
See README.rst or https://github.com/internetarchive/warcprox
"""
from __future__ import absolute_import
try:
import http.server as http_server
except ImportError:
import BaseHTTPServer as http_server
try:
import socketserver
except ImportError:
import SocketServer as socketserver
try:
import queue
except ImportError:
import Queue as queue
try:
import http.client as http_client
except ImportError:
import httplib as http_client
import logging
import re
import tempfile
import traceback
import hashlib
import json
import socket
from certauth.certauth import CertificateAuthority
import warcprox.mitmproxy
class ProxyingRecorder(object):
"""
Wraps a socket._fileobject, recording the bytes as they are read,
calculating digests, and sending them on to the proxy client.
"""
logger = logging.getLogger("warcprox.warcprox.ProxyingRecorder")
def __init__(self, fp, proxy_dest, digest_algorithm='sha1'):
self.fp = fp
# "The file has no name, and will cease to exist when it is closed."
self.tempfile = tempfile.SpooledTemporaryFile(max_size=512*1024)
self.digest_algorithm = digest_algorithm
self.block_digest = hashlib.new(digest_algorithm)
self.payload_offset = None
self.payload_digest = None
self.proxy_dest = proxy_dest
self._proxy_dest_conn_open = True
self._prev_hunk_last_two_bytes = b''
self.len = 0
def _update_payload_digest(self, hunk):
if self.payload_digest is None:
# convoluted handling of two newlines crossing hunks
# XXX write tests for this
if self._prev_hunk_last_two_bytes.endswith(b'\n'):
if hunk.startswith(b'\n'):
self.payload_digest = hashlib.new(self.digest_algorithm)
self.payload_digest.update(hunk[1:])
self.payload_offset = self.len + 1
elif hunk.startswith(b'\r\n'):
self.payload_digest = hashlib.new(self.digest_algorithm)
self.payload_digest.update(hunk[2:])
self.payload_offset = self.len + 2
elif self._prev_hunk_last_two_bytes == b'\n\r':
if hunk.startswith(b'\n'):
self.payload_digest = hashlib.new(self.digest_algorithm)
self.payload_digest.update(hunk[1:])
self.payload_offset = self.len + 1
else:
m = re.search(br'\n\r?\n', hunk)
if m is not None:
self.payload_digest = hashlib.new(self.digest_algorithm)
self.payload_digest.update(hunk[m.end():])
self.payload_offset = self.len + m.end()
# if we still haven't found start of payload hold on to these bytes
if self.payload_digest is None:
self._prev_hunk_last_two_bytes = hunk[-2:]
else:
self.payload_digest.update(hunk)
def _update(self, hunk):
self._update_payload_digest(hunk)
self.block_digest.update(hunk)
self.tempfile.write(hunk)
if self._proxy_dest_conn_open:
try:
self.proxy_dest.sendall(hunk)
except BaseException as e:
self._proxy_dest_conn_open = False
self.logger.warn('{} sending data to proxy client'.format(e))
self.logger.info('will continue downloading from remote server without sending to client')
self.len += len(hunk)
def read(self, size=-1):
hunk = self.fp.read(size)
self._update(hunk)
return hunk
def readinto(self, b):
n = self.fp.readinto(b)
self._update(b[:n])
return n
def readline(self, size=-1):
# XXX depends on implementation details of self.fp.readline(), in
# particular that it doesn't call self.fp.read()
hunk = self.fp.readline(size)
self._update(hunk)
return hunk
def close(self):
return self.fp.close()
def __len__(self):
return self.len
def payload_size(self):
if self.payload_offset is not None:
return self.len - self.payload_offset
else:
return 0
class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
def __init__(self, sock, debuglevel=0, method=None, proxy_dest=None, digest_algorithm='sha1'):
http_client.HTTPResponse.__init__(self, sock, debuglevel=debuglevel, method=method)
# Keep around extra reference to self.fp because HTTPResponse sets
# self.fp=None after it finishes reading, but we still need it
self.recorder = ProxyingRecorder(self.fp, proxy_dest, digest_algorithm)
self.fp = self.recorder
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler")
def _proxy_request(self):
# Build request
req_str = '{} {} {}\r\n'.format(self.command, self.path, self.request_version)
warcprox_meta = self.headers.get('Warcprox-Meta')
# Swallow headers that don't make sense to forward on, i.e. most
# hop-by-hop headers, see http://tools.ietf.org/html/rfc2616#section-13.5
# self.headers is an email.message.Message, which is case-insensitive
# and doesn't throw KeyError in __delitem__
for h in ('Connection', 'Proxy-Connection', 'Keep-Alive',
'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade',
'Warcprox-Meta'):
del self.headers[h]
# Add headers to the request
# XXX in at least python3.3 str(self.headers) uses \n not \r\n :(
req_str += '\r\n'.join('{}: {}'.format(k,v) for (k,v) in self.headers.items())
req = req_str.encode('utf-8') + b'\r\n\r\n'
# Append message body if present to the request
if 'Content-Length' in self.headers:
req += self.rfile.read(int(self.headers['Content-Length']))
self.logger.debug('req={}'.format(repr(req)))
# Send it down the pipe!
self._proxy_sock.sendall(req)
# We want HTTPResponse's smarts about http and handling of
# non-compliant servers. But HTTPResponse.read() doesn't return the raw
# bytes read from the server, it unchunks them if they're chunked, and
# might do other stuff. We want to send the raw bytes back to the
# client. So we ignore the values returned by h.read() below. Instead
# the ProxyingRecordingHTTPResponse takes care of sending the raw bytes
# to the proxy client.
# Proxy and record the response
h = ProxyingRecordingHTTPResponse(self._proxy_sock,
proxy_dest=self.connection,
digest_algorithm=self.server.digest_algorithm)
h.begin()
buf = h.read(8192)
while buf != b'':
buf = h.read(8192)
self.log_request(h.status, h.recorder.len)
remote_ip = self._proxy_sock.getpeername()[0]
# Let's close off the remote end
h.close()
self._proxy_sock.close()
recorded_url = RecordedUrl(url=self.url, request_data=req,
response_recorder=h.recorder, remote_ip=remote_ip,
warcprox_meta=warcprox_meta)
self.server.recorded_url_q.put(recorded_url)
return recorded_url
class RecordedUrl(object):
def __init__(self, url, request_data, response_recorder, remote_ip, warcprox_meta=None):
# XXX should test what happens with non-ascii url (when does
# url-encoding happen?)
if type(url) is not bytes:
self.url = url.encode('ascii')
else:
self.url = url
if type(remote_ip) is not bytes:
self.remote_ip = remote_ip.encode('ascii')
else:
self.remote_ip = remote_ip
self.request_data = request_data
self.response_recorder = response_recorder
if warcprox_meta:
self.warcprox_meta = json.loads(warcprox_meta)
else:
self.warcprox_meta = {}
class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer):
logger = logging.getLogger("warcprox.warcprox.WarcProxy")
def __init__(self, server_address=('localhost', 8000),
req_handler_class=WarcProxyHandler, bind_and_activate=True,
ca=None, recorded_url_q=None, digest_algorithm='sha1'):
http_server.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate)
self.digest_algorithm = digest_algorithm
if ca is not None:
self.ca = ca
else:
ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64]
self.ca = CertificateAuthority(ca_file='warcprox-ca.pem',
certs_dir='./warcprox-ca',
ca_name=ca_name)
if recorded_url_q is not None:
self.recorded_url_q = recorded_url_q
else:
self.recorded_url_q = queue.Queue()
def server_activate(self):
http_server.HTTPServer.server_activate(self)
self.logger.info('WarcProxy listening on {0}:{1}'.format(self.server_address[0], self.server_address[1]))
def server_close(self):
self.logger.info('WarcProxy shutting down')
http_server.HTTPServer.server_close(self)

415
warcprox/warcproxy.py Normal file
View File

@ -0,0 +1,415 @@
'''
warcprox/warcproxy.py - recording proxy, extends mitmproxy to record traffic,
enqueue info on the recorded url queue
Copyright (C) 2013-2016 Internet Archive
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
USA.
'''
from __future__ import absolute_import
try:
import http.server as http_server
except ImportError:
import BaseHTTPServer as http_server
try:
import socketserver
except ImportError:
import SocketServer as socketserver
try:
import queue
except ImportError:
import Queue as queue
import logging
import re
import traceback
import json
import socket
from hanzo import warctools
from certauth.certauth import CertificateAuthority
import warcprox
import datetime
import ipaddress
import surt
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
'''
XXX add more information.
Among other things, this class enforces limits specified in the
Warcprox-Meta request header. If a limit is deemed to have been reached, no
request will be made to the remote destination server. This implementation
detail has implications worth noting. For example, if a limit applies to
"new" (not deduplicated) bytes, and the limit has already been reached, no
request will be made, even if it would have resulted in duplicate content,
which would not count toward the limit. To reiterate, this is because the
limit enforcer does not know that the content would be deduplicated.
'''
# self.server is WarcProxy
logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler")
# XXX nearly identical to brozzler.site.Site._scope_rule_applies() but
# there's no obvious common dependency where this code should go... TBD
def _scope_rule_applies(self, rule):
u = warcprox.Url(self.url)
if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]):
return False
if "url_match" in rule:
if rule["url_match"] == "STRING_MATCH":
return u.url.find(rule["value"]) >= 0
elif rule["url_match"] == "REGEX_MATCH":
try:
return re.fullmatch(rule["value"], u.url)
except Exception as e:
self.logger.warn(
"caught exception matching against regex %s: %s",
rule["value"], e)
return False
elif rule["url_match"] == "SURT_MATCH":
return u.surt.startswith(rule["value"])
else:
self.logger.warn("invalid rule.url_match=%s", rule.url_match)
return False
else:
if "domain" in rule:
# we already know that it matches from earlier check
return True
else:
self.logger.warn("unable to make sense of scope rule %s", rule)
return False
def _enforce_blocks(self, warcprox_meta):
"""
Sends a 403 response and raises warcprox.RequestBlockedByRule if the
url is blocked by a rule in warcprox_meta.
"""
if warcprox_meta and "blocks" in warcprox_meta:
for rule in warcprox_meta["blocks"]:
if self._scope_rule_applies(rule):
body = ("request rejected by warcprox: blocked by "
"rule found in Warcprox-Meta header: %s"
% rule).encode("utf-8")
self.send_response(403, "Forbidden")
self.send_header("Content-Type", "text/plain;charset=utf-8")
self.send_header("Connection", "close")
self.send_header("Content-Length", len(body))
response_meta = {"blocked-by-rule":rule}
self.send_header(
"Warcprox-Meta",
json.dumps(response_meta, separators=(",",":")))
self.end_headers()
if self.command != "HEAD":
self.wfile.write(body)
self.connection.close()
raise warcprox.RequestBlockedByRule(
"%s 403 %s %s -- blocked by rule in Warcprox-Meta "
"request header %s" % (
self.client_address[0], self.command,
self.url, rule))
def _enforce_limit(self, limit_key, limit_value, soft=False):
bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
_limit_key = limit_key
# if limit_key looks like 'job1:foo.com/total/urls' then we only want
# to apply this rule if the requested url is within domain
bucket0_fields = bucket0.split(':')
if len(bucket0_fields) == 2:
if not warcprox.host_matches_ip_or_domain(
self.hostname, bucket0_fields[1]):
return # else host matches, go ahead and enforce the limit
bucket0 = '%s:%s' % (
bucket0_fields[0],
warcprox.normalize_host(bucket0_fields[1]))
_limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2)
value = self.server.stats_db.value(bucket0, bucket1, bucket2)
if value and value >= limit_value:
body = ("request rejected by warcprox: reached %s %s=%s\n" % (
"soft limit" if soft else "limit", _limit_key,
limit_value)).encode("utf-8")
if soft:
self.send_response(430, "Reached soft limit")
else:
self.send_response(420, "Reached limit")
self.send_header("Content-Type", "text/plain;charset=utf-8")
self.send_header("Connection", "close")
self.send_header("Content-Length", len(body))
response_meta = {
"stats": {bucket0:self.server.stats_db.value(bucket0)}
}
if soft:
response_meta["reached-soft-limit"] = {_limit_key:limit_value}
else:
response_meta["reached-limit"] = {_limit_key:limit_value}
self.send_header(
"Warcprox-Meta",
json.dumps(response_meta, separators=(",",":")))
self.end_headers()
if self.command != "HEAD":
self.wfile.write(body)
self.connection.close()
raise warcprox.RequestBlockedByRule(
"%s %s %s %s -- reached %s %s=%s" % (
self.client_address[0], 430 if soft else 420,
self.command, self.url,
"soft limit" if soft else "limit",
_limit_key, limit_value))
def _enforce_limits(self, warcprox_meta):
"""
Sends a 420 (hard limit) or 430 (soft limit) response and raises
warcprox.RequestBlockedByRule if a limit specified in warcprox_meta is
reached.
"""
if warcprox_meta and "limits" in warcprox_meta:
for item in warcprox_meta["limits"].items():
limit_key, limit_value = item
self._enforce_limit(limit_key, limit_value, soft=False)
if warcprox_meta and "soft-limits" in warcprox_meta:
for item in warcprox_meta["soft-limits"].items():
limit_key, limit_value = item
self._enforce_limit(limit_key, limit_value, soft=True)
def _connect_to_remote_server(self):
'''
Wraps MitmProxyHandler._connect_to_remote_server, first enforcing
limits and block rules in the Warcprox-Meta request header, if any.
Raises warcprox.RequestBlockedByRule if a rule has been enforced.
Otherwise calls MitmProxyHandler._connect_to_remote_server, which
initializes self._remote_server_sock.
'''
if 'Warcprox-Meta' in self.headers:
warcprox_meta = json.loads(self.headers['Warcprox-Meta'])
self._enforce_limits(warcprox_meta)
self._enforce_blocks(warcprox_meta)
return warcprox.mitmproxy.MitmProxyHandler._connect_to_remote_server(self)
def _proxy_request(self):
warcprox_meta = None
raw_warcprox_meta = self.headers.get('Warcprox-Meta')
self.logger.log(
warcprox.TRACE, 'request for %s Warcprox-Meta header: %s',
self.url, repr(raw_warcprox_meta))
if raw_warcprox_meta:
warcprox_meta = json.loads(raw_warcprox_meta)
del self.headers['Warcprox-Meta']
remote_ip = self._remote_server_sock.getpeername()[0]
timestamp = datetime.datetime.utcnow()
req, prox_rec_res = warcprox.mitmproxy.MitmProxyHandler._proxy_request(
self)
recorded_url = RecordedUrl(
url=self.url, request_data=req,
response_recorder=prox_rec_res.recorder, remote_ip=remote_ip,
warcprox_meta=warcprox_meta, status=prox_rec_res.status,
size=prox_rec_res.recorder.len,
client_ip=self.client_address[0],
content_type=prox_rec_res.getheader("Content-Type"),
method=self.command, timestamp=timestamp, host=self.hostname,
duration=datetime.datetime.utcnow()-timestamp)
self.server.recorded_url_q.put(recorded_url)
return recorded_url
# deprecated
def do_PUTMETA(self):
'''
Handles a special warcprox PUTMETA request (deprecated). A PUTMETA
request is equivalent to a WARCPROX_WRITE_RECORD request with
WARC-Type: metadata.
'''
self.do_WARCPROX_WRITE_RECORD(warc_type=warctools.WarcRecord.METADATA)
def do_WARCPROX_WRITE_RECORD(self, warc_type=None):
'''
Handles a request with http method WARCPROX_WRITE_RECORD, a special
type of request which tells warcprox to construct a warc record from
the request more or less verbatim, and write it to a warc.
To honor the request, this method creates a RecordedUrl queues it for
the WarcWriterThread to process. The warc record headers Content-Type
and WARC-Type are taken from the request headers, as is the payload.
Example request:
WARCPROX_WRITE_RECORD screenshot:https://example.com/ HTTP/1.1
WARC-Type: metadata
Content-Type: image/png
Content-Length: 12345
Connection: close
<png image data>
'''
try:
self.url = self.path
if ('Content-Length' in self.headers and 'Content-Type' in self.headers
and (warc_type or 'WARC-Type' in self.headers)):
timestamp = datetime.datetime.utcnow()
# stream this?
request_data = self.rfile.read(int(self.headers['Content-Length']))
warcprox_meta = None
raw_warcprox_meta = self.headers.get('Warcprox-Meta')
if raw_warcprox_meta:
warcprox_meta = json.loads(raw_warcprox_meta)
rec_custom = RecordedUrl(url=self.url,
request_data=request_data,
response_recorder=None,
remote_ip=b'',
warcprox_meta=warcprox_meta,
content_type=self.headers['Content-Type'],
custom_type=warc_type or self.headers['WARC-Type'].encode('utf-8'),
status=204, size=len(request_data),
client_ip=self.client_address[0],
method=self.command, timestamp=timestamp)
self.server.recorded_url_q.put(rec_custom)
self.send_response(204, 'OK')
else:
self.send_error(400, 'Bad request')
self.end_headers()
except:
self.logger.error("uncaught exception in do_WARCPROX_WRITE_RECORD", exc_info=True)
raise
def log_message(self, fmt, *args):
# logging better handled elsewhere?
pass
class RecordedUrl:
logger = logging.getLogger("warcprox.warcproxy.RecordedUrl")
def __init__(self, url, request_data, response_recorder, remote_ip,
warcprox_meta=None, content_type=None, custom_type=None,
status=None, size=None, client_ip=None, method=None,
timestamp=None, host=None, duration=None):
# XXX should test what happens with non-ascii url (when does
# url-encoding happen?)
if type(url) is not bytes:
self.url = url.encode('ascii')
else:
self.url = url
if type(remote_ip) is not bytes:
self.remote_ip = remote_ip.encode('ascii')
else:
self.remote_ip = remote_ip
self.request_data = request_data
self.response_recorder = response_recorder
if warcprox_meta:
self.warcprox_meta = warcprox_meta
else:
self.warcprox_meta = {}
self.content_type = content_type
self.mimetype = content_type
if self.mimetype:
n = self.mimetype.find(";")
if n >= 0:
self.mimetype = self.mimetype[:n]
self.custom_type = custom_type
self.status = status
self.size = size
self.client_ip = client_ip
self.method = method
self.timestamp = timestamp
self.host = host
self.duration = duration
class SingleThreadedWarcProxy(http_server.HTTPServer):
logger = logging.getLogger("warcprox.warcproxy.WarcProxy")
def __init__(
self, ca=None, recorded_url_q=None, stats_db=None,
options=warcprox.Options()):
server_address = (
options.address or 'localhost',
options.port if options.port is not None else 8000)
if options.onion_tor_socks_proxy:
try:
host, port = options.onion_tor_socks_proxy.split(':')
WarcProxyHandler.onion_tor_socks_proxy_host = host
WarcProxyHandler.onion_tor_socks_proxy_port = int(port)
except ValueError:
WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy
WarcProxyHandler.onion_tor_socks_proxy_port = None
http_server.HTTPServer.__init__(
self, server_address, WarcProxyHandler, bind_and_activate=True)
self.digest_algorithm = options.digest_algorithm or 'sha1'
if ca is not None:
self.ca = ca
else:
ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64]
self.ca = CertificateAuthority(ca_file='warcprox-ca.pem',
certs_dir='./warcprox-ca',
ca_name=ca_name)
if recorded_url_q is not None:
self.recorded_url_q = recorded_url_q
else:
self.recorded_url_q = queue.Queue(maxsize=options.queue_size or 1000)
self.stats_db = stats_db
self.options = options
class WarcProxy(SingleThreadedWarcProxy, warcprox.mitmproxy.PooledMitmProxy):
logger = logging.getLogger("warcprox.warcproxy.WarcProxy")
def __init__(
self, ca=None, recorded_url_q=None, stats_db=None,
options=warcprox.Options()):
if options.max_threads:
self.logger.info(
"max_threads=%s set by command line option",
options.max_threads)
warcprox.mitmproxy.PooledMitmProxy.__init__(self, options.max_threads)
SingleThreadedWarcProxy.__init__(
self, ca, recorded_url_q, stats_db, options)
def server_activate(self):
http_server.HTTPServer.server_activate(self)
self.logger.info(
'listening on %s:%s', self.server_address[0],
self.server_address[1])
def server_close(self):
self.logger.info('shutting down')
http_server.HTTPServer.server_close(self)
def handle_error(self, request, client_address):
self.logger.warn(
"exception processing request %s from %s", request,
client_address, exc_info=True)

View File

@ -1,301 +0,0 @@
# vim:set sw=4 et:
from __future__ import absolute_import
try:
import queue
except ImportError:
import Queue as queue
import logging
import threading
import os
import hashlib
import time
import socket
import base64
from datetime import datetime
import hanzo.httptools
from hanzo import warctools
import warcprox
class WarcWriter:
logger = logging.getLogger("warcprox.warcwriter.WarcWriter")
# port is only used for warc filename
def __init__(self, directory='./warcs', rollover_size=1000000000,
gzip=False, prefix='WARCPROX', port=0,
digest_algorithm='sha1', base32=False, dedup_db=None,
playback_index_db=None):
self.rollover_size = rollover_size
self.gzip = gzip
self.digest_algorithm = digest_algorithm
self.base32 = base32
self.dedup_db = dedup_db
self.playback_index_db = playback_index_db
# warc path and filename stuff
self.directory = directory
self.prefix = prefix
self.port = port
self._f = None
self._fpath = None
self._serial = 0
if not os.path.exists(directory):
self.logger.info("warc destination directory {} doesn't exist, creating it".format(directory))
os.mkdir(directory)
# returns a tuple (principal_record, request_record) where principal_record is either a response or revisit record
def build_warc_records(self, recorded_url):
warc_date = warctools.warc.warc_datetime_str(datetime.utcnow())
dedup_info = None
if self.dedup_db is not None and recorded_url.response_recorder.payload_digest is not None:
key = self.digest_str(recorded_url.response_recorder.payload_digest)
dedup_info = self.dedup_db.lookup(key)
if dedup_info is not None:
# revisit record
recorded_url.response_recorder.tempfile.seek(0)
if recorded_url.response_recorder.payload_offset is not None:
response_header_block = recorded_url.response_recorder.tempfile.read(recorded_url.response_recorder.payload_offset)
else:
response_header_block = recorded_url.response_recorder.tempfile.read()
principal_record = self.build_warc_record(
url=recorded_url.url, warc_date=warc_date,
data=response_header_block,
warc_type=warctools.WarcRecord.REVISIT,
refers_to=dedup_info['i'],
refers_to_target_uri=dedup_info['u'],
refers_to_date=dedup_info['d'],
payload_digest=self.digest_str(recorded_url.response_recorder.payload_digest),
profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST,
content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
remote_ip=recorded_url.remote_ip)
else:
# response record
principal_record = self.build_warc_record(
url=recorded_url.url, warc_date=warc_date,
recorder=recorded_url.response_recorder,
warc_type=warctools.WarcRecord.RESPONSE,
content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
remote_ip=recorded_url.remote_ip)
request_record = self.build_warc_record(
url=recorded_url.url, warc_date=warc_date,
data=recorded_url.request_data,
warc_type=warctools.WarcRecord.REQUEST,
content_type=hanzo.httptools.RequestMessage.CONTENT_TYPE,
concurrent_to=principal_record.id)
return principal_record, request_record
def digest_str(self, hash_obj):
return hash_obj.name.encode('utf-8') + b':' + (base64.b32encode(hash_obj.digest()) if self.base32 else hash_obj.hexdigest().encode('ascii'))
def build_warc_record(self, url, warc_date=None, recorder=None, data=None,
concurrent_to=None, warc_type=None, content_type=None, remote_ip=None,
profile=None, refers_to=None, refers_to_target_uri=None,
refers_to_date=None, payload_digest=None):
if warc_date is None:
warc_date = warctools.warc.warc_datetime_str(datetime.utcnow())
record_id = warctools.WarcRecord.random_warc_uuid()
headers = []
if warc_type is not None:
headers.append((warctools.WarcRecord.TYPE, warc_type))
headers.append((warctools.WarcRecord.ID, record_id))
headers.append((warctools.WarcRecord.DATE, warc_date))
headers.append((warctools.WarcRecord.URL, url))
if remote_ip is not None:
headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip))
if profile is not None:
headers.append((warctools.WarcRecord.PROFILE, profile))
if refers_to is not None:
headers.append((warctools.WarcRecord.REFERS_TO, refers_to))
if refers_to_target_uri is not None:
headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri))
if refers_to_date is not None:
headers.append((warctools.WarcRecord.REFERS_TO_DATE, refers_to_date))
if concurrent_to is not None:
headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to))
if content_type is not None:
headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
if payload_digest is not None:
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
if recorder is not None:
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1')))
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
self.digest_str(recorder.block_digest)))
if recorder.payload_digest is not None:
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
self.digest_str(recorder.payload_digest)))
recorder.tempfile.seek(0)
record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)
else:
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1')))
block_digest = hashlib.new(self.digest_algorithm, data)
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
self.digest_str(block_digest)))
content_tuple = content_type, data
record = warctools.WarcRecord(headers=headers, content=content_tuple)
return record
def timestamp17(self):
now = datetime.utcnow()
return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000)
def close_writer(self):
if self._fpath:
self.logger.info('closing {0}'.format(self._f_finalname))
self._f.close()
finalpath = os.path.sep.join([self.directory, self._f_finalname])
os.rename(self._fpath, finalpath)
self._fpath = None
self._f = None
def _build_warcinfo_record(self, filename):
warc_record_date = warctools.warc.warc_datetime_str(datetime.utcnow())
record_id = warctools.WarcRecord.random_warc_uuid()
headers = []
headers.append((warctools.WarcRecord.ID, record_id))
headers.append((warctools.WarcRecord.TYPE, warctools.WarcRecord.WARCINFO))
headers.append((warctools.WarcRecord.FILENAME, filename.encode('latin1')))
headers.append((warctools.WarcRecord.DATE, warc_record_date))
warcinfo_fields = []
warcinfo_fields.append(b'software: warcprox ' + warcprox.version_bytes)
hostname = socket.gethostname()
warcinfo_fields.append('hostname: {}'.format(hostname).encode('latin1'))
warcinfo_fields.append('ip: {0}'.format(socket.gethostbyname(hostname)).encode('latin1'))
warcinfo_fields.append(b'format: WARC File Format 1.0')
# warcinfo_fields.append('robots: ignore')
# warcinfo_fields.append('description: {0}'.format(self.description))
# warcinfo_fields.append('isPartOf: {0}'.format(self.is_part_of))
data = b'\r\n'.join(warcinfo_fields) + b'\r\n'
record = warctools.WarcRecord(headers=headers, content=(b'application/warc-fields', data))
return record
# <!-- <property name="template" value="${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}" /> -->
def _writer(self):
if self._fpath and os.path.getsize(self._fpath) > self.rollover_size:
self.close_writer()
if self._f == None:
self._f_finalname = '{}-{}-{:05d}-{}-{}-{}.warc{}'.format(
self.prefix, self.timestamp17(), self._serial, os.getpid(),
socket.gethostname(), self.port, '.gz' if self.gzip else '')
self._fpath = os.path.sep.join([self.directory, self._f_finalname + '.open'])
self._f = open(self._fpath, 'wb')
warcinfo_record = self._build_warcinfo_record(self._f_finalname)
self.logger.debug('warcinfo_record.headers={}'.format(warcinfo_record.headers))
warcinfo_record.write_to(self._f, gzip=self.gzip)
self._serial += 1
return self._f
def _final_tasks(self, recorded_url, recordset, recordset_offset):
if (self.dedup_db is not None
and recordset[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
and recorded_url.response_recorder.payload_size() > 0):
key = self.digest_str(recorded_url.response_recorder.payload_digest)
self.dedup_db.save(key, recordset[0], recordset_offset)
if self.playback_index_db is not None:
self.playback_index_db.save(self._f_finalname, recordset, recordset_offset)
recorded_url.response_recorder.tempfile.close()
def write_records(self, recorded_url):
recordset = self.build_warc_records(recorded_url)
writer = self._writer()
recordset_offset = writer.tell()
for record in recordset:
offset = writer.tell()
record.write_to(writer, gzip=self.gzip)
self.logger.debug('wrote warc record: warc_type={} content_length={} url={} warc={} offset={}'.format(
record.get_header(warctools.WarcRecord.TYPE),
record.get_header(warctools.WarcRecord.CONTENT_LENGTH),
record.get_header(warctools.WarcRecord.URL),
self._fpath, offset))
self._f.flush()
self._final_tasks(recorded_url, recordset, recordset_offset)
class WarcWriterThread(threading.Thread):
logger = logging.getLogger("warcprox.warcwriter.WarcWriterThread")
def __init__(self, recorded_url_q=None, warc_writer=None, rollover_idle_time=None):
"""recorded_url_q is a queue.Queue of warcprox.warcprox.RecordedUrl."""
threading.Thread.__init__(self, name='WarcWriterThread')
self.recorded_url_q = recorded_url_q
self.rollover_idle_time = rollover_idle_time
self.stop = threading.Event()
if warc_writer:
self.warc_writer = warc_writer
else:
self.warc_writer = WarcWriter()
def run(self):
self.logger.info('WarcWriterThread starting, directory={} gzip={} rollover_size={} rollover_idle_time={} prefix={} port={}'.format(
os.path.abspath(self.warc_writer.directory), self.warc_writer.gzip, self.warc_writer.rollover_size,
self.rollover_idle_time, self.warc_writer.prefix, self.warc_writer.port))
self._last_sync = self._last_activity = time.time()
while not self.stop.is_set():
try:
recorded_url = self.recorded_url_q.get(block=True, timeout=0.5)
self.logger.info("recorded_url.warcprox_meta={} for {}".format(recorded_url.warcprox_meta, recorded_url.url))
self.warc_writer.write_records(recorded_url)
self._last_activity = time.time()
except queue.Empty:
if (self.warc_writer._fpath is not None
and self.rollover_idle_time is not None
and self.rollover_idle_time > 0
and time.time() - self._last_activity > self.rollover_idle_time):
self.logger.debug('rolling over warc file after {} seconds idle'.format(time.time() - self._last_activity))
self.warc_writer.close_writer()
if time.time() - self._last_sync > 60:
if self.warc_writer.dedup_db:
self.warc_writer.dedup_db.sync()
if self.warc_writer.playback_index_db:
self.warc_writer.playback_index_db.sync()
self._last_sync = time.time()
self.logger.info('WarcWriterThread shutting down')
self.warc_writer.close_writer();

168
warcprox/writer.py Normal file
View File

@ -0,0 +1,168 @@
#
# warcprox/writer.py - warc writer, manages and writes records to warc files
#
# Copyright (C) 2013-2016 Internet Archive
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
# USA.
#
from __future__ import absolute_import
import logging
from datetime import datetime
from hanzo import warctools
import time
import warcprox
import os
import socket
import string
import random
class WarcWriter:
logger = logging.getLogger('warcprox.writer.WarcWriter')
def __init__(self, options=warcprox.Options()):
self.rollover_size = options.rollover_size or 1000000000
self.rollover_idle_time = options.rollover_idle_time or None
self._last_activity = time.time()
self.gzip = options.gzip or False
digest_algorithm = options.digest_algorithm or 'sha1'
base32 = options.base32
self.record_builder = warcprox.warc.WarcRecordBuilder(digest_algorithm=digest_algorithm, base32=base32)
# warc path and filename stuff
self.directory = options.directory or './warcs'
self.prefix = options.prefix or 'warcprox'
self._f = None
self._fpath = None
self._f_finalname = None
self._serial = 0
self._randomtoken = "".join(random.Random().sample(string.digits + string.ascii_lowercase, 8))
if not os.path.exists(self.directory):
self.logger.info("warc destination directory {} doesn't exist, creating it".format(self.directory))
os.mkdir(self.directory)
def timestamp17(self):
now = datetime.utcnow()
return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000)
def close_writer(self):
if self._fpath:
self.logger.info('closing {0}'.format(self._f_finalname))
self._f.close()
finalpath = os.path.sep.join([self.directory, self._f_finalname])
os.rename(self._fpath, finalpath)
self._fpath = None
self._f = None
# h3 default <!-- <property name="template" value="${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}" /> -->
# ${prefix}-${timestamp17}-${randomtoken}-${serialno}.warc.gz"
def _writer(self):
if self._fpath and os.path.getsize(self._fpath) > self.rollover_size:
self.close_writer()
if self._f == None:
self._f_finalname = '{}-{}-{:05d}-{}.warc{}'.format(
self.prefix, self.timestamp17(), self._serial, self._randomtoken, '.gz' if self.gzip else '')
self._fpath = os.path.sep.join([self.directory, self._f_finalname + '.open'])
self._f = open(self._fpath, 'wb')
warcinfo_record = self.record_builder.build_warcinfo_record(self._f_finalname)
self.logger.debug('warcinfo_record.headers={}'.format(warcinfo_record.headers))
warcinfo_record.write_to(self._f, gzip=self.gzip)
self._serial += 1
return self._f
def write_records(self, recorded_url):
"""Returns tuple of records written, which are instances of
hanzo.warctools.warc.WarcRecord, decorated with "warc_filename" and
"offset" attributes."""
records = self.record_builder.build_warc_records(recorded_url)
writer = self._writer()
recordset_offset = writer.tell()
for record in records:
offset = writer.tell()
record.write_to(writer, gzip=self.gzip)
record.offset = offset
record.length = writer.tell() - offset
record.warc_filename = self._f_finalname
self.logger.debug('wrote warc record: warc_type=%s content_length=%s url=%s warc=%s offset=%d',
record.get_header(warctools.WarcRecord.TYPE),
record.get_header(warctools.WarcRecord.CONTENT_LENGTH),
record.get_header(warctools.WarcRecord.URL),
self._fpath, record.offset)
self._f.flush()
self._last_activity = time.time()
return records
def maybe_idle_rollover(self):
if (self._fpath is not None
and self.rollover_idle_time is not None
and self.rollover_idle_time > 0
and time.time() - self._last_activity > self.rollover_idle_time):
self.logger.debug('rolling over {} after {} seconds idle'.format(self._f_finalname, time.time() - self._last_activity))
self.close_writer()
class WarcWriterPool:
logger = logging.getLogger("warcprox.writer.WarcWriterPool")
def __init__(self, options=warcprox.Options()):
self.default_warc_writer = WarcWriter(options=options)
self.warc_writers = {} # {prefix:WarcWriter}
self._last_sync = time.time()
self.options = options
# chooses writer for filename specified by warcprox_meta["warc-prefix"] if set
def _writer(self, recorded_url):
w = self.default_warc_writer
if recorded_url.warcprox_meta and "warc-prefix" in recorded_url.warcprox_meta:
# self.logger.info("recorded_url.warcprox_meta={} for {}".format(recorded_url.warcprox_meta, recorded_url.url))
options = warcprox.Options(**vars(self.options))
options.prefix = recorded_url.warcprox_meta["warc-prefix"]
if not options.prefix in self.warc_writers:
self.warc_writers[options.prefix] = WarcWriter(options=options)
w = self.warc_writers[options.prefix]
return w
def write_records(self, recorded_url):
"""Returns tuple of records written, which are instances of
hanzo.warctools.warc.WarcRecord, decorated with "warc_filename" and
"offset" attributes."""
return self._writer(recorded_url).write_records(recorded_url)
def maybe_idle_rollover(self):
self.default_warc_writer.maybe_idle_rollover()
for w in self.warc_writers.values():
w.maybe_idle_rollover()
def close_writers(self):
self.default_warc_writer.close_writer()
for w in self.warc_writers.values():
w.close_writer()

122
warcprox/writerthread.py Normal file
View File

@ -0,0 +1,122 @@
#
# warcprox/writerthread.py - warc writer thread, reads from the recorded url
# queue, writes warc records, runs final tasks after warc records are written
#
# Copyright (C) 2013-2016 Internet Archive
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
# USA.
#
from __future__ import absolute_import
try:
import queue
except ImportError:
import Queue as queue
import logging
import threading
import os
import hashlib
import time
import socket
import base64
from datetime import datetime
import hanzo.httptools
from hanzo import warctools
import warcprox
import cProfile
class WarcWriterThread(threading.Thread):
logger = logging.getLogger("warcprox.warcproxwriter.WarcWriterThread")
def __init__(self, recorded_url_q=None, writer_pool=None, dedup_db=None, listeners=None, options=warcprox.Options()):
"""recorded_url_q is a queue.Queue of warcprox.warcprox.RecordedUrl."""
threading.Thread.__init__(self, name='WarcWriterThread')
self.recorded_url_q = recorded_url_q
self.stop = threading.Event()
if writer_pool:
self.writer_pool = writer_pool
else:
self.writer_pool = WarcWriterPool()
self.dedup_db = dedup_db
self.listeners = listeners
self.options = options
self.idle = None
def run(self):
if self.options.profile:
cProfile.runctx('self._run()', globals(), locals(), sort='cumulative')
else:
self._run()
def _run(self):
while not self.stop.is_set():
try:
self.name = 'WarcWriterThread(tid={})'.format(warcprox.gettid())
while True:
try:
if self.stop.is_set():
qsize = self.recorded_url_q.qsize()
if qsize % 50 == 0:
self.logger.info("%s urls left to write", qsize)
recorded_url = self.recorded_url_q.get(block=True, timeout=0.5)
self.idle = None
if self.dedup_db:
warcprox.dedup.decorate_with_dedup_info(self.dedup_db,
recorded_url, base32=self.options.base32)
records = self.writer_pool.write_records(recorded_url)
self._final_tasks(recorded_url, records)
# try to release resources in a timely fashion
if recorded_url.response_recorder and recorded_url.response_recorder.tempfile:
recorded_url.response_recorder.tempfile.close()
except queue.Empty:
if self.stop.is_set():
break
self.idle = time.time()
self.writer_pool.maybe_idle_rollover()
self.logger.info('WarcWriterThread shutting down')
self.writer_pool.close_writers()
except:
self.logger.critical("WarcWriterThread will try to continue after unexpected error", exc_info=True)
time.sleep(0.5)
# closest thing we have to heritrix crawl log at the moment
def _log(self, recorded_url, records):
try:
payload_digest = records[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode("utf-8")
except:
payload_digest = "-"
# 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"}
self.logger.info("{} {} {} {} {} size={} {} {} {} offset={}".format(
recorded_url.client_ip, recorded_url.status, recorded_url.method,
recorded_url.url.decode("utf-8"), recorded_url.mimetype,
recorded_url.size, payload_digest, records[0].type.decode("utf-8"),
records[0].warc_filename, records[0].offset))
def _final_tasks(self, recorded_url, records):
if self.listeners:
for listener in self.listeners:
try:
listener.notify(recorded_url, records)
except:
self.logger.error('%s raised exception',
listener.notify, exc_info=True)
self._log(recorded_url, records)