mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
commit
de3c81fdc8
1
.gitignore
vendored
1
.gitignore
vendored
@ -11,3 +11,4 @@ warcs
|
||||
build
|
||||
dist
|
||||
.tox
|
||||
out.*
|
||||
|
41
.travis.yml
41
.travis.yml
@ -1,21 +1,36 @@
|
||||
# vim: set sw=4 et:
|
||||
#
|
||||
# tox approach stolen from
|
||||
# https://github.com/pypa/pip/blob/abdb597dbfb51b21cc76c1cff068b72c80f3a77d/.travis.yml
|
||||
#
|
||||
|
||||
language: python
|
||||
python:
|
||||
- 3.5
|
||||
- 3.4
|
||||
- 2.7
|
||||
- nightly
|
||||
- pypy
|
||||
- pypy3
|
||||
|
||||
env:
|
||||
- TOXENV=py27
|
||||
- TOXENV=py34
|
||||
matrix:
|
||||
allow_failures:
|
||||
- python: pypy
|
||||
- python: pypy3
|
||||
|
||||
addons:
|
||||
apt:
|
||||
packages:
|
||||
- python-gdbm
|
||||
- python3-gdbm
|
||||
- tor
|
||||
|
||||
services:
|
||||
- docker
|
||||
|
||||
before_install:
|
||||
- sudo apt-get update
|
||||
- sudo apt-get -y install python-gdbm python3-gdbm
|
||||
- sudo service docker restart ; sleep 10 # https://github.com/travis-ci/travis-ci/issues/4778
|
||||
- docker run -d --publish=28015:28015 rethinkdb
|
||||
|
||||
before_script:
|
||||
- pip install tox
|
||||
- pip install . pytest requests
|
||||
|
||||
script: tox
|
||||
script:
|
||||
- py.test -v -s tests
|
||||
- py.test -v -s --rethinkdb-servers=localhost tests tests
|
||||
- py.test -v -s --rethinkdb-servers=localhost --rethinkdb-big-table tests
|
||||
|
||||
|
145
README.rst
145
README.rst
@ -1,15 +1,11 @@
|
||||
warcprox - WARC writing MITM HTTP/S proxy
|
||||
-----------------------------------------
|
||||
.. image:: https://travis-ci.org/internetarchive/warcprox.png?branch=master
|
||||
.. image:: https://travis-ci.org/internetarchive/warcprox.png?branch=master
|
||||
:target: https://travis-ci.org/internetarchive/warcprox
|
||||
|
||||
Based on the excellent and simple pymiproxy by Nadeem Douba.
|
||||
https://github.com/allfro/pymiproxy
|
||||
|
||||
License: because pymiproxy is GPL and warcprox is a derivative work of
|
||||
pymiproxy, warcprox is also GPL.
|
||||
|
||||
|
||||
Install
|
||||
~~~~~~~
|
||||
|
||||
@ -19,6 +15,7 @@ To install latest release run:
|
||||
|
||||
::
|
||||
|
||||
# apt-get install libffi-dev libssl-dev python3-gdbm
|
||||
pip install warcprox
|
||||
|
||||
You can also install the latest bleeding edge code:
|
||||
@ -45,10 +42,15 @@ Usage
|
||||
usage: warcprox [-h] [-p PORT] [-b ADDRESS] [-c CACERT]
|
||||
[--certs-dir CERTS_DIR] [-d DIRECTORY] [-z] [-n PREFIX]
|
||||
[-s SIZE] [--rollover-idle-time ROLLOVER_IDLE_TIME]
|
||||
[-g DIGEST_ALGORITHM] [--base32] [-j DEDUP_DB_FILE]
|
||||
[-P PLAYBACK_PORT]
|
||||
[--playback-index-db-file PLAYBACK_INDEX_DB_FILE] [--version]
|
||||
[-v] [-q]
|
||||
[-g DIGEST_ALGORITHM] [--base32]
|
||||
[--stats-db-file STATS_DB_FILE] [-P PLAYBACK_PORT]
|
||||
[--playback-index-db-file PLAYBACK_INDEX_DB_FILE]
|
||||
[-j DEDUP_DB_FILE | --rethinkdb-servers RETHINKDB_SERVERS]
|
||||
[--rethinkdb-db RETHINKDB_DB] [--rethinkdb-big-table]
|
||||
[--kafka-broker-list KAFKA_BROKER_LIST]
|
||||
[--kafka-capture-feed-topic KAFKA_CAPTURE_FEED_TOPIC]
|
||||
[--onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY]
|
||||
[--version] [-v] [-q]
|
||||
|
||||
warcprox - WARC writing MITM HTTP/S proxy
|
||||
|
||||
@ -58,84 +60,91 @@ Usage
|
||||
-b ADDRESS, --address ADDRESS
|
||||
address to listen on (default: localhost)
|
||||
-c CACERT, --cacert CACERT
|
||||
CA certificate file; if file does not exist, it will
|
||||
be created (default: ./desktop-nlevitt-warcprox-
|
||||
ca.pem)
|
||||
CA certificate file; if file does not exist, it
|
||||
will be created (default: ./MacBook-Pro.local-
|
||||
warcprox-ca.pem)
|
||||
--certs-dir CERTS_DIR
|
||||
where to store and load generated certificates
|
||||
(default: ./desktop-nlevitt-warcprox-ca)
|
||||
(default: ./MacBook-Pro.local-warcprox-ca)
|
||||
-d DIRECTORY, --dir DIRECTORY
|
||||
where to write warcs (default: ./warcs)
|
||||
-z, --gzip write gzip-compressed warc records (default: False)
|
||||
-z, --gzip write gzip-compressed warc records (default:
|
||||
False)
|
||||
-n PREFIX, --prefix PREFIX
|
||||
WARC filename prefix (default: WARCPROX)
|
||||
-s SIZE, --size SIZE WARC file rollover size threshold in bytes (default:
|
||||
1000000000)
|
||||
-s SIZE, --size SIZE WARC file rollover size threshold in bytes
|
||||
(default: 1000000000)
|
||||
--rollover-idle-time ROLLOVER_IDLE_TIME
|
||||
WARC file rollover idle time threshold in seconds (so
|
||||
that Friday's last open WARC doesn't sit there all
|
||||
weekend waiting for more data) (default: None)
|
||||
WARC file rollover idle time threshold in seconds
|
||||
(so that Friday's last open WARC doesn't sit there
|
||||
all weekend waiting for more data) (default: None)
|
||||
-g DIGEST_ALGORITHM, --digest-algorithm DIGEST_ALGORITHM
|
||||
digest algorithm, one of sha384, sha512, md5, sha224,
|
||||
sha256, sha1 (default: sha1)
|
||||
digest algorithm, one of sha1, sha256, md5,
|
||||
sha224, sha512, sha384 (default: sha1)
|
||||
--base32 write digests in Base32 instead of hex (default:
|
||||
False)
|
||||
-j DEDUP_DB_FILE, --dedup-db-file DEDUP_DB_FILE
|
||||
persistent deduplication database file; empty string
|
||||
or /dev/null disables deduplication (default:
|
||||
./warcprox-dedup.db)
|
||||
--stats-db-file STATS_DB_FILE
|
||||
persistent statistics database file; empty string
|
||||
or /dev/null disables statistics tracking
|
||||
(default: ./warcprox-stats.db)
|
||||
-P PLAYBACK_PORT, --playback-port PLAYBACK_PORT
|
||||
port to listen on for instant playback (default: None)
|
||||
port to listen on for instant playback (default:
|
||||
None)
|
||||
--playback-index-db-file PLAYBACK_INDEX_DB_FILE
|
||||
playback index database file (only used if --playback-
|
||||
port is specified) (default: ./warcprox-playback-
|
||||
index.db)
|
||||
playback index database file (only used if
|
||||
--playback-port is specified) (default:
|
||||
./warcprox-playback-index.db)
|
||||
-j DEDUP_DB_FILE, --dedup-db-file DEDUP_DB_FILE
|
||||
persistent deduplication database file; empty
|
||||
string or /dev/null disables deduplication
|
||||
(default: ./warcprox-dedup.db)
|
||||
--rethinkdb-servers RETHINKDB_SERVERS
|
||||
rethinkdb servers, used for dedup and stats if
|
||||
specified; e.g.
|
||||
db0.foo.org,db0.foo.org:38015,db1.foo.org
|
||||
(default: None)
|
||||
--rethinkdb-db RETHINKDB_DB
|
||||
rethinkdb database name (ignored unless
|
||||
--rethinkdb-servers is specified) (default:
|
||||
warcprox)
|
||||
--rethinkdb-big-table
|
||||
use a big rethinkdb table called "captures",
|
||||
instead of a small table called "dedup"; table is
|
||||
suitable for use as index for playback (ignored
|
||||
unless --rethinkdb-servers is specified) (default:
|
||||
False)
|
||||
--kafka-broker-list KAFKA_BROKER_LIST
|
||||
kafka broker list for capture feed (default: None)
|
||||
--kafka-capture-feed-topic KAFKA_CAPTURE_FEED_TOPIC
|
||||
kafka capture feed topic (default: None)
|
||||
--onion-tor-socks-proxy ONION_TOR_SOCKS_PROXY
|
||||
host:port of tor socks proxy, used only to connect
|
||||
to .onion sites (default: None)
|
||||
--version show program's version number and exit
|
||||
-v, --verbose
|
||||
-q, --quiet
|
||||
|
||||
To do
|
||||
~~~~~
|
||||
|
||||
* (partly done) integration tests, unit tests
|
||||
* (done) url-agnostic deduplication
|
||||
* unchunk and/or ungzip before storing payload, or alter request to
|
||||
discourage server from chunking/gzipping
|
||||
* check certs from proxied website, like browser does, and present
|
||||
browser-like warning if appropriate
|
||||
* keep statistics, produce reports
|
||||
* write cdx while crawling?
|
||||
* performance testing
|
||||
* (done) base32 sha1 like heritrix?
|
||||
* configurable timeouts and stuff
|
||||
* evaluate ipv6 support
|
||||
* (done) more explicit handling of connection closed exception
|
||||
during transfer
|
||||
* dns cache?? the system already does a fine job I'm thinking
|
||||
* keepalive with remote servers?
|
||||
* (done) python3
|
||||
* special handling for 304 not-modified (write nothing or write revisit
|
||||
record... and/or modify request so server never responds with 304)
|
||||
* (done) instant playback on a second proxy port
|
||||
* special url for downloading ca cert e.g. http(s)://warcprox./ca.pem
|
||||
* special url for other stuff, some status info or something?
|
||||
* browser plugin for warcprox mode
|
||||
License
|
||||
~~~~~~~
|
||||
|
||||
- accept warcprox CA cert only when in warcprox mode
|
||||
- separate temporary cookie store, like incognito
|
||||
- "careful! your activity is being archived" banner
|
||||
- easy switch between archiving and instant playback proxy port
|
||||
Warcprox is a derivative work of pymiproxy, which is GPL. Thus warcprox is also
|
||||
GPL.
|
||||
|
||||
To not do
|
||||
^^^^^^^^^
|
||||
Copyright (C) 2012 Cygnos Corporation
|
||||
Copyright (C) 2013-2016 Internet Archive
|
||||
|
||||
The features below could also be part of warcprox. But maybe they don't
|
||||
belong here, since this is a proxy, not a crawler/robot. It can be used
|
||||
by a human with a browser, or by something automated, i.e. a robot. My
|
||||
feeling is that it's more appropriate to implement these in the robot.
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
* politeness, i.e. throttle requests per server
|
||||
* fetch and obey robots.txt
|
||||
* alter user-agent, maybe insert something like "warcprox mitm
|
||||
archiving proxy; +http://archive.org/details/archive.org\_bot"
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
|
||||
|
1
benchmarks/requirements.txt
Normal file
1
benchmarks/requirements.txt
Normal file
@ -0,0 +1 @@
|
||||
aiohttp
|
172
benchmarks/run-benchmarks.py
Executable file
172
benchmarks/run-benchmarks.py
Executable file
@ -0,0 +1,172 @@
|
||||
#!/usr/bin/env python
|
||||
#
|
||||
# run-benchmarks.py - some benchmarking code for warcprox
|
||||
#
|
||||
# Copyright (C) 2015-2016 Internet Archive
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
# as published by the Free Software Foundation; either version 2
|
||||
# of the License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
# USA.
|
||||
#
|
||||
|
||||
import sys
|
||||
import aiohttp
|
||||
import aiohttp.server
|
||||
import asyncio
|
||||
import ssl
|
||||
import tempfile
|
||||
import OpenSSL.crypto
|
||||
import OpenSSL.SSL
|
||||
import random
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
import logging
|
||||
import warcprox.main
|
||||
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.INFO,
|
||||
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
||||
|
||||
def self_signed_cert():
|
||||
key = OpenSSL.crypto.PKey()
|
||||
key.generate_key(OpenSSL.crypto.TYPE_RSA, 2048)
|
||||
|
||||
cert = OpenSSL.crypto.X509()
|
||||
cert.set_serial_number(random.randint(0, 2 ** 64 - 1))
|
||||
cert.get_subject().CN = 'localhost'
|
||||
|
||||
cert.set_version(2)
|
||||
cert.gmtime_adj_notBefore(0)
|
||||
cert.gmtime_adj_notAfter(10 * 365 * 24 * 60 * 60)
|
||||
|
||||
cert.set_issuer(cert.get_subject())
|
||||
cert.set_pubkey(key)
|
||||
cert.sign(key, "sha1")
|
||||
|
||||
return key, cert
|
||||
|
||||
class HttpRequestHandler(aiohttp.server.ServerHttpProtocol):
|
||||
@asyncio.coroutine
|
||||
def handle_request(self, message, payload):
|
||||
response = aiohttp.Response(
|
||||
self.writer, 200, http_version=message.version
|
||||
)
|
||||
n = int(message.path.partition('/')[2])
|
||||
response.add_header('Content-Type', 'text/plain')
|
||||
# response.add_header('Content-Length', '18')
|
||||
response.send_headers()
|
||||
for i in range(n):
|
||||
response.write(b'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n')
|
||||
yield from response.write_eof()
|
||||
|
||||
def run_servers():
|
||||
loop.run_forever()
|
||||
|
||||
def start_servers():
|
||||
loop = asyncio.get_event_loop()
|
||||
http = loop.create_server(lambda: HttpRequestHandler(debug=True, keep_alive=75), '127.0.0.1', '8080')
|
||||
sslcontext = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
|
||||
key, cert = self_signed_cert()
|
||||
with tempfile.NamedTemporaryFile(delete=False) as certfile:
|
||||
certfile.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM, key))
|
||||
certfile.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM, cert))
|
||||
sslcontext.load_cert_chain(certfile.name)
|
||||
os.remove(certfile.name)
|
||||
https = loop.create_server(lambda: HttpRequestHandler(debug=True, keep_alive=75), '127.0.0.1', '8443', ssl=sslcontext)
|
||||
srv = loop.run_until_complete(http)
|
||||
srv = loop.run_until_complete(https)
|
||||
logging.info('serving on http://127.0.0.1:8080 and https://127.0.0.1:8443')
|
||||
|
||||
class AsyncClient(object):
|
||||
def __init__(self, proxy=None):
|
||||
self.n_urls = 0
|
||||
self.n_bytes = 0
|
||||
self.proxy = proxy
|
||||
if proxy:
|
||||
self.connector = aiohttp.connector.ProxyConnector(proxy, verify_ssl=False)
|
||||
else:
|
||||
self.connector = aiohttp.connector.TCPConnector(verify_ssl=False)
|
||||
|
||||
@asyncio.coroutine
|
||||
def read_response(self, r, url):
|
||||
# time.sleep(random.random() * 10)
|
||||
while True:
|
||||
chunk = yield from r.content.read(2**16)
|
||||
self.n_bytes += len(chunk)
|
||||
if not chunk:
|
||||
self.n_urls += 1
|
||||
logging.debug("finished reading from %s", url)
|
||||
r.close()
|
||||
break
|
||||
|
||||
@asyncio.coroutine
|
||||
def one_request(self, url):
|
||||
logging.debug("issuing request to %s", url)
|
||||
r = yield from aiohttp.get(url, connector=self.connector)
|
||||
logging.debug("issued request to %s", url)
|
||||
yield from self.read_response(r, url)
|
||||
|
||||
def benchmark(client):
|
||||
try:
|
||||
start = time.time()
|
||||
tasks_https = [client.one_request('https://localhost:8443/%s' % int(1.1**i)) for i in range(80)]
|
||||
asyncio.get_event_loop().run_until_complete(asyncio.wait(tasks_https))
|
||||
tasks_http = [client.one_request('http://localhost:8080/%s' % int(1.1**i)) for i in range(80)]
|
||||
asyncio.get_event_loop().run_until_complete(asyncio.wait(tasks_http))
|
||||
finally:
|
||||
finish = time.time()
|
||||
logging.info("proxy=%s: %s urls totaling %s bytes in %s seconds", client.proxy, client.n_urls, client.n_bytes, (finish - start))
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = warcprox.main.parse_args()
|
||||
|
||||
start_servers()
|
||||
|
||||
baseline_client = AsyncClient()
|
||||
logging.info("===== baseline benchmark starting (no proxy) =====")
|
||||
benchmark(baseline_client)
|
||||
logging.info("===== baseline benchmark finished =====")
|
||||
|
||||
|
||||
# Queue size of 1 makes warcprox behave as though it were synchronous (each
|
||||
# request blocks until the warc writer starts working on the last request).
|
||||
# This gives us a better sense of sustained max throughput. The
|
||||
# asynchronous nature of warcprox helps with bursty traffic, as long as the
|
||||
# average throughput stays below the sustained max.
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
args.queue_size = 1
|
||||
args.cacert = os.path.join(tmpdir, "benchmark-warcprox-ca.pem")
|
||||
args.certs_dir = os.path.join(tmpdir, "benchmark-warcprox-ca")
|
||||
args.directory = os.path.join(tmpdir, "warcs")
|
||||
args.gzip = True
|
||||
args.base32 = True
|
||||
args.stats_db_file = os.path.join(tmpdir, "stats.db")
|
||||
args.dedup_db_file = os.path.join(tmpdir, "dedup.db")
|
||||
|
||||
warcprox_controller = warcprox.main.init_controller(args)
|
||||
warcprox_controller_thread = threading.Thread(target=warcprox_controller.run_until_shutdown)
|
||||
warcprox_controller_thread.start()
|
||||
proxy = "http://%s:%s" % (args.address, args.port)
|
||||
proxied_client = AsyncClient(proxy=proxy)
|
||||
|
||||
logging.info("===== warcprox benchmark starting =====")
|
||||
benchmark(proxied_client)
|
||||
logging.info("===== warcprox benchmark finished =====")
|
||||
|
||||
warcprox_controller.stop.set()
|
||||
warcprox_controller_thread.join()
|
||||
|
||||
asyncio.get_event_loop().stop()
|
||||
logging.info("finished")
|
||||
|
@ -1,8 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# vim: set sw=4 et:
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
import warcprox.main
|
||||
|
||||
warcprox.main.main()
|
83
setup.py
83
setup.py
@ -1,44 +1,57 @@
|
||||
#!/usr/bin/env python
|
||||
# vim: set sw=4 et:
|
||||
'''
|
||||
setup.py - setuptools installation configuration for warcprox
|
||||
|
||||
Copyright (C) 2013-2016 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
USA.
|
||||
'''
|
||||
|
||||
from setuptools.command.test import test as TestCommand
|
||||
import sys
|
||||
import setuptools
|
||||
|
||||
VERSION_BYTES = b'1.4'
|
||||
|
||||
def full_version_bytes():
|
||||
import subprocess, time
|
||||
try:
|
||||
commit_bytes = subprocess.check_output(['git', 'log', '-1', '--pretty=format:%h'])
|
||||
|
||||
t_bytes = subprocess.check_output(['git', 'log', '-1', '--pretty=format:%ct'])
|
||||
t = int(t_bytes.strip().decode('utf-8'))
|
||||
tm = time.gmtime(t)
|
||||
timestamp_utc = time.strftime("%Y%m%d%H%M%S", time.gmtime(t))
|
||||
return VERSION_BYTES + b'-' + timestamp_utc.encode('utf-8') + b'-' + commit_bytes.strip()
|
||||
except subprocess.CalledProcessError:
|
||||
return VERSION_BYTES
|
||||
|
||||
version_bytes = full_version_bytes()
|
||||
with open('warcprox/version.txt', 'wb') as out:
|
||||
out.write(version_bytes)
|
||||
out.write(b'\n');
|
||||
import setuptools
|
||||
import setuptools.command.test
|
||||
|
||||
# special class needs to be added to support the pytest written dump-anydbm tests
|
||||
class PyTest(TestCommand):
|
||||
class PyTest(setuptools.command.test.test):
|
||||
def finalize_options(self):
|
||||
TestCommand.finalize_options(self)
|
||||
setuptools.command.test.test.finalize_options(self)
|
||||
self.test_args = []
|
||||
self.test_suite = True
|
||||
def run_tests(self):
|
||||
#import here, cause outside the eggs aren't loaded
|
||||
# import here, because outside the eggs aren't loaded
|
||||
import pytest
|
||||
errno = pytest.main(self.test_args)
|
||||
sys.exit(errno)
|
||||
|
||||
setuptools.setup(name='warcprox',
|
||||
version=version_bytes.decode('utf-8'),
|
||||
deps = [
|
||||
'certauth>=1.1.0',
|
||||
'warctools',
|
||||
'kafka-python>=1.0.1',
|
||||
'surt>=0.3b4',
|
||||
'rethinkstuff',
|
||||
'PySocks',
|
||||
]
|
||||
try:
|
||||
import concurrent.futures
|
||||
except:
|
||||
deps.append('futures')
|
||||
|
||||
setuptools.setup(
|
||||
name='warcprox',
|
||||
version='2.0b2.dev32',
|
||||
description='WARC writing MITM HTTP/S proxy',
|
||||
url='https://github.com/internetarchive/warcprox',
|
||||
author='Noah Levitt',
|
||||
@ -46,13 +59,18 @@ setuptools.setup(name='warcprox',
|
||||
long_description=open('README.rst').read(),
|
||||
license='GPL',
|
||||
packages=['warcprox'],
|
||||
package_data={'warcprox':['version.txt']},
|
||||
install_requires=['certauth>=1.1.0', 'warctools>=4.8.3'], # gdbm not in pip :(
|
||||
dependency_links=['git+https://github.com/internetarchive/warctools.git#egg=warctools-4.8.3'],
|
||||
install_requires=deps,
|
||||
tests_require=['requests>=2.0.1', 'pytest'], # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636
|
||||
cmdclass = {'test': PyTest},
|
||||
test_suite='warcprox.tests',
|
||||
scripts=['bin/dump-anydbm', 'bin/warcprox'],
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
'warcprox=warcprox.main:main',
|
||||
('warcprox-ensure-rethinkdb-tables='
|
||||
'warcprox.main:ensure_rethinkdb_tables'),
|
||||
'dump-anydbm=warcprox.dump_anydbm:main',
|
||||
],
|
||||
},
|
||||
zip_safe=False,
|
||||
classifiers=[
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
@ -60,6 +78,7 @@ setuptools.setup(name='warcprox',
|
||||
'License :: OSI Approved :: GNU General Public License (GPL)',
|
||||
'Programming Language :: Python :: 2.7',
|
||||
'Programming Language :: Python :: 3.4',
|
||||
'Programming Language :: Python :: 3.5',
|
||||
'Topic :: Internet :: Proxy Servers',
|
||||
'Topic :: Internet :: WWW/HTTP',
|
||||
'Topic :: Software Development :: Libraries :: Python Modules',
|
||||
|
49
tests/Dockerfile
Normal file
49
tests/Dockerfile
Normal file
@ -0,0 +1,49 @@
|
||||
#
|
||||
# Dockerfile for warcprox tests
|
||||
#
|
||||
# Copyright (C) 2015-2016 Internet Archive
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
# as published by the Free Software Foundation; either version 2
|
||||
# of the License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
# USA.
|
||||
#
|
||||
|
||||
FROM phusion/baseimage
|
||||
MAINTAINER Noah Levitt <nlevitt@archive.org>
|
||||
|
||||
# see https://github.com/stuartpb/rethinkdb-dockerfiles/blob/master/trusty/2.1.3/Dockerfile
|
||||
|
||||
ENV LANG=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get --auto-remove -y dist-upgrade
|
||||
|
||||
# Add the RethinkDB repository and public key
|
||||
# "RethinkDB Packaging <packaging@rethinkdb.com>" http://download.rethinkdb.com/apt/pubkey.gpg
|
||||
RUN apt-key adv --keyserver pgp.mit.edu --recv-keys 1614552E5765227AEC39EFCFA7E00EF33A8F2399 \
|
||||
&& echo "deb http://download.rethinkdb.com/apt trusty main" > /etc/apt/sources.list.d/rethinkdb.list \
|
||||
&& apt-get update && apt-get -y install rethinkdb
|
||||
|
||||
RUN mkdir -vp /etc/service/rethinkdb \
|
||||
&& echo "#!/bin/sh\nrethinkdb --bind 0.0.0.0 --directory /tmp/rethink-data --runuser rethinkdb --rungroup rethinkdb\n" > /etc/service/rethinkdb/run \
|
||||
&& chmod a+x /etc/service/rethinkdb/run
|
||||
|
||||
RUN apt-get -y install python-virtualenv git
|
||||
RUN apt-get -y install python-gdbm python3-gdbm libpython2.7-dev libpython3.4-dev libffi-dev libssl-dev
|
||||
RUN pip install devpi-client
|
||||
|
||||
RUN apt-get -y install tor
|
||||
RUN mkdir -vp /etc/service/tor \
|
||||
&& echo "#!/bin/sh\ntor\n" > /etc/service/tor/run \
|
||||
&& chmod a+x /etc/service/tor/run
|
||||
|
39
tests/conftest.py
Normal file
39
tests/conftest.py
Normal file
@ -0,0 +1,39 @@
|
||||
#
|
||||
# tests/conftest.py - command line options for warcprox tests
|
||||
#
|
||||
# Copyright (C) 2015-2016 Internet Archive
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
# as published by the Free Software Foundation; either version 2
|
||||
# of the License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
# USA.
|
||||
#
|
||||
|
||||
import pytest
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption('--rethinkdb-servers', dest='rethinkdb_servers',
|
||||
help='rethink db servers for dedup, e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org')
|
||||
parser.addoption('--rethinkdb-big-table',
|
||||
dest='rethinkdb_big_table', action='store_true', default=False,
|
||||
help='use a big rethinkdb table called "captures", instead of a small table called "dedup"; table is suitable for use as index for playback (ignored unless --rethinkdb-servers is specified)')
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def rethinkdb_servers(request):
|
||||
return request.config.getoption("--rethinkdb-servers")
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def rethinkdb_big_table(request):
|
||||
return request.config.getoption("--rethinkdb-big-table")
|
||||
|
||||
|
48
tests/run-tests.sh
Executable file
48
tests/run-tests.sh
Executable file
@ -0,0 +1,48 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# tests/run-tests.sh - Runs tests in a docker container. Also runs a temporary
|
||||
# instance of rethinkdb inside the container. The tests run with rethinkdb
|
||||
# features enabled, against that instance of rethinkdb, and also run without
|
||||
# rethinkdb features enabled. With python 2.7 and 3.4.
|
||||
#
|
||||
# tests/conftest.py - command line options for warcprox tests
|
||||
#
|
||||
# Copyright (C) 2015-2016 Internet Archive
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
# as published by the Free Software Foundation; either version 2
|
||||
# of the License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
# USA.
|
||||
#
|
||||
# 😬
|
||||
#
|
||||
|
||||
set -e
|
||||
|
||||
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
docker build -t internetarchive/warcprox-tests $script_dir
|
||||
|
||||
for python in python2.7 python3.4
|
||||
do
|
||||
docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests /sbin/my_init -- \
|
||||
bash -x -c "cd /tmp && git clone /warcprox && cd /tmp/warcprox \
|
||||
&& (cd /warcprox && git diff) | patch -p1 \
|
||||
&& virtualenv -p $python /tmp/venv \
|
||||
&& source /tmp/venv/bin/activate \
|
||||
&& pip --log-file /tmp/pip.log install . pytest requests \
|
||||
&& py.test -s tests \
|
||||
&& py.test -s --rethinkdb-servers=localhost tests \
|
||||
&& py.test -s --rethinkdb-servers=localhost --rethinkdb-big-table tests"
|
||||
done
|
||||
|
102
tests/single-threaded-proxy.py
Executable file
102
tests/single-threaded-proxy.py
Executable file
@ -0,0 +1,102 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
tests/single-threaded-proxy.py - single-threaded MITM proxy, useful for
|
||||
debugging, does not write warcs
|
||||
|
||||
Copyright (C) 2015-2016 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
USA.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
import warcprox
|
||||
import logging
|
||||
import sys
|
||||
import argparse
|
||||
import certauth
|
||||
import queue
|
||||
import socket
|
||||
import os
|
||||
|
||||
class FakeQueue(object):
|
||||
logger = logging.getLogger("FakeQueue")
|
||||
def __init__(self, maxsize=0): pass
|
||||
def join(self): pass
|
||||
def qsize(self): return 0
|
||||
def empty(self): return True
|
||||
def full(self): return False
|
||||
def get(self, block=True, timeout=None): raise queue.Empty
|
||||
def put_nowait(self, item): return self.put(item, block=False)
|
||||
def get_nowait(self): return self.get(block=False)
|
||||
def put(self, recorded_url, block=True, timeout=None):
|
||||
logging.info("{} {} {} {} {} size={} {}".format(
|
||||
recorded_url.client_ip, recorded_url.status, recorded_url.method,
|
||||
recorded_url.url.decode("utf-8"), recorded_url.mimetype,
|
||||
recorded_url.size, warcprox.digest_str(recorded_url.response_recorder.payload_digest, False).decode('utf-8')))
|
||||
|
||||
def parse_args():
|
||||
prog = os.path.basename(sys.argv[0])
|
||||
arg_parser = argparse.ArgumentParser(prog=prog,
|
||||
description='%s - single threaded mitm http/s proxy, for debugging' % prog,
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
arg_parser.add_argument('-p', '--port', dest='port', default='8000',
|
||||
type=int, help='port to listen on')
|
||||
arg_parser.add_argument('-b', '--address', dest='address',
|
||||
default='localhost', help='address to listen on')
|
||||
arg_parser.add_argument('-c', '--cacert', dest='cacert',
|
||||
default='./{0}-warcprox-ca.pem'.format(socket.gethostname()),
|
||||
help='CA certificate file; if file does not exist, it will be created')
|
||||
arg_parser.add_argument('--certs-dir', dest='certs_dir',
|
||||
default='./{0}-warcprox-ca'.format(socket.gethostname()),
|
||||
help='where to store and load generated certificates')
|
||||
arg_parser.add_argument('--onion-tor-socks-proxy', dest='onion_tor_socks_proxy',
|
||||
default=None, help='host:port of tor socks proxy, used only to connect to .onion sites')
|
||||
arg_parser.add_argument('--version', action='version',
|
||||
version="warcprox {}".format(warcprox.__version__))
|
||||
arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')
|
||||
arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true')
|
||||
|
||||
return arg_parser.parse_args(args=sys.argv[1:])
|
||||
|
||||
def init_logging(verbose):
|
||||
if args.verbose:
|
||||
loglevel = logging.DEBUG
|
||||
elif args.quiet:
|
||||
loglevel = logging.WARNING
|
||||
else:
|
||||
loglevel = logging.INFO
|
||||
|
||||
logging.basicConfig(stream=sys.stdout, level=loglevel,
|
||||
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
||||
# format='%(asctime)s %(funcName) 21s() %(filename)15s:%(lineno)05d %(message)s')
|
||||
|
||||
def init_proxy(args):
|
||||
ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64]
|
||||
ca = certauth.certauth.CertificateAuthority(args.cacert, args.certs_dir,
|
||||
ca_name=ca_name)
|
||||
options = warcprox.Options(**vars(args))
|
||||
proxy = warcprox.warcproxy.SingleThreadedWarcProxy(ca,
|
||||
recorded_url_q=FakeQueue(), options=options)
|
||||
return proxy
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
init_logging(args.verbose)
|
||||
proxy = init_proxy(args)
|
||||
|
||||
proxy.serve_forever()
|
||||
|
@ -1,4 +1,24 @@
|
||||
#!/usr/bin/env python
|
||||
#
|
||||
# tests/test_dump-anydbm.py - tests for dump-anydbm
|
||||
#
|
||||
# Copyright (C) 2013-2016 Internet Archive
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
# as published by the Free Software Foundation; either version 2
|
||||
# of the License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
# USA.
|
||||
#
|
||||
|
||||
import pytest
|
||||
import os
|
||||
@ -6,6 +26,7 @@ import tempfile
|
||||
import subprocess # to access the script from shell
|
||||
import sys
|
||||
import glob
|
||||
import distutils
|
||||
|
||||
# will try as python 3 then default to python 2 modules
|
||||
try:
|
||||
@ -38,7 +59,7 @@ val1 = 'very first value'
|
||||
val2 = 'second value'
|
||||
|
||||
py = sys.executable
|
||||
dump_anydbm_loc = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "bin/dump-anydbm")
|
||||
dump_anydbm_loc = distutils.spawn.find_executable("dump-anydbm")
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def gdbm_test_db(request):
|
1150
tests/test_warcprox.py
Executable file
1150
tests/test_warcprox.py
Executable file
File diff suppressed because it is too large
Load Diff
13
tox.ini
13
tox.ini
@ -1,13 +0,0 @@
|
||||
# Tox (http://tox.testrun.org/) is a tool for running tests
|
||||
# in multiple virtualenvs. This configuration file will run the
|
||||
# test suite on all supported python versions. To use it, "pip install tox"
|
||||
# and then run "tox" from this directory.
|
||||
|
||||
[tox]
|
||||
envlist = py27, py34
|
||||
|
||||
[testenv]
|
||||
commands = py.test warcprox
|
||||
deps =
|
||||
pytest
|
||||
requests
|
@ -1,8 +1,141 @@
|
||||
def _read_version_bytes():
|
||||
import os
|
||||
version_txt = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ['version.txt'])
|
||||
with open(version_txt, 'rb') as fin:
|
||||
return fin.read().strip()
|
||||
"""
|
||||
warcprox/__init__.py - warcprox package main file, contains some utility code
|
||||
|
||||
version_bytes = _read_version_bytes().strip()
|
||||
version_str = version_bytes.decode('utf-8')
|
||||
Copyright (C) 2013-2016 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
USA.
|
||||
"""
|
||||
|
||||
from argparse import Namespace as _Namespace
|
||||
from pkg_resources import get_distribution as _get_distribution
|
||||
__version__ = _get_distribution('warcprox').version
|
||||
|
||||
def digest_str(hash_obj, base32):
|
||||
import base64
|
||||
return hash_obj.name.encode('utf-8') + b':' + (
|
||||
base64.b32encode(hash_obj.digest()) if base32
|
||||
else hash_obj.hexdigest().encode('ascii'))
|
||||
|
||||
class Options(_Namespace):
|
||||
def __getattr__(self, name):
|
||||
try:
|
||||
return super(Options, self).__getattr__(self, name)
|
||||
except AttributeError:
|
||||
return None
|
||||
|
||||
# XXX linux-specific
|
||||
def gettid():
|
||||
try:
|
||||
import ctypes
|
||||
libc = ctypes.cdll.LoadLibrary('libc.so.6')
|
||||
SYS_gettid = 186
|
||||
tid = libc.syscall(SYS_gettid)
|
||||
return tid
|
||||
except:
|
||||
return "n/a"
|
||||
|
||||
class RequestBlockedByRule(Exception):
|
||||
"""
|
||||
An exception raised when a request should be blocked to respect a
|
||||
Warcprox-Meta rule.
|
||||
"""
|
||||
def __init__(self, msg):
|
||||
self.msg = msg
|
||||
def __str__(self):
|
||||
return "%s: %s" % (self.__class__.__name__, self.msg)
|
||||
|
||||
class Url:
|
||||
'''
|
||||
Utility class
|
||||
'''
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
self._surt = None
|
||||
self._host = None
|
||||
|
||||
@property
|
||||
def surt(self):
|
||||
if not self._surt:
|
||||
import surt
|
||||
hurl = surt.handyurl.parse(self.url)
|
||||
surt.GoogleURLCanonicalizer.canonicalize(hurl)
|
||||
hurl.query = None
|
||||
hurl.hash = None
|
||||
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
|
||||
return self._surt
|
||||
|
||||
@property
|
||||
def host(self):
|
||||
if not self._host:
|
||||
import surt
|
||||
self._host = surt.handyurl.parse(self.url).host
|
||||
return self._host
|
||||
|
||||
def matches_ip_or_domain(self, ip_or_domain):
|
||||
return host_matches_ip_or_domain(self.host, ip_or_domain)
|
||||
|
||||
def normalize_host(host):
|
||||
# normalize host (punycode and lowercase)
|
||||
return host.encode('idna').decode('ascii').lower()
|
||||
|
||||
def host_matches_ip_or_domain(host, ip_or_domain):
|
||||
'''
|
||||
Returns true if
|
||||
- ip_or_domain is an ip address and host is the same ip address
|
||||
- ip_or_domain is a domain and host is the same domain
|
||||
- ip_or_domain is a domain and host is a subdomain of it
|
||||
'''
|
||||
_host = normalize_host(host)
|
||||
_ip_or_domain = normalize_host(ip_or_domain)
|
||||
|
||||
if _ip_or_domain == _host:
|
||||
return True
|
||||
|
||||
# if either _ip_or_domain or host are ip addresses, and they're not
|
||||
# identical (previous check), not a match
|
||||
try:
|
||||
ipaddress.ip_address(_ip_or_domain)
|
||||
return False
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
ipaddress.ip_address(_host)
|
||||
return False
|
||||
except:
|
||||
pass
|
||||
|
||||
# if we get here, we're looking at two hostnames
|
||||
domain_parts = _ip_or_domain.split(".")
|
||||
host_parts = _host.split(".")
|
||||
|
||||
result = host_parts[-len(domain_parts):] == domain_parts
|
||||
return result
|
||||
|
||||
|
||||
# logging level more fine-grained than logging.DEBUG==10
|
||||
TRACE = 5
|
||||
|
||||
import warcprox.controller as controller
|
||||
import warcprox.playback as playback
|
||||
import warcprox.dedup as dedup
|
||||
import warcprox.warcproxy as warcproxy
|
||||
import warcprox.mitmproxy as mitmproxy
|
||||
import warcprox.writer as writer
|
||||
import warcprox.warc as warc
|
||||
import warcprox.writerthread as writerthread
|
||||
import warcprox.stats as stats
|
||||
import warcprox.bigtable as bigtable
|
||||
import warcprox.kafkafeed as kafkafeed
|
||||
|
218
warcprox/bigtable.py
Normal file
218
warcprox/bigtable.py
Normal file
@ -0,0 +1,218 @@
|
||||
"""
|
||||
warcprox/bigtable.py - module for "big" RethinkDB table for deduplication;
|
||||
the table is "big" in the sense that it is designed to be usable as an index
|
||||
for playback software outside of warcprox, and contains information not
|
||||
needed merely for deduplication
|
||||
|
||||
Copyright (C) 2015-2016 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
USA.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
import logging
|
||||
from hanzo import warctools
|
||||
import random
|
||||
import warcprox
|
||||
import base64
|
||||
import surt
|
||||
import os
|
||||
import hashlib
|
||||
import threading
|
||||
import datetime
|
||||
import rethinkstuff
|
||||
|
||||
class RethinkCaptures:
|
||||
"""Inserts in batches every 0.5 seconds"""
|
||||
logger = logging.getLogger("warcprox.bigtable.RethinkCaptures")
|
||||
|
||||
def __init__(self, r, table="captures", shards=None, replicas=None, options=warcprox.Options()):
|
||||
self.r = r
|
||||
self.table = table
|
||||
self.shards = shards or len(r.servers)
|
||||
self.replicas = replicas or min(3, len(r.servers))
|
||||
self.options = options
|
||||
self._ensure_db_table()
|
||||
|
||||
self._stop = threading.Event()
|
||||
self._batch_lock = threading.RLock()
|
||||
with self._batch_lock:
|
||||
self._batch = []
|
||||
self._timer = None
|
||||
|
||||
def start(self):
|
||||
"""Starts batch insert repeating timer"""
|
||||
self._insert_batch()
|
||||
|
||||
def _insert_batch(self):
|
||||
try:
|
||||
with self._batch_lock:
|
||||
if len(self._batch) > 0:
|
||||
result = self.r.table(self.table).insert(self._batch).run()
|
||||
if result["inserted"] != len(self._batch) or sorted(
|
||||
result.values()) != [0,0,0,0,0,len(self._batch)]:
|
||||
raise Exception(
|
||||
"unexpected result %s saving batch of %s "
|
||||
"entries", result, len(self._batch))
|
||||
self.logger.debug(
|
||||
"saved %s entries to big capture table db",
|
||||
len(self._batch))
|
||||
self._batch = []
|
||||
except BaseException as e:
|
||||
self.logger.error(
|
||||
"caught exception trying to save %s entries, they will "
|
||||
"be included in the next batch", len(self._batch),
|
||||
exc_info=True)
|
||||
finally:
|
||||
if not self._stop.is_set():
|
||||
t = threading.Timer(0.5, self._insert_batch)
|
||||
t.name = "RethinkCaptures-batch-insert-timer-%s" % datetime.datetime.utcnow().isoformat()
|
||||
t.start()
|
||||
# ensure self._timer joinable (already started) whenever close() happens to be called
|
||||
self._timer = t
|
||||
else:
|
||||
self.logger.info("finished")
|
||||
|
||||
def _ensure_db_table(self):
|
||||
dbs = self.r.db_list().run()
|
||||
if not self.r.dbname in dbs:
|
||||
self.logger.info("creating rethinkdb database %s", repr(self.r.dbname))
|
||||
self.r.db_create(self.r.dbname).run()
|
||||
tables = self.r.table_list().run()
|
||||
if not self.table in tables:
|
||||
self.logger.info("creating rethinkdb table %s in database %s", repr(self.table), repr(self.r.dbname))
|
||||
self.r.table_create(self.table, shards=self.shards, replicas=self.replicas).run()
|
||||
self.r.table(self.table).index_create("abbr_canon_surt_timestamp", [self.r.row["abbr_canon_surt"], self.r.row["timestamp"]]).run()
|
||||
self.r.table(self.table).index_create("sha1_warc_type", [self.r.row["sha1base32"], self.r.row["warc_type"], self.r.row["bucket"]]).run()
|
||||
|
||||
def find_response_by_digest(self, algo, raw_digest, bucket="__unspecified__"):
|
||||
if algo != "sha1":
|
||||
raise Exception("digest type is {} but big capture table is indexed by sha1".format(algo))
|
||||
sha1base32 = base64.b32encode(raw_digest).decode("utf-8")
|
||||
results_iter = self.r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type").run()
|
||||
results = list(results_iter)
|
||||
if len(results) > 0:
|
||||
if len(results) > 1:
|
||||
self.logger.debug("expected 0 or 1 but found %s results for sha1base32=%s bucket=%s (will use first result)", len(results), sha1base32, bucket)
|
||||
result = results[0]
|
||||
else:
|
||||
result = None
|
||||
self.logger.debug("returning %s for sha1base32=%s bucket=%s",
|
||||
result, sha1base32, bucket)
|
||||
return result
|
||||
|
||||
def _assemble_entry(self, recorded_url, records):
|
||||
if recorded_url.response_recorder:
|
||||
if recorded_url.response_recorder.payload_digest.name == "sha1":
|
||||
sha1base32 = base64.b32encode(
|
||||
recorded_url.response_recorder.payload_digest.digest()
|
||||
).decode("utf-8")
|
||||
else:
|
||||
self.logger.warn(
|
||||
"digest type is %s but big capture table is indexed "
|
||||
"by sha1",
|
||||
recorded_url.response_recorder.payload_digest.name)
|
||||
else:
|
||||
digest = hashlib.new("sha1", records[0].content[1])
|
||||
sha1base32 = base64.b32encode(digest.digest()).decode("utf-8")
|
||||
|
||||
if (recorded_url.warcprox_meta
|
||||
and "captures-bucket" in recorded_url.warcprox_meta):
|
||||
bucket = recorded_url.warcprox_meta["captures-bucket"]
|
||||
else:
|
||||
bucket = "__unspecified__"
|
||||
|
||||
canon_surt = surt.surt(recorded_url.url.decode("utf-8"),
|
||||
trailing_comma=True, host_massage=False, with_scheme=True)
|
||||
|
||||
entry = {
|
||||
# id only specified for rethinkdb partitioning
|
||||
"id": "{} {}".format(
|
||||
canon_surt[:20], records[0].id.decode("utf-8")[10:-1]),
|
||||
"abbr_canon_surt": canon_surt[:150],
|
||||
"canon_surt": canon_surt,
|
||||
"timestamp": recorded_url.timestamp.replace(
|
||||
tzinfo=rethinkstuff.UTC),
|
||||
"url": recorded_url.url.decode("utf-8"),
|
||||
"offset": records[0].offset,
|
||||
"filename": os.path.basename(records[0].warc_filename),
|
||||
"warc_type": records[0].type.decode("utf-8"),
|
||||
"warc_id": records[0].id.decode("utf-8"),
|
||||
"sha1base32": sha1base32,
|
||||
"content_type": recorded_url.mimetype,
|
||||
"response_code": recorded_url.status,
|
||||
"http_method": recorded_url.method,
|
||||
"bucket": bucket,
|
||||
"length": records[0].length,
|
||||
}
|
||||
|
||||
if (recorded_url.warcprox_meta and
|
||||
"captures-table-extra-fields" in recorded_url.warcprox_meta):
|
||||
extras = recorded_url.warcprox_meta["captures-table-extra-fields"]
|
||||
for extra_field in extras:
|
||||
entry[extra_field] = extras[extra_field]
|
||||
|
||||
return entry
|
||||
|
||||
def notify(self, recorded_url, records):
|
||||
entry = self._assemble_entry(recorded_url, records)
|
||||
with self._batch_lock:
|
||||
self._batch.append(entry)
|
||||
|
||||
def close(self):
|
||||
self.stop()
|
||||
|
||||
def stop(self):
|
||||
self.logger.info("closing rethinkdb captures table")
|
||||
self._stop.set()
|
||||
if self._timer:
|
||||
self._timer.join()
|
||||
|
||||
class RethinkCapturesDedup:
|
||||
logger = logging.getLogger("warcprox.dedup.RethinkCapturesDedup")
|
||||
|
||||
def __init__(self, captures_db, options=warcprox.Options()):
|
||||
self.captures_db = captures_db
|
||||
self.options = options
|
||||
|
||||
def lookup(self, digest_key, bucket="__unspecified__"):
|
||||
k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key
|
||||
algo, value_str = k.split(":")
|
||||
if self.options.base32:
|
||||
raw_digest = base64.b32decode(value_str, casefold=True)
|
||||
else:
|
||||
raw_digest = base64.b16decode(value_str, casefold=True)
|
||||
entry = self.captures_db.find_response_by_digest(algo, raw_digest, bucket)
|
||||
if entry:
|
||||
dedup_info = {
|
||||
"url": entry["url"].encode("utf-8"),
|
||||
"date": entry["timestamp"].strftime("%Y-%m-%dT%H:%M:%SZ").encode("utf-8"),
|
||||
}
|
||||
if "warc_id" in entry:
|
||||
dedup_info["id"] = entry["warc_id"].encode("utf-8")
|
||||
return dedup_info
|
||||
else:
|
||||
return None
|
||||
|
||||
def start(self):
|
||||
self.captures_db.start()
|
||||
|
||||
def stop(self):
|
||||
self.captures_db.stop()
|
||||
|
||||
def close(self):
|
||||
self.captures_db.close()
|
@ -1,19 +1,45 @@
|
||||
# vim: set sw=4 et:
|
||||
'''
|
||||
warcprox/controller.py - contains WarcproxController class, responsible for
|
||||
starting up and shutting down the various components of warcprox, and for
|
||||
sending heartbeats to the service registry if configured to do so; also has
|
||||
some memory profiling capabilities
|
||||
|
||||
Copyright (C) 2013-2016 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
USA.
|
||||
'''
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
import logging
|
||||
import threading
|
||||
import signal
|
||||
import time
|
||||
|
||||
import warcprox.warcprox
|
||||
import warcprox.warcwriter
|
||||
import warcprox
|
||||
import sys
|
||||
import gc
|
||||
import datetime
|
||||
|
||||
class WarcproxController(object):
|
||||
logger = logging.getLogger("warcprox.controller.WarcproxController")
|
||||
|
||||
def __init__(self, proxy=None, warc_writer_thread=None, playback_proxy=None):
|
||||
HEARTBEAT_INTERVAL = 20.0
|
||||
|
||||
def __init__(self, proxy=None, warc_writer_thread=None,
|
||||
playback_proxy=None, service_registry=None,
|
||||
options=warcprox.Options()):
|
||||
"""
|
||||
Create warcprox controller.
|
||||
|
||||
@ -34,44 +60,129 @@ class WarcproxController(object):
|
||||
else:
|
||||
self.warc_writer_thread = warcprox.warcwriter.WarcWriterThread(recorded_url_q=self.proxy.recorded_url_q)
|
||||
|
||||
self.proxy_thread = None
|
||||
self.playback_proxy_thread = None
|
||||
self.playback_proxy = playback_proxy
|
||||
self.service_registry = service_registry
|
||||
self.options = options
|
||||
|
||||
|
||||
def run_until_shutdown(self):
|
||||
"""Start warcprox and run until shut down.
|
||||
|
||||
If running in the main thread, SIGTERM initiates a graceful shutdown.
|
||||
Otherwise, call warcprox_controller.stop.set().
|
||||
"""
|
||||
proxy_thread = threading.Thread(target=self.proxy.serve_forever, name='ProxyThread')
|
||||
proxy_thread.start()
|
||||
self.warc_writer_thread.start()
|
||||
|
||||
if self.playback_proxy is not None:
|
||||
playback_proxy_thread = threading.Thread(target=self.playback_proxy.serve_forever, name='PlaybackProxyThread')
|
||||
playback_proxy_thread.start()
|
||||
self._last_rss = None
|
||||
|
||||
self.stop = threading.Event()
|
||||
self._start_stop_lock = threading.Lock()
|
||||
|
||||
try:
|
||||
signal.signal(signal.SIGTERM, self.stop.set)
|
||||
self.logger.info('SIGTERM will initiate graceful shutdown')
|
||||
except ValueError:
|
||||
pass
|
||||
def debug_mem(self):
|
||||
self.logger.info("self.proxy.recorded_url_q.qsize()=%s", self.proxy.recorded_url_q.qsize())
|
||||
with open("/proc/self/status") as f:
|
||||
for line in f:
|
||||
fields = line.split()
|
||||
if len(fields) >= 2:
|
||||
k, v = fields[0:2]
|
||||
if k == "VmHWM:":
|
||||
hwm = int(v)
|
||||
elif k == "VmRSS:":
|
||||
rss = int(v)
|
||||
elif k == "VmData:":
|
||||
data = int(v)
|
||||
elif k == "VmStk:":
|
||||
stk = int(v)
|
||||
self.logger.info("rss=%s data=%s stack=%s hwm=%s", rss, data, stk, hwm)
|
||||
self._last_rss = self._last_rss or rss # to set initial value
|
||||
|
||||
if rss - self._last_rss > 1024:
|
||||
num_unreachable = gc.collect()
|
||||
all_objects = gc.get_objects()
|
||||
total_size = 0
|
||||
summary = {}
|
||||
biggest_objects = [None] * 10
|
||||
for obj in all_objects:
|
||||
size = sys.getsizeof(obj)
|
||||
total_size += size
|
||||
if not type(obj) in summary:
|
||||
summary[type(obj)] = {"count":0,"size":0}
|
||||
summary[type(obj)]["count"] += 1
|
||||
summary[type(obj)]["size"] += size
|
||||
if size > sys.getsizeof(biggest_objects[-1]):
|
||||
for i in range(len(biggest_objects)):
|
||||
if size > sys.getsizeof(biggest_objects[i]):
|
||||
index = i
|
||||
break
|
||||
biggest_objects[index+1:] = biggest_objects[index:-1]
|
||||
biggest_objects[index] = obj
|
||||
|
||||
self.logger.info("%s objects totaling %s bytes", len(all_objects), total_size)
|
||||
|
||||
self.logger.info("=== biggest types ===")
|
||||
for item in sorted(summary.items(), key=lambda item: item[1]["size"], reverse=True)[:10]:
|
||||
self.logger.info("%s bytes in %s instances of %s", item[1]["size"], item[1]["count"], item[0])
|
||||
|
||||
self.logger.info("=== warcprox types ===")
|
||||
for t in (t for t in summary if str(t).find("warcprox") >= 0):
|
||||
self.logger.info("%s bytes in %s instances of %s", summary[t]["size"], summary[t]["count"], t)
|
||||
|
||||
for i in range(len(biggest_objects)):
|
||||
obj = biggest_objects[i]
|
||||
try:
|
||||
value = repr(bytes(obj.getbuffer()[:100]))
|
||||
except:
|
||||
try:
|
||||
value = repr(obj)[:100]
|
||||
except BaseException as e:
|
||||
value = "<{} getting value>".format(e)
|
||||
self.logger.info("#%s (%s) (%s bytes) (%s refs) (id=%s): %s", i+1, type(obj), sys.getsizeof(obj), sys.getrefcount(obj), id(obj), value)
|
||||
self.logger.info("%s unreachable objects totaling %s bytes", len(gc.garbage), sum(sys.getsizeof(x) for x in gc.garbage))
|
||||
|
||||
self._last_rss = rss
|
||||
|
||||
def _service_heartbeat(self):
|
||||
if hasattr(self, 'status_info'):
|
||||
status_info = self.status_info
|
||||
else:
|
||||
status_info = {
|
||||
'role': 'warcprox',
|
||||
'heartbeat_interval': self.HEARTBEAT_INTERVAL,
|
||||
'port': self.options.port,
|
||||
}
|
||||
status_info['load'] = 1.0 * self.proxy.recorded_url_q.qsize() / (self.proxy.recorded_url_q.maxsize or 100)
|
||||
status_info['queue_size'] = self.proxy.recorded_url_q.qsize()
|
||||
|
||||
self.status_info = self.service_registry.heartbeat(status_info)
|
||||
self.logger.log(
|
||||
warcprox.TRACE, "status in service registry: %s",
|
||||
self.status_info)
|
||||
|
||||
def start(self):
|
||||
with self._start_stop_lock:
|
||||
if self.proxy_thread and self.proxy_thread.is_alive():
|
||||
self.logger.info('warcprox is already running')
|
||||
return
|
||||
|
||||
if self.proxy.stats_db:
|
||||
self.proxy.stats_db.start()
|
||||
self.proxy_thread = threading.Thread(
|
||||
target=self.proxy.serve_forever, name='ProxyThread')
|
||||
self.proxy_thread.start()
|
||||
|
||||
if self.warc_writer_thread.dedup_db:
|
||||
self.warc_writer_thread.dedup_db.start()
|
||||
self.warc_writer_thread.start()
|
||||
|
||||
if self.playback_proxy is not None:
|
||||
self.playback_proxy_thread = threading.Thread(
|
||||
target=self.playback_proxy.serve_forever,
|
||||
name='PlaybackProxyThread')
|
||||
self.playback_proxy_thread.start()
|
||||
|
||||
def shutdown(self):
|
||||
with self._start_stop_lock:
|
||||
if not self.proxy_thread or not self.proxy_thread.is_alive():
|
||||
self.logger.info('warcprox is not running')
|
||||
return
|
||||
|
||||
try:
|
||||
while not self.stop.is_set():
|
||||
time.sleep(0.5)
|
||||
except:
|
||||
pass
|
||||
finally:
|
||||
self.warc_writer_thread.stop.set()
|
||||
self.proxy.shutdown()
|
||||
self.proxy.server_close()
|
||||
|
||||
if self.warc_writer_thread.warc_writer.dedup_db is not None:
|
||||
self.warc_writer_thread.warc_writer.dedup_db.close()
|
||||
|
||||
if self.playback_proxy is not None:
|
||||
self.playback_proxy.shutdown()
|
||||
self.playback_proxy.server_close()
|
||||
@ -80,7 +191,59 @@ class WarcproxController(object):
|
||||
|
||||
# wait for threads to finish
|
||||
self.warc_writer_thread.join()
|
||||
proxy_thread.join()
|
||||
if self.playback_proxy is not None:
|
||||
playback_proxy_thread.join()
|
||||
|
||||
if self.proxy.stats_db:
|
||||
self.proxy.stats_db.stop()
|
||||
if self.warc_writer_thread.dedup_db:
|
||||
self.warc_writer_thread.dedup_db.close()
|
||||
|
||||
self.proxy_thread.join()
|
||||
if self.playback_proxy is not None:
|
||||
self.playback_proxy_thread.join()
|
||||
|
||||
if self.service_registry and hasattr(self, "status_info"):
|
||||
self.service_registry.unregister(self.status_info["id"])
|
||||
|
||||
def run_until_shutdown(self):
|
||||
"""
|
||||
Start warcprox and run until shut down. Call
|
||||
warcprox_controller.stop.set() to initiate graceful shutdown.
|
||||
"""
|
||||
self.start()
|
||||
|
||||
last_mem_dbg = datetime.datetime.utcfromtimestamp(0)
|
||||
|
||||
try:
|
||||
utc = datetime.timezone.utc
|
||||
except AttributeError:
|
||||
# python2 :-\
|
||||
class UTC(datetime.tzinfo):
|
||||
def tzname(self, dt): return "UTC+00:00"
|
||||
def dst(self, dt): return datetime.timedelta(0)
|
||||
def utcoffset(self, dt): return datetime.timedelta(0)
|
||||
utc = UTC()
|
||||
|
||||
try:
|
||||
while not self.stop.is_set():
|
||||
if self.service_registry and (
|
||||
not hasattr(self, "status_info") or (
|
||||
datetime.datetime.now(utc)
|
||||
- self.status_info["last_heartbeat"]
|
||||
).total_seconds() > self.HEARTBEAT_INTERVAL):
|
||||
self._service_heartbeat()
|
||||
|
||||
if self.options.profile and (
|
||||
datetime.datetime.utcnow() - last_mem_dbg
|
||||
).total_seconds() > 60:
|
||||
self.debug_mem()
|
||||
last_mem_dbg = datetime.datetime.utcnow()
|
||||
|
||||
time.sleep(0.5)
|
||||
except:
|
||||
self.logger.critical(
|
||||
"shutting down in response to fatal exception",
|
||||
exc_info=True)
|
||||
pass
|
||||
finally:
|
||||
self.shutdown()
|
||||
|
||||
|
@ -1,30 +1,58 @@
|
||||
# vim:set sw=4 et:
|
||||
#
|
||||
# warcprox/dedup.py - identical payload digest deduplication
|
||||
#
|
||||
# Copyright (C) 2013-2016 Internet Archive
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
# as published by the Free Software Foundation; either version 2
|
||||
# of the License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
# USA.
|
||||
#
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
try:
|
||||
import dbm.gnu as dbm_gnu
|
||||
except ImportError:
|
||||
try:
|
||||
import gdbm as dbm_gnu
|
||||
except ImportError:
|
||||
import anydbm as dbm_gnu
|
||||
|
||||
import logging
|
||||
import os
|
||||
import json
|
||||
from hanzo import warctools
|
||||
import warcprox
|
||||
import random
|
||||
|
||||
class DedupDb(object):
|
||||
logger = logging.getLogger("warcprox.dedup.DedupDb")
|
||||
|
||||
def __init__(self, dbm_file='./warcprox-dedup.db'):
|
||||
def __init__(self, dbm_file='./warcprox-dedup.db', options=warcprox.Options()):
|
||||
try:
|
||||
import dbm.gnu as dbm_gnu
|
||||
except ImportError:
|
||||
try:
|
||||
import gdbm as dbm_gnu
|
||||
except ImportError:
|
||||
import anydbm as dbm_gnu
|
||||
|
||||
if os.path.exists(dbm_file):
|
||||
self.logger.info('opening existing deduplication database {}'.format(dbm_file))
|
||||
else:
|
||||
self.logger.info('creating new deduplication database {}'.format(dbm_file))
|
||||
|
||||
self.db = dbm_gnu.open(dbm_file, 'c')
|
||||
self.options = options
|
||||
|
||||
def start(self):
|
||||
pass
|
||||
|
||||
def stop(self):
|
||||
self.close()
|
||||
|
||||
def close(self):
|
||||
self.db.close()
|
||||
@ -35,26 +63,115 @@ class DedupDb(object):
|
||||
except:
|
||||
pass
|
||||
|
||||
def save(self, key, response_record, offset):
|
||||
def save(self, digest_key, response_record, bucket=""):
|
||||
record_id = response_record.get_header(warctools.WarcRecord.ID).decode('latin1')
|
||||
url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1')
|
||||
date = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1')
|
||||
|
||||
py_value = {'i':record_id, 'u':url, 'd':date}
|
||||
key = digest_key + b"|" + bucket.encode("utf-8")
|
||||
|
||||
py_value = {'id':record_id, 'url':url, 'date':date}
|
||||
json_value = json.dumps(py_value, separators=(',',':'))
|
||||
|
||||
self.db[key] = json_value.encode('utf-8')
|
||||
self.logger.debug('dedup db saved {}:{}'.format(key, json_value))
|
||||
self.logger.debug('dedup db saved %s:%s', key, json_value)
|
||||
|
||||
def lookup(self, key):
|
||||
def lookup(self, digest_key, bucket=""):
|
||||
result = None
|
||||
key = digest_key + b"|" + bucket.encode("utf-8")
|
||||
if key in self.db:
|
||||
json_result = self.db[key]
|
||||
result = json.loads(json_result.decode('utf-8'))
|
||||
result['i'] = result['i'].encode('latin1')
|
||||
result['u'] = result['u'].encode('latin1')
|
||||
result['d'] = result['d'].encode('latin1')
|
||||
return result
|
||||
result['id'] = result['id'].encode('latin1')
|
||||
result['url'] = result['url'].encode('latin1')
|
||||
result['date'] = result['date'].encode('latin1')
|
||||
self.logger.debug('dedup db lookup of key=%s returning %s', key, result)
|
||||
return result
|
||||
|
||||
def notify(self, recorded_url, records):
|
||||
if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
|
||||
and recorded_url.response_recorder.payload_size() > 0):
|
||||
digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest,
|
||||
self.options.base32)
|
||||
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
||||
self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"])
|
||||
else:
|
||||
self.save(digest_key, records[0])
|
||||
|
||||
|
||||
def decorate_with_dedup_info(dedup_db, recorded_url, base32=False):
|
||||
if (recorded_url.response_recorder
|
||||
and recorded_url.response_recorder.payload_digest
|
||||
and recorded_url.response_recorder.payload_size() > 0):
|
||||
digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32)
|
||||
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
||||
recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"])
|
||||
else:
|
||||
return None
|
||||
recorded_url.dedup_info = dedup_db.lookup(digest_key)
|
||||
|
||||
class RethinkDedupDb:
|
||||
logger = logging.getLogger("warcprox.dedup.RethinkDedupDb")
|
||||
|
||||
def __init__(self, r, table="dedup", shards=None, replicas=None, options=warcprox.Options()):
|
||||
self.r = r
|
||||
self.table = table
|
||||
self.shards = shards or len(r.servers)
|
||||
self.replicas = replicas or min(3, len(r.servers))
|
||||
self._ensure_db_table()
|
||||
self.options = options
|
||||
|
||||
def _ensure_db_table(self):
|
||||
dbs = self.r.db_list().run()
|
||||
if not self.r.dbname in dbs:
|
||||
self.logger.info("creating rethinkdb database %s", repr(self.r.dbname))
|
||||
self.r.db_create(self.r.dbname).run()
|
||||
tables = self.r.table_list().run()
|
||||
if not self.table in tables:
|
||||
self.logger.info("creating rethinkdb table %s in database %s shards=%s replicas=%s",
|
||||
repr(self.table), repr(self.r.dbname), self.shards, self.replicas)
|
||||
self.r.table_create(self.table, primary_key="key", shards=self.shards, replicas=self.replicas).run()
|
||||
|
||||
|
||||
def start(self):
|
||||
pass
|
||||
|
||||
def stop(self):
|
||||
pass
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
def sync(self):
|
||||
pass
|
||||
|
||||
def save(self, digest_key, response_record, bucket=""):
|
||||
k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key
|
||||
k = "{}|{}".format(k, bucket)
|
||||
record_id = response_record.get_header(warctools.WarcRecord.ID).decode('latin1')
|
||||
url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1')
|
||||
date = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1')
|
||||
record = {'key':k,'url':url,'date':date,'id':record_id}
|
||||
result = self.r.table(self.table).insert(record,conflict="replace").run()
|
||||
if sorted(result.values()) != [0,0,0,0,0,1] and [result["deleted"],result["skipped"],result["errors"]] != [0,0,0]:
|
||||
raise Exception("unexpected result %s saving %s", result, record)
|
||||
self.logger.debug('dedup db saved %s:%s', k, record)
|
||||
|
||||
def lookup(self, digest_key, bucket=""):
|
||||
k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key
|
||||
k = "{}|{}".format(k, bucket)
|
||||
result = self.r.table(self.table).get(k).run()
|
||||
if result:
|
||||
for x in result:
|
||||
result[x] = result[x].encode("utf-8")
|
||||
self.logger.debug('dedup db lookup of key=%s returning %s', k, result)
|
||||
return result
|
||||
|
||||
def notify(self, recorded_url, records):
|
||||
if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
|
||||
and recorded_url.response_recorder.payload_size() > 0):
|
||||
digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest,
|
||||
self.options.base32)
|
||||
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
||||
self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"])
|
||||
else:
|
||||
self.save(digest_key, records[0])
|
||||
|
@ -1,12 +1,28 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:set sw=4 et:
|
||||
#
|
||||
'''
|
||||
dump-anydbm - dumps contents of dbm file to stdout
|
||||
|
||||
"""
|
||||
Dump contents of database to stdout. Database can be any file that the anydbm
|
||||
module can read. Included with warcprox because it's useful for inspecting a
|
||||
deduplication database or a playback index database, but it is a generic tool.
|
||||
"""
|
||||
|
||||
Copyright (C) 2013-2016 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
USA.
|
||||
'''
|
||||
|
||||
try:
|
||||
import dbm
|
||||
@ -14,7 +30,7 @@ try:
|
||||
whichdb = dbm.whichdb
|
||||
|
||||
except:
|
||||
import anydbm
|
||||
import anydbm
|
||||
dbm = anydbm
|
||||
from whichdb import whichdb
|
||||
|
||||
@ -22,6 +38,9 @@ import sys
|
||||
import os.path
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 2:
|
||||
sys.stderr.write("usage: {} DBM_FILE\n".format(sys.argv[0]))
|
||||
exit(1)
|
101
warcprox/kafkafeed.py
Normal file
101
warcprox/kafkafeed.py
Normal file
@ -0,0 +1,101 @@
|
||||
'''
|
||||
warcprox/kafkafeed.py - support for publishing information about archived
|
||||
urls to apache kafka
|
||||
|
||||
Copyright (C) 2015-2016 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
USA.
|
||||
'''
|
||||
|
||||
import kafka
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
from hanzo import warctools
|
||||
|
||||
class CaptureFeed:
|
||||
logger = logging.getLogger('warcprox.kafkafeed.CaptureFeed')
|
||||
|
||||
def __init__(self, broker_list, topic=None):
|
||||
self.broker_list = broker_list
|
||||
self.topic = topic
|
||||
self.__producer = None
|
||||
self._connection_exception = None
|
||||
|
||||
def _producer(self):
|
||||
if not self.__producer:
|
||||
try:
|
||||
# acks=0 to avoid ever blocking
|
||||
self.__producer = kafka.KafkaProducer(
|
||||
bootstrap_servers=self.broker_list, acks=0)
|
||||
if self._connection_exception:
|
||||
logging.info('connected to kafka successfully!')
|
||||
self._connection_exception = None
|
||||
except Exception as e:
|
||||
if not self._connection_exception:
|
||||
self._connection_exception = e
|
||||
logging.error('problem connecting to kafka', exc_info=True)
|
||||
|
||||
return self.__producer
|
||||
|
||||
def notify(self, recorded_url, records):
|
||||
if records[0].type not in (b'revisit', b'response'):
|
||||
return
|
||||
|
||||
topic = recorded_url.warcprox_meta.get('capture-feed-topic', self.topic)
|
||||
if not topic:
|
||||
return
|
||||
|
||||
try:
|
||||
payload_digest = records[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode('utf-8')
|
||||
except:
|
||||
payload_digest = '-'
|
||||
|
||||
# {"status_code":200,"content_digest":"sha1:3VU56HI3BTMDZBL2TP7SQYXITT7VEAJQ","host":"www.kaosgl.com","via":"http://www.kaosgl.com/sayfa.php?id=4427","account_id":"877","seed":"http://www.kaosgl.com/","warc_filename":"ARCHIVEIT-6003-WEEKLY-JOB171310-20150903100014694-00002.warc.gz","url":"http://www.kaosgl.com/resim/HomofobiKarsitiBulusma/trabzon05.jpg","size":29700,"start_time_plus_duration":"20150903175709637+1049","timestamp":"2015-09-03T17:57:10.707Z","mimetype":"image/jpeg","collection_id":"6003","is_test_crawl":"false","job_name":"6003-20150902172136074","warc_offset":856320200,"thread":6,"hop_path":"RLLLLLE","extra_info":{},"annotations":"duplicate:digest","content_length":29432}
|
||||
|
||||
now = datetime.datetime.utcnow()
|
||||
d = {
|
||||
'timestamp': '{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000),
|
||||
'size': recorded_url.size,
|
||||
'status_code': recorded_url.status,
|
||||
'url': recorded_url.url.decode('utf-8'),
|
||||
'mimetype': recorded_url.mimetype,
|
||||
'content_digest': payload_digest,
|
||||
'warc_filename': records[0].warc_filename,
|
||||
'warc_offset': records[0].offset,
|
||||
'host': recorded_url.host,
|
||||
'annotations': 'duplicate:digest' if records[0].type == 'revisit' else '',
|
||||
'content_length': recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset,
|
||||
'start_time_plus_duration': '{:%Y%m%d%H%M%S}{:03d}+{}'.format(
|
||||
recorded_url.timestamp, recorded_url.timestamp.microsecond//1000,
|
||||
int(recorded_url.duration.total_seconds() * 1000)),
|
||||
# 'hop_path': ? # only used for seed redirects, which are n/a to brozzler (?)
|
||||
# 'via': ?
|
||||
# 'thread': ? # not needed
|
||||
}
|
||||
|
||||
# fields expected to be populated here are (for archive-it):
|
||||
# account_id, collection_id, is_test_crawl, seed, job_name
|
||||
if recorded_url.warcprox_meta and 'capture-feed-extra-fields' in recorded_url.warcprox_meta:
|
||||
for (k,v) in recorded_url.warcprox_meta['capture-feed-extra-fields'].items():
|
||||
d[k] = v
|
||||
|
||||
msg = json.dumps(d, separators=(',', ':')).encode('utf-8')
|
||||
self.logger.debug('feeding kafka topic=%s msg=%s', repr(topic), msg)
|
||||
p = self._producer()
|
||||
if p:
|
||||
p.send(topic, msg)
|
||||
|
280
warcprox/main.py
280
warcprox/main.py
@ -1,5 +1,25 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:set sw=4 et:
|
||||
'''
|
||||
warcprox/main.py - entrypoint for warcprox executable, parses command line
|
||||
arguments, initializes components, starts controller, handles signals
|
||||
|
||||
Copyright (C) 2013-2016 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
USA.
|
||||
'''
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
@ -14,21 +34,21 @@ import hashlib
|
||||
import argparse
|
||||
import os
|
||||
import socket
|
||||
|
||||
import traceback
|
||||
import signal
|
||||
import threading
|
||||
import certauth.certauth
|
||||
|
||||
import warcprox.playback
|
||||
import warcprox.dedup
|
||||
import warcprox.warcwriter
|
||||
import warcprox.warcprox
|
||||
import warcprox.controller
|
||||
import warcprox
|
||||
import re
|
||||
import rethinkstuff
|
||||
import cryptography.hazmat.backends.openssl
|
||||
|
||||
def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
||||
arg_parser = argparse.ArgumentParser(prog=prog,
|
||||
description='warcprox - WARC writing MITM HTTP/S proxy',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
arg_parser.add_argument('-p', '--port', dest='port', default='8000',
|
||||
help='port to listen on')
|
||||
type=int, help='port to listen on')
|
||||
arg_parser.add_argument('-b', '--address', dest='address',
|
||||
default='localhost', help='address to listen on')
|
||||
arg_parser.add_argument('-c', '--cacert', dest='cacert',
|
||||
@ -44,10 +64,10 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
||||
arg_parser.add_argument('-n', '--prefix', dest='prefix',
|
||||
default='WARCPROX', help='WARC filename prefix')
|
||||
arg_parser.add_argument('-s', '--size', dest='size',
|
||||
default=1000*1000*1000,
|
||||
default=1000*1000*1000, type=int,
|
||||
help='WARC file rollover size threshold in bytes')
|
||||
arg_parser.add_argument('--rollover-idle-time',
|
||||
dest='rollover_idle_time', default=None,
|
||||
dest='rollover_idle_time', default=None, type=int,
|
||||
help="WARC file rollover idle time threshold in seconds (so that Friday's last open WARC doesn't sit there all weekend waiting for more data)")
|
||||
try:
|
||||
hash_algos = hashlib.algorithms_guaranteed
|
||||
@ -57,30 +77,171 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
||||
default='sha1', help='digest algorithm, one of {}'.format(', '.join(hash_algos)))
|
||||
arg_parser.add_argument('--base32', dest='base32', action='store_true',
|
||||
default=False, help='write digests in Base32 instead of hex')
|
||||
arg_parser.add_argument('-j', '--dedup-db-file', dest='dedup_db_file',
|
||||
default='./warcprox-dedup.db', help='persistent deduplication database file; empty string or /dev/null disables deduplication')
|
||||
arg_parser.add_argument('--stats-db-file', dest='stats_db_file',
|
||||
default='./warcprox-stats.db', help='persistent statistics database file; empty string or /dev/null disables statistics tracking')
|
||||
arg_parser.add_argument('-P', '--playback-port', dest='playback_port',
|
||||
default=None, help='port to listen on for instant playback')
|
||||
type=int, default=None, help='port to listen on for instant playback')
|
||||
arg_parser.add_argument('--playback-index-db-file', dest='playback_index_db_file',
|
||||
default='./warcprox-playback-index.db',
|
||||
help='playback index database file (only used if --playback-port is specified)')
|
||||
group = arg_parser.add_mutually_exclusive_group()
|
||||
group.add_argument('-j', '--dedup-db-file', dest='dedup_db_file',
|
||||
default='./warcprox-dedup.db', help='persistent deduplication database file; empty string or /dev/null disables deduplication')
|
||||
group.add_argument('--rethinkdb-servers', dest='rethinkdb_servers',
|
||||
help='rethinkdb servers, used for dedup and stats if specified; e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org')
|
||||
arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default='warcprox',
|
||||
help='rethinkdb database name (ignored unless --rethinkdb-servers is specified)')
|
||||
arg_parser.add_argument('--rethinkdb-big-table',
|
||||
dest='rethinkdb_big_table', action='store_true', default=False,
|
||||
help='use a big rethinkdb table called "captures", instead of a small table called "dedup"; table is suitable for use as index for playback (ignored unless --rethinkdb-servers is specified)')
|
||||
arg_parser.add_argument('--kafka-broker-list', dest='kafka_broker_list',
|
||||
default=None, help='kafka broker list for capture feed')
|
||||
arg_parser.add_argument('--kafka-capture-feed-topic', dest='kafka_capture_feed_topic',
|
||||
default=None, help='kafka capture feed topic')
|
||||
arg_parser.add_argument('--queue-size', dest='queue_size', default=500,
|
||||
help=argparse.SUPPRESS)
|
||||
arg_parser.add_argument('--max-threads', dest='max_threads',
|
||||
help=argparse.SUPPRESS)
|
||||
arg_parser.add_argument('--profile', action='store_true', default=False,
|
||||
help=argparse.SUPPRESS)
|
||||
arg_parser.add_argument('--onion-tor-socks-proxy', dest='onion_tor_socks_proxy',
|
||||
default=None, help='host:port of tor socks proxy, used only to connect to .onion sites')
|
||||
arg_parser.add_argument('--version', action='version',
|
||||
version="warcprox {}".format(warcprox.version_str))
|
||||
version="warcprox {}".format(warcprox.__version__))
|
||||
arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')
|
||||
arg_parser.add_argument('--trace', dest='trace', action='store_true')
|
||||
arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true')
|
||||
# [--ispartof=warcinfo ispartof]
|
||||
# [--description=warcinfo description]
|
||||
# [--operator=warcinfo operator]
|
||||
# [--httpheader=warcinfo httpheader]
|
||||
|
||||
return arg_parser
|
||||
|
||||
def dump_state(signum=None, frame=None):
|
||||
'''
|
||||
Signal handler, logs stack traces of active threads.
|
||||
'''
|
||||
state_strs = []
|
||||
|
||||
def main(argv=sys.argv):
|
||||
for th in threading.enumerate():
|
||||
try:
|
||||
state_strs.append(str(th))
|
||||
except AssertionError:
|
||||
state_strs.append('<n/a:AssertionError>')
|
||||
stack = traceback.format_stack(sys._current_frames()[th.ident])
|
||||
state_strs.append(''.join(stack))
|
||||
|
||||
logging.warn(
|
||||
'dumping state (caught signal %s)\n%s',
|
||||
signum, '\n'.join(state_strs))
|
||||
|
||||
def init_controller(args):
|
||||
'''
|
||||
Creates a warcprox.controller.WarcproxController configured according to
|
||||
the supplied arguments (normally the result of parse_args(sys.argv)).
|
||||
'''
|
||||
options = warcprox.Options(**vars(args))
|
||||
|
||||
try:
|
||||
hashlib.new(args.digest_algorithm)
|
||||
except Exception as e:
|
||||
logging.fatal(e)
|
||||
exit(1)
|
||||
|
||||
listeners = []
|
||||
if args.rethinkdb_servers:
|
||||
r = rethinkstuff.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db)
|
||||
if args.rethinkdb_big_table:
|
||||
captures_db = warcprox.bigtable.RethinkCaptures(r, options=options)
|
||||
dedup_db = warcprox.bigtable.RethinkCapturesDedup(captures_db, options=options)
|
||||
listeners.append(captures_db)
|
||||
else:
|
||||
dedup_db = warcprox.dedup.RethinkDedupDb(r, options=options)
|
||||
listeners.append(dedup_db)
|
||||
elif args.dedup_db_file in (None, '', '/dev/null'):
|
||||
logging.info('deduplication disabled')
|
||||
dedup_db = None
|
||||
else:
|
||||
dedup_db = warcprox.dedup.DedupDb(args.dedup_db_file, options=options)
|
||||
listeners.append(dedup_db)
|
||||
|
||||
if args.rethinkdb_servers:
|
||||
stats_db = warcprox.stats.RethinkStatsDb(r, options=options)
|
||||
listeners.append(stats_db)
|
||||
elif args.stats_db_file in (None, '', '/dev/null'):
|
||||
logging.info('statistics tracking disabled')
|
||||
stats_db = None
|
||||
else:
|
||||
stats_db = warcprox.stats.StatsDb(args.stats_db_file, options=options)
|
||||
listeners.append(stats_db)
|
||||
|
||||
if args.kafka_broker_list:
|
||||
kafka_capture_feed = warcprox.kafkafeed.CaptureFeed(
|
||||
args.kafka_broker_list, args.kafka_capture_feed_topic)
|
||||
listeners.append(kafka_capture_feed)
|
||||
|
||||
recorded_url_q = queue.Queue(maxsize=args.queue_size)
|
||||
|
||||
ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64]
|
||||
ca = certauth.certauth.CertificateAuthority(args.cacert, args.certs_dir,
|
||||
ca_name=ca_name)
|
||||
|
||||
proxy = warcprox.warcproxy.WarcProxy(ca=ca, recorded_url_q=recorded_url_q,
|
||||
stats_db=stats_db, options=options)
|
||||
|
||||
if args.playback_port is not None:
|
||||
playback_index_db = warcprox.playback.PlaybackIndexDb(args.playback_index_db_file, options=options)
|
||||
playback_proxy = warcprox.playback.PlaybackProxy(
|
||||
server_address=(args.address, args.playback_port), ca=ca,
|
||||
playback_index_db=playback_index_db, warcs_dir=args.directory,
|
||||
options=options)
|
||||
listeners.append(playback_index_db)
|
||||
else:
|
||||
playback_index_db = None
|
||||
playback_proxy = None
|
||||
|
||||
writer_pool = warcprox.writer.WarcWriterPool(options=options)
|
||||
warc_writer_thread = warcprox.writerthread.WarcWriterThread(
|
||||
recorded_url_q=recorded_url_q, writer_pool=writer_pool,
|
||||
dedup_db=dedup_db, listeners=listeners, options=options)
|
||||
|
||||
if args.rethinkdb_servers:
|
||||
svcreg = rethinkstuff.ServiceRegistry(r)
|
||||
else:
|
||||
svcreg = None
|
||||
|
||||
controller = warcprox.controller.WarcproxController(proxy,
|
||||
warc_writer_thread, playback_proxy, service_registry=svcreg,
|
||||
options=options)
|
||||
|
||||
return controller
|
||||
|
||||
def real_main(args):
|
||||
# see https://github.com/pyca/cryptography/issues/2911
|
||||
cryptography.hazmat.backends.openssl.backend.activate_builtin_random()
|
||||
|
||||
controller = init_controller(args)
|
||||
|
||||
signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set())
|
||||
signal.signal(signal.SIGINT, lambda a,b: controller.stop.set())
|
||||
signal.signal(signal.SIGQUIT, dump_state)
|
||||
|
||||
controller.run_until_shutdown()
|
||||
|
||||
def parse_args(argv=sys.argv):
|
||||
'''
|
||||
Parses command line arguments with argparse.
|
||||
'''
|
||||
arg_parser = _build_arg_parser(prog=os.path.basename(argv[0]))
|
||||
args = arg_parser.parse_args(args=argv[1:])
|
||||
return args
|
||||
|
||||
if args.verbose:
|
||||
def main(argv=sys.argv):
|
||||
'''
|
||||
Main method, entry point of warcprox command.
|
||||
'''
|
||||
args = parse_args(argv)
|
||||
|
||||
if args.trace:
|
||||
loglevel = warcprox.TRACE
|
||||
elif args.verbose:
|
||||
loglevel = logging.DEBUG
|
||||
elif args.quiet:
|
||||
loglevel = logging.WARNING
|
||||
@ -90,51 +251,50 @@ def main(argv=sys.argv):
|
||||
logging.basicConfig(stream=sys.stdout, level=loglevel,
|
||||
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
||||
|
||||
try:
|
||||
hashlib.new(args.digest_algorithm)
|
||||
except Exception as e:
|
||||
logging.fatal(e)
|
||||
exit(1)
|
||||
real_main(args)
|
||||
|
||||
if args.dedup_db_file in (None, '', '/dev/null'):
|
||||
logging.info('deduplication disabled')
|
||||
dedup_db = None
|
||||
else:
|
||||
dedup_db = warcprox.dedup.DedupDb(args.dedup_db_file)
|
||||
def ensure_rethinkdb_tables():
|
||||
'''
|
||||
Creates rethinkdb tables if they don't already exist. Warcprox normally
|
||||
creates the tables it needs on demand at startup, but if multiple instances
|
||||
are starting up at the same time, you can end up with duplicate broken
|
||||
tables. So it's a good idea to use this utility at an early step when
|
||||
spinning up a cluster.
|
||||
'''
|
||||
arg_parser = argparse.ArgumentParser(
|
||||
prog=os.path.basename(sys.argv[0]),
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
arg_parser.add_argument(
|
||||
'--rethinkdb-servers', dest='rethinkdb_servers', default='localhost',
|
||||
help='rethinkdb servers e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org')
|
||||
arg_parser.add_argument(
|
||||
'--rethinkdb-db', dest='rethinkdb_db', default='warcprox',
|
||||
help='rethinkdb database name')
|
||||
arg_parser.add_argument(
|
||||
'-q', '--quiet', dest='log_level',
|
||||
action='store_const', default=logging.INFO, const=logging.WARN)
|
||||
arg_parser.add_argument(
|
||||
'-v', '--verbose', dest='log_level',
|
||||
action='store_const', default=logging.INFO, const=logging.DEBUG)
|
||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||
|
||||
recorded_url_q = queue.Queue()
|
||||
logging.basicConfig(
|
||||
stream=sys.stdout, level=args.log_level,
|
||||
format=(
|
||||
'%(asctime)s %(levelname)s %(name)s.%(funcName)s'
|
||||
'(%(filename)s:%(lineno)d) %(message)s'))
|
||||
|
||||
ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64]
|
||||
ca = certauth.certauth.CertificateAuthority(args.cacert, args.certs_dir,
|
||||
ca_name=ca_name)
|
||||
r = rethinkstuff.Rethinker(
|
||||
args.rethinkdb_servers.split(','), args.rethinkdb_db)
|
||||
|
||||
proxy = warcprox.warcprox.WarcProxy(
|
||||
server_address=(args.address, int(args.port)), ca=ca,
|
||||
recorded_url_q=recorded_url_q,
|
||||
digest_algorithm=args.digest_algorithm)
|
||||
# services table
|
||||
rethinkstuff.ServiceRegistry(r)
|
||||
|
||||
if args.playback_port is not None:
|
||||
playback_index_db = warcprox.playback.PlaybackIndexDb(args.playback_index_db_file)
|
||||
playback_server_address=(args.address, int(args.playback_port))
|
||||
playback_proxy = warcprox.playback.PlaybackProxy(server_address=playback_server_address,
|
||||
ca=ca, playback_index_db=playback_index_db,
|
||||
warcs_dir=args.directory)
|
||||
else:
|
||||
playback_index_db = None
|
||||
playback_proxy = None
|
||||
|
||||
warc_writer = warcprox.warcwriter.WarcWriter(directory=args.directory,
|
||||
gzip=args.gzip, prefix=args.prefix, port=int(args.port),
|
||||
rollover_size=int(args.size), base32=args.base32,
|
||||
dedup_db=dedup_db, digest_algorithm=args.digest_algorithm,
|
||||
playback_index_db=playback_index_db)
|
||||
warc_writer_thread = warcprox.warcwriter.WarcWriterThread(
|
||||
recorded_url_q=recorded_url_q, warc_writer=warc_writer,
|
||||
rollover_idle_time=int(args.rollover_idle_time) if args.rollover_idle_time is not None else None)
|
||||
|
||||
controller = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy)
|
||||
controller.run_until_shutdown()
|
||||
# stats table
|
||||
warcprox.stats.RethinkStatsDb(r)
|
||||
|
||||
# captures table
|
||||
warcprox.bigtable.RethinkCaptures(r)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
@ -1,4 +1,28 @@
|
||||
# vim:set sw=4 et:
|
||||
'''
|
||||
warcprox/mitmproxy.py - man-in-the-middle http/s proxy code, handles http
|
||||
CONNECT method by creating a snakeoil certificate for the requested site,
|
||||
calling ssl.wrap_socket() on the client connection; connects to remote
|
||||
(proxied) host, possibly using tor if host tld is .onion and tor proxy is
|
||||
configured
|
||||
|
||||
Copyright (C) 2012 Cygnos Corporation
|
||||
Copyright (C) 2013-2016 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
USA.
|
||||
'''
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
@ -11,46 +35,194 @@ try:
|
||||
import urllib.parse as urllib_parse
|
||||
except ImportError:
|
||||
import urlparse as urllib_parse
|
||||
|
||||
try:
|
||||
import http.client as http_client
|
||||
except ImportError:
|
||||
import httplib as http_client
|
||||
import socket
|
||||
import logging
|
||||
import ssl
|
||||
import warcprox
|
||||
import threading
|
||||
import datetime
|
||||
import socks
|
||||
import tempfile
|
||||
import hashlib
|
||||
try:
|
||||
import socketserver
|
||||
except ImportError:
|
||||
import SocketServer as socketserver
|
||||
import resource
|
||||
import concurrent.futures
|
||||
|
||||
class ProxyingRecorder(object):
|
||||
"""
|
||||
Wraps a socket._fileobject, recording the bytes as they are read,
|
||||
calculating digests, and sending them on to the proxy client.
|
||||
"""
|
||||
|
||||
logger = logging.getLogger("warcprox.mitmproxy.ProxyingRecorder")
|
||||
|
||||
def __init__(self, fp, proxy_client, digest_algorithm='sha1', url=None):
|
||||
self.fp = fp
|
||||
# "The file has no name, and will cease to exist when it is closed."
|
||||
self.tempfile = tempfile.SpooledTemporaryFile(max_size=512*1024)
|
||||
self.digest_algorithm = digest_algorithm
|
||||
self.block_digest = hashlib.new(digest_algorithm)
|
||||
self.payload_offset = None
|
||||
self.payload_digest = None
|
||||
self.proxy_client = proxy_client
|
||||
self._proxy_client_conn_open = True
|
||||
self.len = 0
|
||||
self.url = url
|
||||
|
||||
def payload_starts_now(self):
|
||||
self.payload_digest = hashlib.new(self.digest_algorithm)
|
||||
self.payload_offset = self.len
|
||||
|
||||
def _update_payload_digest(self, hunk):
|
||||
if self.payload_digest:
|
||||
self.payload_digest.update(hunk)
|
||||
|
||||
def _update(self, hunk):
|
||||
self._update_payload_digest(hunk)
|
||||
self.block_digest.update(hunk)
|
||||
|
||||
self.tempfile.write(hunk)
|
||||
|
||||
if self.payload_digest and self._proxy_client_conn_open:
|
||||
try:
|
||||
self.proxy_client.sendall(hunk)
|
||||
except BaseException as e:
|
||||
self._proxy_client_conn_open = False
|
||||
self.logger.warn(
|
||||
'%s sending data to proxy client for url %s',
|
||||
e, self.url)
|
||||
self.logger.info(
|
||||
'will continue downloading from remote server without '
|
||||
'sending to client %s', self.url)
|
||||
|
||||
self.len += len(hunk)
|
||||
|
||||
def read(self, size=-1):
|
||||
hunk = self.fp.read(size)
|
||||
self._update(hunk)
|
||||
return hunk
|
||||
|
||||
def readinto(self, b):
|
||||
n = self.fp.readinto(b)
|
||||
self._update(b[:n])
|
||||
return n
|
||||
|
||||
def readline(self, size=-1):
|
||||
# XXX depends on implementation details of self.fp.readline(), in
|
||||
# particular that it doesn't call self.fp.read()
|
||||
hunk = self.fp.readline(size)
|
||||
self._update(hunk)
|
||||
return hunk
|
||||
|
||||
def flush(self):
|
||||
return self.fp.flush()
|
||||
|
||||
def close(self):
|
||||
return self.fp.close()
|
||||
|
||||
def __len__(self):
|
||||
return self.len
|
||||
|
||||
def payload_size(self):
|
||||
if self.payload_offset is not None:
|
||||
return self.len - self.payload_offset
|
||||
else:
|
||||
return 0
|
||||
|
||||
class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
|
||||
'''
|
||||
Implementation of HTTPResponse that uses a ProxyingRecorder to read the
|
||||
response from the remote web server and send it on to the proxy client,
|
||||
while recording the bytes in transit.
|
||||
'''
|
||||
def __init__(
|
||||
self, sock, debuglevel=0, method=None, proxy_client=None,
|
||||
digest_algorithm='sha1', url=None):
|
||||
http_client.HTTPResponse.__init__(
|
||||
self, sock, debuglevel=debuglevel, method=method)
|
||||
self.proxy_client = proxy_client
|
||||
self.url = url
|
||||
|
||||
# Keep around extra reference to self.fp because HTTPResponse sets
|
||||
# self.fp=None after it finishes reading, but we still need it
|
||||
self.recorder = ProxyingRecorder(
|
||||
self.fp, proxy_client, digest_algorithm, url=url)
|
||||
self.fp = self.recorder
|
||||
|
||||
def begin(self):
|
||||
http_client.HTTPResponse.begin(self) # reads status line, headers
|
||||
|
||||
status_and_headers = 'HTTP/1.1 {} {}\r\n'.format(
|
||||
self.status, self.reason)
|
||||
for k,v in self.msg.items():
|
||||
if k.lower() not in (
|
||||
'connection', 'proxy-connection', 'keep-alive',
|
||||
'proxy-authenticate', 'proxy-authorization', 'upgrade',
|
||||
'strict-transport-security'):
|
||||
status_and_headers += '{}: {}\r\n'.format(k, v)
|
||||
status_and_headers += 'Connection: close\r\n\r\n'
|
||||
self.proxy_client.sendall(status_and_headers.encode('latin1'))
|
||||
|
||||
self.recorder.payload_starts_now()
|
||||
|
||||
class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
'''
|
||||
An http proxy implementation of BaseHTTPRequestHandler, that acts as a
|
||||
man-in-the-middle in order to peek at the content of https transactions,
|
||||
and records the bytes in transit as it proxies them.
|
||||
'''
|
||||
logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler")
|
||||
|
||||
def __init__(self, request, client_address, server):
|
||||
threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1])
|
||||
self.is_connect = False
|
||||
self._headers_buffer = []
|
||||
request.settimeout(60) # XXX what value should this have?
|
||||
http_server.BaseHTTPRequestHandler.__init__(self, request, client_address, server)
|
||||
|
||||
def _determine_host_port(self):
|
||||
# Get hostname and port to connect to
|
||||
if self.is_connect:
|
||||
self.hostname, self.port = self.path.split(':')
|
||||
host, self.port = self.path.split(':')
|
||||
else:
|
||||
self.url = self.path
|
||||
u = urllib_parse.urlparse(self.url)
|
||||
if u.scheme != 'http':
|
||||
raise Exception('Unknown scheme %s' % repr(u.scheme))
|
||||
self.hostname = u.hostname
|
||||
raise Exception(
|
||||
'unable to parse request %s as a proxy request' % (
|
||||
repr(self.requestline)))
|
||||
host = u.hostname
|
||||
self.port = u.port or 80
|
||||
self.path = urllib_parse.urlunparse(
|
||||
urllib_parse.ParseResult(
|
||||
scheme='',
|
||||
netloc='',
|
||||
params=u.params,
|
||||
path=u.path or '/',
|
||||
query=u.query,
|
||||
fragment=u.fragment
|
||||
)
|
||||
)
|
||||
scheme='', netloc='', params=u.params, path=u.path or '/',
|
||||
query=u.query, fragment=u.fragment))
|
||||
self.hostname = warcprox.normalize_host(host)
|
||||
|
||||
def _connect_to_host(self):
|
||||
def _connect_to_remote_server(self):
|
||||
# Connect to destination
|
||||
self._proxy_sock = socket.socket()
|
||||
self._proxy_sock.settimeout(60)
|
||||
self._proxy_sock.connect((self.hostname, int(self.port)))
|
||||
if self.onion_tor_socks_proxy_host and self.hostname.lower().endswith('.onion'):
|
||||
self.logger.info("using tor socks proxy at %s:%s to connect to %s",
|
||||
self.onion_tor_socks_proxy_host,
|
||||
self.onion_tor_socks_proxy_port or 1080,
|
||||
self.hostname)
|
||||
self._remote_server_sock = socks.socksocket()
|
||||
self._remote_server_sock.set_proxy(
|
||||
socks.SOCKS5, addr=self.onion_tor_socks_proxy_host,
|
||||
port=self.onion_tor_socks_proxy_port, rdns=True)
|
||||
else:
|
||||
self._remote_server_sock = socket.socket()
|
||||
|
||||
# XXX what value should this timeout have?
|
||||
self._remote_server_sock.settimeout(60)
|
||||
self._remote_server_sock.connect((self.hostname, int(self.port)))
|
||||
|
||||
# Wrap socket if SSL is required
|
||||
if self.is_connect:
|
||||
@ -58,24 +230,44 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
context = ssl.create_default_context()
|
||||
context.check_hostname = False
|
||||
context.verify_mode = ssl.CERT_NONE
|
||||
self._proxy_sock = context.wrap_socket(self._proxy_sock, server_hostname=self.hostname)
|
||||
self._remote_server_sock = context.wrap_socket(
|
||||
self._remote_server_sock, server_hostname=self.hostname)
|
||||
except AttributeError:
|
||||
try:
|
||||
self._proxy_sock = ssl.wrap_socket(self._proxy_sock)
|
||||
self._remote_server_sock = ssl.wrap_socket(
|
||||
self._remote_server_sock)
|
||||
except ssl.SSLError:
|
||||
self.logger.warn("failed to establish ssl connection to {}; python ssl library does not support SNI, considering upgrading to python >= 2.7.9 or python 3.4".format(self.hostname))
|
||||
self.logger.warn(
|
||||
"failed to establish ssl connection to %s; python "
|
||||
"ssl library does not support SNI, considering "
|
||||
"upgrading to python >= 2.7.9 or python 3.4",
|
||||
self.hostname)
|
||||
raise
|
||||
|
||||
return self._remote_server_sock
|
||||
|
||||
def _transition_to_ssl(self):
|
||||
self.request = self.connection = ssl.wrap_socket(self.connection,
|
||||
server_side=True, certfile=self.server.ca.cert_for_host(self.hostname))
|
||||
|
||||
def do_CONNECT(self):
|
||||
'''
|
||||
Handles a http CONNECT request.
|
||||
|
||||
The CONNECT method is meant to "convert the request connection to a
|
||||
transparent TCP/IP tunnel, usually to facilitate SSL-encrypted
|
||||
communication (HTTPS) through an unencrypted HTTP proxy" (Wikipedia).
|
||||
|
||||
do_CONNECT is where the man-in-the-middle logic happens. In do_CONNECT
|
||||
the proxy transitions the proxy client connection to ssl while
|
||||
masquerading as the remote web server using a generated certificate.
|
||||
Meanwhile makes its own separate ssl connection to the remote web
|
||||
server. Then it calls self.handle_one_request() again to handle the
|
||||
request intended for the remote server.
|
||||
'''
|
||||
self.is_connect = True
|
||||
try:
|
||||
# Connect to destination first
|
||||
self._determine_host_port()
|
||||
self._connect_to_host()
|
||||
|
||||
# If successful, let's do this!
|
||||
self.send_response(200, 'Connection established')
|
||||
@ -83,6 +275,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
self._transition_to_ssl()
|
||||
except Exception as e:
|
||||
try:
|
||||
self.logger.error("problem handling {}: {}".format(repr(self.requestline), e))
|
||||
if type(e) is socket.timeout:
|
||||
self.send_error(504, str(e))
|
||||
else:
|
||||
@ -115,35 +308,162 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
return result
|
||||
|
||||
def do_COMMAND(self):
|
||||
if not self.is_connect:
|
||||
try:
|
||||
# Connect to destination
|
||||
self._determine_host_port()
|
||||
self._connect_to_host()
|
||||
assert self.url
|
||||
except Exception as e:
|
||||
self.send_error(500, str(e))
|
||||
return
|
||||
else:
|
||||
# if self.is_connect we already connected in do_CONNECT
|
||||
if self.is_connect:
|
||||
self.url = self._construct_tunneled_url()
|
||||
else:
|
||||
self._determine_host_port()
|
||||
assert self.url
|
||||
|
||||
self._proxy_request()
|
||||
try:
|
||||
# Connect to destination
|
||||
self._connect_to_remote_server()
|
||||
except warcprox.RequestBlockedByRule as e:
|
||||
# limit enforcers have already sent the appropriate response
|
||||
self.logger.info("%s: %s", repr(self.requestline), e)
|
||||
return
|
||||
except Exception as e:
|
||||
self.logger.error("problem processing request {}: {}".format(repr(self.requestline), e), exc_info=True)
|
||||
self.send_error(500, str(e))
|
||||
return
|
||||
|
||||
try:
|
||||
self._proxy_request()
|
||||
except:
|
||||
self.logger.error("exception proxying request", exc_info=True)
|
||||
raise
|
||||
|
||||
def _proxy_request(self):
|
||||
raise Exception('_proxy_request() not implemented in MitmProxyHandler, must be implemented in subclass!')
|
||||
'''
|
||||
Sends the request to the remote server, then uses a ProxyingRecorder to
|
||||
read the response and send it to the proxy client, while recording the
|
||||
bytes in transit. Returns a tuple (request, response) where request is
|
||||
the raw request bytes, and response is a ProxyingRecorder.
|
||||
'''
|
||||
# Build request
|
||||
req_str = '{} {} {}\r\n'.format(
|
||||
self.command, self.path, self.request_version)
|
||||
|
||||
# Swallow headers that don't make sense to forward on, i.e. most
|
||||
# hop-by-hop headers, see
|
||||
# http://tools.ietf.org/html/rfc2616#section-13.5.
|
||||
# self.headers is an email.message.Message, which is case-insensitive
|
||||
# and doesn't throw KeyError in __delitem__
|
||||
for key in (
|
||||
'Connection', 'Proxy-Connection', 'Keep-Alive',
|
||||
'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'):
|
||||
del self.headers[key]
|
||||
|
||||
# Add headers to the request
|
||||
# XXX in at least python3.3 str(self.headers) uses \n not \r\n :(
|
||||
req_str += '\r\n'.join(
|
||||
'{}: {}'.format(k,v) for (k,v) in self.headers.items())
|
||||
|
||||
req = req_str.encode('latin1') + b'\r\n\r\n'
|
||||
|
||||
# Append message body if present to the request
|
||||
if 'Content-Length' in self.headers:
|
||||
req += self.rfile.read(int(self.headers['Content-Length']))
|
||||
|
||||
try:
|
||||
self.logger.debug('sending to remote server req=%s', repr(req))
|
||||
|
||||
# Send it down the pipe!
|
||||
self._remote_server_sock.sendall(req)
|
||||
|
||||
prox_rec_res = ProxyingRecordingHTTPResponse(
|
||||
self._remote_server_sock, proxy_client=self.connection,
|
||||
digest_algorithm=self.server.digest_algorithm,
|
||||
url=self.url)
|
||||
prox_rec_res.begin()
|
||||
|
||||
buf = prox_rec_res.read(8192)
|
||||
while buf != b'':
|
||||
buf = prox_rec_res.read(8192)
|
||||
|
||||
self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
|
||||
except socket.timeout as e:
|
||||
self.logger.warn(
|
||||
"%s proxying %s %s", repr(e), self.command, self.url)
|
||||
except BaseException as e:
|
||||
self.logger.error(
|
||||
"%s proxying %s %s", repr(e), self.command, self.url,
|
||||
exc_info=True)
|
||||
finally:
|
||||
# Let's close off the remote end
|
||||
if prox_rec_res:
|
||||
prox_rec_res.close()
|
||||
self._remote_server_sock.close()
|
||||
|
||||
return req, prox_rec_res
|
||||
|
||||
def __getattr__(self, item):
|
||||
if item.startswith('do_'):
|
||||
return self.do_COMMAND
|
||||
|
||||
def log_error(self, fmt, *args):
|
||||
self.logger.error("{0} - - [{1}] {2}".format(self.address_string(),
|
||||
self.log_date_time_string(), fmt % args))
|
||||
self.logger.warn(fmt, *args)
|
||||
|
||||
def log_message(self, fmt, *args):
|
||||
self.logger.info("{} {} - - [{}] {}".format(self.__class__.__name__,
|
||||
self.address_string(), self.log_date_time_string(), fmt % args))
|
||||
class PooledMixIn(socketserver.ThreadingMixIn):
|
||||
logger = logging.getLogger("warcprox.mitmproxy.PooledMixIn")
|
||||
def __init__(self, max_threads=None):
|
||||
'''
|
||||
If max_threads is not supplied, calculates a reasonable value based
|
||||
on system resource limits.
|
||||
'''
|
||||
if not max_threads:
|
||||
# man getrlimit: "RLIMIT_NPROC The maximum number of processes (or,
|
||||
# more precisely on Linux, threads) that can be created for the
|
||||
# real user ID of the calling process."
|
||||
rlimit_nproc = resource.getrlimit(resource.RLIMIT_NPROC)[0]
|
||||
rlimit_nofile = resource.getrlimit(resource.RLIMIT_NOFILE)[0]
|
||||
max_threads = min(rlimit_nofile // 10, rlimit_nproc // 2)
|
||||
self.logger.info(
|
||||
"max_threads=%s (rlimit_nproc=%s, rlimit_nofile=%s)",
|
||||
max_threads, rlimit_nproc, rlimit_nofile)
|
||||
self.pool = concurrent.futures.ThreadPoolExecutor(max_threads)
|
||||
|
||||
def process_request(self, request, client_address):
|
||||
self.pool.submit(self.process_request_thread, request, client_address)
|
||||
|
||||
class MitmProxy(http_server.HTTPServer):
|
||||
def finish_request(self, request, client_address):
|
||||
'''
|
||||
We override socketserver.BaseServer.finish_request to get at
|
||||
MitmProxyHandler's self.request. A normal socket server's self.request
|
||||
is set to `request` and never changes, but in our case, it may be
|
||||
replaced with an SSL socket. The caller of this method (e.g.
|
||||
self.process_request or PooledMitmProxy.process_request_thread) needs
|
||||
to get a hold of that socket so it can close it.
|
||||
'''
|
||||
req_handler = self.RequestHandlerClass(request, client_address, self)
|
||||
return req_handler.request
|
||||
|
||||
def process_request(self, request, client_address):
|
||||
'''
|
||||
This an almost verbatim copy/paste of
|
||||
socketserver.BaseServer.process_request.
|
||||
The only difference is that it expects self.finish_request to return
|
||||
the request (i.e. the socket). This new value of request is passed on
|
||||
to self.shutdown_request. See the comment on self.finish_request for
|
||||
the rationale.
|
||||
'''
|
||||
request = self.finish_request(request, client_address)
|
||||
self.shutdown_request(request)
|
||||
|
||||
class PooledMitmProxy(PooledMixIn, MitmProxy):
|
||||
def process_request_thread(self, request, client_address):
|
||||
'''
|
||||
This an almost verbatim copy/paste of
|
||||
socketserver.ThreadingMixIn.process_request_thread.
|
||||
The only difference is that it expects self.finish_request to return
|
||||
the request (i.e. the socket). This new value of request is passed on
|
||||
to self.shutdown_request. See the comment on MitmProxy.finish_request
|
||||
for the rationale.
|
||||
'''
|
||||
try:
|
||||
request = self.finish_request(request, client_address)
|
||||
self.shutdown_request(request)
|
||||
except:
|
||||
self.handle_error(request, client_address)
|
||||
self.shutdown_request(request)
|
||||
|
||||
|
@ -1,4 +1,24 @@
|
||||
# vim:set sw=4 et:
|
||||
'''
|
||||
warcprox/playback.py - rudimentary support for playback of urls archived by
|
||||
warcprox (not much used or maintained)
|
||||
|
||||
Copyright (C) 2013-2016 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
USA.
|
||||
'''
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
@ -12,14 +32,6 @@ try:
|
||||
except ImportError:
|
||||
import SocketServer as socketserver
|
||||
|
||||
try:
|
||||
import dbm.gnu as dbm_gnu
|
||||
except ImportError:
|
||||
try:
|
||||
import gdbm as dbm_gnu
|
||||
except ImportError:
|
||||
import anydbm as dbm_gnu
|
||||
|
||||
import logging
|
||||
import os
|
||||
from hanzo import warctools
|
||||
@ -27,13 +39,14 @@ import json
|
||||
import traceback
|
||||
import re
|
||||
from warcprox.mitmproxy import MitmProxyHandler
|
||||
import warcprox
|
||||
|
||||
class PlaybackProxyHandler(MitmProxyHandler):
|
||||
logger = logging.getLogger("warcprox.playback.PlaybackProxyHandler")
|
||||
|
||||
# @Override
|
||||
def _connect_to_host(self):
|
||||
# don't connect to host!
|
||||
def _connect_to_remote_server(self):
|
||||
# don't connect to any remote server!
|
||||
pass
|
||||
|
||||
|
||||
@ -180,13 +193,14 @@ class PlaybackProxyHandler(MitmProxyHandler):
|
||||
class PlaybackProxy(socketserver.ThreadingMixIn, http_server.HTTPServer):
|
||||
logger = logging.getLogger("warcprox.playback.PlaybackProxy")
|
||||
|
||||
def __init__(self, server_address, req_handler_class=PlaybackProxyHandler,
|
||||
bind_and_activate=True, ca=None, playback_index_db=None,
|
||||
warcs_dir=None):
|
||||
http_server.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate)
|
||||
|
||||
def __init__(self, ca=None, playback_index_db=None, options=warcprox.Options()):
|
||||
server_address = (options.address or 'localhost', options.playback_port if options.playback_port is not None else 8001)
|
||||
http_server.HTTPServer.__init__(self, server_address, PlaybackProxyHandler, bind_and_activate=True)
|
||||
self.ca = ca
|
||||
self.playback_index_db = playback_index_db
|
||||
self.warcs_dir = warcs_dir
|
||||
self.warcs_dir = options.directory
|
||||
self.options = options
|
||||
|
||||
def server_activate(self):
|
||||
http_server.HTTPServer.server_activate(self)
|
||||
@ -201,6 +215,14 @@ class PlaybackIndexDb(object):
|
||||
logger = logging.getLogger("warcprox.playback.PlaybackIndexDb")
|
||||
|
||||
def __init__(self, dbm_file='./warcprox-playback-index.db'):
|
||||
try:
|
||||
import dbm.gnu as dbm_gnu
|
||||
except ImportError:
|
||||
try:
|
||||
import gdbm as dbm_gnu
|
||||
except ImportError:
|
||||
import anydbm as dbm_gnu
|
||||
|
||||
if os.path.exists(dbm_file):
|
||||
self.logger.info('opening existing playback index database {}'.format(dbm_file))
|
||||
else:
|
||||
@ -217,6 +239,9 @@ class PlaybackIndexDb(object):
|
||||
except:
|
||||
pass
|
||||
|
||||
def notify(self, recorded_url, records):
|
||||
self.save(records[0].warc_filename, records, records[0].offset)
|
||||
|
||||
def save(self, warcfile, recordset, offset):
|
||||
response_record = recordset[0]
|
||||
# XXX canonicalize url?
|
||||
|
303
warcprox/stats.py
Normal file
303
warcprox/stats.py
Normal file
@ -0,0 +1,303 @@
|
||||
'''
|
||||
warcprox/stats.py - keeps statistics on what has been archived
|
||||
|
||||
Copyright (C) 2013-2016 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
USA.
|
||||
'''
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
import logging
|
||||
import os
|
||||
import json
|
||||
from hanzo import warctools
|
||||
import random
|
||||
import warcprox
|
||||
import threading
|
||||
import rethinkdb as r
|
||||
import datetime
|
||||
import surt
|
||||
|
||||
def _empty_bucket(bucket):
|
||||
return {
|
||||
"bucket": bucket,
|
||||
"total": {
|
||||
"urls": 0,
|
||||
"wire_bytes": 0,
|
||||
},
|
||||
"new": {
|
||||
"urls": 0,
|
||||
"wire_bytes": 0,
|
||||
},
|
||||
"revisit": {
|
||||
"urls": 0,
|
||||
"wire_bytes": 0,
|
||||
},
|
||||
}
|
||||
|
||||
class StatsDb:
|
||||
logger = logging.getLogger("warcprox.stats.StatsDb")
|
||||
|
||||
def __init__(self, dbm_file='./warcprox-stats.db', options=warcprox.Options()):
|
||||
try:
|
||||
import dbm.gnu as dbm_gnu
|
||||
except ImportError:
|
||||
try:
|
||||
import gdbm as dbm_gnu
|
||||
except ImportError:
|
||||
import anydbm as dbm_gnu
|
||||
|
||||
if os.path.exists(dbm_file):
|
||||
self.logger.info('opening existing stats database {}'.format(dbm_file))
|
||||
else:
|
||||
self.logger.info('creating new stats database {}'.format(dbm_file))
|
||||
|
||||
self.db = dbm_gnu.open(dbm_file, 'c')
|
||||
self.options = options
|
||||
|
||||
def start(self):
|
||||
# method only exists to match RethinkStatsDb
|
||||
pass
|
||||
|
||||
def stop(self):
|
||||
self.close()
|
||||
|
||||
def close(self):
|
||||
self.db.close()
|
||||
|
||||
def sync(self):
|
||||
try:
|
||||
self.db.sync()
|
||||
except:
|
||||
pass
|
||||
|
||||
def value(self, bucket0="__all__", bucket1=None, bucket2=None):
|
||||
# Gdbm wants str/bytes keys in python2, str/unicode keys in python3.
|
||||
# This ugliness deals with keys that arrive as unicode in py2.
|
||||
b0 = bucket0.encode("utf-8") if bucket0 and not isinstance(bucket0, str) else bucket0
|
||||
b1 = bucket1.encode("utf-8") if bucket1 and not isinstance(bucket1, str) else bucket1
|
||||
b2 = bucket2.encode("utf-8") if bucket2 and not isinstance(bucket2, str) else bucket2
|
||||
|
||||
if b0 in self.db:
|
||||
bucket0_stats = json.loads(self.db[b0].decode("utf-8"))
|
||||
if b1:
|
||||
if b2:
|
||||
return bucket0_stats[b1][b2]
|
||||
else:
|
||||
return bucket0_stats[b1]
|
||||
else:
|
||||
return bucket0_stats
|
||||
else:
|
||||
return None
|
||||
|
||||
def notify(self, recorded_url, records):
|
||||
self.tally(recorded_url, records)
|
||||
|
||||
def buckets(self, recorded_url):
|
||||
'''
|
||||
Unravels bucket definitions in Warcprox-Meta header. Each bucket
|
||||
definition can either be a string, which signifies the name of the
|
||||
bucket, or a dict. If a dict it is expected to have at least an item
|
||||
with key 'bucket' whose value is the name of the bucket. The other
|
||||
currently recognized item is 'tally-domains', which if supplied should
|
||||
be a list of domains. This instructs warcprox to additionally tally
|
||||
substats of the given bucket by domain. Host stats are stored in the
|
||||
stats table under the key '{parent-bucket}:{domain(normalized)}'.
|
||||
|
||||
Example Warcprox-Meta header (a real one will likely have other
|
||||
sections besides 'stats'):
|
||||
|
||||
Warcprox-Meta: {'stats':{'buckets':['bucket1',{'bucket':'bucket2','tally-domains':['foo.bar.com','192.168.10.20'}]}}
|
||||
'''
|
||||
buckets = ["__all__"]
|
||||
if (recorded_url.warcprox_meta
|
||||
and "stats" in recorded_url.warcprox_meta
|
||||
and "buckets" in recorded_url.warcprox_meta["stats"]):
|
||||
for bucket in recorded_url.warcprox_meta["stats"]["buckets"]:
|
||||
if isinstance(bucket, dict):
|
||||
if not 'bucket' in bucket:
|
||||
self.logger.warn(
|
||||
'ignoring invalid stats bucket in '
|
||||
'warcprox-meta header %s', bucket)
|
||||
continue
|
||||
buckets.append(bucket['bucket'])
|
||||
if bucket.get('tally-domains'):
|
||||
url = warcprox.Url(recorded_url.url.decode('utf-8'))
|
||||
for domain in bucket['tally-domains']:
|
||||
if url.matches_ip_or_domain(domain):
|
||||
buckets.append('%s:%s' % (
|
||||
bucket['bucket'],
|
||||
warcprox.normalize_host(domain)))
|
||||
else:
|
||||
buckets.append(bucket)
|
||||
else:
|
||||
buckets.append("__unspecified__")
|
||||
|
||||
return buckets
|
||||
|
||||
def tally(self, recorded_url, records):
|
||||
for bucket in self.buckets(recorded_url):
|
||||
# Gdbm wants str/bytes keys in python2, str/unicode keys in python3.
|
||||
# This ugliness deals with keys that arrive as unicode in py2.
|
||||
b = bucket.encode("utf-8") if bucket and not isinstance(bucket, str) else bucket
|
||||
if b in self.db:
|
||||
bucket_stats = json.loads(self.db[b].decode("utf-8"))
|
||||
else:
|
||||
bucket_stats = _empty_bucket(b)
|
||||
|
||||
bucket_stats["total"]["urls"] += 1
|
||||
bucket_stats["total"]["wire_bytes"] += recorded_url.size
|
||||
|
||||
if records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT:
|
||||
bucket_stats["revisit"]["urls"] += 1
|
||||
bucket_stats["revisit"]["wire_bytes"] += recorded_url.size
|
||||
else:
|
||||
bucket_stats["new"]["urls"] += 1
|
||||
bucket_stats["new"]["wire_bytes"] += recorded_url.size
|
||||
|
||||
self.db[b] = json.dumps(bucket_stats, separators=(',',':')).encode("utf-8")
|
||||
|
||||
class RethinkStatsDb(StatsDb):
|
||||
"""Updates database in batch every 2.0 seconds"""
|
||||
logger = logging.getLogger("warcprox.stats.RethinkStatsDb")
|
||||
|
||||
def __init__(self, rethinker, table="stats", shards=None, replicas=None, options=warcprox.Options()):
|
||||
self.r = rethinker
|
||||
self.table = table
|
||||
self.shards = shards or 1 # 1 shard by default because it's probably a small table
|
||||
self.replicas = replicas or min(3, len(self.r.servers))
|
||||
self._ensure_db_table()
|
||||
self.options = options
|
||||
|
||||
self._stop = threading.Event()
|
||||
self._batch_lock = threading.RLock()
|
||||
with self._batch_lock:
|
||||
self._batch = {}
|
||||
self._timer = None
|
||||
|
||||
def start(self):
|
||||
"""Starts batch update repeating timer."""
|
||||
self._update_batch() # starts repeating timer
|
||||
|
||||
def _bucket_batch_update_reql(self, bucket):
|
||||
return self.r.table(self.table).get(bucket).replace(
|
||||
lambda old: r.branch(
|
||||
old.eq(None), self._batch[bucket], old.merge({
|
||||
"total": {
|
||||
"urls": old["total"]["urls"].add(
|
||||
self._batch[bucket]["total"]["urls"]),
|
||||
"wire_bytes": old["total"]["wire_bytes"].add(
|
||||
self._batch[bucket]["total"]["wire_bytes"]),
|
||||
},
|
||||
"new": {
|
||||
"urls": old["new"]["urls"].add(
|
||||
self._batch[bucket]["new"]["urls"]),
|
||||
"wire_bytes": old["new"]["wire_bytes"].add(
|
||||
self._batch[bucket]["new"]["wire_bytes"]),
|
||||
},
|
||||
"revisit": {
|
||||
"urls": old["revisit"]["urls"].add(
|
||||
self._batch[bucket]["revisit"]["urls"]),
|
||||
"wire_bytes": old["revisit"]["wire_bytes"].add(
|
||||
self._batch[bucket]["revisit"]["wire_bytes"]),
|
||||
},
|
||||
})))
|
||||
|
||||
def _update_batch(self):
|
||||
with self._batch_lock:
|
||||
if len(self._batch) > 0:
|
||||
# XXX can all the buckets be done in one query?
|
||||
for bucket in self._batch:
|
||||
result = self._bucket_batch_update_reql(bucket).run()
|
||||
if (not result["inserted"] and not result["replaced"]
|
||||
or sorted(result.values()) != [0,0,0,0,0,1]):
|
||||
raise Exception(
|
||||
"unexpected result %s updating stats %s" % (
|
||||
result, self._batch[bucket]))
|
||||
self._batch = {}
|
||||
|
||||
if not self._stop.is_set():
|
||||
self._timer = threading.Timer(2.0, self._update_batch)
|
||||
self._timer.name = "RethinkStats-batch-update-timer-%s" % (
|
||||
datetime.datetime.utcnow().isoformat())
|
||||
self._timer.start()
|
||||
else:
|
||||
self.logger.info("finished")
|
||||
|
||||
def _ensure_db_table(self):
|
||||
dbs = self.r.db_list().run()
|
||||
if not self.r.dbname in dbs:
|
||||
self.logger.info(
|
||||
"creating rethinkdb database %s", repr(self.r.dbname))
|
||||
self.r.db_create(self.r.dbname).run()
|
||||
tables = self.r.table_list().run()
|
||||
if not self.table in tables:
|
||||
self.logger.info(
|
||||
"creating rethinkdb table %s in database %s shards=%s "
|
||||
"replicas=%s", repr(self.table), repr(self.r.dbname),
|
||||
self.shards, self.replicas)
|
||||
self.r.table_create(
|
||||
self.table, primary_key="bucket", shards=self.shards,
|
||||
replicas=self.replicas).run()
|
||||
|
||||
def close(self):
|
||||
self.stop()
|
||||
|
||||
def stop(self):
|
||||
self.logger.info("stopping rethinkdb stats table batch updates")
|
||||
self._stop.set()
|
||||
if self._timer:
|
||||
self._timer.join()
|
||||
|
||||
def sync(self):
|
||||
pass
|
||||
|
||||
def value(self, bucket0="__all__", bucket1=None, bucket2=None):
|
||||
bucket0_stats = self.r.table(self.table).get(bucket0).run()
|
||||
self.logger.debug(
|
||||
'stats db lookup of bucket=%s returned %s',
|
||||
bucket0, bucket0_stats)
|
||||
if bucket0_stats:
|
||||
if bucket1:
|
||||
if bucket2:
|
||||
return bucket0_stats[bucket1][bucket2]
|
||||
else:
|
||||
return bucket0_stats[bucket1]
|
||||
return bucket0_stats
|
||||
|
||||
def tally(self, recorded_url, records):
|
||||
buckets = self.buckets(recorded_url)
|
||||
is_revisit = records[0].get_header(
|
||||
warctools.WarcRecord.TYPE) == warctools.WarcRecord.REVISIT
|
||||
with self._batch_lock:
|
||||
for bucket in buckets:
|
||||
bucket_stats = self._batch.setdefault(
|
||||
bucket, _empty_bucket(bucket))
|
||||
|
||||
bucket_stats["total"]["urls"] += 1
|
||||
bucket_stats["total"]["wire_bytes"] += recorded_url.size
|
||||
|
||||
if is_revisit:
|
||||
bucket_stats["revisit"]["urls"] += 1
|
||||
bucket_stats["revisit"]["wire_bytes"] += recorded_url.size
|
||||
else:
|
||||
bucket_stats["new"]["urls"] += 1
|
||||
bucket_stats["new"]["wire_bytes"] += recorded_url.size
|
||||
|
||||
def notify(self, recorded_url, records):
|
||||
self.tally(recorded_url, records)
|
||||
|
@ -1,414 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# vim: set sw=4 et:
|
||||
|
||||
import unittest
|
||||
import threading
|
||||
import time
|
||||
import logging
|
||||
import sys
|
||||
import ssl
|
||||
import re
|
||||
import tempfile
|
||||
import OpenSSL
|
||||
import os
|
||||
import shutil
|
||||
import requests
|
||||
|
||||
try:
|
||||
import http.server as http_server
|
||||
except ImportError:
|
||||
import BaseHTTPServer as http_server
|
||||
|
||||
try:
|
||||
import queue
|
||||
except ImportError:
|
||||
import Queue as queue
|
||||
|
||||
import certauth.certauth
|
||||
|
||||
import warcprox.controller
|
||||
import warcprox.warcprox
|
||||
import warcprox.playback
|
||||
import warcprox.warcwriter
|
||||
import warcprox.dedup
|
||||
|
||||
class TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
|
||||
logger = logging.getLogger('TestHttpRequestHandler')
|
||||
|
||||
def do_GET(self):
|
||||
self.logger.info('GET {}'.format(self.path))
|
||||
|
||||
m = re.match(r'^/([^/]+)/([^/]+)$', self.path)
|
||||
if m is not None:
|
||||
special_header = 'warcprox-test-header: {}!'.format(m.group(1)).encode('utf-8')
|
||||
payload = 'I am the warcprox test payload! {}!\n'.format(10*m.group(2)).encode('utf-8')
|
||||
headers = (b'HTTP/1.1 200 OK\r\n'
|
||||
+ b'Content-Type: text/plain\r\n'
|
||||
+ special_header + b'\r\n'
|
||||
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n'
|
||||
+ b'\r\n')
|
||||
else:
|
||||
payload = b'404 Not Found\n'
|
||||
headers = (b'HTTP/1.1 404 Not Found\r\n'
|
||||
+ b'Content-Type: text/plain\r\n'
|
||||
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n'
|
||||
+ b'\r\n')
|
||||
|
||||
self.connection.sendall(headers)
|
||||
self.connection.sendall(payload)
|
||||
|
||||
|
||||
class WarcproxTest(unittest.TestCase):
|
||||
logger = logging.getLogger('WarcproxTest')
|
||||
|
||||
def __init__(self, methodName='runTest'):
|
||||
self.__cert = None
|
||||
unittest.TestCase.__init__(self, methodName)
|
||||
|
||||
@property
|
||||
def _cert(self):
|
||||
if self.__cert is None:
|
||||
f = tempfile.NamedTemporaryFile(prefix='warcprox-test-https-', suffix='.pem', delete=False)
|
||||
try:
|
||||
key = OpenSSL.crypto.PKey()
|
||||
key.generate_key(OpenSSL.crypto.TYPE_RSA, 2048)
|
||||
req = OpenSSL.crypto.X509Req()
|
||||
req.get_subject().CN = 'localhost'
|
||||
req.set_pubkey(key)
|
||||
req.sign(key, 'sha1')
|
||||
cert = OpenSSL.crypto.X509()
|
||||
cert.set_subject(req.get_subject())
|
||||
cert.set_serial_number(0)
|
||||
cert.gmtime_adj_notBefore(0)
|
||||
cert.gmtime_adj_notAfter(2*60*60) # valid for 2hrs
|
||||
cert.set_issuer(cert.get_subject())
|
||||
cert.set_pubkey(req.get_pubkey())
|
||||
cert.sign(key, 'sha1')
|
||||
|
||||
f.write(OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM, key))
|
||||
f.write(OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM, cert))
|
||||
|
||||
self.logger.info('generated self-signed certificate {}'.format(f.name))
|
||||
self.__cert = f.name
|
||||
finally:
|
||||
f.close()
|
||||
|
||||
return self.__cert
|
||||
|
||||
|
||||
def _start_http_servers(self):
|
||||
self.http_daemon = http_server.HTTPServer(('localhost', 0),
|
||||
RequestHandlerClass=TestHttpRequestHandler)
|
||||
self.logger.info('starting http://{}:{}'.format(self.http_daemon.server_address[0], self.http_daemon.server_address[1]))
|
||||
self.http_daemon_thread = threading.Thread(name='HttpdThread',
|
||||
target=self.http_daemon.serve_forever)
|
||||
self.http_daemon_thread.start()
|
||||
|
||||
# http://www.piware.de/2011/01/creating-an-https-server-in-python/
|
||||
self.https_daemon = http_server.HTTPServer(('localhost', 0),
|
||||
RequestHandlerClass=TestHttpRequestHandler)
|
||||
# self.https_daemon.socket = ssl.wrap_socket(httpd.socket, certfile='path/to/localhost.pem', server_side=True)
|
||||
self.https_daemon.socket = ssl.wrap_socket(self.https_daemon.socket, certfile=self._cert, server_side=True)
|
||||
self.logger.info('starting https://{}:{}'.format(self.https_daemon.server_address[0], self.https_daemon.server_address[1]))
|
||||
self.https_daemon_thread = threading.Thread(name='HttpdThread',
|
||||
target=self.https_daemon.serve_forever)
|
||||
self.https_daemon_thread.start()
|
||||
|
||||
|
||||
def _start_warcprox(self):
|
||||
f = tempfile.NamedTemporaryFile(prefix='warcprox-test-ca-', suffix='.pem', delete=True)
|
||||
f.close() # delete it, or CertificateAuthority will try to read it
|
||||
self._ca_file = f.name
|
||||
self._ca_dir = tempfile.mkdtemp(prefix='warcprox-test-', suffix='-ca')
|
||||
ca = certauth.certauth.CertificateAuthority(self._ca_file, self._ca_dir, 'warcprox-test')
|
||||
|
||||
recorded_url_q = queue.Queue()
|
||||
|
||||
proxy = warcprox.warcprox.WarcProxy(server_address=('localhost', 0), ca=ca,
|
||||
recorded_url_q=recorded_url_q)
|
||||
|
||||
self._warcs_dir = tempfile.mkdtemp(prefix='warcprox-test-warcs-')
|
||||
|
||||
f = tempfile.NamedTemporaryFile(prefix='warcprox-test-playback-index-', suffix='.db', delete=False)
|
||||
f.close()
|
||||
self._playback_index_db_file = f.name
|
||||
playback_index_db = warcprox.playback.PlaybackIndexDb(self._playback_index_db_file)
|
||||
playback_proxy = warcprox.playback.PlaybackProxy(server_address=('localhost', 0), ca=ca,
|
||||
playback_index_db=playback_index_db, warcs_dir=self._warcs_dir)
|
||||
|
||||
f = tempfile.NamedTemporaryFile(prefix='warcprox-test-dedup-', suffix='.db', delete=False)
|
||||
f.close()
|
||||
self._dedup_db_file = f.name
|
||||
dedup_db = warcprox.dedup.DedupDb(self._dedup_db_file)
|
||||
|
||||
warc_writer = warcprox.warcwriter.WarcWriter(directory=self._warcs_dir,
|
||||
port=proxy.server_port, dedup_db=dedup_db,
|
||||
playback_index_db=playback_index_db)
|
||||
warc_writer_thread = warcprox.warcwriter.WarcWriterThread(recorded_url_q=recorded_url_q,
|
||||
warc_writer=warc_writer)
|
||||
|
||||
self.warcprox = warcprox.controller.WarcproxController(proxy, warc_writer_thread, playback_proxy)
|
||||
self.logger.info('starting warcprox')
|
||||
self.warcprox_thread = threading.Thread(name='WarcproxThread',
|
||||
target=self.warcprox.run_until_shutdown)
|
||||
self.warcprox_thread.start()
|
||||
|
||||
|
||||
def setUp(self):
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG,
|
||||
format='%(asctime)s %(levelname)s %(process)d %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
||||
|
||||
self._start_http_servers()
|
||||
self._start_warcprox()
|
||||
|
||||
archiving_proxy = 'http://localhost:{}'.format(self.warcprox.proxy.server_port)
|
||||
self.archiving_proxies = {'http':archiving_proxy, 'https':archiving_proxy}
|
||||
|
||||
playback_proxy = 'http://localhost:{}'.format(self.warcprox.playback_proxy.server_port)
|
||||
self.playback_proxies = {'http':playback_proxy, 'https':playback_proxy}
|
||||
|
||||
|
||||
def tearDown(self):
|
||||
self.logger.info('stopping warcprox')
|
||||
self.warcprox.stop.set()
|
||||
|
||||
self.logger.info('stopping http and https daemons')
|
||||
self.http_daemon.shutdown()
|
||||
self.https_daemon.shutdown()
|
||||
self.http_daemon.server_close()
|
||||
self.https_daemon.server_close()
|
||||
|
||||
# Have to wait for threads to finish or the threads will try to use
|
||||
# variables that no longer exist, resulting in errors like this:
|
||||
# File "/usr/lib/python2.7/SocketServer.py", line 235, in serve_forever
|
||||
# r, w, e = _eintr_retry(select.select, [self], [], [],
|
||||
# AttributeError: 'NoneType' object has no attribute 'select'
|
||||
self.http_daemon_thread.join()
|
||||
self.https_daemon_thread.join()
|
||||
self.warcprox_thread.join()
|
||||
|
||||
for f in (self.__cert, self._ca_file, self._ca_dir, self._warcs_dir, self._playback_index_db_file, self._dedup_db_file):
|
||||
if os.path.isdir(f):
|
||||
self.logger.info('deleting directory {}'.format(f))
|
||||
shutil.rmtree(f)
|
||||
else:
|
||||
self.logger.info('deleting file {}'.format(f))
|
||||
os.unlink(f)
|
||||
|
||||
|
||||
def _test_httpds_no_proxy(self):
|
||||
url = 'http://localhost:{}/'.format(self.http_daemon.server_port)
|
||||
response = requests.get(url)
|
||||
self.assertEqual(response.status_code, 404)
|
||||
self.assertEqual(response.content, b'404 Not Found\n')
|
||||
|
||||
url = 'https://localhost:{}/'.format(self.https_daemon.server_port)
|
||||
response = requests.get(url, verify=False)
|
||||
self.assertEqual(response.status_code, 404)
|
||||
self.assertEqual(response.content, b'404 Not Found\n')
|
||||
|
||||
url = 'http://localhost:{}/a/b'.format(self.http_daemon.server_port)
|
||||
response = requests.get(url)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'a!')
|
||||
self.assertEqual(response.content, b'I am the warcprox test payload! bbbbbbbbbb!\n')
|
||||
|
||||
url = 'https://localhost:{}/c/d'.format(self.https_daemon.server_port)
|
||||
response = requests.get(url, verify=False)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'c!')
|
||||
self.assertEqual(response.content, b'I am the warcprox test payload! dddddddddd!\n')
|
||||
|
||||
|
||||
def poll_playback_until(self, url, status, timeout_sec):
|
||||
start = time.time()
|
||||
# check playback (warc writing is asynchronous, give it up to 10 sec)
|
||||
while time.time() - start < timeout_sec:
|
||||
response = requests.get(url, proxies=self.playback_proxies, verify=False)
|
||||
if response.status_code == status:
|
||||
break
|
||||
time.sleep(0.5)
|
||||
|
||||
return response
|
||||
|
||||
|
||||
def _test_archive_and_playback_http_url(self):
|
||||
url = 'http://localhost:{}/a/b'.format(self.http_daemon.server_port)
|
||||
|
||||
# ensure playback fails before archiving
|
||||
response = requests.get(url, proxies=self.playback_proxies)
|
||||
self.assertEqual(response.status_code, 404)
|
||||
self.assertEqual(response.content, b'404 Not in Archive\n')
|
||||
|
||||
# archive
|
||||
response = requests.get(url, proxies=self.archiving_proxies)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'a!')
|
||||
self.assertEqual(response.content, b'I am the warcprox test payload! bbbbbbbbbb!\n')
|
||||
|
||||
response = self.poll_playback_until(url, status=200, timeout_sec=10)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'a!')
|
||||
self.assertEqual(response.content, b'I am the warcprox test payload! bbbbbbbbbb!\n')
|
||||
|
||||
|
||||
def _test_archive_and_playback_https_url(self):
|
||||
url = 'https://localhost:{}/c/d'.format(self.https_daemon.server_port)
|
||||
|
||||
# ensure playback fails before archiving
|
||||
response = requests.get(url, proxies=self.playback_proxies, verify=False)
|
||||
self.assertEqual(response.status_code, 404)
|
||||
self.assertEqual(response.content, b'404 Not in Archive\n')
|
||||
|
||||
# fetch & archive response
|
||||
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'c!')
|
||||
self.assertEqual(response.content, b'I am the warcprox test payload! dddddddddd!\n')
|
||||
|
||||
# test playback
|
||||
response = self.poll_playback_until(url, status=200, timeout_sec=10)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'c!')
|
||||
self.assertEqual(response.content, b'I am the warcprox test payload! dddddddddd!\n')
|
||||
|
||||
|
||||
# test dedup of same http url with same payload
|
||||
def _test_dedup_http(self):
|
||||
url = 'http://localhost:{}/e/f'.format(self.http_daemon.server_port)
|
||||
|
||||
# ensure playback fails before archiving
|
||||
response = requests.get(url, proxies=self.playback_proxies, verify=False)
|
||||
self.assertEqual(response.status_code, 404)
|
||||
self.assertEqual(response.content, b'404 Not in Archive\n')
|
||||
|
||||
# check not in dedup db
|
||||
dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
|
||||
self.assertIsNone(dedup_lookup)
|
||||
|
||||
# archive
|
||||
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'e!')
|
||||
self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n')
|
||||
|
||||
# test playback
|
||||
response = self.poll_playback_until(url, status=200, timeout_sec=10)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'e!')
|
||||
self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n')
|
||||
|
||||
# check in dedup db
|
||||
# {u'i': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'}
|
||||
dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
|
||||
self.assertEqual(dedup_lookup['u'], url.encode('ascii'))
|
||||
self.assertRegexpMatches(dedup_lookup['i'], br'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$')
|
||||
self.assertRegexpMatches(dedup_lookup['d'], br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$')
|
||||
record_id = dedup_lookup['i']
|
||||
dedup_date = dedup_lookup['d']
|
||||
|
||||
# need revisit to have a later timestamp than original, else playing
|
||||
# back the latest record might not hit the revisit
|
||||
time.sleep(1.5)
|
||||
|
||||
# fetch & archive revisit
|
||||
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'e!')
|
||||
self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n')
|
||||
|
||||
# XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\
|
||||
time.sleep(2.0)
|
||||
|
||||
# check in dedup db (no change from prev)
|
||||
dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
|
||||
self.assertEqual(dedup_lookup['u'], url.encode('ascii'))
|
||||
self.assertEqual(dedup_lookup['i'], record_id)
|
||||
self.assertEqual(dedup_lookup['d'], dedup_date)
|
||||
|
||||
# test playback
|
||||
self.logger.debug('testing playback of revisit of {}'.format(url))
|
||||
response = self.poll_playback_until(url, status=200, timeout_sec=10)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'e!')
|
||||
self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n')
|
||||
# XXX how to check dedup was used?
|
||||
|
||||
|
||||
# test dedup of same https url with same payload
|
||||
def _test_dedup_https(self):
|
||||
url = 'https://localhost:{}/g/h'.format(self.https_daemon.server_port)
|
||||
|
||||
# ensure playback fails before archiving
|
||||
response = requests.get(url, proxies=self.playback_proxies, verify=False)
|
||||
self.assertEqual(response.status_code, 404)
|
||||
self.assertEqual(response.content, b'404 Not in Archive\n')
|
||||
|
||||
# check not in dedup db
|
||||
dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
|
||||
self.assertIsNone(dedup_lookup)
|
||||
|
||||
# archive
|
||||
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'g!')
|
||||
self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n')
|
||||
|
||||
# test playback
|
||||
response = self.poll_playback_until(url, status=200, timeout_sec=10)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'g!')
|
||||
self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n')
|
||||
|
||||
# check in dedup db
|
||||
# {u'i': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'}
|
||||
dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
|
||||
self.assertEqual(dedup_lookup['u'], url.encode('ascii'))
|
||||
self.assertRegexpMatches(dedup_lookup['i'], br'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$')
|
||||
self.assertRegexpMatches(dedup_lookup['d'], br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$')
|
||||
record_id = dedup_lookup['i']
|
||||
dedup_date = dedup_lookup['d']
|
||||
|
||||
# need revisit to have a later timestamp than original, else playing
|
||||
# back the latest record might not hit the revisit
|
||||
time.sleep(1.5)
|
||||
|
||||
# fetch & archive revisit
|
||||
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'g!')
|
||||
self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n')
|
||||
|
||||
# XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\
|
||||
time.sleep(2.0)
|
||||
|
||||
# check in dedup db (no change from prev)
|
||||
dedup_lookup = self.warcprox.warc_writer_thread.warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
|
||||
self.assertEqual(dedup_lookup['u'], url.encode('ascii'))
|
||||
self.assertEqual(dedup_lookup['i'], record_id)
|
||||
self.assertEqual(dedup_lookup['d'], dedup_date)
|
||||
|
||||
# test playback
|
||||
self.logger.debug('testing playback of revisit of {}'.format(url))
|
||||
response = self.poll_playback_until(url, status=200, timeout_sec=10)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.headers['warcprox-test-header'], 'g!')
|
||||
self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n')
|
||||
# XXX how to check dedup was used?
|
||||
|
||||
|
||||
# run everything from here, otherwise it wants to setUp() and tearDown
|
||||
# around each test
|
||||
def runTest(self):
|
||||
self._test_httpds_no_proxy()
|
||||
self._test_archive_and_playback_http_url()
|
||||
self._test_archive_and_playback_https_url()
|
||||
self._test_dedup_http()
|
||||
self._test_dedup_https()
|
||||
# self._test_dedup_mixed_http()
|
||||
# self._test_dedup_mixed_https()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
171
warcprox/warc.py
Normal file
171
warcprox/warc.py
Normal file
@ -0,0 +1,171 @@
|
||||
#
|
||||
# warcprox/warc.py - assembles warc records
|
||||
#
|
||||
# Copyright (C) 2013-2016 Internet Archive
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
# as published by the Free Software Foundation; either version 2
|
||||
# of the License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
# USA.
|
||||
#
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
import logging
|
||||
import warcprox
|
||||
import hashlib
|
||||
import socket
|
||||
import hanzo.httptools
|
||||
from hanzo import warctools
|
||||
import warcprox
|
||||
import datetime
|
||||
|
||||
class WarcRecordBuilder:
|
||||
logger = logging.getLogger("warcprox.warc.WarcRecordBuilder")
|
||||
|
||||
def __init__(self, digest_algorithm="sha1", base32=False):
|
||||
self.digest_algorithm = digest_algorithm
|
||||
self.base32 = base32
|
||||
|
||||
def _build_response_principal_record(self, recorded_url, warc_date):
|
||||
"""Builds response or revisit record, whichever is appropriate."""
|
||||
if hasattr(recorded_url, "dedup_info") and recorded_url.dedup_info:
|
||||
# revisit record
|
||||
recorded_url.response_recorder.tempfile.seek(0)
|
||||
if recorded_url.response_recorder.payload_offset is not None:
|
||||
response_header_block = recorded_url.response_recorder.tempfile.read(recorded_url.response_recorder.payload_offset)
|
||||
else:
|
||||
response_header_block = recorded_url.response_recorder.tempfile.read()
|
||||
|
||||
return self.build_warc_record(
|
||||
url=recorded_url.url, warc_date=warc_date,
|
||||
data=response_header_block,
|
||||
warc_type=warctools.WarcRecord.REVISIT,
|
||||
refers_to=recorded_url.dedup_info['id'],
|
||||
refers_to_target_uri=recorded_url.dedup_info['url'],
|
||||
refers_to_date=recorded_url.dedup_info['date'],
|
||||
payload_digest=warcprox.digest_str(recorded_url.response_recorder.payload_digest, self.base32),
|
||||
profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST,
|
||||
content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
|
||||
remote_ip=recorded_url.remote_ip)
|
||||
else:
|
||||
# response record
|
||||
return self.build_warc_record(
|
||||
url=recorded_url.url, warc_date=warc_date,
|
||||
recorder=recorded_url.response_recorder,
|
||||
warc_type=warctools.WarcRecord.RESPONSE,
|
||||
content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
|
||||
remote_ip=recorded_url.remote_ip)
|
||||
|
||||
def build_warc_records(self, recorded_url):
|
||||
"""Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)"""
|
||||
warc_date = warctools.warc.warc_datetime_str(recorded_url.timestamp)
|
||||
|
||||
if recorded_url.response_recorder:
|
||||
principal_record = self._build_response_principal_record(recorded_url, warc_date)
|
||||
request_record = self.build_warc_record(url=recorded_url.url,
|
||||
warc_date=warc_date, data=recorded_url.request_data,
|
||||
warc_type=warctools.WarcRecord.REQUEST,
|
||||
content_type=hanzo.httptools.RequestMessage.CONTENT_TYPE,
|
||||
concurrent_to=principal_record.id)
|
||||
return principal_record, request_record
|
||||
else:
|
||||
principal_record = self.build_warc_record(url=recorded_url.url,
|
||||
warc_date=warc_date, data=recorded_url.request_data,
|
||||
warc_type=recorded_url.custom_type,
|
||||
content_type=recorded_url.content_type.encode("latin1"))
|
||||
return (principal_record,)
|
||||
|
||||
def build_warc_record(self, url, warc_date=None, recorder=None, data=None,
|
||||
concurrent_to=None, warc_type=None, content_type=None, remote_ip=None,
|
||||
profile=None, refers_to=None, refers_to_target_uri=None,
|
||||
refers_to_date=None, payload_digest=None):
|
||||
|
||||
if warc_date is None:
|
||||
warc_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow())
|
||||
|
||||
record_id = warctools.WarcRecord.random_warc_uuid()
|
||||
|
||||
headers = []
|
||||
if warc_type is not None:
|
||||
headers.append((warctools.WarcRecord.TYPE, warc_type))
|
||||
headers.append((warctools.WarcRecord.ID, record_id))
|
||||
headers.append((warctools.WarcRecord.DATE, warc_date))
|
||||
headers.append((warctools.WarcRecord.URL, url))
|
||||
if remote_ip is not None:
|
||||
headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip))
|
||||
if profile is not None:
|
||||
headers.append((warctools.WarcRecord.PROFILE, profile))
|
||||
if refers_to is not None:
|
||||
headers.append((warctools.WarcRecord.REFERS_TO, refers_to))
|
||||
if refers_to_target_uri is not None:
|
||||
headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri))
|
||||
if refers_to_date is not None:
|
||||
headers.append((warctools.WarcRecord.REFERS_TO_DATE, refers_to_date))
|
||||
if concurrent_to is not None:
|
||||
headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to))
|
||||
if content_type is not None:
|
||||
headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
|
||||
if payload_digest is not None:
|
||||
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
|
||||
|
||||
if recorder is not None:
|
||||
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1')))
|
||||
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
||||
warcprox.digest_str(recorder.block_digest, self.base32)))
|
||||
if recorder.payload_digest is not None:
|
||||
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
|
||||
warcprox.digest_str(recorder.payload_digest, self.base32)))
|
||||
|
||||
recorder.tempfile.seek(0)
|
||||
record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)
|
||||
|
||||
else:
|
||||
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1')))
|
||||
digest = hashlib.new(self.digest_algorithm, data)
|
||||
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
||||
warcprox.digest_str(digest, self.base32)))
|
||||
if not payload_digest:
|
||||
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
|
||||
warcprox.digest_str(digest, self.base32)))
|
||||
|
||||
content_tuple = content_type, data
|
||||
record = warctools.WarcRecord(headers=headers, content=content_tuple)
|
||||
|
||||
return record
|
||||
|
||||
def build_warcinfo_record(self, filename):
|
||||
warc_record_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow())
|
||||
record_id = warctools.WarcRecord.random_warc_uuid()
|
||||
|
||||
headers = []
|
||||
headers.append((warctools.WarcRecord.ID, record_id))
|
||||
headers.append((warctools.WarcRecord.TYPE, warctools.WarcRecord.WARCINFO))
|
||||
headers.append((warctools.WarcRecord.FILENAME, filename.encode('latin1')))
|
||||
headers.append((warctools.WarcRecord.DATE, warc_record_date))
|
||||
|
||||
warcinfo_fields = []
|
||||
warcinfo_fields.append(b'software: warcprox ' + warcprox.__version__.encode('latin1'))
|
||||
hostname = socket.gethostname()
|
||||
warcinfo_fields.append('hostname: {}'.format(hostname).encode('latin1'))
|
||||
warcinfo_fields.append('ip: {}'.format(socket.gethostbyname(hostname)).encode('latin1'))
|
||||
warcinfo_fields.append(b'format: WARC File Format 1.0')
|
||||
# warcinfo_fields.append('robots: ignore')
|
||||
# warcinfo_fields.append('description: {0}'.format(self.description))
|
||||
# warcinfo_fields.append('isPartOf: {0}'.format(self.is_part_of))
|
||||
data = b'\r\n'.join(warcinfo_fields) + b'\r\n'
|
||||
|
||||
record = warctools.WarcRecord(headers=headers, content=(b'application/warc-fields', data))
|
||||
|
||||
return record
|
||||
|
@ -1,272 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# vim:set sw=4 et:
|
||||
#
|
||||
"""
|
||||
WARC writing MITM HTTP/S proxy
|
||||
|
||||
See README.rst or https://github.com/internetarchive/warcprox
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
try:
|
||||
import http.server as http_server
|
||||
except ImportError:
|
||||
import BaseHTTPServer as http_server
|
||||
|
||||
try:
|
||||
import socketserver
|
||||
except ImportError:
|
||||
import SocketServer as socketserver
|
||||
|
||||
try:
|
||||
import queue
|
||||
except ImportError:
|
||||
import Queue as queue
|
||||
|
||||
try:
|
||||
import http.client as http_client
|
||||
except ImportError:
|
||||
import httplib as http_client
|
||||
|
||||
import logging
|
||||
import re
|
||||
import tempfile
|
||||
import traceback
|
||||
import hashlib
|
||||
import json
|
||||
import socket
|
||||
|
||||
from certauth.certauth import CertificateAuthority
|
||||
import warcprox.mitmproxy
|
||||
|
||||
class ProxyingRecorder(object):
|
||||
"""
|
||||
Wraps a socket._fileobject, recording the bytes as they are read,
|
||||
calculating digests, and sending them on to the proxy client.
|
||||
"""
|
||||
|
||||
logger = logging.getLogger("warcprox.warcprox.ProxyingRecorder")
|
||||
|
||||
def __init__(self, fp, proxy_dest, digest_algorithm='sha1'):
|
||||
self.fp = fp
|
||||
# "The file has no name, and will cease to exist when it is closed."
|
||||
self.tempfile = tempfile.SpooledTemporaryFile(max_size=512*1024)
|
||||
self.digest_algorithm = digest_algorithm
|
||||
self.block_digest = hashlib.new(digest_algorithm)
|
||||
self.payload_offset = None
|
||||
self.payload_digest = None
|
||||
self.proxy_dest = proxy_dest
|
||||
self._proxy_dest_conn_open = True
|
||||
self._prev_hunk_last_two_bytes = b''
|
||||
self.len = 0
|
||||
|
||||
def _update_payload_digest(self, hunk):
|
||||
if self.payload_digest is None:
|
||||
# convoluted handling of two newlines crossing hunks
|
||||
# XXX write tests for this
|
||||
if self._prev_hunk_last_two_bytes.endswith(b'\n'):
|
||||
if hunk.startswith(b'\n'):
|
||||
self.payload_digest = hashlib.new(self.digest_algorithm)
|
||||
self.payload_digest.update(hunk[1:])
|
||||
self.payload_offset = self.len + 1
|
||||
elif hunk.startswith(b'\r\n'):
|
||||
self.payload_digest = hashlib.new(self.digest_algorithm)
|
||||
self.payload_digest.update(hunk[2:])
|
||||
self.payload_offset = self.len + 2
|
||||
elif self._prev_hunk_last_two_bytes == b'\n\r':
|
||||
if hunk.startswith(b'\n'):
|
||||
self.payload_digest = hashlib.new(self.digest_algorithm)
|
||||
self.payload_digest.update(hunk[1:])
|
||||
self.payload_offset = self.len + 1
|
||||
else:
|
||||
m = re.search(br'\n\r?\n', hunk)
|
||||
if m is not None:
|
||||
self.payload_digest = hashlib.new(self.digest_algorithm)
|
||||
self.payload_digest.update(hunk[m.end():])
|
||||
self.payload_offset = self.len + m.end()
|
||||
|
||||
# if we still haven't found start of payload hold on to these bytes
|
||||
if self.payload_digest is None:
|
||||
self._prev_hunk_last_two_bytes = hunk[-2:]
|
||||
else:
|
||||
self.payload_digest.update(hunk)
|
||||
|
||||
def _update(self, hunk):
|
||||
self._update_payload_digest(hunk)
|
||||
self.block_digest.update(hunk)
|
||||
|
||||
self.tempfile.write(hunk)
|
||||
|
||||
if self._proxy_dest_conn_open:
|
||||
try:
|
||||
self.proxy_dest.sendall(hunk)
|
||||
except BaseException as e:
|
||||
self._proxy_dest_conn_open = False
|
||||
self.logger.warn('{} sending data to proxy client'.format(e))
|
||||
self.logger.info('will continue downloading from remote server without sending to client')
|
||||
|
||||
self.len += len(hunk)
|
||||
|
||||
def read(self, size=-1):
|
||||
hunk = self.fp.read(size)
|
||||
self._update(hunk)
|
||||
return hunk
|
||||
|
||||
def readinto(self, b):
|
||||
n = self.fp.readinto(b)
|
||||
self._update(b[:n])
|
||||
return n
|
||||
|
||||
def readline(self, size=-1):
|
||||
# XXX depends on implementation details of self.fp.readline(), in
|
||||
# particular that it doesn't call self.fp.read()
|
||||
hunk = self.fp.readline(size)
|
||||
self._update(hunk)
|
||||
return hunk
|
||||
|
||||
def close(self):
|
||||
return self.fp.close()
|
||||
|
||||
def __len__(self):
|
||||
return self.len
|
||||
|
||||
def payload_size(self):
|
||||
if self.payload_offset is not None:
|
||||
return self.len - self.payload_offset
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
|
||||
|
||||
def __init__(self, sock, debuglevel=0, method=None, proxy_dest=None, digest_algorithm='sha1'):
|
||||
http_client.HTTPResponse.__init__(self, sock, debuglevel=debuglevel, method=method)
|
||||
|
||||
# Keep around extra reference to self.fp because HTTPResponse sets
|
||||
# self.fp=None after it finishes reading, but we still need it
|
||||
self.recorder = ProxyingRecorder(self.fp, proxy_dest, digest_algorithm)
|
||||
self.fp = self.recorder
|
||||
|
||||
|
||||
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler")
|
||||
|
||||
def _proxy_request(self):
|
||||
# Build request
|
||||
req_str = '{} {} {}\r\n'.format(self.command, self.path, self.request_version)
|
||||
|
||||
warcprox_meta = self.headers.get('Warcprox-Meta')
|
||||
|
||||
# Swallow headers that don't make sense to forward on, i.e. most
|
||||
# hop-by-hop headers, see http://tools.ietf.org/html/rfc2616#section-13.5
|
||||
# self.headers is an email.message.Message, which is case-insensitive
|
||||
# and doesn't throw KeyError in __delitem__
|
||||
for h in ('Connection', 'Proxy-Connection', 'Keep-Alive',
|
||||
'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade',
|
||||
'Warcprox-Meta'):
|
||||
del self.headers[h]
|
||||
|
||||
# Add headers to the request
|
||||
# XXX in at least python3.3 str(self.headers) uses \n not \r\n :(
|
||||
req_str += '\r\n'.join('{}: {}'.format(k,v) for (k,v) in self.headers.items())
|
||||
|
||||
req = req_str.encode('utf-8') + b'\r\n\r\n'
|
||||
|
||||
# Append message body if present to the request
|
||||
if 'Content-Length' in self.headers:
|
||||
req += self.rfile.read(int(self.headers['Content-Length']))
|
||||
|
||||
self.logger.debug('req={}'.format(repr(req)))
|
||||
|
||||
# Send it down the pipe!
|
||||
self._proxy_sock.sendall(req)
|
||||
|
||||
# We want HTTPResponse's smarts about http and handling of
|
||||
# non-compliant servers. But HTTPResponse.read() doesn't return the raw
|
||||
# bytes read from the server, it unchunks them if they're chunked, and
|
||||
# might do other stuff. We want to send the raw bytes back to the
|
||||
# client. So we ignore the values returned by h.read() below. Instead
|
||||
# the ProxyingRecordingHTTPResponse takes care of sending the raw bytes
|
||||
# to the proxy client.
|
||||
|
||||
# Proxy and record the response
|
||||
h = ProxyingRecordingHTTPResponse(self._proxy_sock,
|
||||
proxy_dest=self.connection,
|
||||
digest_algorithm=self.server.digest_algorithm)
|
||||
h.begin()
|
||||
|
||||
buf = h.read(8192)
|
||||
while buf != b'':
|
||||
buf = h.read(8192)
|
||||
|
||||
self.log_request(h.status, h.recorder.len)
|
||||
|
||||
remote_ip = self._proxy_sock.getpeername()[0]
|
||||
|
||||
# Let's close off the remote end
|
||||
h.close()
|
||||
self._proxy_sock.close()
|
||||
|
||||
recorded_url = RecordedUrl(url=self.url, request_data=req,
|
||||
response_recorder=h.recorder, remote_ip=remote_ip,
|
||||
warcprox_meta=warcprox_meta)
|
||||
self.server.recorded_url_q.put(recorded_url)
|
||||
|
||||
return recorded_url
|
||||
|
||||
|
||||
class RecordedUrl(object):
|
||||
def __init__(self, url, request_data, response_recorder, remote_ip, warcprox_meta=None):
|
||||
# XXX should test what happens with non-ascii url (when does
|
||||
# url-encoding happen?)
|
||||
if type(url) is not bytes:
|
||||
self.url = url.encode('ascii')
|
||||
else:
|
||||
self.url = url
|
||||
|
||||
if type(remote_ip) is not bytes:
|
||||
self.remote_ip = remote_ip.encode('ascii')
|
||||
else:
|
||||
self.remote_ip = remote_ip
|
||||
|
||||
self.request_data = request_data
|
||||
self.response_recorder = response_recorder
|
||||
|
||||
if warcprox_meta:
|
||||
self.warcprox_meta = json.loads(warcprox_meta)
|
||||
else:
|
||||
self.warcprox_meta = {}
|
||||
|
||||
|
||||
class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer):
|
||||
logger = logging.getLogger("warcprox.warcprox.WarcProxy")
|
||||
|
||||
def __init__(self, server_address=('localhost', 8000),
|
||||
req_handler_class=WarcProxyHandler, bind_and_activate=True,
|
||||
ca=None, recorded_url_q=None, digest_algorithm='sha1'):
|
||||
http_server.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate)
|
||||
|
||||
self.digest_algorithm = digest_algorithm
|
||||
|
||||
if ca is not None:
|
||||
self.ca = ca
|
||||
else:
|
||||
ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64]
|
||||
self.ca = CertificateAuthority(ca_file='warcprox-ca.pem',
|
||||
certs_dir='./warcprox-ca',
|
||||
ca_name=ca_name)
|
||||
|
||||
if recorded_url_q is not None:
|
||||
self.recorded_url_q = recorded_url_q
|
||||
else:
|
||||
self.recorded_url_q = queue.Queue()
|
||||
|
||||
def server_activate(self):
|
||||
http_server.HTTPServer.server_activate(self)
|
||||
self.logger.info('WarcProxy listening on {0}:{1}'.format(self.server_address[0], self.server_address[1]))
|
||||
|
||||
def server_close(self):
|
||||
self.logger.info('WarcProxy shutting down')
|
||||
http_server.HTTPServer.server_close(self)
|
||||
|
415
warcprox/warcproxy.py
Normal file
415
warcprox/warcproxy.py
Normal file
@ -0,0 +1,415 @@
|
||||
'''
|
||||
warcprox/warcproxy.py - recording proxy, extends mitmproxy to record traffic,
|
||||
enqueue info on the recorded url queue
|
||||
|
||||
Copyright (C) 2013-2016 Internet Archive
|
||||
|
||||
This program is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public License
|
||||
as published by the Free Software Foundation; either version 2
|
||||
of the License, or (at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
USA.
|
||||
'''
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
try:
|
||||
import http.server as http_server
|
||||
except ImportError:
|
||||
import BaseHTTPServer as http_server
|
||||
try:
|
||||
import socketserver
|
||||
except ImportError:
|
||||
import SocketServer as socketserver
|
||||
try:
|
||||
import queue
|
||||
except ImportError:
|
||||
import Queue as queue
|
||||
import logging
|
||||
import re
|
||||
import traceback
|
||||
import json
|
||||
import socket
|
||||
from hanzo import warctools
|
||||
from certauth.certauth import CertificateAuthority
|
||||
import warcprox
|
||||
import datetime
|
||||
import ipaddress
|
||||
import surt
|
||||
|
||||
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
'''
|
||||
XXX add more information.
|
||||
|
||||
Among other things, this class enforces limits specified in the
|
||||
Warcprox-Meta request header. If a limit is deemed to have been reached, no
|
||||
request will be made to the remote destination server. This implementation
|
||||
detail has implications worth noting. For example, if a limit applies to
|
||||
"new" (not deduplicated) bytes, and the limit has already been reached, no
|
||||
request will be made, even if it would have resulted in duplicate content,
|
||||
which would not count toward the limit. To reiterate, this is because the
|
||||
limit enforcer does not know that the content would be deduplicated.
|
||||
'''
|
||||
# self.server is WarcProxy
|
||||
logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler")
|
||||
|
||||
# XXX nearly identical to brozzler.site.Site._scope_rule_applies() but
|
||||
# there's no obvious common dependency where this code should go... TBD
|
||||
def _scope_rule_applies(self, rule):
|
||||
u = warcprox.Url(self.url)
|
||||
|
||||
if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]):
|
||||
return False
|
||||
if "url_match" in rule:
|
||||
if rule["url_match"] == "STRING_MATCH":
|
||||
return u.url.find(rule["value"]) >= 0
|
||||
elif rule["url_match"] == "REGEX_MATCH":
|
||||
try:
|
||||
return re.fullmatch(rule["value"], u.url)
|
||||
except Exception as e:
|
||||
self.logger.warn(
|
||||
"caught exception matching against regex %s: %s",
|
||||
rule["value"], e)
|
||||
return False
|
||||
elif rule["url_match"] == "SURT_MATCH":
|
||||
return u.surt.startswith(rule["value"])
|
||||
else:
|
||||
self.logger.warn("invalid rule.url_match=%s", rule.url_match)
|
||||
return False
|
||||
else:
|
||||
if "domain" in rule:
|
||||
# we already know that it matches from earlier check
|
||||
return True
|
||||
else:
|
||||
self.logger.warn("unable to make sense of scope rule %s", rule)
|
||||
return False
|
||||
|
||||
def _enforce_blocks(self, warcprox_meta):
|
||||
"""
|
||||
Sends a 403 response and raises warcprox.RequestBlockedByRule if the
|
||||
url is blocked by a rule in warcprox_meta.
|
||||
"""
|
||||
if warcprox_meta and "blocks" in warcprox_meta:
|
||||
for rule in warcprox_meta["blocks"]:
|
||||
if self._scope_rule_applies(rule):
|
||||
body = ("request rejected by warcprox: blocked by "
|
||||
"rule found in Warcprox-Meta header: %s"
|
||||
% rule).encode("utf-8")
|
||||
self.send_response(403, "Forbidden")
|
||||
self.send_header("Content-Type", "text/plain;charset=utf-8")
|
||||
self.send_header("Connection", "close")
|
||||
self.send_header("Content-Length", len(body))
|
||||
response_meta = {"blocked-by-rule":rule}
|
||||
self.send_header(
|
||||
"Warcprox-Meta",
|
||||
json.dumps(response_meta, separators=(",",":")))
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
self.connection.close()
|
||||
raise warcprox.RequestBlockedByRule(
|
||||
"%s 403 %s %s -- blocked by rule in Warcprox-Meta "
|
||||
"request header %s" % (
|
||||
self.client_address[0], self.command,
|
||||
self.url, rule))
|
||||
|
||||
def _enforce_limit(self, limit_key, limit_value, soft=False):
|
||||
bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
|
||||
_limit_key = limit_key
|
||||
|
||||
# if limit_key looks like 'job1:foo.com/total/urls' then we only want
|
||||
# to apply this rule if the requested url is within domain
|
||||
bucket0_fields = bucket0.split(':')
|
||||
if len(bucket0_fields) == 2:
|
||||
if not warcprox.host_matches_ip_or_domain(
|
||||
self.hostname, bucket0_fields[1]):
|
||||
return # else host matches, go ahead and enforce the limit
|
||||
bucket0 = '%s:%s' % (
|
||||
bucket0_fields[0],
|
||||
warcprox.normalize_host(bucket0_fields[1]))
|
||||
_limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2)
|
||||
|
||||
value = self.server.stats_db.value(bucket0, bucket1, bucket2)
|
||||
if value and value >= limit_value:
|
||||
body = ("request rejected by warcprox: reached %s %s=%s\n" % (
|
||||
"soft limit" if soft else "limit", _limit_key,
|
||||
limit_value)).encode("utf-8")
|
||||
if soft:
|
||||
self.send_response(430, "Reached soft limit")
|
||||
else:
|
||||
self.send_response(420, "Reached limit")
|
||||
self.send_header("Content-Type", "text/plain;charset=utf-8")
|
||||
self.send_header("Connection", "close")
|
||||
self.send_header("Content-Length", len(body))
|
||||
response_meta = {
|
||||
"stats": {bucket0:self.server.stats_db.value(bucket0)}
|
||||
}
|
||||
if soft:
|
||||
response_meta["reached-soft-limit"] = {_limit_key:limit_value}
|
||||
else:
|
||||
response_meta["reached-limit"] = {_limit_key:limit_value}
|
||||
self.send_header(
|
||||
"Warcprox-Meta",
|
||||
json.dumps(response_meta, separators=(",",":")))
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
self.connection.close()
|
||||
raise warcprox.RequestBlockedByRule(
|
||||
"%s %s %s %s -- reached %s %s=%s" % (
|
||||
self.client_address[0], 430 if soft else 420,
|
||||
self.command, self.url,
|
||||
"soft limit" if soft else "limit",
|
||||
_limit_key, limit_value))
|
||||
|
||||
def _enforce_limits(self, warcprox_meta):
|
||||
"""
|
||||
Sends a 420 (hard limit) or 430 (soft limit) response and raises
|
||||
warcprox.RequestBlockedByRule if a limit specified in warcprox_meta is
|
||||
reached.
|
||||
"""
|
||||
if warcprox_meta and "limits" in warcprox_meta:
|
||||
for item in warcprox_meta["limits"].items():
|
||||
limit_key, limit_value = item
|
||||
self._enforce_limit(limit_key, limit_value, soft=False)
|
||||
if warcprox_meta and "soft-limits" in warcprox_meta:
|
||||
for item in warcprox_meta["soft-limits"].items():
|
||||
limit_key, limit_value = item
|
||||
self._enforce_limit(limit_key, limit_value, soft=True)
|
||||
|
||||
def _connect_to_remote_server(self):
|
||||
'''
|
||||
Wraps MitmProxyHandler._connect_to_remote_server, first enforcing
|
||||
limits and block rules in the Warcprox-Meta request header, if any.
|
||||
Raises warcprox.RequestBlockedByRule if a rule has been enforced.
|
||||
Otherwise calls MitmProxyHandler._connect_to_remote_server, which
|
||||
initializes self._remote_server_sock.
|
||||
'''
|
||||
if 'Warcprox-Meta' in self.headers:
|
||||
warcprox_meta = json.loads(self.headers['Warcprox-Meta'])
|
||||
self._enforce_limits(warcprox_meta)
|
||||
self._enforce_blocks(warcprox_meta)
|
||||
return warcprox.mitmproxy.MitmProxyHandler._connect_to_remote_server(self)
|
||||
|
||||
def _proxy_request(self):
|
||||
warcprox_meta = None
|
||||
raw_warcprox_meta = self.headers.get('Warcprox-Meta')
|
||||
self.logger.log(
|
||||
warcprox.TRACE, 'request for %s Warcprox-Meta header: %s',
|
||||
self.url, repr(raw_warcprox_meta))
|
||||
if raw_warcprox_meta:
|
||||
warcprox_meta = json.loads(raw_warcprox_meta)
|
||||
del self.headers['Warcprox-Meta']
|
||||
|
||||
remote_ip = self._remote_server_sock.getpeername()[0]
|
||||
timestamp = datetime.datetime.utcnow()
|
||||
|
||||
req, prox_rec_res = warcprox.mitmproxy.MitmProxyHandler._proxy_request(
|
||||
self)
|
||||
|
||||
recorded_url = RecordedUrl(
|
||||
url=self.url, request_data=req,
|
||||
response_recorder=prox_rec_res.recorder, remote_ip=remote_ip,
|
||||
warcprox_meta=warcprox_meta, status=prox_rec_res.status,
|
||||
size=prox_rec_res.recorder.len,
|
||||
client_ip=self.client_address[0],
|
||||
content_type=prox_rec_res.getheader("Content-Type"),
|
||||
method=self.command, timestamp=timestamp, host=self.hostname,
|
||||
duration=datetime.datetime.utcnow()-timestamp)
|
||||
self.server.recorded_url_q.put(recorded_url)
|
||||
|
||||
return recorded_url
|
||||
|
||||
# deprecated
|
||||
def do_PUTMETA(self):
|
||||
'''
|
||||
Handles a special warcprox PUTMETA request (deprecated). A PUTMETA
|
||||
request is equivalent to a WARCPROX_WRITE_RECORD request with
|
||||
WARC-Type: metadata.
|
||||
'''
|
||||
self.do_WARCPROX_WRITE_RECORD(warc_type=warctools.WarcRecord.METADATA)
|
||||
|
||||
def do_WARCPROX_WRITE_RECORD(self, warc_type=None):
|
||||
'''
|
||||
Handles a request with http method WARCPROX_WRITE_RECORD, a special
|
||||
type of request which tells warcprox to construct a warc record from
|
||||
the request more or less verbatim, and write it to a warc.
|
||||
|
||||
To honor the request, this method creates a RecordedUrl queues it for
|
||||
the WarcWriterThread to process. The warc record headers Content-Type
|
||||
and WARC-Type are taken from the request headers, as is the payload.
|
||||
|
||||
Example request:
|
||||
|
||||
WARCPROX_WRITE_RECORD screenshot:https://example.com/ HTTP/1.1
|
||||
WARC-Type: metadata
|
||||
Content-Type: image/png
|
||||
Content-Length: 12345
|
||||
Connection: close
|
||||
|
||||
<png image data>
|
||||
'''
|
||||
try:
|
||||
self.url = self.path
|
||||
|
||||
if ('Content-Length' in self.headers and 'Content-Type' in self.headers
|
||||
and (warc_type or 'WARC-Type' in self.headers)):
|
||||
timestamp = datetime.datetime.utcnow()
|
||||
|
||||
# stream this?
|
||||
request_data = self.rfile.read(int(self.headers['Content-Length']))
|
||||
|
||||
warcprox_meta = None
|
||||
raw_warcprox_meta = self.headers.get('Warcprox-Meta')
|
||||
if raw_warcprox_meta:
|
||||
warcprox_meta = json.loads(raw_warcprox_meta)
|
||||
|
||||
rec_custom = RecordedUrl(url=self.url,
|
||||
request_data=request_data,
|
||||
response_recorder=None,
|
||||
remote_ip=b'',
|
||||
warcprox_meta=warcprox_meta,
|
||||
content_type=self.headers['Content-Type'],
|
||||
custom_type=warc_type or self.headers['WARC-Type'].encode('utf-8'),
|
||||
status=204, size=len(request_data),
|
||||
client_ip=self.client_address[0],
|
||||
method=self.command, timestamp=timestamp)
|
||||
|
||||
self.server.recorded_url_q.put(rec_custom)
|
||||
self.send_response(204, 'OK')
|
||||
else:
|
||||
self.send_error(400, 'Bad request')
|
||||
|
||||
self.end_headers()
|
||||
except:
|
||||
self.logger.error("uncaught exception in do_WARCPROX_WRITE_RECORD", exc_info=True)
|
||||
raise
|
||||
|
||||
def log_message(self, fmt, *args):
|
||||
# logging better handled elsewhere?
|
||||
pass
|
||||
|
||||
|
||||
class RecordedUrl:
|
||||
logger = logging.getLogger("warcprox.warcproxy.RecordedUrl")
|
||||
|
||||
def __init__(self, url, request_data, response_recorder, remote_ip,
|
||||
warcprox_meta=None, content_type=None, custom_type=None,
|
||||
status=None, size=None, client_ip=None, method=None,
|
||||
timestamp=None, host=None, duration=None):
|
||||
# XXX should test what happens with non-ascii url (when does
|
||||
# url-encoding happen?)
|
||||
if type(url) is not bytes:
|
||||
self.url = url.encode('ascii')
|
||||
else:
|
||||
self.url = url
|
||||
|
||||
if type(remote_ip) is not bytes:
|
||||
self.remote_ip = remote_ip.encode('ascii')
|
||||
else:
|
||||
self.remote_ip = remote_ip
|
||||
|
||||
self.request_data = request_data
|
||||
self.response_recorder = response_recorder
|
||||
|
||||
if warcprox_meta:
|
||||
self.warcprox_meta = warcprox_meta
|
||||
else:
|
||||
self.warcprox_meta = {}
|
||||
|
||||
self.content_type = content_type
|
||||
|
||||
self.mimetype = content_type
|
||||
if self.mimetype:
|
||||
n = self.mimetype.find(";")
|
||||
if n >= 0:
|
||||
self.mimetype = self.mimetype[:n]
|
||||
|
||||
self.custom_type = custom_type
|
||||
self.status = status
|
||||
self.size = size
|
||||
self.client_ip = client_ip
|
||||
self.method = method
|
||||
self.timestamp = timestamp
|
||||
self.host = host
|
||||
self.duration = duration
|
||||
|
||||
|
||||
class SingleThreadedWarcProxy(http_server.HTTPServer):
|
||||
logger = logging.getLogger("warcprox.warcproxy.WarcProxy")
|
||||
|
||||
def __init__(
|
||||
self, ca=None, recorded_url_q=None, stats_db=None,
|
||||
options=warcprox.Options()):
|
||||
server_address = (
|
||||
options.address or 'localhost',
|
||||
options.port if options.port is not None else 8000)
|
||||
|
||||
if options.onion_tor_socks_proxy:
|
||||
try:
|
||||
host, port = options.onion_tor_socks_proxy.split(':')
|
||||
WarcProxyHandler.onion_tor_socks_proxy_host = host
|
||||
WarcProxyHandler.onion_tor_socks_proxy_port = int(port)
|
||||
except ValueError:
|
||||
WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy
|
||||
WarcProxyHandler.onion_tor_socks_proxy_port = None
|
||||
|
||||
http_server.HTTPServer.__init__(
|
||||
self, server_address, WarcProxyHandler, bind_and_activate=True)
|
||||
|
||||
self.digest_algorithm = options.digest_algorithm or 'sha1'
|
||||
|
||||
if ca is not None:
|
||||
self.ca = ca
|
||||
else:
|
||||
ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64]
|
||||
self.ca = CertificateAuthority(ca_file='warcprox-ca.pem',
|
||||
certs_dir='./warcprox-ca',
|
||||
ca_name=ca_name)
|
||||
|
||||
if recorded_url_q is not None:
|
||||
self.recorded_url_q = recorded_url_q
|
||||
else:
|
||||
self.recorded_url_q = queue.Queue(maxsize=options.queue_size or 1000)
|
||||
|
||||
self.stats_db = stats_db
|
||||
|
||||
self.options = options
|
||||
|
||||
class WarcProxy(SingleThreadedWarcProxy, warcprox.mitmproxy.PooledMitmProxy):
|
||||
logger = logging.getLogger("warcprox.warcproxy.WarcProxy")
|
||||
|
||||
def __init__(
|
||||
self, ca=None, recorded_url_q=None, stats_db=None,
|
||||
options=warcprox.Options()):
|
||||
if options.max_threads:
|
||||
self.logger.info(
|
||||
"max_threads=%s set by command line option",
|
||||
options.max_threads)
|
||||
warcprox.mitmproxy.PooledMitmProxy.__init__(self, options.max_threads)
|
||||
SingleThreadedWarcProxy.__init__(
|
||||
self, ca, recorded_url_q, stats_db, options)
|
||||
|
||||
def server_activate(self):
|
||||
http_server.HTTPServer.server_activate(self)
|
||||
self.logger.info(
|
||||
'listening on %s:%s', self.server_address[0],
|
||||
self.server_address[1])
|
||||
|
||||
def server_close(self):
|
||||
self.logger.info('shutting down')
|
||||
http_server.HTTPServer.server_close(self)
|
||||
|
||||
def handle_error(self, request, client_address):
|
||||
self.logger.warn(
|
||||
"exception processing request %s from %s", request,
|
||||
client_address, exc_info=True)
|
@ -1,301 +0,0 @@
|
||||
# vim:set sw=4 et:
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
try:
|
||||
import queue
|
||||
except ImportError:
|
||||
import Queue as queue
|
||||
|
||||
import logging
|
||||
import threading
|
||||
import os
|
||||
import hashlib
|
||||
import time
|
||||
import socket
|
||||
import base64
|
||||
from datetime import datetime
|
||||
import hanzo.httptools
|
||||
from hanzo import warctools
|
||||
import warcprox
|
||||
|
||||
class WarcWriter:
|
||||
logger = logging.getLogger("warcprox.warcwriter.WarcWriter")
|
||||
|
||||
# port is only used for warc filename
|
||||
def __init__(self, directory='./warcs', rollover_size=1000000000,
|
||||
gzip=False, prefix='WARCPROX', port=0,
|
||||
digest_algorithm='sha1', base32=False, dedup_db=None,
|
||||
playback_index_db=None):
|
||||
|
||||
self.rollover_size = rollover_size
|
||||
|
||||
self.gzip = gzip
|
||||
self.digest_algorithm = digest_algorithm
|
||||
self.base32 = base32
|
||||
self.dedup_db = dedup_db
|
||||
|
||||
self.playback_index_db = playback_index_db
|
||||
|
||||
# warc path and filename stuff
|
||||
self.directory = directory
|
||||
self.prefix = prefix
|
||||
self.port = port
|
||||
|
||||
self._f = None
|
||||
self._fpath = None
|
||||
self._serial = 0
|
||||
|
||||
if not os.path.exists(directory):
|
||||
self.logger.info("warc destination directory {} doesn't exist, creating it".format(directory))
|
||||
os.mkdir(directory)
|
||||
|
||||
|
||||
# returns a tuple (principal_record, request_record) where principal_record is either a response or revisit record
|
||||
def build_warc_records(self, recorded_url):
|
||||
warc_date = warctools.warc.warc_datetime_str(datetime.utcnow())
|
||||
|
||||
dedup_info = None
|
||||
if self.dedup_db is not None and recorded_url.response_recorder.payload_digest is not None:
|
||||
key = self.digest_str(recorded_url.response_recorder.payload_digest)
|
||||
dedup_info = self.dedup_db.lookup(key)
|
||||
|
||||
if dedup_info is not None:
|
||||
# revisit record
|
||||
recorded_url.response_recorder.tempfile.seek(0)
|
||||
if recorded_url.response_recorder.payload_offset is not None:
|
||||
response_header_block = recorded_url.response_recorder.tempfile.read(recorded_url.response_recorder.payload_offset)
|
||||
else:
|
||||
response_header_block = recorded_url.response_recorder.tempfile.read()
|
||||
|
||||
principal_record = self.build_warc_record(
|
||||
url=recorded_url.url, warc_date=warc_date,
|
||||
data=response_header_block,
|
||||
warc_type=warctools.WarcRecord.REVISIT,
|
||||
refers_to=dedup_info['i'],
|
||||
refers_to_target_uri=dedup_info['u'],
|
||||
refers_to_date=dedup_info['d'],
|
||||
payload_digest=self.digest_str(recorded_url.response_recorder.payload_digest),
|
||||
profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST,
|
||||
content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
|
||||
remote_ip=recorded_url.remote_ip)
|
||||
else:
|
||||
# response record
|
||||
principal_record = self.build_warc_record(
|
||||
url=recorded_url.url, warc_date=warc_date,
|
||||
recorder=recorded_url.response_recorder,
|
||||
warc_type=warctools.WarcRecord.RESPONSE,
|
||||
content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
|
||||
remote_ip=recorded_url.remote_ip)
|
||||
|
||||
request_record = self.build_warc_record(
|
||||
url=recorded_url.url, warc_date=warc_date,
|
||||
data=recorded_url.request_data,
|
||||
warc_type=warctools.WarcRecord.REQUEST,
|
||||
content_type=hanzo.httptools.RequestMessage.CONTENT_TYPE,
|
||||
concurrent_to=principal_record.id)
|
||||
|
||||
return principal_record, request_record
|
||||
|
||||
|
||||
def digest_str(self, hash_obj):
|
||||
return hash_obj.name.encode('utf-8') + b':' + (base64.b32encode(hash_obj.digest()) if self.base32 else hash_obj.hexdigest().encode('ascii'))
|
||||
|
||||
|
||||
def build_warc_record(self, url, warc_date=None, recorder=None, data=None,
|
||||
concurrent_to=None, warc_type=None, content_type=None, remote_ip=None,
|
||||
profile=None, refers_to=None, refers_to_target_uri=None,
|
||||
refers_to_date=None, payload_digest=None):
|
||||
|
||||
if warc_date is None:
|
||||
warc_date = warctools.warc.warc_datetime_str(datetime.utcnow())
|
||||
|
||||
record_id = warctools.WarcRecord.random_warc_uuid()
|
||||
|
||||
headers = []
|
||||
if warc_type is not None:
|
||||
headers.append((warctools.WarcRecord.TYPE, warc_type))
|
||||
headers.append((warctools.WarcRecord.ID, record_id))
|
||||
headers.append((warctools.WarcRecord.DATE, warc_date))
|
||||
headers.append((warctools.WarcRecord.URL, url))
|
||||
if remote_ip is not None:
|
||||
headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip))
|
||||
if profile is not None:
|
||||
headers.append((warctools.WarcRecord.PROFILE, profile))
|
||||
if refers_to is not None:
|
||||
headers.append((warctools.WarcRecord.REFERS_TO, refers_to))
|
||||
if refers_to_target_uri is not None:
|
||||
headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri))
|
||||
if refers_to_date is not None:
|
||||
headers.append((warctools.WarcRecord.REFERS_TO_DATE, refers_to_date))
|
||||
if concurrent_to is not None:
|
||||
headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to))
|
||||
if content_type is not None:
|
||||
headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
|
||||
if payload_digest is not None:
|
||||
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
|
||||
|
||||
if recorder is not None:
|
||||
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1')))
|
||||
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
||||
self.digest_str(recorder.block_digest)))
|
||||
if recorder.payload_digest is not None:
|
||||
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
|
||||
self.digest_str(recorder.payload_digest)))
|
||||
|
||||
recorder.tempfile.seek(0)
|
||||
record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)
|
||||
|
||||
else:
|
||||
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1')))
|
||||
block_digest = hashlib.new(self.digest_algorithm, data)
|
||||
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
||||
self.digest_str(block_digest)))
|
||||
|
||||
content_tuple = content_type, data
|
||||
record = warctools.WarcRecord(headers=headers, content=content_tuple)
|
||||
|
||||
return record
|
||||
|
||||
|
||||
def timestamp17(self):
|
||||
now = datetime.utcnow()
|
||||
return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000)
|
||||
|
||||
def close_writer(self):
|
||||
if self._fpath:
|
||||
self.logger.info('closing {0}'.format(self._f_finalname))
|
||||
self._f.close()
|
||||
finalpath = os.path.sep.join([self.directory, self._f_finalname])
|
||||
os.rename(self._fpath, finalpath)
|
||||
|
||||
self._fpath = None
|
||||
self._f = None
|
||||
|
||||
def _build_warcinfo_record(self, filename):
|
||||
warc_record_date = warctools.warc.warc_datetime_str(datetime.utcnow())
|
||||
record_id = warctools.WarcRecord.random_warc_uuid()
|
||||
|
||||
headers = []
|
||||
headers.append((warctools.WarcRecord.ID, record_id))
|
||||
headers.append((warctools.WarcRecord.TYPE, warctools.WarcRecord.WARCINFO))
|
||||
headers.append((warctools.WarcRecord.FILENAME, filename.encode('latin1')))
|
||||
headers.append((warctools.WarcRecord.DATE, warc_record_date))
|
||||
|
||||
warcinfo_fields = []
|
||||
warcinfo_fields.append(b'software: warcprox ' + warcprox.version_bytes)
|
||||
hostname = socket.gethostname()
|
||||
warcinfo_fields.append('hostname: {}'.format(hostname).encode('latin1'))
|
||||
warcinfo_fields.append('ip: {0}'.format(socket.gethostbyname(hostname)).encode('latin1'))
|
||||
warcinfo_fields.append(b'format: WARC File Format 1.0')
|
||||
# warcinfo_fields.append('robots: ignore')
|
||||
# warcinfo_fields.append('description: {0}'.format(self.description))
|
||||
# warcinfo_fields.append('isPartOf: {0}'.format(self.is_part_of))
|
||||
data = b'\r\n'.join(warcinfo_fields) + b'\r\n'
|
||||
|
||||
record = warctools.WarcRecord(headers=headers, content=(b'application/warc-fields', data))
|
||||
|
||||
return record
|
||||
|
||||
|
||||
# <!-- <property name="template" value="${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}" /> -->
|
||||
def _writer(self):
|
||||
if self._fpath and os.path.getsize(self._fpath) > self.rollover_size:
|
||||
self.close_writer()
|
||||
|
||||
if self._f == None:
|
||||
self._f_finalname = '{}-{}-{:05d}-{}-{}-{}.warc{}'.format(
|
||||
self.prefix, self.timestamp17(), self._serial, os.getpid(),
|
||||
socket.gethostname(), self.port, '.gz' if self.gzip else '')
|
||||
self._fpath = os.path.sep.join([self.directory, self._f_finalname + '.open'])
|
||||
|
||||
self._f = open(self._fpath, 'wb')
|
||||
|
||||
warcinfo_record = self._build_warcinfo_record(self._f_finalname)
|
||||
self.logger.debug('warcinfo_record.headers={}'.format(warcinfo_record.headers))
|
||||
warcinfo_record.write_to(self._f, gzip=self.gzip)
|
||||
|
||||
self._serial += 1
|
||||
|
||||
return self._f
|
||||
|
||||
|
||||
def _final_tasks(self, recorded_url, recordset, recordset_offset):
|
||||
if (self.dedup_db is not None
|
||||
and recordset[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
|
||||
and recorded_url.response_recorder.payload_size() > 0):
|
||||
key = self.digest_str(recorded_url.response_recorder.payload_digest)
|
||||
self.dedup_db.save(key, recordset[0], recordset_offset)
|
||||
|
||||
if self.playback_index_db is not None:
|
||||
self.playback_index_db.save(self._f_finalname, recordset, recordset_offset)
|
||||
|
||||
recorded_url.response_recorder.tempfile.close()
|
||||
|
||||
def write_records(self, recorded_url):
|
||||
recordset = self.build_warc_records(recorded_url)
|
||||
|
||||
writer = self._writer()
|
||||
recordset_offset = writer.tell()
|
||||
|
||||
for record in recordset:
|
||||
offset = writer.tell()
|
||||
record.write_to(writer, gzip=self.gzip)
|
||||
self.logger.debug('wrote warc record: warc_type={} content_length={} url={} warc={} offset={}'.format(
|
||||
record.get_header(warctools.WarcRecord.TYPE),
|
||||
record.get_header(warctools.WarcRecord.CONTENT_LENGTH),
|
||||
record.get_header(warctools.WarcRecord.URL),
|
||||
self._fpath, offset))
|
||||
|
||||
self._f.flush()
|
||||
|
||||
self._final_tasks(recorded_url, recordset, recordset_offset)
|
||||
|
||||
|
||||
|
||||
class WarcWriterThread(threading.Thread):
|
||||
logger = logging.getLogger("warcprox.warcwriter.WarcWriterThread")
|
||||
|
||||
def __init__(self, recorded_url_q=None, warc_writer=None, rollover_idle_time=None):
|
||||
"""recorded_url_q is a queue.Queue of warcprox.warcprox.RecordedUrl."""
|
||||
threading.Thread.__init__(self, name='WarcWriterThread')
|
||||
self.recorded_url_q = recorded_url_q
|
||||
self.rollover_idle_time = rollover_idle_time
|
||||
self.stop = threading.Event()
|
||||
if warc_writer:
|
||||
self.warc_writer = warc_writer
|
||||
else:
|
||||
self.warc_writer = WarcWriter()
|
||||
|
||||
def run(self):
|
||||
self.logger.info('WarcWriterThread starting, directory={} gzip={} rollover_size={} rollover_idle_time={} prefix={} port={}'.format(
|
||||
os.path.abspath(self.warc_writer.directory), self.warc_writer.gzip, self.warc_writer.rollover_size,
|
||||
self.rollover_idle_time, self.warc_writer.prefix, self.warc_writer.port))
|
||||
|
||||
self._last_sync = self._last_activity = time.time()
|
||||
|
||||
while not self.stop.is_set():
|
||||
try:
|
||||
recorded_url = self.recorded_url_q.get(block=True, timeout=0.5)
|
||||
self.logger.info("recorded_url.warcprox_meta={} for {}".format(recorded_url.warcprox_meta, recorded_url.url))
|
||||
self.warc_writer.write_records(recorded_url)
|
||||
self._last_activity = time.time()
|
||||
except queue.Empty:
|
||||
if (self.warc_writer._fpath is not None
|
||||
and self.rollover_idle_time is not None
|
||||
and self.rollover_idle_time > 0
|
||||
and time.time() - self._last_activity > self.rollover_idle_time):
|
||||
self.logger.debug('rolling over warc file after {} seconds idle'.format(time.time() - self._last_activity))
|
||||
self.warc_writer.close_writer()
|
||||
|
||||
if time.time() - self._last_sync > 60:
|
||||
if self.warc_writer.dedup_db:
|
||||
self.warc_writer.dedup_db.sync()
|
||||
if self.warc_writer.playback_index_db:
|
||||
self.warc_writer.playback_index_db.sync()
|
||||
self._last_sync = time.time()
|
||||
|
||||
self.logger.info('WarcWriterThread shutting down')
|
||||
self.warc_writer.close_writer();
|
||||
|
||||
|
168
warcprox/writer.py
Normal file
168
warcprox/writer.py
Normal file
@ -0,0 +1,168 @@
|
||||
#
|
||||
# warcprox/writer.py - warc writer, manages and writes records to warc files
|
||||
#
|
||||
# Copyright (C) 2013-2016 Internet Archive
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
# as published by the Free Software Foundation; either version 2
|
||||
# of the License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
# USA.
|
||||
#
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from hanzo import warctools
|
||||
import time
|
||||
import warcprox
|
||||
import os
|
||||
import socket
|
||||
import string
|
||||
import random
|
||||
|
||||
class WarcWriter:
|
||||
logger = logging.getLogger('warcprox.writer.WarcWriter')
|
||||
|
||||
def __init__(self, options=warcprox.Options()):
|
||||
|
||||
self.rollover_size = options.rollover_size or 1000000000
|
||||
self.rollover_idle_time = options.rollover_idle_time or None
|
||||
self._last_activity = time.time()
|
||||
|
||||
self.gzip = options.gzip or False
|
||||
digest_algorithm = options.digest_algorithm or 'sha1'
|
||||
base32 = options.base32
|
||||
self.record_builder = warcprox.warc.WarcRecordBuilder(digest_algorithm=digest_algorithm, base32=base32)
|
||||
|
||||
# warc path and filename stuff
|
||||
self.directory = options.directory or './warcs'
|
||||
self.prefix = options.prefix or 'warcprox'
|
||||
|
||||
self._f = None
|
||||
self._fpath = None
|
||||
self._f_finalname = None
|
||||
self._serial = 0
|
||||
|
||||
self._randomtoken = "".join(random.Random().sample(string.digits + string.ascii_lowercase, 8))
|
||||
|
||||
if not os.path.exists(self.directory):
|
||||
self.logger.info("warc destination directory {} doesn't exist, creating it".format(self.directory))
|
||||
os.mkdir(self.directory)
|
||||
|
||||
def timestamp17(self):
|
||||
now = datetime.utcnow()
|
||||
return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000)
|
||||
|
||||
def close_writer(self):
|
||||
if self._fpath:
|
||||
self.logger.info('closing {0}'.format(self._f_finalname))
|
||||
self._f.close()
|
||||
finalpath = os.path.sep.join([self.directory, self._f_finalname])
|
||||
os.rename(self._fpath, finalpath)
|
||||
|
||||
self._fpath = None
|
||||
self._f = None
|
||||
|
||||
# h3 default <!-- <property name="template" value="${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}" /> -->
|
||||
# ${prefix}-${timestamp17}-${randomtoken}-${serialno}.warc.gz"
|
||||
def _writer(self):
|
||||
if self._fpath and os.path.getsize(self._fpath) > self.rollover_size:
|
||||
self.close_writer()
|
||||
|
||||
if self._f == None:
|
||||
self._f_finalname = '{}-{}-{:05d}-{}.warc{}'.format(
|
||||
self.prefix, self.timestamp17(), self._serial, self._randomtoken, '.gz' if self.gzip else '')
|
||||
self._fpath = os.path.sep.join([self.directory, self._f_finalname + '.open'])
|
||||
|
||||
self._f = open(self._fpath, 'wb')
|
||||
|
||||
warcinfo_record = self.record_builder.build_warcinfo_record(self._f_finalname)
|
||||
self.logger.debug('warcinfo_record.headers={}'.format(warcinfo_record.headers))
|
||||
warcinfo_record.write_to(self._f, gzip=self.gzip)
|
||||
|
||||
self._serial += 1
|
||||
|
||||
return self._f
|
||||
|
||||
def write_records(self, recorded_url):
|
||||
"""Returns tuple of records written, which are instances of
|
||||
hanzo.warctools.warc.WarcRecord, decorated with "warc_filename" and
|
||||
"offset" attributes."""
|
||||
records = self.record_builder.build_warc_records(recorded_url)
|
||||
|
||||
writer = self._writer()
|
||||
recordset_offset = writer.tell()
|
||||
|
||||
for record in records:
|
||||
offset = writer.tell()
|
||||
record.write_to(writer, gzip=self.gzip)
|
||||
record.offset = offset
|
||||
record.length = writer.tell() - offset
|
||||
record.warc_filename = self._f_finalname
|
||||
self.logger.debug('wrote warc record: warc_type=%s content_length=%s url=%s warc=%s offset=%d',
|
||||
record.get_header(warctools.WarcRecord.TYPE),
|
||||
record.get_header(warctools.WarcRecord.CONTENT_LENGTH),
|
||||
record.get_header(warctools.WarcRecord.URL),
|
||||
self._fpath, record.offset)
|
||||
|
||||
self._f.flush()
|
||||
self._last_activity = time.time()
|
||||
|
||||
return records
|
||||
|
||||
def maybe_idle_rollover(self):
|
||||
if (self._fpath is not None
|
||||
and self.rollover_idle_time is not None
|
||||
and self.rollover_idle_time > 0
|
||||
and time.time() - self._last_activity > self.rollover_idle_time):
|
||||
self.logger.debug('rolling over {} after {} seconds idle'.format(self._f_finalname, time.time() - self._last_activity))
|
||||
self.close_writer()
|
||||
|
||||
class WarcWriterPool:
|
||||
logger = logging.getLogger("warcprox.writer.WarcWriterPool")
|
||||
|
||||
def __init__(self, options=warcprox.Options()):
|
||||
self.default_warc_writer = WarcWriter(options=options)
|
||||
self.warc_writers = {} # {prefix:WarcWriter}
|
||||
self._last_sync = time.time()
|
||||
self.options = options
|
||||
|
||||
# chooses writer for filename specified by warcprox_meta["warc-prefix"] if set
|
||||
def _writer(self, recorded_url):
|
||||
w = self.default_warc_writer
|
||||
if recorded_url.warcprox_meta and "warc-prefix" in recorded_url.warcprox_meta:
|
||||
# self.logger.info("recorded_url.warcprox_meta={} for {}".format(recorded_url.warcprox_meta, recorded_url.url))
|
||||
options = warcprox.Options(**vars(self.options))
|
||||
options.prefix = recorded_url.warcprox_meta["warc-prefix"]
|
||||
if not options.prefix in self.warc_writers:
|
||||
self.warc_writers[options.prefix] = WarcWriter(options=options)
|
||||
w = self.warc_writers[options.prefix]
|
||||
return w
|
||||
|
||||
def write_records(self, recorded_url):
|
||||
"""Returns tuple of records written, which are instances of
|
||||
hanzo.warctools.warc.WarcRecord, decorated with "warc_filename" and
|
||||
"offset" attributes."""
|
||||
return self._writer(recorded_url).write_records(recorded_url)
|
||||
|
||||
def maybe_idle_rollover(self):
|
||||
self.default_warc_writer.maybe_idle_rollover()
|
||||
for w in self.warc_writers.values():
|
||||
w.maybe_idle_rollover()
|
||||
|
||||
def close_writers(self):
|
||||
self.default_warc_writer.close_writer()
|
||||
for w in self.warc_writers.values():
|
||||
w.close_writer()
|
||||
|
122
warcprox/writerthread.py
Normal file
122
warcprox/writerthread.py
Normal file
@ -0,0 +1,122 @@
|
||||
#
|
||||
# warcprox/writerthread.py - warc writer thread, reads from the recorded url
|
||||
# queue, writes warc records, runs final tasks after warc records are written
|
||||
#
|
||||
# Copyright (C) 2013-2016 Internet Archive
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
# as published by the Free Software Foundation; either version 2
|
||||
# of the License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
# USA.
|
||||
#
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
try:
|
||||
import queue
|
||||
except ImportError:
|
||||
import Queue as queue
|
||||
|
||||
import logging
|
||||
import threading
|
||||
import os
|
||||
import hashlib
|
||||
import time
|
||||
import socket
|
||||
import base64
|
||||
from datetime import datetime
|
||||
import hanzo.httptools
|
||||
from hanzo import warctools
|
||||
import warcprox
|
||||
import cProfile
|
||||
|
||||
class WarcWriterThread(threading.Thread):
|
||||
logger = logging.getLogger("warcprox.warcproxwriter.WarcWriterThread")
|
||||
|
||||
def __init__(self, recorded_url_q=None, writer_pool=None, dedup_db=None, listeners=None, options=warcprox.Options()):
|
||||
"""recorded_url_q is a queue.Queue of warcprox.warcprox.RecordedUrl."""
|
||||
threading.Thread.__init__(self, name='WarcWriterThread')
|
||||
self.recorded_url_q = recorded_url_q
|
||||
self.stop = threading.Event()
|
||||
if writer_pool:
|
||||
self.writer_pool = writer_pool
|
||||
else:
|
||||
self.writer_pool = WarcWriterPool()
|
||||
self.dedup_db = dedup_db
|
||||
self.listeners = listeners
|
||||
self.options = options
|
||||
self.idle = None
|
||||
|
||||
def run(self):
|
||||
if self.options.profile:
|
||||
cProfile.runctx('self._run()', globals(), locals(), sort='cumulative')
|
||||
else:
|
||||
self._run()
|
||||
|
||||
def _run(self):
|
||||
while not self.stop.is_set():
|
||||
try:
|
||||
self.name = 'WarcWriterThread(tid={})'.format(warcprox.gettid())
|
||||
while True:
|
||||
try:
|
||||
if self.stop.is_set():
|
||||
qsize = self.recorded_url_q.qsize()
|
||||
if qsize % 50 == 0:
|
||||
self.logger.info("%s urls left to write", qsize)
|
||||
|
||||
recorded_url = self.recorded_url_q.get(block=True, timeout=0.5)
|
||||
self.idle = None
|
||||
if self.dedup_db:
|
||||
warcprox.dedup.decorate_with_dedup_info(self.dedup_db,
|
||||
recorded_url, base32=self.options.base32)
|
||||
records = self.writer_pool.write_records(recorded_url)
|
||||
self._final_tasks(recorded_url, records)
|
||||
|
||||
# try to release resources in a timely fashion
|
||||
if recorded_url.response_recorder and recorded_url.response_recorder.tempfile:
|
||||
recorded_url.response_recorder.tempfile.close()
|
||||
except queue.Empty:
|
||||
if self.stop.is_set():
|
||||
break
|
||||
self.idle = time.time()
|
||||
self.writer_pool.maybe_idle_rollover()
|
||||
|
||||
self.logger.info('WarcWriterThread shutting down')
|
||||
self.writer_pool.close_writers()
|
||||
except:
|
||||
self.logger.critical("WarcWriterThread will try to continue after unexpected error", exc_info=True)
|
||||
time.sleep(0.5)
|
||||
|
||||
# closest thing we have to heritrix crawl log at the moment
|
||||
def _log(self, recorded_url, records):
|
||||
try:
|
||||
payload_digest = records[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode("utf-8")
|
||||
except:
|
||||
payload_digest = "-"
|
||||
|
||||
# 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"}
|
||||
self.logger.info("{} {} {} {} {} size={} {} {} {} offset={}".format(
|
||||
recorded_url.client_ip, recorded_url.status, recorded_url.method,
|
||||
recorded_url.url.decode("utf-8"), recorded_url.mimetype,
|
||||
recorded_url.size, payload_digest, records[0].type.decode("utf-8"),
|
||||
records[0].warc_filename, records[0].offset))
|
||||
|
||||
def _final_tasks(self, recorded_url, records):
|
||||
if self.listeners:
|
||||
for listener in self.listeners:
|
||||
try:
|
||||
listener.notify(recorded_url, records)
|
||||
except:
|
||||
self.logger.error('%s raised exception',
|
||||
listener.notify, exc_info=True)
|
||||
self._log(recorded_url, records)
|
Loading…
x
Reference in New Issue
Block a user