tests pass with python2.7 and 3.2! (tox fails though oddly)

This commit is contained in:
Noah Levitt 2013-12-04 17:25:45 -08:00
parent 8ae164f8ca
commit dc9fdc3412
5 changed files with 242 additions and 152 deletions

1
.gitignore vendored
View File

@ -4,6 +4,7 @@
*.pem
*.db
*.diff
*.egg
*.egg-info
*.swp
warcs

View File

@ -12,8 +12,8 @@ setuptools.setup(name='warcprox',
long_description=open('README.rst').read(),
license='GPL',
packages=['warcprox'],
install_requires=['pyopenssl', 'warctools>=4.8.2'], # gdbm/dbhash?
dependency_links=['git+https://github.com/nlevitt/warctools.git@tweaks#egg=warctools-4.8.2'],
install_requires=['pyopenssl', 'warctools>=4.8.3'], # gdbm/dbhash?
dependency_links=['git+https://github.com/nlevitt/warctools.git@python3#egg=warctools-4.8.3'],
tests_require=['requests>=2.0.1'], # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636
scripts=['bin/dump-anydbm', 'bin/warcprox'],
zip_safe=False,

11
tox.ini Normal file
View File

@ -0,0 +1,11 @@
# Tox (http://tox.testrun.org/) is a tool for running tests
# in multiple virtualenvs. This configuration file will run the
# test suite on all supported python versions. To use it, "pip install tox"
# and then run "tox" from this directory.
[tox]
envlist = py27, py32, py33
[testenv]
commands = {envpython} setup.py test

View File

@ -3,7 +3,6 @@
from warcprox import warcprox
import unittest
import BaseHTTPServer
import threading
import time
import logging
@ -14,10 +13,23 @@ import tempfile
import OpenSSL
import os
import shutil
import Queue
import requests
class TestHttpRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
try:
import http.server
http_server = http.server
except ImportError:
import BaseHTTPServer
http_server = BaseHTTPServer
try:
import queue
except ImportError:
import Queue
queue = Queue
class TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
logger = logging.getLogger('TestHttpRequestHandler')
def do_GET(self):
@ -25,19 +37,19 @@ class TestHttpRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
m = re.match(r'^/([^/]+)/([^/]+)$', self.path)
if m is not None:
special_header = 'warcprox-test-header: {}!'.format(m.group(1))
payload = 'I am the warcprox test payload! {}!\n'.format(10*m.group(2))
headers = ('HTTP/1.1 200 OK\r\n'
+ 'Content-Type: text/plain\r\n'
+ '{}\r\n'
+ 'Content-Length: {}\r\n'
+ '\r\n').format(special_header, len(payload))
special_header = 'warcprox-test-header: {}!'.format(m.group(1)).encode('utf-8')
payload = 'I am the warcprox test payload! {}!\n'.format(10*m.group(2)).encode('utf-8')
headers = (b'HTTP/1.1 200 OK\r\n'
+ b'Content-Type: text/plain\r\n'
+ special_header + b'\r\n'
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n'
+ b'\r\n')
else:
payload = '404 Not Found\n'
headers = ('HTTP/1.1 404 Not Found\r\n'
+ 'Content-Type: text/plain\r\n'
+ 'Content-Length: {}\r\n'
+ '\r\n').format(len(payload))
payload = b'404 Not Found\n'
headers = (b'HTTP/1.1 404 Not Found\r\n'
+ b'Content-Type: text/plain\r\n'
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n'
+ b'\r\n')
self.connection.sendall(headers)
self.connection.sendall(payload)
@ -82,7 +94,7 @@ class WarcproxTest(unittest.TestCase):
def _start_http_servers(self):
self.http_daemon = BaseHTTPServer.HTTPServer(('localhost', 0),
self.http_daemon = http_server.HTTPServer(('localhost', 0),
RequestHandlerClass=TestHttpRequestHandler)
self.logger.info('starting http://{}:{}'.format(self.http_daemon.server_address[0], self.http_daemon.server_address[1]))
self.http_daemon_thread = threading.Thread(name='HttpdThread',
@ -90,7 +102,7 @@ class WarcproxTest(unittest.TestCase):
self.http_daemon_thread.start()
# http://www.piware.de/2011/01/creating-an-https-server-in-python/
self.https_daemon = BaseHTTPServer.HTTPServer(('localhost', 0),
self.https_daemon = http_server.HTTPServer(('localhost', 0),
RequestHandlerClass=TestHttpRequestHandler)
# self.https_daemon.socket = ssl.wrap_socket(httpd.socket, certfile='path/to/localhost.pem', server_side=True)
self.https_daemon.socket = ssl.wrap_socket(self.https_daemon.socket, certfile=self._cert, server_side=True)
@ -107,7 +119,7 @@ class WarcproxTest(unittest.TestCase):
self._ca_dir = tempfile.mkdtemp(prefix='warcprox-test-', suffix='-ca')
ca = warcprox.CertificateAuthority(self._ca_file, self._ca_dir)
recorded_url_q = Queue.Queue()
recorded_url_q = queue.Queue()
proxy = warcprox.WarcProxy(server_address=('localhost', 0), ca=ca,
recorded_url_q=recorded_url_q)
@ -183,24 +195,24 @@ class WarcproxTest(unittest.TestCase):
url = 'http://localhost:{}/'.format(self.http_daemon.server_port)
response = requests.get(url)
self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, '404 Not Found\n')
self.assertEqual(response.content, b'404 Not Found\n')
url = 'https://localhost:{}/'.format(self.https_daemon.server_port)
response = requests.get(url, verify=False)
self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, '404 Not Found\n')
self.assertEqual(response.content, b'404 Not Found\n')
url = 'http://localhost:{}/a/b'.format(self.http_daemon.server_port)
response = requests.get(url)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'a!')
self.assertEqual(response.content, 'I am the warcprox test payload! bbbbbbbbbb!\n')
self.assertEqual(response.content, b'I am the warcprox test payload! bbbbbbbbbb!\n')
url = 'https://localhost:{}/c/d'.format(self.https_daemon.server_port)
response = requests.get(url, verify=False)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'c!')
self.assertEqual(response.content, 'I am the warcprox test payload! dddddddddd!\n')
self.assertEqual(response.content, b'I am the warcprox test payload! dddddddddd!\n')
def poll_playback_until(self, url, status, timeout_sec):
@ -221,18 +233,18 @@ class WarcproxTest(unittest.TestCase):
# ensure playback fails before archiving
response = requests.get(url, proxies=self.playback_proxies)
self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, '404 Not in Archive\n')
self.assertEqual(response.content, b'404 Not in Archive\n')
# archive
response = requests.get(url, proxies=self.archiving_proxies)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'a!')
self.assertEqual(response.content, 'I am the warcprox test payload! bbbbbbbbbb!\n')
self.assertEqual(response.content, b'I am the warcprox test payload! bbbbbbbbbb!\n')
response = self.poll_playback_until(url, status=200, timeout_sec=10)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'a!')
self.assertEqual(response.content, 'I am the warcprox test payload! bbbbbbbbbb!\n')
self.assertEqual(response.content, b'I am the warcprox test payload! bbbbbbbbbb!\n')
def _test_archive_and_playback_https_url(self):
@ -241,19 +253,19 @@ class WarcproxTest(unittest.TestCase):
# ensure playback fails before archiving
response = requests.get(url, proxies=self.playback_proxies, verify=False)
self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, '404 Not in Archive\n')
self.assertEqual(response.content, b'404 Not in Archive\n')
# fetch & archive response
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'c!')
self.assertEqual(response.content, 'I am the warcprox test payload! dddddddddd!\n')
self.assertEqual(response.content, b'I am the warcprox test payload! dddddddddd!\n')
# test playback
response = self.poll_playback_until(url, status=200, timeout_sec=10)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'c!')
self.assertEqual(response.content, 'I am the warcprox test payload! dddddddddd!\n')
self.assertEqual(response.content, b'I am the warcprox test payload! dddddddddd!\n')
# test dedup of same http url with same payload
@ -263,30 +275,30 @@ class WarcproxTest(unittest.TestCase):
# ensure playback fails before archiving
response = requests.get(url, proxies=self.playback_proxies, verify=False)
self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, '404 Not in Archive\n')
self.assertEqual(response.content, b'404 Not in Archive\n')
# check not in dedup db
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
self.assertIsNone(dedup_lookup)
# archive
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'e!')
self.assertEqual(response.content, 'I am the warcprox test payload! ffffffffff!\n')
self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n')
# test playback
response = self.poll_playback_until(url, status=200, timeout_sec=10)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'e!')
self.assertEqual(response.content, 'I am the warcprox test payload! ffffffffff!\n')
self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n')
# check in dedup db
# {u'i': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'}
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
self.assertEquals(dedup_lookup['u'], url)
self.assertRegexpMatches(dedup_lookup['i'], r'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$')
self.assertRegexpMatches(dedup_lookup['d'], r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$')
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
self.assertEqual(dedup_lookup['u'], url.encode('ascii'))
self.assertRegexpMatches(dedup_lookup['i'], br'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$')
self.assertRegexpMatches(dedup_lookup['d'], br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$')
record_id = dedup_lookup['i']
dedup_date = dedup_lookup['d']
@ -298,23 +310,23 @@ class WarcproxTest(unittest.TestCase):
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'e!')
self.assertEqual(response.content, 'I am the warcprox test payload! ffffffffff!\n')
self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n')
# XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\
time.sleep(2.0)
# check in dedup db (no change from prev)
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
self.assertEquals(dedup_lookup['u'], url)
self.assertEquals(dedup_lookup['i'], record_id)
self.assertEquals(dedup_lookup['d'], dedup_date)
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
self.assertEqual(dedup_lookup['u'], url.encode('ascii'))
self.assertEqual(dedup_lookup['i'], record_id)
self.assertEqual(dedup_lookup['d'], dedup_date)
# test playback
self.logger.debug('testing playback of revisit of {}'.format(url))
response = self.poll_playback_until(url, status=200, timeout_sec=10)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'e!')
self.assertEqual(response.content, 'I am the warcprox test payload! ffffffffff!\n')
self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n')
# XXX how to check dedup was used?
@ -325,30 +337,30 @@ class WarcproxTest(unittest.TestCase):
# ensure playback fails before archiving
response = requests.get(url, proxies=self.playback_proxies, verify=False)
self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, '404 Not in Archive\n')
self.assertEqual(response.content, b'404 Not in Archive\n')
# check not in dedup db
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
self.assertIsNone(dedup_lookup)
# archive
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'g!')
self.assertEqual(response.content, 'I am the warcprox test payload! hhhhhhhhhh!\n')
self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n')
# test playback
response = self.poll_playback_until(url, status=200, timeout_sec=10)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'g!')
self.assertEqual(response.content, 'I am the warcprox test payload! hhhhhhhhhh!\n')
self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n')
# check in dedup db
# {u'i': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'}
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
self.assertEquals(dedup_lookup['u'], url)
self.assertRegexpMatches(dedup_lookup['i'], r'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$')
self.assertRegexpMatches(dedup_lookup['d'], r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$')
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
self.assertEqual(dedup_lookup['u'], url.encode('ascii'))
self.assertRegexpMatches(dedup_lookup['i'], br'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$')
self.assertRegexpMatches(dedup_lookup['d'], br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$')
record_id = dedup_lookup['i']
dedup_date = dedup_lookup['d']
@ -360,23 +372,23 @@ class WarcproxTest(unittest.TestCase):
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'g!')
self.assertEqual(response.content, 'I am the warcprox test payload! hhhhhhhhhh!\n')
self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n')
# XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\
time.sleep(2.0)
# check in dedup db (no change from prev)
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
self.assertEquals(dedup_lookup['u'], url)
self.assertEquals(dedup_lookup['i'], record_id)
self.assertEquals(dedup_lookup['d'], dedup_date)
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
self.assertEqual(dedup_lookup['u'], url.encode('ascii'))
self.assertEqual(dedup_lookup['i'], record_id)
self.assertEqual(dedup_lookup['d'], dedup_date)
# test playback
self.logger.debug('testing playback of revisit of {}'.format(url))
response = self.poll_playback_until(url, status=200, timeout_sec=10)
self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'g!')
self.assertEqual(response.content, 'I am the warcprox test payload! hhhhhhhhhh!\n')
self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n')
# XXX how to check dedup was used?

View File

@ -4,12 +4,55 @@
"""
WARC writing MITM HTTP/S proxy
See README.md or https://github.com/internetarchive/warcprox
See README.rst or https://github.com/internetarchive/warcprox
"""
import BaseHTTPServer, SocketServer
try:
import http.server
http_server = http.server
except ImportError:
import BaseHTTPServer
http_server = BaseHTTPServer
try:
import socketserver
except ImportError:
import SocketServer
socketserver = SocketServer
try:
import urllib.parse
urllib_parse = urllib.parse
except ImportError:
import urlparse
urllib_parse = urlparse
try:
import queue
except ImportError:
import Queue
queue = Queue
try:
import http.client
http_client = http.client
except ImportError:
import httplib
http_client = httplib
try:
import dbm.gnu
dbm_gnu = dbm.gnu
except ImportError:
import gdbm
dbm_gnu = gdbm
try:
from io import StringIO
except ImportError:
from StringIO import StringIO
import socket
import urlparse
import OpenSSL
import ssl
import logging
@ -17,12 +60,10 @@ import sys
from hanzo import warctools, httptools
import hashlib
from datetime import datetime
import Queue
import threading
import os
import argparse
import random
import httplib
import re
import signal
import time
@ -30,8 +71,6 @@ import tempfile
import base64
import json
import traceback
import gdbm
from StringIO import StringIO
class CertificateAuthority(object):
logger = logging.getLogger('warcprox.CertificateAuthority')
@ -66,9 +105,9 @@ class CertificateAuthority(object):
self.cert.set_issuer(self.cert.get_subject())
self.cert.set_pubkey(self.key)
self.cert.add_extensions([
OpenSSL.crypto.X509Extension("basicConstraints", True, "CA:TRUE, pathlen:0"),
OpenSSL.crypto.X509Extension("keyUsage", True, "keyCertSign, cRLSign"),
OpenSSL.crypto.X509Extension("subjectKeyIdentifier", False, "hash", subject=self.cert),
OpenSSL.crypto.X509Extension(b"basicConstraints", True, b"CA:TRUE, pathlen:0"),
OpenSSL.crypto.X509Extension(b"keyUsage", True, b"keyCertSign, cRLSign"),
OpenSSL.crypto.X509Extension(b"subjectKeyIdentifier", False, b"hash", subject=self.cert),
])
self.cert.sign(self.key, "sha1")
@ -134,29 +173,29 @@ class ProxyingRecorder(object):
self.payload_digest = None
self.proxy_dest = proxy_dest
self._proxy_dest_conn_open = True
self._prev_hunk_last_two_bytes = ''
self._prev_hunk_last_two_bytes = b''
self.len = 0
def _update(self, hunk):
if self.payload_digest is None:
# convoluted handling of two newlines crossing hunks
# XXX write tests for this
if self._prev_hunk_last_two_bytes.endswith('\n'):
if hunk.startswith('\n'):
if self._prev_hunk_last_two_bytes.endswith(b'\n'):
if hunk.startswith(b'\n'):
self.payload_digest = hashlib.new(self.digest_algorithm)
self.payload_digest.update(hunk[1:])
self.payload_offset = self.len + 1
elif hunk.startswith('\r\n'):
elif hunk.startswith(b'\r\n'):
self.payload_digest = hashlib.new(self.digest_algorithm)
self.payload_digest.update(hunk[2:])
self.payload_offset = self.len + 2
elif self._prev_hunk_last_two_bytes == '\n\r':
if hunk.startswith('\n'):
elif self._prev_hunk_last_two_bytes == b'\n\r':
if hunk.startswith(b'\n'):
self.payload_digest = hashlib.new(self.digest_algorithm)
self.payload_digest.update(hunk[1:])
self.payload_offset = self.len + 1
else:
m = re.search(r'\n\r?\n', hunk)
m = re.search(br'\n\r?\n', hunk)
if m is not None:
self.payload_digest = hashlib.new(self.digest_algorithm)
self.payload_digest.update(hunk[m.end():])
@ -184,14 +223,14 @@ class ProxyingRecorder(object):
def read(self, size=-1):
hunk = self.fp.read(size=size)
hunk = self.fp.read(size)
self._update(hunk)
return hunk
def readline(self, size=-1):
# XXX depends on implementation details of self.fp.readline(), in
# particular that it doesn't call self.fp.read()
hunk = self.fp.readline(size=size)
hunk = self.fp.readline(size)
self._update(hunk)
return hunk
@ -208,10 +247,10 @@ class ProxyingRecorder(object):
return 0
class ProxyingRecordingHTTPResponse(httplib.HTTPResponse):
class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
def __init__(self, sock, debuglevel=0, strict=0, method=None, buffering=False, proxy_dest=None, digest_algorithm='sha1'):
httplib.HTTPResponse.__init__(self, sock, debuglevel=debuglevel, strict=strict, method=method, buffering=buffering)
def __init__(self, sock, debuglevel=0, method=None, proxy_dest=None, digest_algorithm='sha1'):
http_client.HTTPResponse.__init__(self, sock, debuglevel=debuglevel, method=method)
# Keep around extra reference to self.fp because HTTPResponse sets
# self.fp=None after it finishes reading, but we still need it
@ -219,12 +258,22 @@ class ProxyingRecordingHTTPResponse(httplib.HTTPResponse):
self.fp = self.recorder
class MitmProxyHandler(BaseHTTPServer.BaseHTTPRequestHandler):
class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
logger = logging.getLogger('warcprox.MitmProxyHandler')
def __init__(self, request, client_address, server):
self.is_connect = False
BaseHTTPServer.BaseHTTPRequestHandler.__init__(self, request, client_address, server)
## XXX hack around bizarre bug on my mac python 3.2 in http.server
## where hasattr returns true in the code snippet below, but
## self._headers_buffer is None
#
# if not hasattr(self, '_headers_buffer'):
# self._headers_buffer = []
# self._headers_buffer.append(
self._headers_buffer = []
http_server.BaseHTTPRequestHandler.__init__(self, request, client_address, server)
def _determine_host_port(self):
# Get hostname and port to connect to
@ -232,13 +281,13 @@ class MitmProxyHandler(BaseHTTPServer.BaseHTTPRequestHandler):
self.hostname, self.port = self.path.split(':')
else:
self.url = self.path
u = urlparse.urlparse(self.url)
u = urllib_parse.urlparse(self.url)
if u.scheme != 'http':
raise Exception('Unknown scheme %s' % repr(u.scheme))
self.hostname = u.hostname
self.port = u.port or 80
self.path = urlparse.urlunparse(
urlparse.ParseResult(
self.path = urllib_parse.urlunparse(
urllib_parse.ParseResult(
scheme='',
netloc='',
params=u.params,
@ -291,8 +340,8 @@ class MitmProxyHandler(BaseHTTPServer.BaseHTTPRequestHandler):
else:
netloc = '{}:{}'.format(self.hostname, self.port)
result = urlparse.urlunparse(
urlparse.ParseResult(
result = urllib_parse.urlunparse(
urllib_parse.ParseResult(
scheme='https',
netloc=netloc,
params='',
@ -344,10 +393,10 @@ class WarcProxyHandler(MitmProxyHandler):
def _proxy_request(self):
# Build request
req = '%s %s %s\r\n' % (self.command, self.path, self.request_version)
req = '{} {} {}\r\n'.format(self.command, self.path, self.request_version).encode('utf-8')
# Add headers to the request
req += '%s\r\n' % self.headers
req += str(self.headers).encode('utf-8') + b'\r\n'
# Append message body if present to the request
if 'Content-Length' in self.headers:
@ -371,7 +420,7 @@ class WarcProxyHandler(MitmProxyHandler):
h.begin()
buf = h.read(8192)
while buf != '':
while buf != b'':
buf = h.read(8192)
self.log_request(h.status, h.recorder.len)
@ -389,19 +438,29 @@ class WarcProxyHandler(MitmProxyHandler):
class RecordedUrl(object):
def __init__(self, url, request_data, response_recorder, remote_ip):
self.url = url
# XXX should test what happens with non-ascii url (when does
# url-encoding happen?)
if type(url) is not bytes:
self.url = url.encode('ascii')
else:
self.url = url
if type(remote_ip) is not bytes:
self.remote_ip = remote_ip.encode('ascii')
else:
self.remote_ip = remote_ip
self.request_data = request_data
self.response_recorder = response_recorder
self.remote_ip = remote_ip
class WarcProxy(SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer):
class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer):
logger = logging.getLogger('warcprox.WarcProxy')
def __init__(self, server_address=('localhost', 8000),
req_handler_class=WarcProxyHandler, bind_and_activate=True,
ca=None, recorded_url_q=None, digest_algorithm='sha1'):
BaseHTTPServer.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate)
http_server.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate)
self.digest_algorithm = digest_algorithm
@ -413,15 +472,15 @@ class WarcProxy(SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer):
if recorded_url_q is not None:
self.recorded_url_q = recorded_url_q
else:
self.recorded_url_q = Queue.Queue()
self.recorded_url_q = queue.Queue()
def server_activate(self):
BaseHTTPServer.HTTPServer.server_activate(self)
http_server.HTTPServer.server_activate(self)
self.logger.info('WarcProxy listening on {0}:{1}'.format(self.server_address[0], self.server_address[1]))
def server_close(self):
self.logger.info('WarcProxy shutting down')
BaseHTTPServer.HTTPServer.server_close(self)
http_server.HTTPServer.server_close(self)
class PlaybackProxyHandler(MitmProxyHandler):
@ -435,31 +494,31 @@ class PlaybackProxyHandler(MitmProxyHandler):
# @Override
def _proxy_request(self):
date, location = self.server.playback_index_db.lookup_latest(self.url)
date, location = self.server.playback_index_db.lookup_latest(self.url.encode('utf-8'))
self.logger.debug('lookup_latest returned {}:{}'.format(date, location))
status = None
if location is not None:
try:
status, sz = self._send_response_from_warc(location[b'f'], location[b'o'])
status, sz = self._send_response_from_warc(location['f'], location['o'])
except:
status = 500
self.logger.error('PlaybackProxyHandler problem playing back {}'.format(self.url), exc_info=1)
payload = '500 Warcprox Error\n\n{}\n'.format(traceback.format_exc())
headers = ('HTTP/1.1 500 Internal Server Error\r\n'
+ 'Content-Type: text/plain\r\n'
+ 'Content-Length: {}\r\n'
+ '\r\n').format(len(payload))
payload = b'500 Warcprox Error\n\n{}\n'.format(traceback.format_exc()).encode('utf-8')
headers = (b'HTTP/1.1 500 Internal Server Error\r\n'
+ b'Content-Type: text/plain;charset=utf-8\r\n'
+ b'Content-Length: ' + len(payload) + b'\r\n'
+ b'\r\n')
self.connection.sendall(headers)
self.connection.sendall(payload)
sz = len(headers) + len(payload)
else:
status = 404
payload = '404 Not in Archive\n'
headers = ('HTTP/1.1 404 Not Found\r\n'
+ 'Content-Type: text/plain\r\n'
+ 'Content-Length: {}\r\n'
+ '\r\n').format(len(payload))
payload = b'404 Not in Archive\n'
headers = (b'HTTP/1.1 404 Not Found\r\n'
+ b'Content-Type: text/plain;charset=utf-8\r\n'
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n'
+ b'\r\n')
self.connection.sendall(headers)
self.connection.sendall(payload)
sz = len(headers) + len(payload)
@ -494,7 +553,7 @@ class PlaybackProxyHandler(MitmProxyHandler):
while True:
buf = payload_fh.read(8192)
if buf == '': break
if buf == b'': break
self.connection.sendall(buf)
sz += len(buf)
@ -520,7 +579,7 @@ class PlaybackProxyHandler(MitmProxyHandler):
# find end of headers
while True:
line = record.content_file.readline()
if line == '' or re.match('^\r?\n$', line):
if line == b'' or re.match(br'^\r?\n$', line):
break
return self._send_response(headers, record.content_file)
@ -545,7 +604,7 @@ class PlaybackProxyHandler(MitmProxyHandler):
while True:
line = record.content_file.readline()
headers_buf.extend(line)
if line == '' or re.match('^\r?\n$', line):
if line == b'' or re.match(b'^\r?\n$', line):
break
return self._send_response(headers_buf, record.content_file)
@ -573,24 +632,24 @@ class PlaybackProxyHandler(MitmProxyHandler):
raise Exception('should not reach this point')
class PlaybackProxy(SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer):
class PlaybackProxy(socketserver.ThreadingMixIn, http_server.HTTPServer):
logger = logging.getLogger('warcprox.PlaybackProxy')
def __init__(self, server_address, req_handler_class=PlaybackProxyHandler,
bind_and_activate=True, ca=None, playback_index_db=None,
warcs_dir=None):
BaseHTTPServer.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate)
http_server.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate)
self.ca = ca
self.playback_index_db = playback_index_db
self.warcs_dir = warcs_dir
def server_activate(self):
BaseHTTPServer.HTTPServer.server_activate(self)
http_server.HTTPServer.server_activate(self)
self.logger.info('PlaybackProxy listening on {0}:{1}'.format(self.server_address[0], self.server_address[1]))
def server_close(self):
self.logger.info('PlaybackProxy shutting down')
BaseHTTPServer.HTTPServer.server_close(self)
http_server.HTTPServer.server_close(self)
class DedupDb(object):
@ -602,7 +661,7 @@ class DedupDb(object):
else:
self.logger.info('creating new deduplication database {}'.format(dbm_file))
self.db = gdbm.open(dbm_file, 'c')
self.db = dbm_gnu.open(dbm_file, 'c')
def close(self):
self.db.close()
@ -611,21 +670,24 @@ class DedupDb(object):
self.db.sync()
def save(self, key, response_record, offset):
record_id = response_record.get_header(warctools.WarcRecord.ID)
url = response_record.get_header(warctools.WarcRecord.URL)
date = response_record.get_header(warctools.WarcRecord.DATE)
record_id = response_record.get_header(warctools.WarcRecord.ID).decode('latin1')
url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1')
date = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1')
py_value = {'i':record_id, 'u':url, 'd':date}
json_value = json.dumps(py_value, separators=(',',':'))
self.db[key] = json_value
self.db[key] = json_value.encode('utf-8')
self.logger.debug('dedup db saved {}:{}'.format(key, json_value))
def lookup(self, key):
if key in self.db:
json_result = self.db[key]
result = json.loads(json_result)
result = json.loads(json_result.decode('utf-8'))
result['i'] = result['i'].encode('latin1')
result['u'] = result['u'].encode('latin1')
result['d'] = result['d'].encode('latin1')
return result
else:
return None
@ -717,8 +779,7 @@ class WarcWriterThread(threading.Thread):
def digest_str(self, hash_obj):
return '{}:{}'.format(hash_obj.name,
base64.b32encode(hash_obj.digest()) if self.base32 else hash_obj.hexdigest())
return hash_obj.name.encode('utf-8') + b':' + (base64.b32encode(hash_obj.digest()) if self.base32 else hash_obj.hexdigest().encode('ascii'))
def build_warc_record(self, url, warc_date=None, recorder=None, data=None,
@ -753,7 +814,7 @@ class WarcWriterThread(threading.Thread):
headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
if recorder is not None:
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder))))
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1')))
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
self.digest_str(recorder.block_digest)))
if recorder.payload_digest is not None:
@ -764,7 +825,7 @@ class WarcWriterThread(threading.Thread):
record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)
else:
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data))))
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1')))
block_digest = hashlib.new(self.digest_algorithm, data)
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
self.digest_str(block_digest)))
@ -796,21 +857,21 @@ class WarcWriterThread(threading.Thread):
headers = []
headers.append((warctools.WarcRecord.ID, record_id))
headers.append((warctools.WarcRecord.TYPE, warctools.WarcRecord.WARCINFO))
headers.append((warctools.WarcRecord.FILENAME, filename))
headers.append((warctools.WarcRecord.FILENAME, filename.encode('latin1')))
headers.append((warctools.WarcRecord.DATE, warc_record_date))
warcinfo_fields = []
warcinfo_fields.append('software: warcprox.py https://github.com/internetarchive/warcprox')
warcinfo_fields.append(b'software: warcprox.py https://github.com/internetarchive/warcprox')
hostname = socket.gethostname()
warcinfo_fields.append('hostname: {0}'.format(hostname))
warcinfo_fields.append('ip: {0}'.format(socket.gethostbyname(hostname)))
warcinfo_fields.append('format: WARC File Format 1.0')
warcinfo_fields.append('hostname: {}'.format(hostname).encode('latin1'))
warcinfo_fields.append('ip: {0}'.format(socket.gethostbyname(hostname)).encode('latin1'))
warcinfo_fields.append(b'format: WARC File Format 1.0')
# warcinfo_fields.append('robots: ignore')
# warcinfo_fields.append('description: {0}'.format(self.description))
# warcinfo_fields.append('isPartOf: {0}'.format(self.is_part_of))
data = '\r\n'.join(warcinfo_fields) + '\r\n'
data = b'\r\n'.join(warcinfo_fields) + b'\r\n'
record = warctools.WarcRecord(headers=headers, content=('application/warc-fields', data))
record = warctools.WarcRecord(headers=headers, content=(b'application/warc-fields', data))
return record
@ -879,7 +940,7 @@ class WarcWriterThread(threading.Thread):
self._final_tasks(recorded_url, recordset, recordset_offset)
except Queue.Empty:
except queue.Empty:
if (self._fpath is not None
and self.rollover_idle_time is not None
and self.rollover_idle_time > 0
@ -907,7 +968,7 @@ class PlaybackIndexDb(object):
else:
self.logger.info('creating new playback index database {}'.format(dbm_file))
self.db = gdbm.open(dbm_file, 'c')
self.db = dbm_gnu.open(dbm_file, 'c')
def close(self):
@ -922,27 +983,27 @@ class PlaybackIndexDb(object):
response_record = recordset[0]
# XXX canonicalize url?
url = response_record.get_header(warctools.WarcRecord.URL)
date = response_record.get_header(warctools.WarcRecord.DATE)
record_id = response_record.get_header(warctools.WarcRecord.ID)
date_str = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1')
record_id_str = response_record.get_header(warctools.WarcRecord.ID).decode('latin1')
# there could be two visits of same url in the same second, and WARC-Date is
# prescribed as YYYY-MM-DDThh:mm:ssZ, so we have to handle it :-\
# url:{date1:[record1={'f':warcfile,'o':response_offset,'q':request_offset,'i':record_id},record2,...],date2:[{...}],...}
if url in self.db:
existing_json_value = self.db[url]
existing_json_value = self.db[url].decode('utf-8')
py_value = json.loads(existing_json_value)
else:
py_value = {}
if date in py_value:
py_value[date].append({'f':warcfile, 'o':offset, 'i':record_id})
if date_str in py_value:
py_value[date_str].append({'f':warcfile, 'o':offset, 'i':record_id_str})
else:
py_value[date] = [{'f':warcfile, 'o':offset, 'i':record_id}]
py_value[date_str] = [{'f':warcfile, 'o':offset, 'i':record_id_str}]
json_value = json.dumps(py_value, separators=(',',':'))
self.db[url] = json_value
self.db[url] = json_value.encode('utf-8')
self.logger.debug('playback index saved: {}:{}'.format(url, json_value))
@ -951,33 +1012,38 @@ class PlaybackIndexDb(object):
if url not in self.db:
return None, None
json_value = self.db[url]
self.logger.debug("'{}':{}".format(url, json_value))
json_value = self.db[url].decode('utf-8')
self.logger.debug("{}:{}".format(repr(url), repr(json_value)))
py_value = json.loads(json_value)
latest_date = max(py_value)
return latest_date, py_value[latest_date][0]
result = py_value[latest_date][0]
result['i'] = result['i'].encode('ascii')
return latest_date, result
# in python3 params are bytes
def lookup_exact(self, url, warc_date, record_id):
if url not in self.db:
return None
json_value = self.db[url]
self.logger.debug("'{}':{}".format(url, json_value))
json_value = self.db[url].decode('utf-8')
self.logger.debug("{}:{}".format(repr(url), repr(json_value)))
py_value = json.loads(json_value)
if warc_date in py_value:
for record in py_value[warc_date]:
if record['i'] == record_id:
warc_date_str = warc_date.decode('ascii')
if warc_date_str in py_value:
for record in py_value[warc_date_str]:
if record['i'].encode('ascii') == record_id:
self.logger.debug("found exact match for ({},{},{})".format(repr(warc_date), repr(record_id), repr(url)))
record['i'] = record['i'].encode('ascii')
return record
else:
self.logger.info("match not found for ({},{},{})".format(repr(warc_date), repr(record_id), repr(url)))
return None
class WarcproxController(object):
logger = logging.getLogger('warcprox.WarcproxController')
@ -1126,7 +1192,7 @@ def main(argv=sys.argv):
else:
dedup_db = DedupDb(args.dedup_db_file)
recorded_url_q = Queue.Queue()
recorded_url_q = queue.Queue()
ca = CertificateAuthority(args.cacert, args.certs_dir)