tests pass with python2.7 and 3.2! (tox fails though oddly)

This commit is contained in:
Noah Levitt 2013-12-04 17:25:45 -08:00
parent 8ae164f8ca
commit dc9fdc3412
5 changed files with 242 additions and 152 deletions

1
.gitignore vendored
View File

@ -4,6 +4,7 @@
*.pem *.pem
*.db *.db
*.diff *.diff
*.egg
*.egg-info *.egg-info
*.swp *.swp
warcs warcs

View File

@ -12,8 +12,8 @@ setuptools.setup(name='warcprox',
long_description=open('README.rst').read(), long_description=open('README.rst').read(),
license='GPL', license='GPL',
packages=['warcprox'], packages=['warcprox'],
install_requires=['pyopenssl', 'warctools>=4.8.2'], # gdbm/dbhash? install_requires=['pyopenssl', 'warctools>=4.8.3'], # gdbm/dbhash?
dependency_links=['git+https://github.com/nlevitt/warctools.git@tweaks#egg=warctools-4.8.2'], dependency_links=['git+https://github.com/nlevitt/warctools.git@python3#egg=warctools-4.8.3'],
tests_require=['requests>=2.0.1'], # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636 tests_require=['requests>=2.0.1'], # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636
scripts=['bin/dump-anydbm', 'bin/warcprox'], scripts=['bin/dump-anydbm', 'bin/warcprox'],
zip_safe=False, zip_safe=False,

11
tox.ini Normal file
View File

@ -0,0 +1,11 @@
# Tox (http://tox.testrun.org/) is a tool for running tests
# in multiple virtualenvs. This configuration file will run the
# test suite on all supported python versions. To use it, "pip install tox"
# and then run "tox" from this directory.
[tox]
envlist = py27, py32, py33
[testenv]
commands = {envpython} setup.py test

View File

@ -3,7 +3,6 @@
from warcprox import warcprox from warcprox import warcprox
import unittest import unittest
import BaseHTTPServer
import threading import threading
import time import time
import logging import logging
@ -14,10 +13,23 @@ import tempfile
import OpenSSL import OpenSSL
import os import os
import shutil import shutil
import Queue
import requests import requests
class TestHttpRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler): try:
import http.server
http_server = http.server
except ImportError:
import BaseHTTPServer
http_server = BaseHTTPServer
try:
import queue
except ImportError:
import Queue
queue = Queue
class TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
logger = logging.getLogger('TestHttpRequestHandler') logger = logging.getLogger('TestHttpRequestHandler')
def do_GET(self): def do_GET(self):
@ -25,19 +37,19 @@ class TestHttpRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
m = re.match(r'^/([^/]+)/([^/]+)$', self.path) m = re.match(r'^/([^/]+)/([^/]+)$', self.path)
if m is not None: if m is not None:
special_header = 'warcprox-test-header: {}!'.format(m.group(1)) special_header = 'warcprox-test-header: {}!'.format(m.group(1)).encode('utf-8')
payload = 'I am the warcprox test payload! {}!\n'.format(10*m.group(2)) payload = 'I am the warcprox test payload! {}!\n'.format(10*m.group(2)).encode('utf-8')
headers = ('HTTP/1.1 200 OK\r\n' headers = (b'HTTP/1.1 200 OK\r\n'
+ 'Content-Type: text/plain\r\n' + b'Content-Type: text/plain\r\n'
+ '{}\r\n' + special_header + b'\r\n'
+ 'Content-Length: {}\r\n' + b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n'
+ '\r\n').format(special_header, len(payload)) + b'\r\n')
else: else:
payload = '404 Not Found\n' payload = b'404 Not Found\n'
headers = ('HTTP/1.1 404 Not Found\r\n' headers = (b'HTTP/1.1 404 Not Found\r\n'
+ 'Content-Type: text/plain\r\n' + b'Content-Type: text/plain\r\n'
+ 'Content-Length: {}\r\n' + b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n'
+ '\r\n').format(len(payload)) + b'\r\n')
self.connection.sendall(headers) self.connection.sendall(headers)
self.connection.sendall(payload) self.connection.sendall(payload)
@ -82,7 +94,7 @@ class WarcproxTest(unittest.TestCase):
def _start_http_servers(self): def _start_http_servers(self):
self.http_daemon = BaseHTTPServer.HTTPServer(('localhost', 0), self.http_daemon = http_server.HTTPServer(('localhost', 0),
RequestHandlerClass=TestHttpRequestHandler) RequestHandlerClass=TestHttpRequestHandler)
self.logger.info('starting http://{}:{}'.format(self.http_daemon.server_address[0], self.http_daemon.server_address[1])) self.logger.info('starting http://{}:{}'.format(self.http_daemon.server_address[0], self.http_daemon.server_address[1]))
self.http_daemon_thread = threading.Thread(name='HttpdThread', self.http_daemon_thread = threading.Thread(name='HttpdThread',
@ -90,7 +102,7 @@ class WarcproxTest(unittest.TestCase):
self.http_daemon_thread.start() self.http_daemon_thread.start()
# http://www.piware.de/2011/01/creating-an-https-server-in-python/ # http://www.piware.de/2011/01/creating-an-https-server-in-python/
self.https_daemon = BaseHTTPServer.HTTPServer(('localhost', 0), self.https_daemon = http_server.HTTPServer(('localhost', 0),
RequestHandlerClass=TestHttpRequestHandler) RequestHandlerClass=TestHttpRequestHandler)
# self.https_daemon.socket = ssl.wrap_socket(httpd.socket, certfile='path/to/localhost.pem', server_side=True) # self.https_daemon.socket = ssl.wrap_socket(httpd.socket, certfile='path/to/localhost.pem', server_side=True)
self.https_daemon.socket = ssl.wrap_socket(self.https_daemon.socket, certfile=self._cert, server_side=True) self.https_daemon.socket = ssl.wrap_socket(self.https_daemon.socket, certfile=self._cert, server_side=True)
@ -107,7 +119,7 @@ class WarcproxTest(unittest.TestCase):
self._ca_dir = tempfile.mkdtemp(prefix='warcprox-test-', suffix='-ca') self._ca_dir = tempfile.mkdtemp(prefix='warcprox-test-', suffix='-ca')
ca = warcprox.CertificateAuthority(self._ca_file, self._ca_dir) ca = warcprox.CertificateAuthority(self._ca_file, self._ca_dir)
recorded_url_q = Queue.Queue() recorded_url_q = queue.Queue()
proxy = warcprox.WarcProxy(server_address=('localhost', 0), ca=ca, proxy = warcprox.WarcProxy(server_address=('localhost', 0), ca=ca,
recorded_url_q=recorded_url_q) recorded_url_q=recorded_url_q)
@ -183,24 +195,24 @@ class WarcproxTest(unittest.TestCase):
url = 'http://localhost:{}/'.format(self.http_daemon.server_port) url = 'http://localhost:{}/'.format(self.http_daemon.server_port)
response = requests.get(url) response = requests.get(url)
self.assertEqual(response.status_code, 404) self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, '404 Not Found\n') self.assertEqual(response.content, b'404 Not Found\n')
url = 'https://localhost:{}/'.format(self.https_daemon.server_port) url = 'https://localhost:{}/'.format(self.https_daemon.server_port)
response = requests.get(url, verify=False) response = requests.get(url, verify=False)
self.assertEqual(response.status_code, 404) self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, '404 Not Found\n') self.assertEqual(response.content, b'404 Not Found\n')
url = 'http://localhost:{}/a/b'.format(self.http_daemon.server_port) url = 'http://localhost:{}/a/b'.format(self.http_daemon.server_port)
response = requests.get(url) response = requests.get(url)
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'a!') self.assertEqual(response.headers['warcprox-test-header'], 'a!')
self.assertEqual(response.content, 'I am the warcprox test payload! bbbbbbbbbb!\n') self.assertEqual(response.content, b'I am the warcprox test payload! bbbbbbbbbb!\n')
url = 'https://localhost:{}/c/d'.format(self.https_daemon.server_port) url = 'https://localhost:{}/c/d'.format(self.https_daemon.server_port)
response = requests.get(url, verify=False) response = requests.get(url, verify=False)
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'c!') self.assertEqual(response.headers['warcprox-test-header'], 'c!')
self.assertEqual(response.content, 'I am the warcprox test payload! dddddddddd!\n') self.assertEqual(response.content, b'I am the warcprox test payload! dddddddddd!\n')
def poll_playback_until(self, url, status, timeout_sec): def poll_playback_until(self, url, status, timeout_sec):
@ -221,18 +233,18 @@ class WarcproxTest(unittest.TestCase):
# ensure playback fails before archiving # ensure playback fails before archiving
response = requests.get(url, proxies=self.playback_proxies) response = requests.get(url, proxies=self.playback_proxies)
self.assertEqual(response.status_code, 404) self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, '404 Not in Archive\n') self.assertEqual(response.content, b'404 Not in Archive\n')
# archive # archive
response = requests.get(url, proxies=self.archiving_proxies) response = requests.get(url, proxies=self.archiving_proxies)
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'a!') self.assertEqual(response.headers['warcprox-test-header'], 'a!')
self.assertEqual(response.content, 'I am the warcprox test payload! bbbbbbbbbb!\n') self.assertEqual(response.content, b'I am the warcprox test payload! bbbbbbbbbb!\n')
response = self.poll_playback_until(url, status=200, timeout_sec=10) response = self.poll_playback_until(url, status=200, timeout_sec=10)
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'a!') self.assertEqual(response.headers['warcprox-test-header'], 'a!')
self.assertEqual(response.content, 'I am the warcprox test payload! bbbbbbbbbb!\n') self.assertEqual(response.content, b'I am the warcprox test payload! bbbbbbbbbb!\n')
def _test_archive_and_playback_https_url(self): def _test_archive_and_playback_https_url(self):
@ -241,19 +253,19 @@ class WarcproxTest(unittest.TestCase):
# ensure playback fails before archiving # ensure playback fails before archiving
response = requests.get(url, proxies=self.playback_proxies, verify=False) response = requests.get(url, proxies=self.playback_proxies, verify=False)
self.assertEqual(response.status_code, 404) self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, '404 Not in Archive\n') self.assertEqual(response.content, b'404 Not in Archive\n')
# fetch & archive response # fetch & archive response
response = requests.get(url, proxies=self.archiving_proxies, verify=False) response = requests.get(url, proxies=self.archiving_proxies, verify=False)
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'c!') self.assertEqual(response.headers['warcprox-test-header'], 'c!')
self.assertEqual(response.content, 'I am the warcprox test payload! dddddddddd!\n') self.assertEqual(response.content, b'I am the warcprox test payload! dddddddddd!\n')
# test playback # test playback
response = self.poll_playback_until(url, status=200, timeout_sec=10) response = self.poll_playback_until(url, status=200, timeout_sec=10)
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'c!') self.assertEqual(response.headers['warcprox-test-header'], 'c!')
self.assertEqual(response.content, 'I am the warcprox test payload! dddddddddd!\n') self.assertEqual(response.content, b'I am the warcprox test payload! dddddddddd!\n')
# test dedup of same http url with same payload # test dedup of same http url with same payload
@ -263,30 +275,30 @@ class WarcproxTest(unittest.TestCase):
# ensure playback fails before archiving # ensure playback fails before archiving
response = requests.get(url, proxies=self.playback_proxies, verify=False) response = requests.get(url, proxies=self.playback_proxies, verify=False)
self.assertEqual(response.status_code, 404) self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, '404 Not in Archive\n') self.assertEqual(response.content, b'404 Not in Archive\n')
# check not in dedup db # check not in dedup db
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
self.assertIsNone(dedup_lookup) self.assertIsNone(dedup_lookup)
# archive # archive
response = requests.get(url, proxies=self.archiving_proxies, verify=False) response = requests.get(url, proxies=self.archiving_proxies, verify=False)
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'e!') self.assertEqual(response.headers['warcprox-test-header'], 'e!')
self.assertEqual(response.content, 'I am the warcprox test payload! ffffffffff!\n') self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n')
# test playback # test playback
response = self.poll_playback_until(url, status=200, timeout_sec=10) response = self.poll_playback_until(url, status=200, timeout_sec=10)
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'e!') self.assertEqual(response.headers['warcprox-test-header'], 'e!')
self.assertEqual(response.content, 'I am the warcprox test payload! ffffffffff!\n') self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n')
# check in dedup db # check in dedup db
# {u'i': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'} # {u'i': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'}
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
self.assertEquals(dedup_lookup['u'], url) self.assertEqual(dedup_lookup['u'], url.encode('ascii'))
self.assertRegexpMatches(dedup_lookup['i'], r'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$') self.assertRegexpMatches(dedup_lookup['i'], br'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$')
self.assertRegexpMatches(dedup_lookup['d'], r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$') self.assertRegexpMatches(dedup_lookup['d'], br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$')
record_id = dedup_lookup['i'] record_id = dedup_lookup['i']
dedup_date = dedup_lookup['d'] dedup_date = dedup_lookup['d']
@ -298,23 +310,23 @@ class WarcproxTest(unittest.TestCase):
response = requests.get(url, proxies=self.archiving_proxies, verify=False) response = requests.get(url, proxies=self.archiving_proxies, verify=False)
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'e!') self.assertEqual(response.headers['warcprox-test-header'], 'e!')
self.assertEqual(response.content, 'I am the warcprox test payload! ffffffffff!\n') self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n')
# XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\ # XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\
time.sleep(2.0) time.sleep(2.0)
# check in dedup db (no change from prev) # check in dedup db (no change from prev)
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc') dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
self.assertEquals(dedup_lookup['u'], url) self.assertEqual(dedup_lookup['u'], url.encode('ascii'))
self.assertEquals(dedup_lookup['i'], record_id) self.assertEqual(dedup_lookup['i'], record_id)
self.assertEquals(dedup_lookup['d'], dedup_date) self.assertEqual(dedup_lookup['d'], dedup_date)
# test playback # test playback
self.logger.debug('testing playback of revisit of {}'.format(url)) self.logger.debug('testing playback of revisit of {}'.format(url))
response = self.poll_playback_until(url, status=200, timeout_sec=10) response = self.poll_playback_until(url, status=200, timeout_sec=10)
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'e!') self.assertEqual(response.headers['warcprox-test-header'], 'e!')
self.assertEqual(response.content, 'I am the warcprox test payload! ffffffffff!\n') self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n')
# XXX how to check dedup was used? # XXX how to check dedup was used?
@ -325,30 +337,30 @@ class WarcproxTest(unittest.TestCase):
# ensure playback fails before archiving # ensure playback fails before archiving
response = requests.get(url, proxies=self.playback_proxies, verify=False) response = requests.get(url, proxies=self.playback_proxies, verify=False)
self.assertEqual(response.status_code, 404) self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, '404 Not in Archive\n') self.assertEqual(response.content, b'404 Not in Archive\n')
# check not in dedup db # check not in dedup db
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
self.assertIsNone(dedup_lookup) self.assertIsNone(dedup_lookup)
# archive # archive
response = requests.get(url, proxies=self.archiving_proxies, verify=False) response = requests.get(url, proxies=self.archiving_proxies, verify=False)
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'g!') self.assertEqual(response.headers['warcprox-test-header'], 'g!')
self.assertEqual(response.content, 'I am the warcprox test payload! hhhhhhhhhh!\n') self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n')
# test playback # test playback
response = self.poll_playback_until(url, status=200, timeout_sec=10) response = self.poll_playback_until(url, status=200, timeout_sec=10)
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'g!') self.assertEqual(response.headers['warcprox-test-header'], 'g!')
self.assertEqual(response.content, 'I am the warcprox test payload! hhhhhhhhhh!\n') self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n')
# check in dedup db # check in dedup db
# {u'i': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'} # {u'i': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'}
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
self.assertEquals(dedup_lookup['u'], url) self.assertEqual(dedup_lookup['u'], url.encode('ascii'))
self.assertRegexpMatches(dedup_lookup['i'], r'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$') self.assertRegexpMatches(dedup_lookup['i'], br'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$')
self.assertRegexpMatches(dedup_lookup['d'], r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$') self.assertRegexpMatches(dedup_lookup['d'], br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$')
record_id = dedup_lookup['i'] record_id = dedup_lookup['i']
dedup_date = dedup_lookup['d'] dedup_date = dedup_lookup['d']
@ -360,23 +372,23 @@ class WarcproxTest(unittest.TestCase):
response = requests.get(url, proxies=self.archiving_proxies, verify=False) response = requests.get(url, proxies=self.archiving_proxies, verify=False)
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'g!') self.assertEqual(response.headers['warcprox-test-header'], 'g!')
self.assertEqual(response.content, 'I am the warcprox test payload! hhhhhhhhhh!\n') self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n')
# XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\ # XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\
time.sleep(2.0) time.sleep(2.0)
# check in dedup db (no change from prev) # check in dedup db (no change from prev)
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89') dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
self.assertEquals(dedup_lookup['u'], url) self.assertEqual(dedup_lookup['u'], url.encode('ascii'))
self.assertEquals(dedup_lookup['i'], record_id) self.assertEqual(dedup_lookup['i'], record_id)
self.assertEquals(dedup_lookup['d'], dedup_date) self.assertEqual(dedup_lookup['d'], dedup_date)
# test playback # test playback
self.logger.debug('testing playback of revisit of {}'.format(url)) self.logger.debug('testing playback of revisit of {}'.format(url))
response = self.poll_playback_until(url, status=200, timeout_sec=10) response = self.poll_playback_until(url, status=200, timeout_sec=10)
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
self.assertEqual(response.headers['warcprox-test-header'], 'g!') self.assertEqual(response.headers['warcprox-test-header'], 'g!')
self.assertEqual(response.content, 'I am the warcprox test payload! hhhhhhhhhh!\n') self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n')
# XXX how to check dedup was used? # XXX how to check dedup was used?

View File

@ -4,12 +4,55 @@
""" """
WARC writing MITM HTTP/S proxy WARC writing MITM HTTP/S proxy
See README.md or https://github.com/internetarchive/warcprox See README.rst or https://github.com/internetarchive/warcprox
""" """
import BaseHTTPServer, SocketServer try:
import http.server
http_server = http.server
except ImportError:
import BaseHTTPServer
http_server = BaseHTTPServer
try:
import socketserver
except ImportError:
import SocketServer
socketserver = SocketServer
try:
import urllib.parse
urllib_parse = urllib.parse
except ImportError:
import urlparse
urllib_parse = urlparse
try:
import queue
except ImportError:
import Queue
queue = Queue
try:
import http.client
http_client = http.client
except ImportError:
import httplib
http_client = httplib
try:
import dbm.gnu
dbm_gnu = dbm.gnu
except ImportError:
import gdbm
dbm_gnu = gdbm
try:
from io import StringIO
except ImportError:
from StringIO import StringIO
import socket import socket
import urlparse
import OpenSSL import OpenSSL
import ssl import ssl
import logging import logging
@ -17,12 +60,10 @@ import sys
from hanzo import warctools, httptools from hanzo import warctools, httptools
import hashlib import hashlib
from datetime import datetime from datetime import datetime
import Queue
import threading import threading
import os import os
import argparse import argparse
import random import random
import httplib
import re import re
import signal import signal
import time import time
@ -30,8 +71,6 @@ import tempfile
import base64 import base64
import json import json
import traceback import traceback
import gdbm
from StringIO import StringIO
class CertificateAuthority(object): class CertificateAuthority(object):
logger = logging.getLogger('warcprox.CertificateAuthority') logger = logging.getLogger('warcprox.CertificateAuthority')
@ -66,9 +105,9 @@ class CertificateAuthority(object):
self.cert.set_issuer(self.cert.get_subject()) self.cert.set_issuer(self.cert.get_subject())
self.cert.set_pubkey(self.key) self.cert.set_pubkey(self.key)
self.cert.add_extensions([ self.cert.add_extensions([
OpenSSL.crypto.X509Extension("basicConstraints", True, "CA:TRUE, pathlen:0"), OpenSSL.crypto.X509Extension(b"basicConstraints", True, b"CA:TRUE, pathlen:0"),
OpenSSL.crypto.X509Extension("keyUsage", True, "keyCertSign, cRLSign"), OpenSSL.crypto.X509Extension(b"keyUsage", True, b"keyCertSign, cRLSign"),
OpenSSL.crypto.X509Extension("subjectKeyIdentifier", False, "hash", subject=self.cert), OpenSSL.crypto.X509Extension(b"subjectKeyIdentifier", False, b"hash", subject=self.cert),
]) ])
self.cert.sign(self.key, "sha1") self.cert.sign(self.key, "sha1")
@ -134,29 +173,29 @@ class ProxyingRecorder(object):
self.payload_digest = None self.payload_digest = None
self.proxy_dest = proxy_dest self.proxy_dest = proxy_dest
self._proxy_dest_conn_open = True self._proxy_dest_conn_open = True
self._prev_hunk_last_two_bytes = '' self._prev_hunk_last_two_bytes = b''
self.len = 0 self.len = 0
def _update(self, hunk): def _update(self, hunk):
if self.payload_digest is None: if self.payload_digest is None:
# convoluted handling of two newlines crossing hunks # convoluted handling of two newlines crossing hunks
# XXX write tests for this # XXX write tests for this
if self._prev_hunk_last_two_bytes.endswith('\n'): if self._prev_hunk_last_two_bytes.endswith(b'\n'):
if hunk.startswith('\n'): if hunk.startswith(b'\n'):
self.payload_digest = hashlib.new(self.digest_algorithm) self.payload_digest = hashlib.new(self.digest_algorithm)
self.payload_digest.update(hunk[1:]) self.payload_digest.update(hunk[1:])
self.payload_offset = self.len + 1 self.payload_offset = self.len + 1
elif hunk.startswith('\r\n'): elif hunk.startswith(b'\r\n'):
self.payload_digest = hashlib.new(self.digest_algorithm) self.payload_digest = hashlib.new(self.digest_algorithm)
self.payload_digest.update(hunk[2:]) self.payload_digest.update(hunk[2:])
self.payload_offset = self.len + 2 self.payload_offset = self.len + 2
elif self._prev_hunk_last_two_bytes == '\n\r': elif self._prev_hunk_last_two_bytes == b'\n\r':
if hunk.startswith('\n'): if hunk.startswith(b'\n'):
self.payload_digest = hashlib.new(self.digest_algorithm) self.payload_digest = hashlib.new(self.digest_algorithm)
self.payload_digest.update(hunk[1:]) self.payload_digest.update(hunk[1:])
self.payload_offset = self.len + 1 self.payload_offset = self.len + 1
else: else:
m = re.search(r'\n\r?\n', hunk) m = re.search(br'\n\r?\n', hunk)
if m is not None: if m is not None:
self.payload_digest = hashlib.new(self.digest_algorithm) self.payload_digest = hashlib.new(self.digest_algorithm)
self.payload_digest.update(hunk[m.end():]) self.payload_digest.update(hunk[m.end():])
@ -184,14 +223,14 @@ class ProxyingRecorder(object):
def read(self, size=-1): def read(self, size=-1):
hunk = self.fp.read(size=size) hunk = self.fp.read(size)
self._update(hunk) self._update(hunk)
return hunk return hunk
def readline(self, size=-1): def readline(self, size=-1):
# XXX depends on implementation details of self.fp.readline(), in # XXX depends on implementation details of self.fp.readline(), in
# particular that it doesn't call self.fp.read() # particular that it doesn't call self.fp.read()
hunk = self.fp.readline(size=size) hunk = self.fp.readline(size)
self._update(hunk) self._update(hunk)
return hunk return hunk
@ -208,10 +247,10 @@ class ProxyingRecorder(object):
return 0 return 0
class ProxyingRecordingHTTPResponse(httplib.HTTPResponse): class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
def __init__(self, sock, debuglevel=0, strict=0, method=None, buffering=False, proxy_dest=None, digest_algorithm='sha1'): def __init__(self, sock, debuglevel=0, method=None, proxy_dest=None, digest_algorithm='sha1'):
httplib.HTTPResponse.__init__(self, sock, debuglevel=debuglevel, strict=strict, method=method, buffering=buffering) http_client.HTTPResponse.__init__(self, sock, debuglevel=debuglevel, method=method)
# Keep around extra reference to self.fp because HTTPResponse sets # Keep around extra reference to self.fp because HTTPResponse sets
# self.fp=None after it finishes reading, but we still need it # self.fp=None after it finishes reading, but we still need it
@ -219,12 +258,22 @@ class ProxyingRecordingHTTPResponse(httplib.HTTPResponse):
self.fp = self.recorder self.fp = self.recorder
class MitmProxyHandler(BaseHTTPServer.BaseHTTPRequestHandler): class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
logger = logging.getLogger('warcprox.MitmProxyHandler') logger = logging.getLogger('warcprox.MitmProxyHandler')
def __init__(self, request, client_address, server): def __init__(self, request, client_address, server):
self.is_connect = False self.is_connect = False
BaseHTTPServer.BaseHTTPRequestHandler.__init__(self, request, client_address, server)
## XXX hack around bizarre bug on my mac python 3.2 in http.server
## where hasattr returns true in the code snippet below, but
## self._headers_buffer is None
#
# if not hasattr(self, '_headers_buffer'):
# self._headers_buffer = []
# self._headers_buffer.append(
self._headers_buffer = []
http_server.BaseHTTPRequestHandler.__init__(self, request, client_address, server)
def _determine_host_port(self): def _determine_host_port(self):
# Get hostname and port to connect to # Get hostname and port to connect to
@ -232,13 +281,13 @@ class MitmProxyHandler(BaseHTTPServer.BaseHTTPRequestHandler):
self.hostname, self.port = self.path.split(':') self.hostname, self.port = self.path.split(':')
else: else:
self.url = self.path self.url = self.path
u = urlparse.urlparse(self.url) u = urllib_parse.urlparse(self.url)
if u.scheme != 'http': if u.scheme != 'http':
raise Exception('Unknown scheme %s' % repr(u.scheme)) raise Exception('Unknown scheme %s' % repr(u.scheme))
self.hostname = u.hostname self.hostname = u.hostname
self.port = u.port or 80 self.port = u.port or 80
self.path = urlparse.urlunparse( self.path = urllib_parse.urlunparse(
urlparse.ParseResult( urllib_parse.ParseResult(
scheme='', scheme='',
netloc='', netloc='',
params=u.params, params=u.params,
@ -291,8 +340,8 @@ class MitmProxyHandler(BaseHTTPServer.BaseHTTPRequestHandler):
else: else:
netloc = '{}:{}'.format(self.hostname, self.port) netloc = '{}:{}'.format(self.hostname, self.port)
result = urlparse.urlunparse( result = urllib_parse.urlunparse(
urlparse.ParseResult( urllib_parse.ParseResult(
scheme='https', scheme='https',
netloc=netloc, netloc=netloc,
params='', params='',
@ -344,10 +393,10 @@ class WarcProxyHandler(MitmProxyHandler):
def _proxy_request(self): def _proxy_request(self):
# Build request # Build request
req = '%s %s %s\r\n' % (self.command, self.path, self.request_version) req = '{} {} {}\r\n'.format(self.command, self.path, self.request_version).encode('utf-8')
# Add headers to the request # Add headers to the request
req += '%s\r\n' % self.headers req += str(self.headers).encode('utf-8') + b'\r\n'
# Append message body if present to the request # Append message body if present to the request
if 'Content-Length' in self.headers: if 'Content-Length' in self.headers:
@ -371,7 +420,7 @@ class WarcProxyHandler(MitmProxyHandler):
h.begin() h.begin()
buf = h.read(8192) buf = h.read(8192)
while buf != '': while buf != b'':
buf = h.read(8192) buf = h.read(8192)
self.log_request(h.status, h.recorder.len) self.log_request(h.status, h.recorder.len)
@ -389,19 +438,29 @@ class WarcProxyHandler(MitmProxyHandler):
class RecordedUrl(object): class RecordedUrl(object):
def __init__(self, url, request_data, response_recorder, remote_ip): def __init__(self, url, request_data, response_recorder, remote_ip):
self.url = url # XXX should test what happens with non-ascii url (when does
# url-encoding happen?)
if type(url) is not bytes:
self.url = url.encode('ascii')
else:
self.url = url
if type(remote_ip) is not bytes:
self.remote_ip = remote_ip.encode('ascii')
else:
self.remote_ip = remote_ip
self.request_data = request_data self.request_data = request_data
self.response_recorder = response_recorder self.response_recorder = response_recorder
self.remote_ip = remote_ip
class WarcProxy(SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer): class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer):
logger = logging.getLogger('warcprox.WarcProxy') logger = logging.getLogger('warcprox.WarcProxy')
def __init__(self, server_address=('localhost', 8000), def __init__(self, server_address=('localhost', 8000),
req_handler_class=WarcProxyHandler, bind_and_activate=True, req_handler_class=WarcProxyHandler, bind_and_activate=True,
ca=None, recorded_url_q=None, digest_algorithm='sha1'): ca=None, recorded_url_q=None, digest_algorithm='sha1'):
BaseHTTPServer.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate) http_server.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate)
self.digest_algorithm = digest_algorithm self.digest_algorithm = digest_algorithm
@ -413,15 +472,15 @@ class WarcProxy(SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer):
if recorded_url_q is not None: if recorded_url_q is not None:
self.recorded_url_q = recorded_url_q self.recorded_url_q = recorded_url_q
else: else:
self.recorded_url_q = Queue.Queue() self.recorded_url_q = queue.Queue()
def server_activate(self): def server_activate(self):
BaseHTTPServer.HTTPServer.server_activate(self) http_server.HTTPServer.server_activate(self)
self.logger.info('WarcProxy listening on {0}:{1}'.format(self.server_address[0], self.server_address[1])) self.logger.info('WarcProxy listening on {0}:{1}'.format(self.server_address[0], self.server_address[1]))
def server_close(self): def server_close(self):
self.logger.info('WarcProxy shutting down') self.logger.info('WarcProxy shutting down')
BaseHTTPServer.HTTPServer.server_close(self) http_server.HTTPServer.server_close(self)
class PlaybackProxyHandler(MitmProxyHandler): class PlaybackProxyHandler(MitmProxyHandler):
@ -435,31 +494,31 @@ class PlaybackProxyHandler(MitmProxyHandler):
# @Override # @Override
def _proxy_request(self): def _proxy_request(self):
date, location = self.server.playback_index_db.lookup_latest(self.url) date, location = self.server.playback_index_db.lookup_latest(self.url.encode('utf-8'))
self.logger.debug('lookup_latest returned {}:{}'.format(date, location)) self.logger.debug('lookup_latest returned {}:{}'.format(date, location))
status = None status = None
if location is not None: if location is not None:
try: try:
status, sz = self._send_response_from_warc(location[b'f'], location[b'o']) status, sz = self._send_response_from_warc(location['f'], location['o'])
except: except:
status = 500 status = 500
self.logger.error('PlaybackProxyHandler problem playing back {}'.format(self.url), exc_info=1) self.logger.error('PlaybackProxyHandler problem playing back {}'.format(self.url), exc_info=1)
payload = '500 Warcprox Error\n\n{}\n'.format(traceback.format_exc()) payload = b'500 Warcprox Error\n\n{}\n'.format(traceback.format_exc()).encode('utf-8')
headers = ('HTTP/1.1 500 Internal Server Error\r\n' headers = (b'HTTP/1.1 500 Internal Server Error\r\n'
+ 'Content-Type: text/plain\r\n' + b'Content-Type: text/plain;charset=utf-8\r\n'
+ 'Content-Length: {}\r\n' + b'Content-Length: ' + len(payload) + b'\r\n'
+ '\r\n').format(len(payload)) + b'\r\n')
self.connection.sendall(headers) self.connection.sendall(headers)
self.connection.sendall(payload) self.connection.sendall(payload)
sz = len(headers) + len(payload) sz = len(headers) + len(payload)
else: else:
status = 404 status = 404
payload = '404 Not in Archive\n' payload = b'404 Not in Archive\n'
headers = ('HTTP/1.1 404 Not Found\r\n' headers = (b'HTTP/1.1 404 Not Found\r\n'
+ 'Content-Type: text/plain\r\n' + b'Content-Type: text/plain;charset=utf-8\r\n'
+ 'Content-Length: {}\r\n' + b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n'
+ '\r\n').format(len(payload)) + b'\r\n')
self.connection.sendall(headers) self.connection.sendall(headers)
self.connection.sendall(payload) self.connection.sendall(payload)
sz = len(headers) + len(payload) sz = len(headers) + len(payload)
@ -494,7 +553,7 @@ class PlaybackProxyHandler(MitmProxyHandler):
while True: while True:
buf = payload_fh.read(8192) buf = payload_fh.read(8192)
if buf == '': break if buf == b'': break
self.connection.sendall(buf) self.connection.sendall(buf)
sz += len(buf) sz += len(buf)
@ -520,7 +579,7 @@ class PlaybackProxyHandler(MitmProxyHandler):
# find end of headers # find end of headers
while True: while True:
line = record.content_file.readline() line = record.content_file.readline()
if line == '' or re.match('^\r?\n$', line): if line == b'' or re.match(br'^\r?\n$', line):
break break
return self._send_response(headers, record.content_file) return self._send_response(headers, record.content_file)
@ -545,7 +604,7 @@ class PlaybackProxyHandler(MitmProxyHandler):
while True: while True:
line = record.content_file.readline() line = record.content_file.readline()
headers_buf.extend(line) headers_buf.extend(line)
if line == '' or re.match('^\r?\n$', line): if line == b'' or re.match(b'^\r?\n$', line):
break break
return self._send_response(headers_buf, record.content_file) return self._send_response(headers_buf, record.content_file)
@ -573,24 +632,24 @@ class PlaybackProxyHandler(MitmProxyHandler):
raise Exception('should not reach this point') raise Exception('should not reach this point')
class PlaybackProxy(SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer): class PlaybackProxy(socketserver.ThreadingMixIn, http_server.HTTPServer):
logger = logging.getLogger('warcprox.PlaybackProxy') logger = logging.getLogger('warcprox.PlaybackProxy')
def __init__(self, server_address, req_handler_class=PlaybackProxyHandler, def __init__(self, server_address, req_handler_class=PlaybackProxyHandler,
bind_and_activate=True, ca=None, playback_index_db=None, bind_and_activate=True, ca=None, playback_index_db=None,
warcs_dir=None): warcs_dir=None):
BaseHTTPServer.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate) http_server.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate)
self.ca = ca self.ca = ca
self.playback_index_db = playback_index_db self.playback_index_db = playback_index_db
self.warcs_dir = warcs_dir self.warcs_dir = warcs_dir
def server_activate(self): def server_activate(self):
BaseHTTPServer.HTTPServer.server_activate(self) http_server.HTTPServer.server_activate(self)
self.logger.info('PlaybackProxy listening on {0}:{1}'.format(self.server_address[0], self.server_address[1])) self.logger.info('PlaybackProxy listening on {0}:{1}'.format(self.server_address[0], self.server_address[1]))
def server_close(self): def server_close(self):
self.logger.info('PlaybackProxy shutting down') self.logger.info('PlaybackProxy shutting down')
BaseHTTPServer.HTTPServer.server_close(self) http_server.HTTPServer.server_close(self)
class DedupDb(object): class DedupDb(object):
@ -602,7 +661,7 @@ class DedupDb(object):
else: else:
self.logger.info('creating new deduplication database {}'.format(dbm_file)) self.logger.info('creating new deduplication database {}'.format(dbm_file))
self.db = gdbm.open(dbm_file, 'c') self.db = dbm_gnu.open(dbm_file, 'c')
def close(self): def close(self):
self.db.close() self.db.close()
@ -611,21 +670,24 @@ class DedupDb(object):
self.db.sync() self.db.sync()
def save(self, key, response_record, offset): def save(self, key, response_record, offset):
record_id = response_record.get_header(warctools.WarcRecord.ID) record_id = response_record.get_header(warctools.WarcRecord.ID).decode('latin1')
url = response_record.get_header(warctools.WarcRecord.URL) url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1')
date = response_record.get_header(warctools.WarcRecord.DATE) date = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1')
py_value = {'i':record_id, 'u':url, 'd':date} py_value = {'i':record_id, 'u':url, 'd':date}
json_value = json.dumps(py_value, separators=(',',':')) json_value = json.dumps(py_value, separators=(',',':'))
self.db[key] = json_value self.db[key] = json_value.encode('utf-8')
self.logger.debug('dedup db saved {}:{}'.format(key, json_value)) self.logger.debug('dedup db saved {}:{}'.format(key, json_value))
def lookup(self, key): def lookup(self, key):
if key in self.db: if key in self.db:
json_result = self.db[key] json_result = self.db[key]
result = json.loads(json_result) result = json.loads(json_result.decode('utf-8'))
result['i'] = result['i'].encode('latin1')
result['u'] = result['u'].encode('latin1')
result['d'] = result['d'].encode('latin1')
return result return result
else: else:
return None return None
@ -717,8 +779,7 @@ class WarcWriterThread(threading.Thread):
def digest_str(self, hash_obj): def digest_str(self, hash_obj):
return '{}:{}'.format(hash_obj.name, return hash_obj.name.encode('utf-8') + b':' + (base64.b32encode(hash_obj.digest()) if self.base32 else hash_obj.hexdigest().encode('ascii'))
base64.b32encode(hash_obj.digest()) if self.base32 else hash_obj.hexdigest())
def build_warc_record(self, url, warc_date=None, recorder=None, data=None, def build_warc_record(self, url, warc_date=None, recorder=None, data=None,
@ -753,7 +814,7 @@ class WarcWriterThread(threading.Thread):
headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type)) headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
if recorder is not None: if recorder is not None:
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)))) headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1')))
headers.append((warctools.WarcRecord.BLOCK_DIGEST, headers.append((warctools.WarcRecord.BLOCK_DIGEST,
self.digest_str(recorder.block_digest))) self.digest_str(recorder.block_digest)))
if recorder.payload_digest is not None: if recorder.payload_digest is not None:
@ -764,7 +825,7 @@ class WarcWriterThread(threading.Thread):
record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile) record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)
else: else:
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)))) headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1')))
block_digest = hashlib.new(self.digest_algorithm, data) block_digest = hashlib.new(self.digest_algorithm, data)
headers.append((warctools.WarcRecord.BLOCK_DIGEST, headers.append((warctools.WarcRecord.BLOCK_DIGEST,
self.digest_str(block_digest))) self.digest_str(block_digest)))
@ -796,21 +857,21 @@ class WarcWriterThread(threading.Thread):
headers = [] headers = []
headers.append((warctools.WarcRecord.ID, record_id)) headers.append((warctools.WarcRecord.ID, record_id))
headers.append((warctools.WarcRecord.TYPE, warctools.WarcRecord.WARCINFO)) headers.append((warctools.WarcRecord.TYPE, warctools.WarcRecord.WARCINFO))
headers.append((warctools.WarcRecord.FILENAME, filename)) headers.append((warctools.WarcRecord.FILENAME, filename.encode('latin1')))
headers.append((warctools.WarcRecord.DATE, warc_record_date)) headers.append((warctools.WarcRecord.DATE, warc_record_date))
warcinfo_fields = [] warcinfo_fields = []
warcinfo_fields.append('software: warcprox.py https://github.com/internetarchive/warcprox') warcinfo_fields.append(b'software: warcprox.py https://github.com/internetarchive/warcprox')
hostname = socket.gethostname() hostname = socket.gethostname()
warcinfo_fields.append('hostname: {0}'.format(hostname)) warcinfo_fields.append('hostname: {}'.format(hostname).encode('latin1'))
warcinfo_fields.append('ip: {0}'.format(socket.gethostbyname(hostname))) warcinfo_fields.append('ip: {0}'.format(socket.gethostbyname(hostname)).encode('latin1'))
warcinfo_fields.append('format: WARC File Format 1.0') warcinfo_fields.append(b'format: WARC File Format 1.0')
# warcinfo_fields.append('robots: ignore') # warcinfo_fields.append('robots: ignore')
# warcinfo_fields.append('description: {0}'.format(self.description)) # warcinfo_fields.append('description: {0}'.format(self.description))
# warcinfo_fields.append('isPartOf: {0}'.format(self.is_part_of)) # warcinfo_fields.append('isPartOf: {0}'.format(self.is_part_of))
data = '\r\n'.join(warcinfo_fields) + '\r\n' data = b'\r\n'.join(warcinfo_fields) + b'\r\n'
record = warctools.WarcRecord(headers=headers, content=('application/warc-fields', data)) record = warctools.WarcRecord(headers=headers, content=(b'application/warc-fields', data))
return record return record
@ -879,7 +940,7 @@ class WarcWriterThread(threading.Thread):
self._final_tasks(recorded_url, recordset, recordset_offset) self._final_tasks(recorded_url, recordset, recordset_offset)
except Queue.Empty: except queue.Empty:
if (self._fpath is not None if (self._fpath is not None
and self.rollover_idle_time is not None and self.rollover_idle_time is not None
and self.rollover_idle_time > 0 and self.rollover_idle_time > 0
@ -907,7 +968,7 @@ class PlaybackIndexDb(object):
else: else:
self.logger.info('creating new playback index database {}'.format(dbm_file)) self.logger.info('creating new playback index database {}'.format(dbm_file))
self.db = gdbm.open(dbm_file, 'c') self.db = dbm_gnu.open(dbm_file, 'c')
def close(self): def close(self):
@ -922,27 +983,27 @@ class PlaybackIndexDb(object):
response_record = recordset[0] response_record = recordset[0]
# XXX canonicalize url? # XXX canonicalize url?
url = response_record.get_header(warctools.WarcRecord.URL) url = response_record.get_header(warctools.WarcRecord.URL)
date = response_record.get_header(warctools.WarcRecord.DATE) date_str = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1')
record_id = response_record.get_header(warctools.WarcRecord.ID) record_id_str = response_record.get_header(warctools.WarcRecord.ID).decode('latin1')
# there could be two visits of same url in the same second, and WARC-Date is # there could be two visits of same url in the same second, and WARC-Date is
# prescribed as YYYY-MM-DDThh:mm:ssZ, so we have to handle it :-\ # prescribed as YYYY-MM-DDThh:mm:ssZ, so we have to handle it :-\
# url:{date1:[record1={'f':warcfile,'o':response_offset,'q':request_offset,'i':record_id},record2,...],date2:[{...}],...} # url:{date1:[record1={'f':warcfile,'o':response_offset,'q':request_offset,'i':record_id},record2,...],date2:[{...}],...}
if url in self.db: if url in self.db:
existing_json_value = self.db[url] existing_json_value = self.db[url].decode('utf-8')
py_value = json.loads(existing_json_value) py_value = json.loads(existing_json_value)
else: else:
py_value = {} py_value = {}
if date in py_value: if date_str in py_value:
py_value[date].append({'f':warcfile, 'o':offset, 'i':record_id}) py_value[date_str].append({'f':warcfile, 'o':offset, 'i':record_id_str})
else: else:
py_value[date] = [{'f':warcfile, 'o':offset, 'i':record_id}] py_value[date_str] = [{'f':warcfile, 'o':offset, 'i':record_id_str}]
json_value = json.dumps(py_value, separators=(',',':')) json_value = json.dumps(py_value, separators=(',',':'))
self.db[url] = json_value self.db[url] = json_value.encode('utf-8')
self.logger.debug('playback index saved: {}:{}'.format(url, json_value)) self.logger.debug('playback index saved: {}:{}'.format(url, json_value))
@ -951,33 +1012,38 @@ class PlaybackIndexDb(object):
if url not in self.db: if url not in self.db:
return None, None return None, None
json_value = self.db[url] json_value = self.db[url].decode('utf-8')
self.logger.debug("'{}':{}".format(url, json_value)) self.logger.debug("{}:{}".format(repr(url), repr(json_value)))
py_value = json.loads(json_value) py_value = json.loads(json_value)
latest_date = max(py_value) latest_date = max(py_value)
return latest_date, py_value[latest_date][0] result = py_value[latest_date][0]
result['i'] = result['i'].encode('ascii')
return latest_date, result
# in python3 params are bytes
def lookup_exact(self, url, warc_date, record_id): def lookup_exact(self, url, warc_date, record_id):
if url not in self.db: if url not in self.db:
return None return None
json_value = self.db[url] json_value = self.db[url].decode('utf-8')
self.logger.debug("'{}':{}".format(url, json_value)) self.logger.debug("{}:{}".format(repr(url), repr(json_value)))
py_value = json.loads(json_value) py_value = json.loads(json_value)
if warc_date in py_value: warc_date_str = warc_date.decode('ascii')
for record in py_value[warc_date]:
if record['i'] == record_id: if warc_date_str in py_value:
for record in py_value[warc_date_str]:
if record['i'].encode('ascii') == record_id:
self.logger.debug("found exact match for ({},{},{})".format(repr(warc_date), repr(record_id), repr(url))) self.logger.debug("found exact match for ({},{},{})".format(repr(warc_date), repr(record_id), repr(url)))
record['i'] = record['i'].encode('ascii')
return record return record
else: else:
self.logger.info("match not found for ({},{},{})".format(repr(warc_date), repr(record_id), repr(url))) self.logger.info("match not found for ({},{},{})".format(repr(warc_date), repr(record_id), repr(url)))
return None return None
class WarcproxController(object): class WarcproxController(object):
logger = logging.getLogger('warcprox.WarcproxController') logger = logging.getLogger('warcprox.WarcproxController')
@ -1126,7 +1192,7 @@ def main(argv=sys.argv):
else: else:
dedup_db = DedupDb(args.dedup_db_file) dedup_db = DedupDb(args.dedup_db_file)
recorded_url_q = Queue.Queue() recorded_url_q = queue.Queue()
ca = CertificateAuthority(args.cacert, args.certs_dir) ca = CertificateAuthority(args.cacert, args.certs_dir)