mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
tests pass with python2.7 and 3.2! (tox fails though oddly)
This commit is contained in:
parent
8ae164f8ca
commit
dc9fdc3412
1
.gitignore
vendored
1
.gitignore
vendored
@ -4,6 +4,7 @@
|
|||||||
*.pem
|
*.pem
|
||||||
*.db
|
*.db
|
||||||
*.diff
|
*.diff
|
||||||
|
*.egg
|
||||||
*.egg-info
|
*.egg-info
|
||||||
*.swp
|
*.swp
|
||||||
warcs
|
warcs
|
||||||
|
4
setup.py
4
setup.py
@ -12,8 +12,8 @@ setuptools.setup(name='warcprox',
|
|||||||
long_description=open('README.rst').read(),
|
long_description=open('README.rst').read(),
|
||||||
license='GPL',
|
license='GPL',
|
||||||
packages=['warcprox'],
|
packages=['warcprox'],
|
||||||
install_requires=['pyopenssl', 'warctools>=4.8.2'], # gdbm/dbhash?
|
install_requires=['pyopenssl', 'warctools>=4.8.3'], # gdbm/dbhash?
|
||||||
dependency_links=['git+https://github.com/nlevitt/warctools.git@tweaks#egg=warctools-4.8.2'],
|
dependency_links=['git+https://github.com/nlevitt/warctools.git@python3#egg=warctools-4.8.3'],
|
||||||
tests_require=['requests>=2.0.1'], # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636
|
tests_require=['requests>=2.0.1'], # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636
|
||||||
scripts=['bin/dump-anydbm', 'bin/warcprox'],
|
scripts=['bin/dump-anydbm', 'bin/warcprox'],
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
|
11
tox.ini
Normal file
11
tox.ini
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
# Tox (http://tox.testrun.org/) is a tool for running tests
|
||||||
|
# in multiple virtualenvs. This configuration file will run the
|
||||||
|
# test suite on all supported python versions. To use it, "pip install tox"
|
||||||
|
# and then run "tox" from this directory.
|
||||||
|
|
||||||
|
[tox]
|
||||||
|
envlist = py27, py32, py33
|
||||||
|
|
||||||
|
[testenv]
|
||||||
|
commands = {envpython} setup.py test
|
||||||
|
|
@ -3,7 +3,6 @@
|
|||||||
|
|
||||||
from warcprox import warcprox
|
from warcprox import warcprox
|
||||||
import unittest
|
import unittest
|
||||||
import BaseHTTPServer
|
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
import logging
|
import logging
|
||||||
@ -14,10 +13,23 @@ import tempfile
|
|||||||
import OpenSSL
|
import OpenSSL
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import Queue
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
class TestHttpRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
|
try:
|
||||||
|
import http.server
|
||||||
|
http_server = http.server
|
||||||
|
except ImportError:
|
||||||
|
import BaseHTTPServer
|
||||||
|
http_server = BaseHTTPServer
|
||||||
|
|
||||||
|
try:
|
||||||
|
import queue
|
||||||
|
except ImportError:
|
||||||
|
import Queue
|
||||||
|
queue = Queue
|
||||||
|
|
||||||
|
|
||||||
|
class TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
|
||||||
logger = logging.getLogger('TestHttpRequestHandler')
|
logger = logging.getLogger('TestHttpRequestHandler')
|
||||||
|
|
||||||
def do_GET(self):
|
def do_GET(self):
|
||||||
@ -25,19 +37,19 @@ class TestHttpRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
|
|||||||
|
|
||||||
m = re.match(r'^/([^/]+)/([^/]+)$', self.path)
|
m = re.match(r'^/([^/]+)/([^/]+)$', self.path)
|
||||||
if m is not None:
|
if m is not None:
|
||||||
special_header = 'warcprox-test-header: {}!'.format(m.group(1))
|
special_header = 'warcprox-test-header: {}!'.format(m.group(1)).encode('utf-8')
|
||||||
payload = 'I am the warcprox test payload! {}!\n'.format(10*m.group(2))
|
payload = 'I am the warcprox test payload! {}!\n'.format(10*m.group(2)).encode('utf-8')
|
||||||
headers = ('HTTP/1.1 200 OK\r\n'
|
headers = (b'HTTP/1.1 200 OK\r\n'
|
||||||
+ 'Content-Type: text/plain\r\n'
|
+ b'Content-Type: text/plain\r\n'
|
||||||
+ '{}\r\n'
|
+ special_header + b'\r\n'
|
||||||
+ 'Content-Length: {}\r\n'
|
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n'
|
||||||
+ '\r\n').format(special_header, len(payload))
|
+ b'\r\n')
|
||||||
else:
|
else:
|
||||||
payload = '404 Not Found\n'
|
payload = b'404 Not Found\n'
|
||||||
headers = ('HTTP/1.1 404 Not Found\r\n'
|
headers = (b'HTTP/1.1 404 Not Found\r\n'
|
||||||
+ 'Content-Type: text/plain\r\n'
|
+ b'Content-Type: text/plain\r\n'
|
||||||
+ 'Content-Length: {}\r\n'
|
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n'
|
||||||
+ '\r\n').format(len(payload))
|
+ b'\r\n')
|
||||||
|
|
||||||
self.connection.sendall(headers)
|
self.connection.sendall(headers)
|
||||||
self.connection.sendall(payload)
|
self.connection.sendall(payload)
|
||||||
@ -82,7 +94,7 @@ class WarcproxTest(unittest.TestCase):
|
|||||||
|
|
||||||
|
|
||||||
def _start_http_servers(self):
|
def _start_http_servers(self):
|
||||||
self.http_daemon = BaseHTTPServer.HTTPServer(('localhost', 0),
|
self.http_daemon = http_server.HTTPServer(('localhost', 0),
|
||||||
RequestHandlerClass=TestHttpRequestHandler)
|
RequestHandlerClass=TestHttpRequestHandler)
|
||||||
self.logger.info('starting http://{}:{}'.format(self.http_daemon.server_address[0], self.http_daemon.server_address[1]))
|
self.logger.info('starting http://{}:{}'.format(self.http_daemon.server_address[0], self.http_daemon.server_address[1]))
|
||||||
self.http_daemon_thread = threading.Thread(name='HttpdThread',
|
self.http_daemon_thread = threading.Thread(name='HttpdThread',
|
||||||
@ -90,7 +102,7 @@ class WarcproxTest(unittest.TestCase):
|
|||||||
self.http_daemon_thread.start()
|
self.http_daemon_thread.start()
|
||||||
|
|
||||||
# http://www.piware.de/2011/01/creating-an-https-server-in-python/
|
# http://www.piware.de/2011/01/creating-an-https-server-in-python/
|
||||||
self.https_daemon = BaseHTTPServer.HTTPServer(('localhost', 0),
|
self.https_daemon = http_server.HTTPServer(('localhost', 0),
|
||||||
RequestHandlerClass=TestHttpRequestHandler)
|
RequestHandlerClass=TestHttpRequestHandler)
|
||||||
# self.https_daemon.socket = ssl.wrap_socket(httpd.socket, certfile='path/to/localhost.pem', server_side=True)
|
# self.https_daemon.socket = ssl.wrap_socket(httpd.socket, certfile='path/to/localhost.pem', server_side=True)
|
||||||
self.https_daemon.socket = ssl.wrap_socket(self.https_daemon.socket, certfile=self._cert, server_side=True)
|
self.https_daemon.socket = ssl.wrap_socket(self.https_daemon.socket, certfile=self._cert, server_side=True)
|
||||||
@ -107,7 +119,7 @@ class WarcproxTest(unittest.TestCase):
|
|||||||
self._ca_dir = tempfile.mkdtemp(prefix='warcprox-test-', suffix='-ca')
|
self._ca_dir = tempfile.mkdtemp(prefix='warcprox-test-', suffix='-ca')
|
||||||
ca = warcprox.CertificateAuthority(self._ca_file, self._ca_dir)
|
ca = warcprox.CertificateAuthority(self._ca_file, self._ca_dir)
|
||||||
|
|
||||||
recorded_url_q = Queue.Queue()
|
recorded_url_q = queue.Queue()
|
||||||
|
|
||||||
proxy = warcprox.WarcProxy(server_address=('localhost', 0), ca=ca,
|
proxy = warcprox.WarcProxy(server_address=('localhost', 0), ca=ca,
|
||||||
recorded_url_q=recorded_url_q)
|
recorded_url_q=recorded_url_q)
|
||||||
@ -183,24 +195,24 @@ class WarcproxTest(unittest.TestCase):
|
|||||||
url = 'http://localhost:{}/'.format(self.http_daemon.server_port)
|
url = 'http://localhost:{}/'.format(self.http_daemon.server_port)
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
self.assertEqual(response.status_code, 404)
|
self.assertEqual(response.status_code, 404)
|
||||||
self.assertEqual(response.content, '404 Not Found\n')
|
self.assertEqual(response.content, b'404 Not Found\n')
|
||||||
|
|
||||||
url = 'https://localhost:{}/'.format(self.https_daemon.server_port)
|
url = 'https://localhost:{}/'.format(self.https_daemon.server_port)
|
||||||
response = requests.get(url, verify=False)
|
response = requests.get(url, verify=False)
|
||||||
self.assertEqual(response.status_code, 404)
|
self.assertEqual(response.status_code, 404)
|
||||||
self.assertEqual(response.content, '404 Not Found\n')
|
self.assertEqual(response.content, b'404 Not Found\n')
|
||||||
|
|
||||||
url = 'http://localhost:{}/a/b'.format(self.http_daemon.server_port)
|
url = 'http://localhost:{}/a/b'.format(self.http_daemon.server_port)
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
self.assertEqual(response.status_code, 200)
|
self.assertEqual(response.status_code, 200)
|
||||||
self.assertEqual(response.headers['warcprox-test-header'], 'a!')
|
self.assertEqual(response.headers['warcprox-test-header'], 'a!')
|
||||||
self.assertEqual(response.content, 'I am the warcprox test payload! bbbbbbbbbb!\n')
|
self.assertEqual(response.content, b'I am the warcprox test payload! bbbbbbbbbb!\n')
|
||||||
|
|
||||||
url = 'https://localhost:{}/c/d'.format(self.https_daemon.server_port)
|
url = 'https://localhost:{}/c/d'.format(self.https_daemon.server_port)
|
||||||
response = requests.get(url, verify=False)
|
response = requests.get(url, verify=False)
|
||||||
self.assertEqual(response.status_code, 200)
|
self.assertEqual(response.status_code, 200)
|
||||||
self.assertEqual(response.headers['warcprox-test-header'], 'c!')
|
self.assertEqual(response.headers['warcprox-test-header'], 'c!')
|
||||||
self.assertEqual(response.content, 'I am the warcprox test payload! dddddddddd!\n')
|
self.assertEqual(response.content, b'I am the warcprox test payload! dddddddddd!\n')
|
||||||
|
|
||||||
|
|
||||||
def poll_playback_until(self, url, status, timeout_sec):
|
def poll_playback_until(self, url, status, timeout_sec):
|
||||||
@ -221,18 +233,18 @@ class WarcproxTest(unittest.TestCase):
|
|||||||
# ensure playback fails before archiving
|
# ensure playback fails before archiving
|
||||||
response = requests.get(url, proxies=self.playback_proxies)
|
response = requests.get(url, proxies=self.playback_proxies)
|
||||||
self.assertEqual(response.status_code, 404)
|
self.assertEqual(response.status_code, 404)
|
||||||
self.assertEqual(response.content, '404 Not in Archive\n')
|
self.assertEqual(response.content, b'404 Not in Archive\n')
|
||||||
|
|
||||||
# archive
|
# archive
|
||||||
response = requests.get(url, proxies=self.archiving_proxies)
|
response = requests.get(url, proxies=self.archiving_proxies)
|
||||||
self.assertEqual(response.status_code, 200)
|
self.assertEqual(response.status_code, 200)
|
||||||
self.assertEqual(response.headers['warcprox-test-header'], 'a!')
|
self.assertEqual(response.headers['warcprox-test-header'], 'a!')
|
||||||
self.assertEqual(response.content, 'I am the warcprox test payload! bbbbbbbbbb!\n')
|
self.assertEqual(response.content, b'I am the warcprox test payload! bbbbbbbbbb!\n')
|
||||||
|
|
||||||
response = self.poll_playback_until(url, status=200, timeout_sec=10)
|
response = self.poll_playback_until(url, status=200, timeout_sec=10)
|
||||||
self.assertEqual(response.status_code, 200)
|
self.assertEqual(response.status_code, 200)
|
||||||
self.assertEqual(response.headers['warcprox-test-header'], 'a!')
|
self.assertEqual(response.headers['warcprox-test-header'], 'a!')
|
||||||
self.assertEqual(response.content, 'I am the warcprox test payload! bbbbbbbbbb!\n')
|
self.assertEqual(response.content, b'I am the warcprox test payload! bbbbbbbbbb!\n')
|
||||||
|
|
||||||
|
|
||||||
def _test_archive_and_playback_https_url(self):
|
def _test_archive_and_playback_https_url(self):
|
||||||
@ -241,19 +253,19 @@ class WarcproxTest(unittest.TestCase):
|
|||||||
# ensure playback fails before archiving
|
# ensure playback fails before archiving
|
||||||
response = requests.get(url, proxies=self.playback_proxies, verify=False)
|
response = requests.get(url, proxies=self.playback_proxies, verify=False)
|
||||||
self.assertEqual(response.status_code, 404)
|
self.assertEqual(response.status_code, 404)
|
||||||
self.assertEqual(response.content, '404 Not in Archive\n')
|
self.assertEqual(response.content, b'404 Not in Archive\n')
|
||||||
|
|
||||||
# fetch & archive response
|
# fetch & archive response
|
||||||
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
|
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
|
||||||
self.assertEqual(response.status_code, 200)
|
self.assertEqual(response.status_code, 200)
|
||||||
self.assertEqual(response.headers['warcprox-test-header'], 'c!')
|
self.assertEqual(response.headers['warcprox-test-header'], 'c!')
|
||||||
self.assertEqual(response.content, 'I am the warcprox test payload! dddddddddd!\n')
|
self.assertEqual(response.content, b'I am the warcprox test payload! dddddddddd!\n')
|
||||||
|
|
||||||
# test playback
|
# test playback
|
||||||
response = self.poll_playback_until(url, status=200, timeout_sec=10)
|
response = self.poll_playback_until(url, status=200, timeout_sec=10)
|
||||||
self.assertEqual(response.status_code, 200)
|
self.assertEqual(response.status_code, 200)
|
||||||
self.assertEqual(response.headers['warcprox-test-header'], 'c!')
|
self.assertEqual(response.headers['warcprox-test-header'], 'c!')
|
||||||
self.assertEqual(response.content, 'I am the warcprox test payload! dddddddddd!\n')
|
self.assertEqual(response.content, b'I am the warcprox test payload! dddddddddd!\n')
|
||||||
|
|
||||||
|
|
||||||
# test dedup of same http url with same payload
|
# test dedup of same http url with same payload
|
||||||
@ -263,30 +275,30 @@ class WarcproxTest(unittest.TestCase):
|
|||||||
# ensure playback fails before archiving
|
# ensure playback fails before archiving
|
||||||
response = requests.get(url, proxies=self.playback_proxies, verify=False)
|
response = requests.get(url, proxies=self.playback_proxies, verify=False)
|
||||||
self.assertEqual(response.status_code, 404)
|
self.assertEqual(response.status_code, 404)
|
||||||
self.assertEqual(response.content, '404 Not in Archive\n')
|
self.assertEqual(response.content, b'404 Not in Archive\n')
|
||||||
|
|
||||||
# check not in dedup db
|
# check not in dedup db
|
||||||
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
|
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
|
||||||
self.assertIsNone(dedup_lookup)
|
self.assertIsNone(dedup_lookup)
|
||||||
|
|
||||||
# archive
|
# archive
|
||||||
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
|
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
|
||||||
self.assertEqual(response.status_code, 200)
|
self.assertEqual(response.status_code, 200)
|
||||||
self.assertEqual(response.headers['warcprox-test-header'], 'e!')
|
self.assertEqual(response.headers['warcprox-test-header'], 'e!')
|
||||||
self.assertEqual(response.content, 'I am the warcprox test payload! ffffffffff!\n')
|
self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n')
|
||||||
|
|
||||||
# test playback
|
# test playback
|
||||||
response = self.poll_playback_until(url, status=200, timeout_sec=10)
|
response = self.poll_playback_until(url, status=200, timeout_sec=10)
|
||||||
self.assertEqual(response.status_code, 200)
|
self.assertEqual(response.status_code, 200)
|
||||||
self.assertEqual(response.headers['warcprox-test-header'], 'e!')
|
self.assertEqual(response.headers['warcprox-test-header'], 'e!')
|
||||||
self.assertEqual(response.content, 'I am the warcprox test payload! ffffffffff!\n')
|
self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n')
|
||||||
|
|
||||||
# check in dedup db
|
# check in dedup db
|
||||||
# {u'i': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'}
|
# {u'i': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'}
|
||||||
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
|
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
|
||||||
self.assertEquals(dedup_lookup['u'], url)
|
self.assertEqual(dedup_lookup['u'], url.encode('ascii'))
|
||||||
self.assertRegexpMatches(dedup_lookup['i'], r'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$')
|
self.assertRegexpMatches(dedup_lookup['i'], br'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$')
|
||||||
self.assertRegexpMatches(dedup_lookup['d'], r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$')
|
self.assertRegexpMatches(dedup_lookup['d'], br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$')
|
||||||
record_id = dedup_lookup['i']
|
record_id = dedup_lookup['i']
|
||||||
dedup_date = dedup_lookup['d']
|
dedup_date = dedup_lookup['d']
|
||||||
|
|
||||||
@ -298,23 +310,23 @@ class WarcproxTest(unittest.TestCase):
|
|||||||
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
|
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
|
||||||
self.assertEqual(response.status_code, 200)
|
self.assertEqual(response.status_code, 200)
|
||||||
self.assertEqual(response.headers['warcprox-test-header'], 'e!')
|
self.assertEqual(response.headers['warcprox-test-header'], 'e!')
|
||||||
self.assertEqual(response.content, 'I am the warcprox test payload! ffffffffff!\n')
|
self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n')
|
||||||
|
|
||||||
# XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\
|
# XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\
|
||||||
time.sleep(2.0)
|
time.sleep(2.0)
|
||||||
|
|
||||||
# check in dedup db (no change from prev)
|
# check in dedup db (no change from prev)
|
||||||
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
|
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup(b'sha1:65e1216acfd220f0292715e74bd7a1ec35c99dfc')
|
||||||
self.assertEquals(dedup_lookup['u'], url)
|
self.assertEqual(dedup_lookup['u'], url.encode('ascii'))
|
||||||
self.assertEquals(dedup_lookup['i'], record_id)
|
self.assertEqual(dedup_lookup['i'], record_id)
|
||||||
self.assertEquals(dedup_lookup['d'], dedup_date)
|
self.assertEqual(dedup_lookup['d'], dedup_date)
|
||||||
|
|
||||||
# test playback
|
# test playback
|
||||||
self.logger.debug('testing playback of revisit of {}'.format(url))
|
self.logger.debug('testing playback of revisit of {}'.format(url))
|
||||||
response = self.poll_playback_until(url, status=200, timeout_sec=10)
|
response = self.poll_playback_until(url, status=200, timeout_sec=10)
|
||||||
self.assertEqual(response.status_code, 200)
|
self.assertEqual(response.status_code, 200)
|
||||||
self.assertEqual(response.headers['warcprox-test-header'], 'e!')
|
self.assertEqual(response.headers['warcprox-test-header'], 'e!')
|
||||||
self.assertEqual(response.content, 'I am the warcprox test payload! ffffffffff!\n')
|
self.assertEqual(response.content, b'I am the warcprox test payload! ffffffffff!\n')
|
||||||
# XXX how to check dedup was used?
|
# XXX how to check dedup was used?
|
||||||
|
|
||||||
|
|
||||||
@ -325,30 +337,30 @@ class WarcproxTest(unittest.TestCase):
|
|||||||
# ensure playback fails before archiving
|
# ensure playback fails before archiving
|
||||||
response = requests.get(url, proxies=self.playback_proxies, verify=False)
|
response = requests.get(url, proxies=self.playback_proxies, verify=False)
|
||||||
self.assertEqual(response.status_code, 404)
|
self.assertEqual(response.status_code, 404)
|
||||||
self.assertEqual(response.content, '404 Not in Archive\n')
|
self.assertEqual(response.content, b'404 Not in Archive\n')
|
||||||
|
|
||||||
# check not in dedup db
|
# check not in dedup db
|
||||||
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
|
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
|
||||||
self.assertIsNone(dedup_lookup)
|
self.assertIsNone(dedup_lookup)
|
||||||
|
|
||||||
# archive
|
# archive
|
||||||
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
|
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
|
||||||
self.assertEqual(response.status_code, 200)
|
self.assertEqual(response.status_code, 200)
|
||||||
self.assertEqual(response.headers['warcprox-test-header'], 'g!')
|
self.assertEqual(response.headers['warcprox-test-header'], 'g!')
|
||||||
self.assertEqual(response.content, 'I am the warcprox test payload! hhhhhhhhhh!\n')
|
self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n')
|
||||||
|
|
||||||
# test playback
|
# test playback
|
||||||
response = self.poll_playback_until(url, status=200, timeout_sec=10)
|
response = self.poll_playback_until(url, status=200, timeout_sec=10)
|
||||||
self.assertEqual(response.status_code, 200)
|
self.assertEqual(response.status_code, 200)
|
||||||
self.assertEqual(response.headers['warcprox-test-header'], 'g!')
|
self.assertEqual(response.headers['warcprox-test-header'], 'g!')
|
||||||
self.assertEqual(response.content, 'I am the warcprox test payload! hhhhhhhhhh!\n')
|
self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n')
|
||||||
|
|
||||||
# check in dedup db
|
# check in dedup db
|
||||||
# {u'i': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'}
|
# {u'i': u'<urn:uuid:e691dc0f-4bb9-4ad8-9afb-2af836aa05e4>', u'u': u'https://localhost:62841/c/d', u'd': u'2013-11-22T00:14:37Z'}
|
||||||
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
|
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
|
||||||
self.assertEquals(dedup_lookup['u'], url)
|
self.assertEqual(dedup_lookup['u'], url.encode('ascii'))
|
||||||
self.assertRegexpMatches(dedup_lookup['i'], r'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$')
|
self.assertRegexpMatches(dedup_lookup['i'], br'^<urn:uuid:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}>$')
|
||||||
self.assertRegexpMatches(dedup_lookup['d'], r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$')
|
self.assertRegexpMatches(dedup_lookup['d'], br'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$')
|
||||||
record_id = dedup_lookup['i']
|
record_id = dedup_lookup['i']
|
||||||
dedup_date = dedup_lookup['d']
|
dedup_date = dedup_lookup['d']
|
||||||
|
|
||||||
@ -360,23 +372,23 @@ class WarcproxTest(unittest.TestCase):
|
|||||||
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
|
response = requests.get(url, proxies=self.archiving_proxies, verify=False)
|
||||||
self.assertEqual(response.status_code, 200)
|
self.assertEqual(response.status_code, 200)
|
||||||
self.assertEqual(response.headers['warcprox-test-header'], 'g!')
|
self.assertEqual(response.headers['warcprox-test-header'], 'g!')
|
||||||
self.assertEqual(response.content, 'I am the warcprox test payload! hhhhhhhhhh!\n')
|
self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n')
|
||||||
|
|
||||||
# XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\
|
# XXX need to give warc writer thread a chance, and we don't have any change to poll for :-\
|
||||||
time.sleep(2.0)
|
time.sleep(2.0)
|
||||||
|
|
||||||
# check in dedup db (no change from prev)
|
# check in dedup db (no change from prev)
|
||||||
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup('sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
|
dedup_lookup = self.warcprox.warc_writer.dedup_db.lookup(b'sha1:5b4efa64fdb308ec06ae56a9beba155a6f734b89')
|
||||||
self.assertEquals(dedup_lookup['u'], url)
|
self.assertEqual(dedup_lookup['u'], url.encode('ascii'))
|
||||||
self.assertEquals(dedup_lookup['i'], record_id)
|
self.assertEqual(dedup_lookup['i'], record_id)
|
||||||
self.assertEquals(dedup_lookup['d'], dedup_date)
|
self.assertEqual(dedup_lookup['d'], dedup_date)
|
||||||
|
|
||||||
# test playback
|
# test playback
|
||||||
self.logger.debug('testing playback of revisit of {}'.format(url))
|
self.logger.debug('testing playback of revisit of {}'.format(url))
|
||||||
response = self.poll_playback_until(url, status=200, timeout_sec=10)
|
response = self.poll_playback_until(url, status=200, timeout_sec=10)
|
||||||
self.assertEqual(response.status_code, 200)
|
self.assertEqual(response.status_code, 200)
|
||||||
self.assertEqual(response.headers['warcprox-test-header'], 'g!')
|
self.assertEqual(response.headers['warcprox-test-header'], 'g!')
|
||||||
self.assertEqual(response.content, 'I am the warcprox test payload! hhhhhhhhhh!\n')
|
self.assertEqual(response.content, b'I am the warcprox test payload! hhhhhhhhhh!\n')
|
||||||
# XXX how to check dedup was used?
|
# XXX how to check dedup was used?
|
||||||
|
|
||||||
|
|
||||||
|
@ -4,12 +4,55 @@
|
|||||||
"""
|
"""
|
||||||
WARC writing MITM HTTP/S proxy
|
WARC writing MITM HTTP/S proxy
|
||||||
|
|
||||||
See README.md or https://github.com/internetarchive/warcprox
|
See README.rst or https://github.com/internetarchive/warcprox
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import BaseHTTPServer, SocketServer
|
try:
|
||||||
|
import http.server
|
||||||
|
http_server = http.server
|
||||||
|
except ImportError:
|
||||||
|
import BaseHTTPServer
|
||||||
|
http_server = BaseHTTPServer
|
||||||
|
|
||||||
|
try:
|
||||||
|
import socketserver
|
||||||
|
except ImportError:
|
||||||
|
import SocketServer
|
||||||
|
socketserver = SocketServer
|
||||||
|
|
||||||
|
try:
|
||||||
|
import urllib.parse
|
||||||
|
urllib_parse = urllib.parse
|
||||||
|
except ImportError:
|
||||||
|
import urlparse
|
||||||
|
urllib_parse = urlparse
|
||||||
|
|
||||||
|
try:
|
||||||
|
import queue
|
||||||
|
except ImportError:
|
||||||
|
import Queue
|
||||||
|
queue = Queue
|
||||||
|
|
||||||
|
try:
|
||||||
|
import http.client
|
||||||
|
http_client = http.client
|
||||||
|
except ImportError:
|
||||||
|
import httplib
|
||||||
|
http_client = httplib
|
||||||
|
|
||||||
|
try:
|
||||||
|
import dbm.gnu
|
||||||
|
dbm_gnu = dbm.gnu
|
||||||
|
except ImportError:
|
||||||
|
import gdbm
|
||||||
|
dbm_gnu = gdbm
|
||||||
|
|
||||||
|
try:
|
||||||
|
from io import StringIO
|
||||||
|
except ImportError:
|
||||||
|
from StringIO import StringIO
|
||||||
|
|
||||||
import socket
|
import socket
|
||||||
import urlparse
|
|
||||||
import OpenSSL
|
import OpenSSL
|
||||||
import ssl
|
import ssl
|
||||||
import logging
|
import logging
|
||||||
@ -17,12 +60,10 @@ import sys
|
|||||||
from hanzo import warctools, httptools
|
from hanzo import warctools, httptools
|
||||||
import hashlib
|
import hashlib
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import Queue
|
|
||||||
import threading
|
import threading
|
||||||
import os
|
import os
|
||||||
import argparse
|
import argparse
|
||||||
import random
|
import random
|
||||||
import httplib
|
|
||||||
import re
|
import re
|
||||||
import signal
|
import signal
|
||||||
import time
|
import time
|
||||||
@ -30,8 +71,6 @@ import tempfile
|
|||||||
import base64
|
import base64
|
||||||
import json
|
import json
|
||||||
import traceback
|
import traceback
|
||||||
import gdbm
|
|
||||||
from StringIO import StringIO
|
|
||||||
|
|
||||||
class CertificateAuthority(object):
|
class CertificateAuthority(object):
|
||||||
logger = logging.getLogger('warcprox.CertificateAuthority')
|
logger = logging.getLogger('warcprox.CertificateAuthority')
|
||||||
@ -66,9 +105,9 @@ class CertificateAuthority(object):
|
|||||||
self.cert.set_issuer(self.cert.get_subject())
|
self.cert.set_issuer(self.cert.get_subject())
|
||||||
self.cert.set_pubkey(self.key)
|
self.cert.set_pubkey(self.key)
|
||||||
self.cert.add_extensions([
|
self.cert.add_extensions([
|
||||||
OpenSSL.crypto.X509Extension("basicConstraints", True, "CA:TRUE, pathlen:0"),
|
OpenSSL.crypto.X509Extension(b"basicConstraints", True, b"CA:TRUE, pathlen:0"),
|
||||||
OpenSSL.crypto.X509Extension("keyUsage", True, "keyCertSign, cRLSign"),
|
OpenSSL.crypto.X509Extension(b"keyUsage", True, b"keyCertSign, cRLSign"),
|
||||||
OpenSSL.crypto.X509Extension("subjectKeyIdentifier", False, "hash", subject=self.cert),
|
OpenSSL.crypto.X509Extension(b"subjectKeyIdentifier", False, b"hash", subject=self.cert),
|
||||||
])
|
])
|
||||||
self.cert.sign(self.key, "sha1")
|
self.cert.sign(self.key, "sha1")
|
||||||
|
|
||||||
@ -134,29 +173,29 @@ class ProxyingRecorder(object):
|
|||||||
self.payload_digest = None
|
self.payload_digest = None
|
||||||
self.proxy_dest = proxy_dest
|
self.proxy_dest = proxy_dest
|
||||||
self._proxy_dest_conn_open = True
|
self._proxy_dest_conn_open = True
|
||||||
self._prev_hunk_last_two_bytes = ''
|
self._prev_hunk_last_two_bytes = b''
|
||||||
self.len = 0
|
self.len = 0
|
||||||
|
|
||||||
def _update(self, hunk):
|
def _update(self, hunk):
|
||||||
if self.payload_digest is None:
|
if self.payload_digest is None:
|
||||||
# convoluted handling of two newlines crossing hunks
|
# convoluted handling of two newlines crossing hunks
|
||||||
# XXX write tests for this
|
# XXX write tests for this
|
||||||
if self._prev_hunk_last_two_bytes.endswith('\n'):
|
if self._prev_hunk_last_two_bytes.endswith(b'\n'):
|
||||||
if hunk.startswith('\n'):
|
if hunk.startswith(b'\n'):
|
||||||
self.payload_digest = hashlib.new(self.digest_algorithm)
|
self.payload_digest = hashlib.new(self.digest_algorithm)
|
||||||
self.payload_digest.update(hunk[1:])
|
self.payload_digest.update(hunk[1:])
|
||||||
self.payload_offset = self.len + 1
|
self.payload_offset = self.len + 1
|
||||||
elif hunk.startswith('\r\n'):
|
elif hunk.startswith(b'\r\n'):
|
||||||
self.payload_digest = hashlib.new(self.digest_algorithm)
|
self.payload_digest = hashlib.new(self.digest_algorithm)
|
||||||
self.payload_digest.update(hunk[2:])
|
self.payload_digest.update(hunk[2:])
|
||||||
self.payload_offset = self.len + 2
|
self.payload_offset = self.len + 2
|
||||||
elif self._prev_hunk_last_two_bytes == '\n\r':
|
elif self._prev_hunk_last_two_bytes == b'\n\r':
|
||||||
if hunk.startswith('\n'):
|
if hunk.startswith(b'\n'):
|
||||||
self.payload_digest = hashlib.new(self.digest_algorithm)
|
self.payload_digest = hashlib.new(self.digest_algorithm)
|
||||||
self.payload_digest.update(hunk[1:])
|
self.payload_digest.update(hunk[1:])
|
||||||
self.payload_offset = self.len + 1
|
self.payload_offset = self.len + 1
|
||||||
else:
|
else:
|
||||||
m = re.search(r'\n\r?\n', hunk)
|
m = re.search(br'\n\r?\n', hunk)
|
||||||
if m is not None:
|
if m is not None:
|
||||||
self.payload_digest = hashlib.new(self.digest_algorithm)
|
self.payload_digest = hashlib.new(self.digest_algorithm)
|
||||||
self.payload_digest.update(hunk[m.end():])
|
self.payload_digest.update(hunk[m.end():])
|
||||||
@ -184,14 +223,14 @@ class ProxyingRecorder(object):
|
|||||||
|
|
||||||
|
|
||||||
def read(self, size=-1):
|
def read(self, size=-1):
|
||||||
hunk = self.fp.read(size=size)
|
hunk = self.fp.read(size)
|
||||||
self._update(hunk)
|
self._update(hunk)
|
||||||
return hunk
|
return hunk
|
||||||
|
|
||||||
def readline(self, size=-1):
|
def readline(self, size=-1):
|
||||||
# XXX depends on implementation details of self.fp.readline(), in
|
# XXX depends on implementation details of self.fp.readline(), in
|
||||||
# particular that it doesn't call self.fp.read()
|
# particular that it doesn't call self.fp.read()
|
||||||
hunk = self.fp.readline(size=size)
|
hunk = self.fp.readline(size)
|
||||||
self._update(hunk)
|
self._update(hunk)
|
||||||
return hunk
|
return hunk
|
||||||
|
|
||||||
@ -208,10 +247,10 @@ class ProxyingRecorder(object):
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
class ProxyingRecordingHTTPResponse(httplib.HTTPResponse):
|
class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
|
||||||
|
|
||||||
def __init__(self, sock, debuglevel=0, strict=0, method=None, buffering=False, proxy_dest=None, digest_algorithm='sha1'):
|
def __init__(self, sock, debuglevel=0, method=None, proxy_dest=None, digest_algorithm='sha1'):
|
||||||
httplib.HTTPResponse.__init__(self, sock, debuglevel=debuglevel, strict=strict, method=method, buffering=buffering)
|
http_client.HTTPResponse.__init__(self, sock, debuglevel=debuglevel, method=method)
|
||||||
|
|
||||||
# Keep around extra reference to self.fp because HTTPResponse sets
|
# Keep around extra reference to self.fp because HTTPResponse sets
|
||||||
# self.fp=None after it finishes reading, but we still need it
|
# self.fp=None after it finishes reading, but we still need it
|
||||||
@ -219,12 +258,22 @@ class ProxyingRecordingHTTPResponse(httplib.HTTPResponse):
|
|||||||
self.fp = self.recorder
|
self.fp = self.recorder
|
||||||
|
|
||||||
|
|
||||||
class MitmProxyHandler(BaseHTTPServer.BaseHTTPRequestHandler):
|
class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||||
logger = logging.getLogger('warcprox.MitmProxyHandler')
|
logger = logging.getLogger('warcprox.MitmProxyHandler')
|
||||||
|
|
||||||
def __init__(self, request, client_address, server):
|
def __init__(self, request, client_address, server):
|
||||||
self.is_connect = False
|
self.is_connect = False
|
||||||
BaseHTTPServer.BaseHTTPRequestHandler.__init__(self, request, client_address, server)
|
|
||||||
|
## XXX hack around bizarre bug on my mac python 3.2 in http.server
|
||||||
|
## where hasattr returns true in the code snippet below, but
|
||||||
|
## self._headers_buffer is None
|
||||||
|
#
|
||||||
|
# if not hasattr(self, '_headers_buffer'):
|
||||||
|
# self._headers_buffer = []
|
||||||
|
# self._headers_buffer.append(
|
||||||
|
self._headers_buffer = []
|
||||||
|
|
||||||
|
http_server.BaseHTTPRequestHandler.__init__(self, request, client_address, server)
|
||||||
|
|
||||||
def _determine_host_port(self):
|
def _determine_host_port(self):
|
||||||
# Get hostname and port to connect to
|
# Get hostname and port to connect to
|
||||||
@ -232,13 +281,13 @@ class MitmProxyHandler(BaseHTTPServer.BaseHTTPRequestHandler):
|
|||||||
self.hostname, self.port = self.path.split(':')
|
self.hostname, self.port = self.path.split(':')
|
||||||
else:
|
else:
|
||||||
self.url = self.path
|
self.url = self.path
|
||||||
u = urlparse.urlparse(self.url)
|
u = urllib_parse.urlparse(self.url)
|
||||||
if u.scheme != 'http':
|
if u.scheme != 'http':
|
||||||
raise Exception('Unknown scheme %s' % repr(u.scheme))
|
raise Exception('Unknown scheme %s' % repr(u.scheme))
|
||||||
self.hostname = u.hostname
|
self.hostname = u.hostname
|
||||||
self.port = u.port or 80
|
self.port = u.port or 80
|
||||||
self.path = urlparse.urlunparse(
|
self.path = urllib_parse.urlunparse(
|
||||||
urlparse.ParseResult(
|
urllib_parse.ParseResult(
|
||||||
scheme='',
|
scheme='',
|
||||||
netloc='',
|
netloc='',
|
||||||
params=u.params,
|
params=u.params,
|
||||||
@ -291,8 +340,8 @@ class MitmProxyHandler(BaseHTTPServer.BaseHTTPRequestHandler):
|
|||||||
else:
|
else:
|
||||||
netloc = '{}:{}'.format(self.hostname, self.port)
|
netloc = '{}:{}'.format(self.hostname, self.port)
|
||||||
|
|
||||||
result = urlparse.urlunparse(
|
result = urllib_parse.urlunparse(
|
||||||
urlparse.ParseResult(
|
urllib_parse.ParseResult(
|
||||||
scheme='https',
|
scheme='https',
|
||||||
netloc=netloc,
|
netloc=netloc,
|
||||||
params='',
|
params='',
|
||||||
@ -344,10 +393,10 @@ class WarcProxyHandler(MitmProxyHandler):
|
|||||||
|
|
||||||
def _proxy_request(self):
|
def _proxy_request(self):
|
||||||
# Build request
|
# Build request
|
||||||
req = '%s %s %s\r\n' % (self.command, self.path, self.request_version)
|
req = '{} {} {}\r\n'.format(self.command, self.path, self.request_version).encode('utf-8')
|
||||||
|
|
||||||
# Add headers to the request
|
# Add headers to the request
|
||||||
req += '%s\r\n' % self.headers
|
req += str(self.headers).encode('utf-8') + b'\r\n'
|
||||||
|
|
||||||
# Append message body if present to the request
|
# Append message body if present to the request
|
||||||
if 'Content-Length' in self.headers:
|
if 'Content-Length' in self.headers:
|
||||||
@ -371,7 +420,7 @@ class WarcProxyHandler(MitmProxyHandler):
|
|||||||
h.begin()
|
h.begin()
|
||||||
|
|
||||||
buf = h.read(8192)
|
buf = h.read(8192)
|
||||||
while buf != '':
|
while buf != b'':
|
||||||
buf = h.read(8192)
|
buf = h.read(8192)
|
||||||
|
|
||||||
self.log_request(h.status, h.recorder.len)
|
self.log_request(h.status, h.recorder.len)
|
||||||
@ -389,19 +438,29 @@ class WarcProxyHandler(MitmProxyHandler):
|
|||||||
|
|
||||||
class RecordedUrl(object):
|
class RecordedUrl(object):
|
||||||
def __init__(self, url, request_data, response_recorder, remote_ip):
|
def __init__(self, url, request_data, response_recorder, remote_ip):
|
||||||
self.url = url
|
# XXX should test what happens with non-ascii url (when does
|
||||||
|
# url-encoding happen?)
|
||||||
|
if type(url) is not bytes:
|
||||||
|
self.url = url.encode('ascii')
|
||||||
|
else:
|
||||||
|
self.url = url
|
||||||
|
|
||||||
|
if type(remote_ip) is not bytes:
|
||||||
|
self.remote_ip = remote_ip.encode('ascii')
|
||||||
|
else:
|
||||||
|
self.remote_ip = remote_ip
|
||||||
|
|
||||||
self.request_data = request_data
|
self.request_data = request_data
|
||||||
self.response_recorder = response_recorder
|
self.response_recorder = response_recorder
|
||||||
self.remote_ip = remote_ip
|
|
||||||
|
|
||||||
|
|
||||||
class WarcProxy(SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer):
|
class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer):
|
||||||
logger = logging.getLogger('warcprox.WarcProxy')
|
logger = logging.getLogger('warcprox.WarcProxy')
|
||||||
|
|
||||||
def __init__(self, server_address=('localhost', 8000),
|
def __init__(self, server_address=('localhost', 8000),
|
||||||
req_handler_class=WarcProxyHandler, bind_and_activate=True,
|
req_handler_class=WarcProxyHandler, bind_and_activate=True,
|
||||||
ca=None, recorded_url_q=None, digest_algorithm='sha1'):
|
ca=None, recorded_url_q=None, digest_algorithm='sha1'):
|
||||||
BaseHTTPServer.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate)
|
http_server.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate)
|
||||||
|
|
||||||
self.digest_algorithm = digest_algorithm
|
self.digest_algorithm = digest_algorithm
|
||||||
|
|
||||||
@ -413,15 +472,15 @@ class WarcProxy(SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer):
|
|||||||
if recorded_url_q is not None:
|
if recorded_url_q is not None:
|
||||||
self.recorded_url_q = recorded_url_q
|
self.recorded_url_q = recorded_url_q
|
||||||
else:
|
else:
|
||||||
self.recorded_url_q = Queue.Queue()
|
self.recorded_url_q = queue.Queue()
|
||||||
|
|
||||||
def server_activate(self):
|
def server_activate(self):
|
||||||
BaseHTTPServer.HTTPServer.server_activate(self)
|
http_server.HTTPServer.server_activate(self)
|
||||||
self.logger.info('WarcProxy listening on {0}:{1}'.format(self.server_address[0], self.server_address[1]))
|
self.logger.info('WarcProxy listening on {0}:{1}'.format(self.server_address[0], self.server_address[1]))
|
||||||
|
|
||||||
def server_close(self):
|
def server_close(self):
|
||||||
self.logger.info('WarcProxy shutting down')
|
self.logger.info('WarcProxy shutting down')
|
||||||
BaseHTTPServer.HTTPServer.server_close(self)
|
http_server.HTTPServer.server_close(self)
|
||||||
|
|
||||||
|
|
||||||
class PlaybackProxyHandler(MitmProxyHandler):
|
class PlaybackProxyHandler(MitmProxyHandler):
|
||||||
@ -435,31 +494,31 @@ class PlaybackProxyHandler(MitmProxyHandler):
|
|||||||
|
|
||||||
# @Override
|
# @Override
|
||||||
def _proxy_request(self):
|
def _proxy_request(self):
|
||||||
date, location = self.server.playback_index_db.lookup_latest(self.url)
|
date, location = self.server.playback_index_db.lookup_latest(self.url.encode('utf-8'))
|
||||||
self.logger.debug('lookup_latest returned {}:{}'.format(date, location))
|
self.logger.debug('lookup_latest returned {}:{}'.format(date, location))
|
||||||
|
|
||||||
status = None
|
status = None
|
||||||
if location is not None:
|
if location is not None:
|
||||||
try:
|
try:
|
||||||
status, sz = self._send_response_from_warc(location[b'f'], location[b'o'])
|
status, sz = self._send_response_from_warc(location['f'], location['o'])
|
||||||
except:
|
except:
|
||||||
status = 500
|
status = 500
|
||||||
self.logger.error('PlaybackProxyHandler problem playing back {}'.format(self.url), exc_info=1)
|
self.logger.error('PlaybackProxyHandler problem playing back {}'.format(self.url), exc_info=1)
|
||||||
payload = '500 Warcprox Error\n\n{}\n'.format(traceback.format_exc())
|
payload = b'500 Warcprox Error\n\n{}\n'.format(traceback.format_exc()).encode('utf-8')
|
||||||
headers = ('HTTP/1.1 500 Internal Server Error\r\n'
|
headers = (b'HTTP/1.1 500 Internal Server Error\r\n'
|
||||||
+ 'Content-Type: text/plain\r\n'
|
+ b'Content-Type: text/plain;charset=utf-8\r\n'
|
||||||
+ 'Content-Length: {}\r\n'
|
+ b'Content-Length: ' + len(payload) + b'\r\n'
|
||||||
+ '\r\n').format(len(payload))
|
+ b'\r\n')
|
||||||
self.connection.sendall(headers)
|
self.connection.sendall(headers)
|
||||||
self.connection.sendall(payload)
|
self.connection.sendall(payload)
|
||||||
sz = len(headers) + len(payload)
|
sz = len(headers) + len(payload)
|
||||||
else:
|
else:
|
||||||
status = 404
|
status = 404
|
||||||
payload = '404 Not in Archive\n'
|
payload = b'404 Not in Archive\n'
|
||||||
headers = ('HTTP/1.1 404 Not Found\r\n'
|
headers = (b'HTTP/1.1 404 Not Found\r\n'
|
||||||
+ 'Content-Type: text/plain\r\n'
|
+ b'Content-Type: text/plain;charset=utf-8\r\n'
|
||||||
+ 'Content-Length: {}\r\n'
|
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n'
|
||||||
+ '\r\n').format(len(payload))
|
+ b'\r\n')
|
||||||
self.connection.sendall(headers)
|
self.connection.sendall(headers)
|
||||||
self.connection.sendall(payload)
|
self.connection.sendall(payload)
|
||||||
sz = len(headers) + len(payload)
|
sz = len(headers) + len(payload)
|
||||||
@ -494,7 +553,7 @@ class PlaybackProxyHandler(MitmProxyHandler):
|
|||||||
|
|
||||||
while True:
|
while True:
|
||||||
buf = payload_fh.read(8192)
|
buf = payload_fh.read(8192)
|
||||||
if buf == '': break
|
if buf == b'': break
|
||||||
self.connection.sendall(buf)
|
self.connection.sendall(buf)
|
||||||
sz += len(buf)
|
sz += len(buf)
|
||||||
|
|
||||||
@ -520,7 +579,7 @@ class PlaybackProxyHandler(MitmProxyHandler):
|
|||||||
# find end of headers
|
# find end of headers
|
||||||
while True:
|
while True:
|
||||||
line = record.content_file.readline()
|
line = record.content_file.readline()
|
||||||
if line == '' or re.match('^\r?\n$', line):
|
if line == b'' or re.match(br'^\r?\n$', line):
|
||||||
break
|
break
|
||||||
|
|
||||||
return self._send_response(headers, record.content_file)
|
return self._send_response(headers, record.content_file)
|
||||||
@ -545,7 +604,7 @@ class PlaybackProxyHandler(MitmProxyHandler):
|
|||||||
while True:
|
while True:
|
||||||
line = record.content_file.readline()
|
line = record.content_file.readline()
|
||||||
headers_buf.extend(line)
|
headers_buf.extend(line)
|
||||||
if line == '' or re.match('^\r?\n$', line):
|
if line == b'' or re.match(b'^\r?\n$', line):
|
||||||
break
|
break
|
||||||
|
|
||||||
return self._send_response(headers_buf, record.content_file)
|
return self._send_response(headers_buf, record.content_file)
|
||||||
@ -573,24 +632,24 @@ class PlaybackProxyHandler(MitmProxyHandler):
|
|||||||
raise Exception('should not reach this point')
|
raise Exception('should not reach this point')
|
||||||
|
|
||||||
|
|
||||||
class PlaybackProxy(SocketServer.ThreadingMixIn, BaseHTTPServer.HTTPServer):
|
class PlaybackProxy(socketserver.ThreadingMixIn, http_server.HTTPServer):
|
||||||
logger = logging.getLogger('warcprox.PlaybackProxy')
|
logger = logging.getLogger('warcprox.PlaybackProxy')
|
||||||
|
|
||||||
def __init__(self, server_address, req_handler_class=PlaybackProxyHandler,
|
def __init__(self, server_address, req_handler_class=PlaybackProxyHandler,
|
||||||
bind_and_activate=True, ca=None, playback_index_db=None,
|
bind_and_activate=True, ca=None, playback_index_db=None,
|
||||||
warcs_dir=None):
|
warcs_dir=None):
|
||||||
BaseHTTPServer.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate)
|
http_server.HTTPServer.__init__(self, server_address, req_handler_class, bind_and_activate)
|
||||||
self.ca = ca
|
self.ca = ca
|
||||||
self.playback_index_db = playback_index_db
|
self.playback_index_db = playback_index_db
|
||||||
self.warcs_dir = warcs_dir
|
self.warcs_dir = warcs_dir
|
||||||
|
|
||||||
def server_activate(self):
|
def server_activate(self):
|
||||||
BaseHTTPServer.HTTPServer.server_activate(self)
|
http_server.HTTPServer.server_activate(self)
|
||||||
self.logger.info('PlaybackProxy listening on {0}:{1}'.format(self.server_address[0], self.server_address[1]))
|
self.logger.info('PlaybackProxy listening on {0}:{1}'.format(self.server_address[0], self.server_address[1]))
|
||||||
|
|
||||||
def server_close(self):
|
def server_close(self):
|
||||||
self.logger.info('PlaybackProxy shutting down')
|
self.logger.info('PlaybackProxy shutting down')
|
||||||
BaseHTTPServer.HTTPServer.server_close(self)
|
http_server.HTTPServer.server_close(self)
|
||||||
|
|
||||||
|
|
||||||
class DedupDb(object):
|
class DedupDb(object):
|
||||||
@ -602,7 +661,7 @@ class DedupDb(object):
|
|||||||
else:
|
else:
|
||||||
self.logger.info('creating new deduplication database {}'.format(dbm_file))
|
self.logger.info('creating new deduplication database {}'.format(dbm_file))
|
||||||
|
|
||||||
self.db = gdbm.open(dbm_file, 'c')
|
self.db = dbm_gnu.open(dbm_file, 'c')
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
self.db.close()
|
self.db.close()
|
||||||
@ -611,21 +670,24 @@ class DedupDb(object):
|
|||||||
self.db.sync()
|
self.db.sync()
|
||||||
|
|
||||||
def save(self, key, response_record, offset):
|
def save(self, key, response_record, offset):
|
||||||
record_id = response_record.get_header(warctools.WarcRecord.ID)
|
record_id = response_record.get_header(warctools.WarcRecord.ID).decode('latin1')
|
||||||
url = response_record.get_header(warctools.WarcRecord.URL)
|
url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1')
|
||||||
date = response_record.get_header(warctools.WarcRecord.DATE)
|
date = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1')
|
||||||
|
|
||||||
py_value = {'i':record_id, 'u':url, 'd':date}
|
py_value = {'i':record_id, 'u':url, 'd':date}
|
||||||
json_value = json.dumps(py_value, separators=(',',':'))
|
json_value = json.dumps(py_value, separators=(',',':'))
|
||||||
|
|
||||||
self.db[key] = json_value
|
self.db[key] = json_value.encode('utf-8')
|
||||||
self.logger.debug('dedup db saved {}:{}'.format(key, json_value))
|
self.logger.debug('dedup db saved {}:{}'.format(key, json_value))
|
||||||
|
|
||||||
|
|
||||||
def lookup(self, key):
|
def lookup(self, key):
|
||||||
if key in self.db:
|
if key in self.db:
|
||||||
json_result = self.db[key]
|
json_result = self.db[key]
|
||||||
result = json.loads(json_result)
|
result = json.loads(json_result.decode('utf-8'))
|
||||||
|
result['i'] = result['i'].encode('latin1')
|
||||||
|
result['u'] = result['u'].encode('latin1')
|
||||||
|
result['d'] = result['d'].encode('latin1')
|
||||||
return result
|
return result
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
@ -717,8 +779,7 @@ class WarcWriterThread(threading.Thread):
|
|||||||
|
|
||||||
|
|
||||||
def digest_str(self, hash_obj):
|
def digest_str(self, hash_obj):
|
||||||
return '{}:{}'.format(hash_obj.name,
|
return hash_obj.name.encode('utf-8') + b':' + (base64.b32encode(hash_obj.digest()) if self.base32 else hash_obj.hexdigest().encode('ascii'))
|
||||||
base64.b32encode(hash_obj.digest()) if self.base32 else hash_obj.hexdigest())
|
|
||||||
|
|
||||||
|
|
||||||
def build_warc_record(self, url, warc_date=None, recorder=None, data=None,
|
def build_warc_record(self, url, warc_date=None, recorder=None, data=None,
|
||||||
@ -753,7 +814,7 @@ class WarcWriterThread(threading.Thread):
|
|||||||
headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
|
headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
|
||||||
|
|
||||||
if recorder is not None:
|
if recorder is not None:
|
||||||
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder))))
|
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1')))
|
||||||
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
||||||
self.digest_str(recorder.block_digest)))
|
self.digest_str(recorder.block_digest)))
|
||||||
if recorder.payload_digest is not None:
|
if recorder.payload_digest is not None:
|
||||||
@ -764,7 +825,7 @@ class WarcWriterThread(threading.Thread):
|
|||||||
record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)
|
record = warctools.WarcRecord(headers=headers, content_file=recorder.tempfile)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data))))
|
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1')))
|
||||||
block_digest = hashlib.new(self.digest_algorithm, data)
|
block_digest = hashlib.new(self.digest_algorithm, data)
|
||||||
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
||||||
self.digest_str(block_digest)))
|
self.digest_str(block_digest)))
|
||||||
@ -796,21 +857,21 @@ class WarcWriterThread(threading.Thread):
|
|||||||
headers = []
|
headers = []
|
||||||
headers.append((warctools.WarcRecord.ID, record_id))
|
headers.append((warctools.WarcRecord.ID, record_id))
|
||||||
headers.append((warctools.WarcRecord.TYPE, warctools.WarcRecord.WARCINFO))
|
headers.append((warctools.WarcRecord.TYPE, warctools.WarcRecord.WARCINFO))
|
||||||
headers.append((warctools.WarcRecord.FILENAME, filename))
|
headers.append((warctools.WarcRecord.FILENAME, filename.encode('latin1')))
|
||||||
headers.append((warctools.WarcRecord.DATE, warc_record_date))
|
headers.append((warctools.WarcRecord.DATE, warc_record_date))
|
||||||
|
|
||||||
warcinfo_fields = []
|
warcinfo_fields = []
|
||||||
warcinfo_fields.append('software: warcprox.py https://github.com/internetarchive/warcprox')
|
warcinfo_fields.append(b'software: warcprox.py https://github.com/internetarchive/warcprox')
|
||||||
hostname = socket.gethostname()
|
hostname = socket.gethostname()
|
||||||
warcinfo_fields.append('hostname: {0}'.format(hostname))
|
warcinfo_fields.append('hostname: {}'.format(hostname).encode('latin1'))
|
||||||
warcinfo_fields.append('ip: {0}'.format(socket.gethostbyname(hostname)))
|
warcinfo_fields.append('ip: {0}'.format(socket.gethostbyname(hostname)).encode('latin1'))
|
||||||
warcinfo_fields.append('format: WARC File Format 1.0')
|
warcinfo_fields.append(b'format: WARC File Format 1.0')
|
||||||
# warcinfo_fields.append('robots: ignore')
|
# warcinfo_fields.append('robots: ignore')
|
||||||
# warcinfo_fields.append('description: {0}'.format(self.description))
|
# warcinfo_fields.append('description: {0}'.format(self.description))
|
||||||
# warcinfo_fields.append('isPartOf: {0}'.format(self.is_part_of))
|
# warcinfo_fields.append('isPartOf: {0}'.format(self.is_part_of))
|
||||||
data = '\r\n'.join(warcinfo_fields) + '\r\n'
|
data = b'\r\n'.join(warcinfo_fields) + b'\r\n'
|
||||||
|
|
||||||
record = warctools.WarcRecord(headers=headers, content=('application/warc-fields', data))
|
record = warctools.WarcRecord(headers=headers, content=(b'application/warc-fields', data))
|
||||||
|
|
||||||
return record
|
return record
|
||||||
|
|
||||||
@ -879,7 +940,7 @@ class WarcWriterThread(threading.Thread):
|
|||||||
|
|
||||||
self._final_tasks(recorded_url, recordset, recordset_offset)
|
self._final_tasks(recorded_url, recordset, recordset_offset)
|
||||||
|
|
||||||
except Queue.Empty:
|
except queue.Empty:
|
||||||
if (self._fpath is not None
|
if (self._fpath is not None
|
||||||
and self.rollover_idle_time is not None
|
and self.rollover_idle_time is not None
|
||||||
and self.rollover_idle_time > 0
|
and self.rollover_idle_time > 0
|
||||||
@ -907,7 +968,7 @@ class PlaybackIndexDb(object):
|
|||||||
else:
|
else:
|
||||||
self.logger.info('creating new playback index database {}'.format(dbm_file))
|
self.logger.info('creating new playback index database {}'.format(dbm_file))
|
||||||
|
|
||||||
self.db = gdbm.open(dbm_file, 'c')
|
self.db = dbm_gnu.open(dbm_file, 'c')
|
||||||
|
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
@ -922,27 +983,27 @@ class PlaybackIndexDb(object):
|
|||||||
response_record = recordset[0]
|
response_record = recordset[0]
|
||||||
# XXX canonicalize url?
|
# XXX canonicalize url?
|
||||||
url = response_record.get_header(warctools.WarcRecord.URL)
|
url = response_record.get_header(warctools.WarcRecord.URL)
|
||||||
date = response_record.get_header(warctools.WarcRecord.DATE)
|
date_str = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1')
|
||||||
record_id = response_record.get_header(warctools.WarcRecord.ID)
|
record_id_str = response_record.get_header(warctools.WarcRecord.ID).decode('latin1')
|
||||||
|
|
||||||
# there could be two visits of same url in the same second, and WARC-Date is
|
# there could be two visits of same url in the same second, and WARC-Date is
|
||||||
# prescribed as YYYY-MM-DDThh:mm:ssZ, so we have to handle it :-\
|
# prescribed as YYYY-MM-DDThh:mm:ssZ, so we have to handle it :-\
|
||||||
|
|
||||||
# url:{date1:[record1={'f':warcfile,'o':response_offset,'q':request_offset,'i':record_id},record2,...],date2:[{...}],...}
|
# url:{date1:[record1={'f':warcfile,'o':response_offset,'q':request_offset,'i':record_id},record2,...],date2:[{...}],...}
|
||||||
if url in self.db:
|
if url in self.db:
|
||||||
existing_json_value = self.db[url]
|
existing_json_value = self.db[url].decode('utf-8')
|
||||||
py_value = json.loads(existing_json_value)
|
py_value = json.loads(existing_json_value)
|
||||||
else:
|
else:
|
||||||
py_value = {}
|
py_value = {}
|
||||||
|
|
||||||
if date in py_value:
|
if date_str in py_value:
|
||||||
py_value[date].append({'f':warcfile, 'o':offset, 'i':record_id})
|
py_value[date_str].append({'f':warcfile, 'o':offset, 'i':record_id_str})
|
||||||
else:
|
else:
|
||||||
py_value[date] = [{'f':warcfile, 'o':offset, 'i':record_id}]
|
py_value[date_str] = [{'f':warcfile, 'o':offset, 'i':record_id_str}]
|
||||||
|
|
||||||
json_value = json.dumps(py_value, separators=(',',':'))
|
json_value = json.dumps(py_value, separators=(',',':'))
|
||||||
|
|
||||||
self.db[url] = json_value
|
self.db[url] = json_value.encode('utf-8')
|
||||||
|
|
||||||
self.logger.debug('playback index saved: {}:{}'.format(url, json_value))
|
self.logger.debug('playback index saved: {}:{}'.format(url, json_value))
|
||||||
|
|
||||||
@ -951,33 +1012,38 @@ class PlaybackIndexDb(object):
|
|||||||
if url not in self.db:
|
if url not in self.db:
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
json_value = self.db[url]
|
json_value = self.db[url].decode('utf-8')
|
||||||
self.logger.debug("'{}':{}".format(url, json_value))
|
self.logger.debug("{}:{}".format(repr(url), repr(json_value)))
|
||||||
py_value = json.loads(json_value)
|
py_value = json.loads(json_value)
|
||||||
|
|
||||||
latest_date = max(py_value)
|
latest_date = max(py_value)
|
||||||
return latest_date, py_value[latest_date][0]
|
result = py_value[latest_date][0]
|
||||||
|
result['i'] = result['i'].encode('ascii')
|
||||||
|
return latest_date, result
|
||||||
|
|
||||||
|
|
||||||
|
# in python3 params are bytes
|
||||||
def lookup_exact(self, url, warc_date, record_id):
|
def lookup_exact(self, url, warc_date, record_id):
|
||||||
if url not in self.db:
|
if url not in self.db:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
json_value = self.db[url]
|
json_value = self.db[url].decode('utf-8')
|
||||||
self.logger.debug("'{}':{}".format(url, json_value))
|
self.logger.debug("{}:{}".format(repr(url), repr(json_value)))
|
||||||
py_value = json.loads(json_value)
|
py_value = json.loads(json_value)
|
||||||
|
|
||||||
if warc_date in py_value:
|
warc_date_str = warc_date.decode('ascii')
|
||||||
for record in py_value[warc_date]:
|
|
||||||
if record['i'] == record_id:
|
if warc_date_str in py_value:
|
||||||
|
for record in py_value[warc_date_str]:
|
||||||
|
if record['i'].encode('ascii') == record_id:
|
||||||
self.logger.debug("found exact match for ({},{},{})".format(repr(warc_date), repr(record_id), repr(url)))
|
self.logger.debug("found exact match for ({},{},{})".format(repr(warc_date), repr(record_id), repr(url)))
|
||||||
|
record['i'] = record['i'].encode('ascii')
|
||||||
return record
|
return record
|
||||||
else:
|
else:
|
||||||
self.logger.info("match not found for ({},{},{})".format(repr(warc_date), repr(record_id), repr(url)))
|
self.logger.info("match not found for ({},{},{})".format(repr(warc_date), repr(record_id), repr(url)))
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class WarcproxController(object):
|
class WarcproxController(object):
|
||||||
logger = logging.getLogger('warcprox.WarcproxController')
|
logger = logging.getLogger('warcprox.WarcproxController')
|
||||||
|
|
||||||
@ -1126,7 +1192,7 @@ def main(argv=sys.argv):
|
|||||||
else:
|
else:
|
||||||
dedup_db = DedupDb(args.dedup_db_file)
|
dedup_db = DedupDb(args.dedup_db_file)
|
||||||
|
|
||||||
recorded_url_q = Queue.Queue()
|
recorded_url_q = queue.Queue()
|
||||||
|
|
||||||
ca = CertificateAuthority(args.cacert, args.certs_dir)
|
ca = CertificateAuthority(args.cacert, args.certs_dir)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user