mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
progress towards warc writing
This commit is contained in:
parent
255ab4a350
commit
a950d199d5
14
setup.py
14
setup.py
@ -5,18 +5,18 @@ def read(fname):
|
|||||||
return open(path.join(path.dirname(__file__), fname)).read()
|
return open(path.join(path.dirname(__file__), fname)).read()
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name='pymiproxy',
|
name='warcprox',
|
||||||
author='Nadeem Douba',
|
author='Noah Levitt',
|
||||||
version='1.0',
|
version='1.0',
|
||||||
author_email='ndouba@gmail.com',
|
author_email='nlevitt@archive.org',
|
||||||
description='Micro Interceptor Proxy - a simple MITM HTTP/S proxy',
|
description='warcprox - WARC writing MITM HTTP/S proxy',
|
||||||
license='GPL',
|
license='GPL',
|
||||||
url='https://github.com/allfro/pymiproxy',
|
url='https://github.com/nlevitt/warcprox',
|
||||||
download_url='https://github.com/allfro/pymiproxy/zipball/master',
|
|
||||||
long_description=read('README.md'),
|
long_description=read('README.md'),
|
||||||
packages=find_packages('src'),
|
packages=find_packages('src'),
|
||||||
package_dir={ '' : 'src' },
|
package_dir={ '' : 'src' },
|
||||||
install_requires = [
|
install_requires = [
|
||||||
'pyopenssl'
|
'pyopenssl',
|
||||||
|
'warctools'
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
@ -1,4 +1,12 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/python
|
||||||
|
# vim:set sw=4 et:
|
||||||
|
#
|
||||||
|
|
||||||
|
# python3 imports
|
||||||
|
# from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||||
|
# from urllib.parse import urlparse, urlunparse, ParseResult
|
||||||
|
# from socketserver import ThreadingMixIn
|
||||||
|
# from http.client import HTTPResponse
|
||||||
|
|
||||||
from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler
|
from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler
|
||||||
from urlparse import urlparse, urlunparse, ParseResult
|
from urlparse import urlparse, urlunparse, ParseResult
|
||||||
@ -8,12 +16,22 @@ from tempfile import gettempdir
|
|||||||
from os import path, listdir
|
from os import path, listdir
|
||||||
from ssl import wrap_socket
|
from ssl import wrap_socket
|
||||||
from socket import socket
|
from socket import socket
|
||||||
from re import compile
|
|
||||||
from sys import argv
|
from sys import argv
|
||||||
|
|
||||||
from OpenSSL.crypto import (X509Extension, X509, dump_privatekey, dump_certificate, load_certificate, load_privatekey,
|
from OpenSSL.crypto import (X509Extension, X509, dump_privatekey, dump_certificate, load_certificate, load_privatekey,
|
||||||
PKey, TYPE_RSA, X509Req)
|
PKey, TYPE_RSA, X509Req)
|
||||||
from OpenSSL.SSL import FILETYPE_PEM
|
from OpenSSL.SSL import FILETYPE_PEM
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
import ssl
|
||||||
|
from hanzo.warctools import WarcRecord
|
||||||
|
from hanzo.warctools.warc import warc_datetime_str
|
||||||
|
import uuid
|
||||||
|
import hashlib
|
||||||
|
from datetime import datetime
|
||||||
|
import time
|
||||||
|
import Queue
|
||||||
|
import threading
|
||||||
|
|
||||||
__author__ = 'Nadeem Douba'
|
__author__ = 'Nadeem Douba'
|
||||||
__copyright__ = 'Copyright 2012, PyMiProxy Project'
|
__copyright__ = 'Copyright 2012, PyMiProxy Project'
|
||||||
@ -127,8 +145,6 @@ class UnsupportedSchemeException(Exception):
|
|||||||
|
|
||||||
class ProxyHandler(BaseHTTPRequestHandler):
|
class ProxyHandler(BaseHTTPRequestHandler):
|
||||||
|
|
||||||
r = compile(r'http://[^/]+(/?.*)(?i)')
|
|
||||||
|
|
||||||
def __init__(self, request, client_address, server):
|
def __init__(self, request, client_address, server):
|
||||||
self.is_connect = False
|
self.is_connect = False
|
||||||
BaseHTTPRequestHandler.__init__(self, request, client_address, server)
|
BaseHTTPRequestHandler.__init__(self, request, client_address, server)
|
||||||
@ -138,7 +154,8 @@ class ProxyHandler(BaseHTTPRequestHandler):
|
|||||||
if self.is_connect:
|
if self.is_connect:
|
||||||
self.hostname, self.port = self.path.split(':')
|
self.hostname, self.port = self.path.split(':')
|
||||||
else:
|
else:
|
||||||
u = urlparse(self.path)
|
self.url = self.path
|
||||||
|
u = urlparse(self.url)
|
||||||
if u.scheme != 'http':
|
if u.scheme != 'http':
|
||||||
raise UnsupportedSchemeException('Unknown scheme %s' % repr(u.scheme))
|
raise UnsupportedSchemeException('Unknown scheme %s' % repr(u.scheme))
|
||||||
self.hostname = u.hostname
|
self.hostname = u.hostname
|
||||||
@ -179,14 +196,17 @@ class ProxyHandler(BaseHTTPRequestHandler):
|
|||||||
self.end_headers()
|
self.end_headers()
|
||||||
#self.request.sendall('%s 200 Connection established\r\n\r\n' % self.request_version)
|
#self.request.sendall('%s 200 Connection established\r\n\r\n' % self.request_version)
|
||||||
self._transition_to_ssl()
|
self._transition_to_ssl()
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
self.send_error(500, str(e))
|
self.send_error(500, str(e))
|
||||||
return
|
return
|
||||||
|
|
||||||
# Reload!
|
# Reload!
|
||||||
self.setup()
|
self.setup()
|
||||||
self.ssl_host = 'https://%s' % self.path
|
|
||||||
self.handle_one_request()
|
self.handle_one_request()
|
||||||
|
# try:
|
||||||
|
# except ssl.SSLError, e:
|
||||||
|
# logging.warn("caught SSLError {0}".format(e))
|
||||||
|
# pass
|
||||||
|
|
||||||
|
|
||||||
def do_COMMAND(self):
|
def do_COMMAND(self):
|
||||||
@ -196,14 +216,14 @@ class ProxyHandler(BaseHTTPRequestHandler):
|
|||||||
try:
|
try:
|
||||||
# Connect to destination
|
# Connect to destination
|
||||||
self._connect_to_host()
|
self._connect_to_host()
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
self.send_error(500, str(e))
|
self.send_error(500, str(e))
|
||||||
return
|
return
|
||||||
# Extract path
|
# Extract path
|
||||||
|
|
||||||
# Build request
|
# Build request
|
||||||
req = '%s %s %s\r\n' % (self.command, self.path, self.request_version)
|
req = '%s %s %s\r\n' % (self.command, self.path, self.request_version)
|
||||||
|
|
||||||
# Add headers to the request
|
# Add headers to the request
|
||||||
req += '%s\r\n' % self.headers
|
req += '%s\r\n' % self.headers
|
||||||
|
|
||||||
@ -288,41 +308,123 @@ class MitmProxy(HTTPServer):
|
|||||||
self._res_plugins.append(interceptor_class)
|
self._res_plugins.append(interceptor_class)
|
||||||
|
|
||||||
|
|
||||||
|
def server_activate(self):
|
||||||
|
HTTPServer.server_activate(self)
|
||||||
|
logging.info('listening on {0}:{1}'.format(self.server_address[0], self.server_address[1]))
|
||||||
|
|
||||||
|
|
||||||
|
def server_close(self):
|
||||||
|
HTTPServer.server_close(self)
|
||||||
|
logging.info('shut down')
|
||||||
|
|
||||||
|
|
||||||
class AsyncMitmProxy(ThreadingMixIn, MitmProxy):
|
class AsyncMitmProxy(ThreadingMixIn, MitmProxy):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class MitmProxyHandler(ProxyHandler):
|
class WarcRecordQueuer(RequestInterceptorPlugin, ResponseInterceptorPlugin):
|
||||||
|
|
||||||
def mitm_request(self, data):
|
warc_record_out_queue = Queue.Queue()
|
||||||
print '>> %s' % repr(data[:100])
|
|
||||||
return data
|
|
||||||
|
|
||||||
def mitm_response(self, data):
|
def __init__(self, server, msg):
|
||||||
print '<< %s' % repr(data[:100])
|
InterceptorPlugin.__init__(self, server, msg)
|
||||||
|
|
||||||
|
if msg.is_connect:
|
||||||
|
assert not msg.url
|
||||||
|
|
||||||
|
if int(msg.port) == 443:
|
||||||
|
netloc = msg.hostname
|
||||||
|
else:
|
||||||
|
netloc = '{0}:{1}'.format(msg.hostname, msg.port)
|
||||||
|
|
||||||
|
self.url = urlunparse(
|
||||||
|
ParseResult(
|
||||||
|
scheme='https',
|
||||||
|
netloc=netloc,
|
||||||
|
params='',
|
||||||
|
path=msg.path,
|
||||||
|
query='',
|
||||||
|
fragment=''
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
assert msg.url
|
||||||
|
self.url = msg.url
|
||||||
|
|
||||||
|
|
||||||
|
def do_request(self, data):
|
||||||
|
logging.info('{0} >> {1}'.format(self.url, repr(data[:100])))
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
class DebugInterceptor(RequestInterceptorPlugin, ResponseInterceptorPlugin):
|
def make_warc_uuid(self, text):
|
||||||
|
return "<urn:uuid:{0}>".format(uuid.UUID(hashlib.sha1(text).hexdigest()[0:32]))
|
||||||
|
|
||||||
def do_request(self, data):
|
|
||||||
print '>> %s' % repr(data[:100])
|
|
||||||
return data
|
|
||||||
|
|
||||||
def do_response(self, data):
|
def do_response(self, data):
|
||||||
print '<< %s' % repr(data[:100])
|
logging.info('{0} << {1}'.format(self.url, repr(data[:100])))
|
||||||
return data
|
|
||||||
|
warc_record_id = self.make_warc_uuid("{0} {1}".format(self.url, time.time()))
|
||||||
|
logging.info('{0}: {1}'.format(WarcRecord.ID, warc_record_id))
|
||||||
|
|
||||||
|
headers = []
|
||||||
|
headers.append((WarcRecord.ID, warc_record_id))
|
||||||
|
headers.append((WarcRecord.URL, self.url))
|
||||||
|
headers.append((WarcRecord.DATE, warc_datetime_str(datetime.now())))
|
||||||
|
# headers.append((WarcRecord.IP_ADDRESS, ip))
|
||||||
|
headers.append((WarcRecord.TYPE, WarcRecord.RESPONSE))
|
||||||
|
|
||||||
|
warcrecord = WarcRecord(headers=headers, content=("application/http;msgtype=response", data))
|
||||||
|
|
||||||
|
# warcrecord.write_to(sys.stdout, gzip=False)
|
||||||
|
WarcRecordQueuer.warc_record_out_queue.put(warcrecord)
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
class WarcWriterThread(threading.Thread):
|
||||||
|
|
||||||
|
# def __init__(self, group=None, target=None, name=None, args=(), kwargs=None, verbose=None):
|
||||||
|
# Thread.__init__(self, group=group, target=target, name=name, args=args, kwargs=args
|
||||||
|
|
||||||
|
def __init__(self, warc_record_in_queue):
|
||||||
|
threading.Thread.__init__(self, name='WarcWriterThread')
|
||||||
|
self.warc_record_in_queue = warc_record_in_queue
|
||||||
|
self.stop = threading.Event()
|
||||||
|
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
logging.info('WarcWriterThread starting')
|
||||||
|
|
||||||
|
while not self.stop.is_set():
|
||||||
|
try:
|
||||||
|
warc_record = self.warc_record_in_queue.get(block=False, timeout=0.5)
|
||||||
|
logging.info('got warc record to write from the queue: {0}'.format(warc_record))
|
||||||
|
# warc_record.write_to(sys.stdout, gzip=False)
|
||||||
|
except Queue.Empty:
|
||||||
|
pass
|
||||||
|
|
||||||
|
logging.info('WarcWriterThread shutting down')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, format='%(asctime)s %(process)d %(levelname)s %(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
||||||
proxy = None
|
proxy = None
|
||||||
if not argv[1:]:
|
if not argv[1:]:
|
||||||
proxy = AsyncMitmProxy()
|
proxy = AsyncMitmProxy()
|
||||||
else:
|
else:
|
||||||
proxy = AsyncMitmProxy(ca_file=argv[1])
|
proxy = AsyncMitmProxy(ca_file=argv[1])
|
||||||
proxy.register_interceptor(DebugInterceptor)
|
|
||||||
|
proxy.register_interceptor(WarcRecordQueuer)
|
||||||
|
|
||||||
|
warc_writer = WarcWriterThread(WarcRecordQueuer.warc_record_out_queue)
|
||||||
|
warc_writer.start()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
proxy.serve_forever()
|
proxy.serve_forever()
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
warc_writer.stop.set()
|
||||||
proxy.server_close()
|
proxy.server_close()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user