diff --git a/setup.py b/setup.py index 96708d0..ae6d0b2 100644 --- a/setup.py +++ b/setup.py @@ -5,18 +5,18 @@ def read(fname): return open(path.join(path.dirname(__file__), fname)).read() setup( - name='pymiproxy', - author='Nadeem Douba', + name='warcprox', + author='Noah Levitt', version='1.0', - author_email='ndouba@gmail.com', - description='Micro Interceptor Proxy - a simple MITM HTTP/S proxy', + author_email='nlevitt@archive.org', + description='warcprox - WARC writing MITM HTTP/S proxy', license='GPL', - url='https://github.com/allfro/pymiproxy', - download_url='https://github.com/allfro/pymiproxy/zipball/master', + url='https://github.com/nlevitt/warcprox', long_description=read('README.md'), packages=find_packages('src'), package_dir={ '' : 'src' }, install_requires = [ - 'pyopenssl' + 'pyopenssl', + 'warctools' ] ) diff --git a/src/miproxy/proxy.py b/src/miproxy/proxy.py index c2caee1..54f465c 100644 --- a/src/miproxy/proxy.py +++ b/src/miproxy/proxy.py @@ -1,4 +1,12 @@ -#!/usr/bin/env python +#!/usr/bin/python +# vim:set sw=4 et: +# + +# python3 imports +# from http.server import HTTPServer, BaseHTTPRequestHandler +# from urllib.parse import urlparse, urlunparse, ParseResult +# from socketserver import ThreadingMixIn +# from http.client import HTTPResponse from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler from urlparse import urlparse, urlunparse, ParseResult @@ -8,12 +16,22 @@ from tempfile import gettempdir from os import path, listdir from ssl import wrap_socket from socket import socket -from re import compile from sys import argv from OpenSSL.crypto import (X509Extension, X509, dump_privatekey, dump_certificate, load_certificate, load_privatekey, PKey, TYPE_RSA, X509Req) from OpenSSL.SSL import FILETYPE_PEM +import logging +import sys +import ssl +from hanzo.warctools import WarcRecord +from hanzo.warctools.warc import warc_datetime_str +import uuid +import hashlib +from datetime import datetime +import time +import Queue +import threading __author__ = 'Nadeem Douba' __copyright__ = 'Copyright 2012, PyMiProxy Project' @@ -127,8 +145,6 @@ class UnsupportedSchemeException(Exception): class ProxyHandler(BaseHTTPRequestHandler): - r = compile(r'http://[^/]+(/?.*)(?i)') - def __init__(self, request, client_address, server): self.is_connect = False BaseHTTPRequestHandler.__init__(self, request, client_address, server) @@ -138,7 +154,8 @@ class ProxyHandler(BaseHTTPRequestHandler): if self.is_connect: self.hostname, self.port = self.path.split(':') else: - u = urlparse(self.path) + self.url = self.path + u = urlparse(self.url) if u.scheme != 'http': raise UnsupportedSchemeException('Unknown scheme %s' % repr(u.scheme)) self.hostname = u.hostname @@ -179,14 +196,17 @@ class ProxyHandler(BaseHTTPRequestHandler): self.end_headers() #self.request.sendall('%s 200 Connection established\r\n\r\n' % self.request_version) self._transition_to_ssl() - except Exception, e: + except Exception as e: self.send_error(500, str(e)) return # Reload! self.setup() - self.ssl_host = 'https://%s' % self.path self.handle_one_request() +# try: +# except ssl.SSLError, e: +# logging.warn("caught SSLError {0}".format(e)) +# pass def do_COMMAND(self): @@ -196,14 +216,14 @@ class ProxyHandler(BaseHTTPRequestHandler): try: # Connect to destination self._connect_to_host() - except Exception, e: + except Exception as e: self.send_error(500, str(e)) return # Extract path # Build request req = '%s %s %s\r\n' % (self.command, self.path, self.request_version) - + # Add headers to the request req += '%s\r\n' % self.headers @@ -288,41 +308,123 @@ class MitmProxy(HTTPServer): self._res_plugins.append(interceptor_class) + def server_activate(self): + HTTPServer.server_activate(self) + logging.info('listening on {0}:{1}'.format(self.server_address[0], self.server_address[1])) + + + def server_close(self): + HTTPServer.server_close(self) + logging.info('shut down') + + class AsyncMitmProxy(ThreadingMixIn, MitmProxy): pass -class MitmProxyHandler(ProxyHandler): +class WarcRecordQueuer(RequestInterceptorPlugin, ResponseInterceptorPlugin): - def mitm_request(self, data): - print '>> %s' % repr(data[:100]) - return data + warc_record_out_queue = Queue.Queue() - def mitm_response(self, data): - print '<< %s' % repr(data[:100]) + def __init__(self, server, msg): + InterceptorPlugin.__init__(self, server, msg) + + if msg.is_connect: + assert not msg.url + + if int(msg.port) == 443: + netloc = msg.hostname + else: + netloc = '{0}:{1}'.format(msg.hostname, msg.port) + + self.url = urlunparse( + ParseResult( + scheme='https', + netloc=netloc, + params='', + path=msg.path, + query='', + fragment='' + ) + ) + else: + assert msg.url + self.url = msg.url + + + def do_request(self, data): + logging.info('{0} >> {1}'.format(self.url, repr(data[:100]))) return data -class DebugInterceptor(RequestInterceptorPlugin, ResponseInterceptorPlugin): + def make_warc_uuid(self, text): + return "".format(uuid.UUID(hashlib.sha1(text).hexdigest()[0:32])) - def do_request(self, data): - print '>> %s' % repr(data[:100]) - return data - def do_response(self, data): - print '<< %s' % repr(data[:100]) - return data + def do_response(self, data): + logging.info('{0} << {1}'.format(self.url, repr(data[:100]))) + + warc_record_id = self.make_warc_uuid("{0} {1}".format(self.url, time.time())) + logging.info('{0}: {1}'.format(WarcRecord.ID, warc_record_id)) + + headers = [] + headers.append((WarcRecord.ID, warc_record_id)) + headers.append((WarcRecord.URL, self.url)) + headers.append((WarcRecord.DATE, warc_datetime_str(datetime.now()))) + # headers.append((WarcRecord.IP_ADDRESS, ip)) + headers.append((WarcRecord.TYPE, WarcRecord.RESPONSE)) + + warcrecord = WarcRecord(headers=headers, content=("application/http;msgtype=response", data)) + + # warcrecord.write_to(sys.stdout, gzip=False) + WarcRecordQueuer.warc_record_out_queue.put(warcrecord) + + return data + + +class WarcWriterThread(threading.Thread): + + # def __init__(self, group=None, target=None, name=None, args=(), kwargs=None, verbose=None): + # Thread.__init__(self, group=group, target=target, name=name, args=args, kwargs=args + + def __init__(self, warc_record_in_queue): + threading.Thread.__init__(self, name='WarcWriterThread') + self.warc_record_in_queue = warc_record_in_queue + self.stop = threading.Event() + + + def run(self): + logging.info('WarcWriterThread starting') + + while not self.stop.is_set(): + try: + warc_record = self.warc_record_in_queue.get(block=False, timeout=0.5) + logging.info('got warc record to write from the queue: {0}'.format(warc_record)) + # warc_record.write_to(sys.stdout, gzip=False) + except Queue.Empty: + pass + + logging.info('WarcWriterThread shutting down') if __name__ == '__main__': + logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, format='%(asctime)s %(process)d %(levelname)s %(funcName)s(%(filename)s:%(lineno)d) %(message)s') proxy = None if not argv[1:]: proxy = AsyncMitmProxy() else: proxy = AsyncMitmProxy(ca_file=argv[1]) - proxy.register_interceptor(DebugInterceptor) + + proxy.register_interceptor(WarcRecordQueuer) + + warc_writer = WarcWriterThread(WarcRecordQueuer.warc_record_out_queue) + warc_writer.start() + try: proxy.serve_forever() except KeyboardInterrupt: + pass + finally: + warc_writer.stop.set() proxy.server_close()