From bbf3fad1dcd9910ba30d025fc063c17fbf3a8c4d Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 15 May 2019 15:58:47 -0700 Subject: [PATCH] avoid using warcproxy.py stuff in mitmproxy.py --- setup.py | 2 +- warcprox/mitmproxy.py | 56 +++++++++++++++++++++++++++++++++++++++++++ warcprox/warcproxy.py | 50 ++++---------------------------------- 3 files changed, 62 insertions(+), 46 deletions(-) diff --git a/setup.py b/setup.py index 56e8213..48192ba 100755 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.12', + version='2.4.13', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index c1f01bd..6ae52f5 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -76,9 +76,13 @@ import urlcanon import time import collections import cProfile +from urllib3 import PoolManager from urllib3.util import is_connection_dropped from urllib3.exceptions import TimeoutError, HTTPError import doublethink +from cachetools import TTLCache +from threading import RLock +from certauth.certauth import CertificateAuthority class ProxyingRecorder(object): """ @@ -223,9 +227,12 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): and records the bytes in transit as it proxies them. ''' logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler") + _socket_timeout = 60 _max_resource_size = None _tmp_file_max_memory_size = 512 * 1024 + onion_tor_socks_proxy_host = None + onion_tor_socks_proxy_port = None def __init__(self, request, client_address, server): threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1]) @@ -737,3 +744,52 @@ class PooledMitmProxy(PooledMixIn, MitmProxy): for sock in self.remote_server_socks: self.shutdown_request(sock) +class SingleThreadedMitmProxy(http_server.HTTPServer): + logger = logging.getLogger('warcprox.warcproxy.SingleThreadedMitmProxy') + + def __init__( + self, MitmProxyHandlerClass=MitmProxyHandler, + options=warcprox.Options()): + self.options = options + + # TTLCache is not thread-safe. Access to the shared cache from multiple + # threads must be properly synchronized with an RLock according to ref: + # https://cachetools.readthedocs.io/en/latest/ + self.bad_hostnames_ports = TTLCache(maxsize=1024, ttl=60) + self.bad_hostnames_ports_lock = RLock() + + self.remote_connection_pool = PoolManager( + num_pools=max(round(options.max_threads / 6), 200) if options.max_threads else 200) + + if options.onion_tor_socks_proxy: + try: + host, port = options.onion_tor_socks_proxy.split(':') + MitmProxyHandlerClass.onion_tor_socks_proxy_host = host + MitmProxyHandlerClass.onion_tor_socks_proxy_port = int(port) + except ValueError: + MitmProxyHandlerClass.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy + MitmProxyHandlerClass.onion_tor_socks_proxy_port = None + + if options.socket_timeout: + MitmProxyHandlerClass._socket_timeout = options.socket_timeout + if options.max_resource_size: + MitmProxyHandlerClass._max_resource_size = options.max_resource_size + if options.tmp_file_max_memory_size: + MitmProxyHandlerClass._tmp_file_max_memory_size = options.tmp_file_max_memory_size + + self.digest_algorithm = options.digest_algorithm or 'sha1' + + ca_name = ('Warcprox CA on %s' % socket.gethostname())[:64] + self.ca = CertificateAuthority( + ca_file=options.cacert or 'warcprox-ca.pem', + certs_dir=options.certs_dir or './warcprox-ca', + ca_name=ca_name) + + server_address = ( + options.address or 'localhost', + options.port if options.port is not None else 8000) + + http_server.HTTPServer.__init__( + self, server_address, MitmProxyHandlerClass, + bind_and_activate=True) + diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 9b8545d..e5b35d2 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -38,18 +38,14 @@ import logging import json import socket from hanzo import warctools -from certauth.certauth import CertificateAuthority import warcprox import datetime import urlcanon import os -from urllib3 import PoolManager import tempfile import hashlib import doublethink import re -from threading import RLock -from cachetools import TTLCache class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): ''' @@ -423,56 +419,20 @@ class RecordedUrl: # inherit from object so that multiple inheritance from this class works # properly in python 2 # http://stackoverflow.com/questions/1713038/super-fails-with-error-typeerror-argument-1-must-be-type-not-classobj#18392639 -class SingleThreadedWarcProxy(http_server.HTTPServer, object): +class SingleThreadedWarcProxy(warcprox.mitmproxy.SingleThreadedMitmProxy): logger = logging.getLogger("warcprox.warcproxy.WarcProxy") def __init__( self, stats_db=None, status_callback=None, options=warcprox.Options()): self.start_time = doublethink.utcnow() + + warcprox.mitmproxy.SingleThreadedMitmProxy.__init__( + self, WarcProxyHandler, options) + self.status_callback = status_callback self.stats_db = stats_db - self.options = options - # TTLCache is not thread-safe. Access to the shared cache from multiple - # threads must be properly synchronized with an RLock according to ref: - # https://cachetools.readthedocs.io/en/latest/ - self.bad_hostnames_ports = TTLCache(maxsize=1024, ttl=60) - self.bad_hostnames_ports_lock = RLock() - self.remote_connection_pool = PoolManager( - num_pools=max(round(options.max_threads / 6), 200) if options.max_threads else 200) - server_address = ( - options.address or 'localhost', - options.port if options.port is not None else 8000) - - if options.onion_tor_socks_proxy: - try: - host, port = options.onion_tor_socks_proxy.split(':') - WarcProxyHandler.onion_tor_socks_proxy_host = host - WarcProxyHandler.onion_tor_socks_proxy_port = int(port) - except ValueError: - WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy - WarcProxyHandler.onion_tor_socks_proxy_port = None - - if options.socket_timeout: - WarcProxyHandler._socket_timeout = options.socket_timeout - if options.max_resource_size: - WarcProxyHandler._max_resource_size = options.max_resource_size - if options.tmp_file_max_memory_size: - WarcProxyHandler._tmp_file_max_memory_size = options.tmp_file_max_memory_size - - http_server.HTTPServer.__init__( - self, server_address, WarcProxyHandler, bind_and_activate=True) - - self.digest_algorithm = options.digest_algorithm or 'sha1' - - ca_name = ('Warcprox CA on %s' % socket.gethostname())[:64] - self.ca = CertificateAuthority( - ca_file=options.cacert or 'warcprox-ca.pem', - certs_dir=options.certs_dir or './warcprox-ca', - ca_name=ca_name) - self.recorded_url_q = queue.Queue(maxsize=options.queue_size or 1000) - self.running_stats = warcprox.stats.RunningStats() def status(self):