avoid using warcproxy.py stuff in mitmproxy.py

This commit is contained in:
Noah Levitt 2019-05-15 15:58:47 -07:00
parent f51f2ec225
commit bbf3fad1dc
3 changed files with 62 additions and 46 deletions

View File

@ -43,7 +43,7 @@ except:
setuptools.setup( setuptools.setup(
name='warcprox', name='warcprox',
version='2.4.12', version='2.4.13',
description='WARC writing MITM HTTP/S proxy', description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox', url='https://github.com/internetarchive/warcprox',
author='Noah Levitt', author='Noah Levitt',

View File

@ -76,9 +76,13 @@ import urlcanon
import time import time
import collections import collections
import cProfile import cProfile
from urllib3 import PoolManager
from urllib3.util import is_connection_dropped from urllib3.util import is_connection_dropped
from urllib3.exceptions import TimeoutError, HTTPError from urllib3.exceptions import TimeoutError, HTTPError
import doublethink import doublethink
from cachetools import TTLCache
from threading import RLock
from certauth.certauth import CertificateAuthority
class ProxyingRecorder(object): class ProxyingRecorder(object):
""" """
@ -223,9 +227,12 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
and records the bytes in transit as it proxies them. and records the bytes in transit as it proxies them.
''' '''
logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler") logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler")
_socket_timeout = 60 _socket_timeout = 60
_max_resource_size = None _max_resource_size = None
_tmp_file_max_memory_size = 512 * 1024 _tmp_file_max_memory_size = 512 * 1024
onion_tor_socks_proxy_host = None
onion_tor_socks_proxy_port = None
def __init__(self, request, client_address, server): def __init__(self, request, client_address, server):
threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1]) threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1])
@ -737,3 +744,52 @@ class PooledMitmProxy(PooledMixIn, MitmProxy):
for sock in self.remote_server_socks: for sock in self.remote_server_socks:
self.shutdown_request(sock) self.shutdown_request(sock)
class SingleThreadedMitmProxy(http_server.HTTPServer):
logger = logging.getLogger('warcprox.warcproxy.SingleThreadedMitmProxy')
def __init__(
self, MitmProxyHandlerClass=MitmProxyHandler,
options=warcprox.Options()):
self.options = options
# TTLCache is not thread-safe. Access to the shared cache from multiple
# threads must be properly synchronized with an RLock according to ref:
# https://cachetools.readthedocs.io/en/latest/
self.bad_hostnames_ports = TTLCache(maxsize=1024, ttl=60)
self.bad_hostnames_ports_lock = RLock()
self.remote_connection_pool = PoolManager(
num_pools=max(round(options.max_threads / 6), 200) if options.max_threads else 200)
if options.onion_tor_socks_proxy:
try:
host, port = options.onion_tor_socks_proxy.split(':')
MitmProxyHandlerClass.onion_tor_socks_proxy_host = host
MitmProxyHandlerClass.onion_tor_socks_proxy_port = int(port)
except ValueError:
MitmProxyHandlerClass.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy
MitmProxyHandlerClass.onion_tor_socks_proxy_port = None
if options.socket_timeout:
MitmProxyHandlerClass._socket_timeout = options.socket_timeout
if options.max_resource_size:
MitmProxyHandlerClass._max_resource_size = options.max_resource_size
if options.tmp_file_max_memory_size:
MitmProxyHandlerClass._tmp_file_max_memory_size = options.tmp_file_max_memory_size
self.digest_algorithm = options.digest_algorithm or 'sha1'
ca_name = ('Warcprox CA on %s' % socket.gethostname())[:64]
self.ca = CertificateAuthority(
ca_file=options.cacert or 'warcprox-ca.pem',
certs_dir=options.certs_dir or './warcprox-ca',
ca_name=ca_name)
server_address = (
options.address or 'localhost',
options.port if options.port is not None else 8000)
http_server.HTTPServer.__init__(
self, server_address, MitmProxyHandlerClass,
bind_and_activate=True)

View File

@ -38,18 +38,14 @@ import logging
import json import json
import socket import socket
from hanzo import warctools from hanzo import warctools
from certauth.certauth import CertificateAuthority
import warcprox import warcprox
import datetime import datetime
import urlcanon import urlcanon
import os import os
from urllib3 import PoolManager
import tempfile import tempfile
import hashlib import hashlib
import doublethink import doublethink
import re import re
from threading import RLock
from cachetools import TTLCache
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
''' '''
@ -423,56 +419,20 @@ class RecordedUrl:
# inherit from object so that multiple inheritance from this class works # inherit from object so that multiple inheritance from this class works
# properly in python 2 # properly in python 2
# http://stackoverflow.com/questions/1713038/super-fails-with-error-typeerror-argument-1-must-be-type-not-classobj#18392639 # http://stackoverflow.com/questions/1713038/super-fails-with-error-typeerror-argument-1-must-be-type-not-classobj#18392639
class SingleThreadedWarcProxy(http_server.HTTPServer, object): class SingleThreadedWarcProxy(warcprox.mitmproxy.SingleThreadedMitmProxy):
logger = logging.getLogger("warcprox.warcproxy.WarcProxy") logger = logging.getLogger("warcprox.warcproxy.WarcProxy")
def __init__( def __init__(
self, stats_db=None, status_callback=None, self, stats_db=None, status_callback=None,
options=warcprox.Options()): options=warcprox.Options()):
self.start_time = doublethink.utcnow() self.start_time = doublethink.utcnow()
warcprox.mitmproxy.SingleThreadedMitmProxy.__init__(
self, WarcProxyHandler, options)
self.status_callback = status_callback self.status_callback = status_callback
self.stats_db = stats_db self.stats_db = stats_db
self.options = options
# TTLCache is not thread-safe. Access to the shared cache from multiple
# threads must be properly synchronized with an RLock according to ref:
# https://cachetools.readthedocs.io/en/latest/
self.bad_hostnames_ports = TTLCache(maxsize=1024, ttl=60)
self.bad_hostnames_ports_lock = RLock()
self.remote_connection_pool = PoolManager(
num_pools=max(round(options.max_threads / 6), 200) if options.max_threads else 200)
server_address = (
options.address or 'localhost',
options.port if options.port is not None else 8000)
if options.onion_tor_socks_proxy:
try:
host, port = options.onion_tor_socks_proxy.split(':')
WarcProxyHandler.onion_tor_socks_proxy_host = host
WarcProxyHandler.onion_tor_socks_proxy_port = int(port)
except ValueError:
WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy
WarcProxyHandler.onion_tor_socks_proxy_port = None
if options.socket_timeout:
WarcProxyHandler._socket_timeout = options.socket_timeout
if options.max_resource_size:
WarcProxyHandler._max_resource_size = options.max_resource_size
if options.tmp_file_max_memory_size:
WarcProxyHandler._tmp_file_max_memory_size = options.tmp_file_max_memory_size
http_server.HTTPServer.__init__(
self, server_address, WarcProxyHandler, bind_and_activate=True)
self.digest_algorithm = options.digest_algorithm or 'sha1'
ca_name = ('Warcprox CA on %s' % socket.gethostname())[:64]
self.ca = CertificateAuthority(
ca_file=options.cacert or 'warcprox-ca.pem',
certs_dir=options.certs_dir or './warcprox-ca',
ca_name=ca_name)
self.recorded_url_q = queue.Queue(maxsize=options.queue_size or 1000) self.recorded_url_q = queue.Queue(maxsize=options.queue_size or 1000)
self.running_stats = warcprox.stats.RunningStats() self.running_stats = warcprox.stats.RunningStats()
def status(self): def status(self):