avoid using warcproxy.py stuff in mitmproxy.py

This commit is contained in:
Noah Levitt 2019-05-15 15:58:47 -07:00
parent f51f2ec225
commit bbf3fad1dc
3 changed files with 62 additions and 46 deletions

View File

@ -43,7 +43,7 @@ except:
setuptools.setup(
name='warcprox',
version='2.4.12',
version='2.4.13',
description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox',
author='Noah Levitt',

View File

@ -76,9 +76,13 @@ import urlcanon
import time
import collections
import cProfile
from urllib3 import PoolManager
from urllib3.util import is_connection_dropped
from urllib3.exceptions import TimeoutError, HTTPError
import doublethink
from cachetools import TTLCache
from threading import RLock
from certauth.certauth import CertificateAuthority
class ProxyingRecorder(object):
"""
@ -223,9 +227,12 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
and records the bytes in transit as it proxies them.
'''
logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler")
_socket_timeout = 60
_max_resource_size = None
_tmp_file_max_memory_size = 512 * 1024
onion_tor_socks_proxy_host = None
onion_tor_socks_proxy_port = None
def __init__(self, request, client_address, server):
threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1])
@ -737,3 +744,52 @@ class PooledMitmProxy(PooledMixIn, MitmProxy):
for sock in self.remote_server_socks:
self.shutdown_request(sock)
class SingleThreadedMitmProxy(http_server.HTTPServer):
logger = logging.getLogger('warcprox.warcproxy.SingleThreadedMitmProxy')
def __init__(
self, MitmProxyHandlerClass=MitmProxyHandler,
options=warcprox.Options()):
self.options = options
# TTLCache is not thread-safe. Access to the shared cache from multiple
# threads must be properly synchronized with an RLock according to ref:
# https://cachetools.readthedocs.io/en/latest/
self.bad_hostnames_ports = TTLCache(maxsize=1024, ttl=60)
self.bad_hostnames_ports_lock = RLock()
self.remote_connection_pool = PoolManager(
num_pools=max(round(options.max_threads / 6), 200) if options.max_threads else 200)
if options.onion_tor_socks_proxy:
try:
host, port = options.onion_tor_socks_proxy.split(':')
MitmProxyHandlerClass.onion_tor_socks_proxy_host = host
MitmProxyHandlerClass.onion_tor_socks_proxy_port = int(port)
except ValueError:
MitmProxyHandlerClass.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy
MitmProxyHandlerClass.onion_tor_socks_proxy_port = None
if options.socket_timeout:
MitmProxyHandlerClass._socket_timeout = options.socket_timeout
if options.max_resource_size:
MitmProxyHandlerClass._max_resource_size = options.max_resource_size
if options.tmp_file_max_memory_size:
MitmProxyHandlerClass._tmp_file_max_memory_size = options.tmp_file_max_memory_size
self.digest_algorithm = options.digest_algorithm or 'sha1'
ca_name = ('Warcprox CA on %s' % socket.gethostname())[:64]
self.ca = CertificateAuthority(
ca_file=options.cacert or 'warcprox-ca.pem',
certs_dir=options.certs_dir or './warcprox-ca',
ca_name=ca_name)
server_address = (
options.address or 'localhost',
options.port if options.port is not None else 8000)
http_server.HTTPServer.__init__(
self, server_address, MitmProxyHandlerClass,
bind_and_activate=True)

View File

@ -38,18 +38,14 @@ import logging
import json
import socket
from hanzo import warctools
from certauth.certauth import CertificateAuthority
import warcprox
import datetime
import urlcanon
import os
from urllib3 import PoolManager
import tempfile
import hashlib
import doublethink
import re
from threading import RLock
from cachetools import TTLCache
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
'''
@ -423,56 +419,20 @@ class RecordedUrl:
# inherit from object so that multiple inheritance from this class works
# properly in python 2
# http://stackoverflow.com/questions/1713038/super-fails-with-error-typeerror-argument-1-must-be-type-not-classobj#18392639
class SingleThreadedWarcProxy(http_server.HTTPServer, object):
class SingleThreadedWarcProxy(warcprox.mitmproxy.SingleThreadedMitmProxy):
logger = logging.getLogger("warcprox.warcproxy.WarcProxy")
def __init__(
self, stats_db=None, status_callback=None,
options=warcprox.Options()):
self.start_time = doublethink.utcnow()
warcprox.mitmproxy.SingleThreadedMitmProxy.__init__(
self, WarcProxyHandler, options)
self.status_callback = status_callback
self.stats_db = stats_db
self.options = options
# TTLCache is not thread-safe. Access to the shared cache from multiple
# threads must be properly synchronized with an RLock according to ref:
# https://cachetools.readthedocs.io/en/latest/
self.bad_hostnames_ports = TTLCache(maxsize=1024, ttl=60)
self.bad_hostnames_ports_lock = RLock()
self.remote_connection_pool = PoolManager(
num_pools=max(round(options.max_threads / 6), 200) if options.max_threads else 200)
server_address = (
options.address or 'localhost',
options.port if options.port is not None else 8000)
if options.onion_tor_socks_proxy:
try:
host, port = options.onion_tor_socks_proxy.split(':')
WarcProxyHandler.onion_tor_socks_proxy_host = host
WarcProxyHandler.onion_tor_socks_proxy_port = int(port)
except ValueError:
WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy
WarcProxyHandler.onion_tor_socks_proxy_port = None
if options.socket_timeout:
WarcProxyHandler._socket_timeout = options.socket_timeout
if options.max_resource_size:
WarcProxyHandler._max_resource_size = options.max_resource_size
if options.tmp_file_max_memory_size:
WarcProxyHandler._tmp_file_max_memory_size = options.tmp_file_max_memory_size
http_server.HTTPServer.__init__(
self, server_address, WarcProxyHandler, bind_and_activate=True)
self.digest_algorithm = options.digest_algorithm or 'sha1'
ca_name = ('Warcprox CA on %s' % socket.gethostname())[:64]
self.ca = CertificateAuthority(
ca_file=options.cacert or 'warcprox-ca.pem',
certs_dir=options.certs_dir or './warcprox-ca',
ca_name=ca_name)
self.recorded_url_q = queue.Queue(maxsize=options.queue_size or 1000)
self.running_stats = warcprox.stats.RunningStats()
def status(self):