mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
avoid using warcproxy.py stuff in mitmproxy.py
This commit is contained in:
parent
f51f2ec225
commit
bbf3fad1dc
2
setup.py
2
setup.py
@ -43,7 +43,7 @@ except:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='warcprox',
|
name='warcprox',
|
||||||
version='2.4.12',
|
version='2.4.13',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -76,9 +76,13 @@ import urlcanon
|
|||||||
import time
|
import time
|
||||||
import collections
|
import collections
|
||||||
import cProfile
|
import cProfile
|
||||||
|
from urllib3 import PoolManager
|
||||||
from urllib3.util import is_connection_dropped
|
from urllib3.util import is_connection_dropped
|
||||||
from urllib3.exceptions import TimeoutError, HTTPError
|
from urllib3.exceptions import TimeoutError, HTTPError
|
||||||
import doublethink
|
import doublethink
|
||||||
|
from cachetools import TTLCache
|
||||||
|
from threading import RLock
|
||||||
|
from certauth.certauth import CertificateAuthority
|
||||||
|
|
||||||
class ProxyingRecorder(object):
|
class ProxyingRecorder(object):
|
||||||
"""
|
"""
|
||||||
@ -223,9 +227,12 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
and records the bytes in transit as it proxies them.
|
and records the bytes in transit as it proxies them.
|
||||||
'''
|
'''
|
||||||
logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler")
|
logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler")
|
||||||
|
|
||||||
_socket_timeout = 60
|
_socket_timeout = 60
|
||||||
_max_resource_size = None
|
_max_resource_size = None
|
||||||
_tmp_file_max_memory_size = 512 * 1024
|
_tmp_file_max_memory_size = 512 * 1024
|
||||||
|
onion_tor_socks_proxy_host = None
|
||||||
|
onion_tor_socks_proxy_port = None
|
||||||
|
|
||||||
def __init__(self, request, client_address, server):
|
def __init__(self, request, client_address, server):
|
||||||
threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1])
|
threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1])
|
||||||
@ -737,3 +744,52 @@ class PooledMitmProxy(PooledMixIn, MitmProxy):
|
|||||||
for sock in self.remote_server_socks:
|
for sock in self.remote_server_socks:
|
||||||
self.shutdown_request(sock)
|
self.shutdown_request(sock)
|
||||||
|
|
||||||
|
class SingleThreadedMitmProxy(http_server.HTTPServer):
|
||||||
|
logger = logging.getLogger('warcprox.warcproxy.SingleThreadedMitmProxy')
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, MitmProxyHandlerClass=MitmProxyHandler,
|
||||||
|
options=warcprox.Options()):
|
||||||
|
self.options = options
|
||||||
|
|
||||||
|
# TTLCache is not thread-safe. Access to the shared cache from multiple
|
||||||
|
# threads must be properly synchronized with an RLock according to ref:
|
||||||
|
# https://cachetools.readthedocs.io/en/latest/
|
||||||
|
self.bad_hostnames_ports = TTLCache(maxsize=1024, ttl=60)
|
||||||
|
self.bad_hostnames_ports_lock = RLock()
|
||||||
|
|
||||||
|
self.remote_connection_pool = PoolManager(
|
||||||
|
num_pools=max(round(options.max_threads / 6), 200) if options.max_threads else 200)
|
||||||
|
|
||||||
|
if options.onion_tor_socks_proxy:
|
||||||
|
try:
|
||||||
|
host, port = options.onion_tor_socks_proxy.split(':')
|
||||||
|
MitmProxyHandlerClass.onion_tor_socks_proxy_host = host
|
||||||
|
MitmProxyHandlerClass.onion_tor_socks_proxy_port = int(port)
|
||||||
|
except ValueError:
|
||||||
|
MitmProxyHandlerClass.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy
|
||||||
|
MitmProxyHandlerClass.onion_tor_socks_proxy_port = None
|
||||||
|
|
||||||
|
if options.socket_timeout:
|
||||||
|
MitmProxyHandlerClass._socket_timeout = options.socket_timeout
|
||||||
|
if options.max_resource_size:
|
||||||
|
MitmProxyHandlerClass._max_resource_size = options.max_resource_size
|
||||||
|
if options.tmp_file_max_memory_size:
|
||||||
|
MitmProxyHandlerClass._tmp_file_max_memory_size = options.tmp_file_max_memory_size
|
||||||
|
|
||||||
|
self.digest_algorithm = options.digest_algorithm or 'sha1'
|
||||||
|
|
||||||
|
ca_name = ('Warcprox CA on %s' % socket.gethostname())[:64]
|
||||||
|
self.ca = CertificateAuthority(
|
||||||
|
ca_file=options.cacert or 'warcprox-ca.pem',
|
||||||
|
certs_dir=options.certs_dir or './warcprox-ca',
|
||||||
|
ca_name=ca_name)
|
||||||
|
|
||||||
|
server_address = (
|
||||||
|
options.address or 'localhost',
|
||||||
|
options.port if options.port is not None else 8000)
|
||||||
|
|
||||||
|
http_server.HTTPServer.__init__(
|
||||||
|
self, server_address, MitmProxyHandlerClass,
|
||||||
|
bind_and_activate=True)
|
||||||
|
|
||||||
|
@ -38,18 +38,14 @@ import logging
|
|||||||
import json
|
import json
|
||||||
import socket
|
import socket
|
||||||
from hanzo import warctools
|
from hanzo import warctools
|
||||||
from certauth.certauth import CertificateAuthority
|
|
||||||
import warcprox
|
import warcprox
|
||||||
import datetime
|
import datetime
|
||||||
import urlcanon
|
import urlcanon
|
||||||
import os
|
import os
|
||||||
from urllib3 import PoolManager
|
|
||||||
import tempfile
|
import tempfile
|
||||||
import hashlib
|
import hashlib
|
||||||
import doublethink
|
import doublethink
|
||||||
import re
|
import re
|
||||||
from threading import RLock
|
|
||||||
from cachetools import TTLCache
|
|
||||||
|
|
||||||
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||||
'''
|
'''
|
||||||
@ -423,56 +419,20 @@ class RecordedUrl:
|
|||||||
# inherit from object so that multiple inheritance from this class works
|
# inherit from object so that multiple inheritance from this class works
|
||||||
# properly in python 2
|
# properly in python 2
|
||||||
# http://stackoverflow.com/questions/1713038/super-fails-with-error-typeerror-argument-1-must-be-type-not-classobj#18392639
|
# http://stackoverflow.com/questions/1713038/super-fails-with-error-typeerror-argument-1-must-be-type-not-classobj#18392639
|
||||||
class SingleThreadedWarcProxy(http_server.HTTPServer, object):
|
class SingleThreadedWarcProxy(warcprox.mitmproxy.SingleThreadedMitmProxy):
|
||||||
logger = logging.getLogger("warcprox.warcproxy.WarcProxy")
|
logger = logging.getLogger("warcprox.warcproxy.WarcProxy")
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, stats_db=None, status_callback=None,
|
self, stats_db=None, status_callback=None,
|
||||||
options=warcprox.Options()):
|
options=warcprox.Options()):
|
||||||
self.start_time = doublethink.utcnow()
|
self.start_time = doublethink.utcnow()
|
||||||
|
|
||||||
|
warcprox.mitmproxy.SingleThreadedMitmProxy.__init__(
|
||||||
|
self, WarcProxyHandler, options)
|
||||||
|
|
||||||
self.status_callback = status_callback
|
self.status_callback = status_callback
|
||||||
self.stats_db = stats_db
|
self.stats_db = stats_db
|
||||||
self.options = options
|
|
||||||
# TTLCache is not thread-safe. Access to the shared cache from multiple
|
|
||||||
# threads must be properly synchronized with an RLock according to ref:
|
|
||||||
# https://cachetools.readthedocs.io/en/latest/
|
|
||||||
self.bad_hostnames_ports = TTLCache(maxsize=1024, ttl=60)
|
|
||||||
self.bad_hostnames_ports_lock = RLock()
|
|
||||||
self.remote_connection_pool = PoolManager(
|
|
||||||
num_pools=max(round(options.max_threads / 6), 200) if options.max_threads else 200)
|
|
||||||
server_address = (
|
|
||||||
options.address or 'localhost',
|
|
||||||
options.port if options.port is not None else 8000)
|
|
||||||
|
|
||||||
if options.onion_tor_socks_proxy:
|
|
||||||
try:
|
|
||||||
host, port = options.onion_tor_socks_proxy.split(':')
|
|
||||||
WarcProxyHandler.onion_tor_socks_proxy_host = host
|
|
||||||
WarcProxyHandler.onion_tor_socks_proxy_port = int(port)
|
|
||||||
except ValueError:
|
|
||||||
WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy
|
|
||||||
WarcProxyHandler.onion_tor_socks_proxy_port = None
|
|
||||||
|
|
||||||
if options.socket_timeout:
|
|
||||||
WarcProxyHandler._socket_timeout = options.socket_timeout
|
|
||||||
if options.max_resource_size:
|
|
||||||
WarcProxyHandler._max_resource_size = options.max_resource_size
|
|
||||||
if options.tmp_file_max_memory_size:
|
|
||||||
WarcProxyHandler._tmp_file_max_memory_size = options.tmp_file_max_memory_size
|
|
||||||
|
|
||||||
http_server.HTTPServer.__init__(
|
|
||||||
self, server_address, WarcProxyHandler, bind_and_activate=True)
|
|
||||||
|
|
||||||
self.digest_algorithm = options.digest_algorithm or 'sha1'
|
|
||||||
|
|
||||||
ca_name = ('Warcprox CA on %s' % socket.gethostname())[:64]
|
|
||||||
self.ca = CertificateAuthority(
|
|
||||||
ca_file=options.cacert or 'warcprox-ca.pem',
|
|
||||||
certs_dir=options.certs_dir or './warcprox-ca',
|
|
||||||
ca_name=ca_name)
|
|
||||||
|
|
||||||
self.recorded_url_q = queue.Queue(maxsize=options.queue_size or 1000)
|
self.recorded_url_q = queue.Queue(maxsize=options.queue_size or 1000)
|
||||||
|
|
||||||
self.running_stats = warcprox.stats.RunningStats()
|
self.running_stats = warcprox.stats.RunningStats()
|
||||||
|
|
||||||
def status(self):
|
def status(self):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user