From 9cce03dc16e326ac633839e41a84a931fd7e8a37 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 14 Nov 2017 14:40:50 -0800 Subject: [PATCH 1/9] hacky way to fix problem of benchmarks arguments getting stale --- benchmarks/run-benchmarks.py | 94 +++++------------------------------- setup.py | 2 +- 2 files changed, 12 insertions(+), 84 deletions(-) diff --git a/benchmarks/run-benchmarks.py b/benchmarks/run-benchmarks.py index a05db59..8267806 100755 --- a/benchmarks/run-benchmarks.py +++ b/benchmarks/run-benchmarks.py @@ -160,90 +160,9 @@ with different options. Benchmarking code uses asyncio/aiohttp and requires python 3.5 or later. ''' - arg_parser = argparse.ArgumentParser( - prog=prog, description=desc, - formatter_class=warcprox.main.BetterArgumentDefaultsHelpFormatter) + arg_parser = warcprox.main._build_arg_parser() + arg_parser.description = desc - ### these warcprox options are not configurable for the benchmarks - # arg_parser.add_argument('-p', '--port', dest='port', default='8000', - # type=int, help='port to listen on') - # arg_parser.add_argument('-b', '--address', dest='address', - # default='localhost', help='address to listen on') - # arg_parser.add_argument('-c', '--cacert', dest='cacert', - # default='./{0}-warcprox-ca.pem'.format(socket.gethostname()), - # help='CA certificate file; if file does not exist, it will be created') - # arg_parser.add_argument('--certs-dir', dest='certs_dir', - # default='./{0}-warcprox-ca'.format(socket.gethostname()), - # help='where to store and load generated certificates') - # arg_parser.add_argument('-d', '--dir', dest='directory', - # default='./warcs', help='where to write warcs') - - arg_parser.add_argument('-z', '--gzip', dest='gzip', action='store_true', - help='write gzip-compressed warc records') - arg_parser.add_argument('-n', '--prefix', dest='prefix', - default='WARCPROX', help='WARC filename prefix') - arg_parser.add_argument( - '-s', '--size', dest='rollover_size', default=1000*1000*1000, - type=int, help='WARC file rollover size threshold in bytes') - arg_parser.add_argument('--rollover-idle-time', - dest='rollover_idle_time', default=None, type=int, - help="WARC file rollover idle time threshold in seconds (so that Friday's last open WARC doesn't sit there all weekend waiting for more data)") - try: - hash_algos = hashlib.algorithms_guaranteed - except AttributeError: - hash_algos = hashlib.algorithms - arg_parser.add_argument('-g', '--digest-algorithm', dest='digest_algorithm', - default='sha1', help='digest algorithm, one of {}'.format(', '.join(hash_algos))) - arg_parser.add_argument('--base32', dest='base32', action='store_true', - default=False, help='write digests in Base32 instead of hex') - arg_parser.add_argument('--method-filter', metavar='HTTP_METHOD', - action='append', help='only record requests with the given http method(s) (can be used more than once)') - arg_parser.add_argument('--stats-db-file', dest='stats_db_file', - default='./warcprox.sqlite', help='persistent statistics database file; empty string or /dev/null disables statistics tracking') - arg_parser.add_argument('-P', '--playback-port', dest='playback_port', - type=int, default=None, help='port to listen on for instant playback') - arg_parser.add_argument('--playback-index-db-file', dest='playback_index_db_file', - default='./warcprox-playback-index.db', - help='playback index database file (only used if --playback-port is specified)') - group = arg_parser.add_mutually_exclusive_group() - group.add_argument('-j', '--dedup-db-file', dest='dedup_db_file', - default='./warcprox.sqlite', help='persistent deduplication database file; empty string or /dev/null disables deduplication') - group.add_argument('--rethinkdb-servers', dest='rethinkdb_servers', - help='rethinkdb servers, used for dedup and stats if specified; e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org') - arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default='warcprox', - help='rethinkdb database name (ignored unless --rethinkdb-servers is specified)') - arg_parser.add_argument('--rethinkdb-big-table', - dest='rethinkdb_big_table', action='store_true', default=False, - help='use a big rethinkdb table called "captures", instead of a small table called "dedup"; table is suitable for use as index for playback (ignored unless --rethinkdb-servers is specified)') - arg_parser.add_argument( - '--rethinkdb-big-table-name', dest='rethinkdb_big_table_name', - default='captures', help=argparse.SUPPRESS) - arg_parser.add_argument('--queue-size', dest='queue_size', type=int, - default=500, help=argparse.SUPPRESS) - arg_parser.add_argument('--max-threads', dest='max_threads', type=int, - help=argparse.SUPPRESS) - arg_parser.add_argument('--profile', action='store_true', default=False, - help=argparse.SUPPRESS) - arg_parser.add_argument( - '--onion-tor-socks-proxy', dest='onion_tor_socks_proxy', - default=None, help=( - 'host:port of tor socks proxy, used only to connect to ' - '.onion sites')) - arg_parser.add_argument( - '--plugin', metavar='PLUGIN_CLASS', dest='plugins', - action='append', help=( - 'Qualified name of plugin class, e.g. "mypkg.mymod.MyClass". ' - 'May be used multiple times to register multiple plugins. ' - 'Plugin classes are loaded from the regular python module ' - 'search path. They will be instantiated with no arguments and ' - 'must have a method `notify(self, recorded_url, records)` ' - 'which will be called for each url, after warc records have ' - 'been written.')) - arg_parser.add_argument('--version', action='version', - version="warcprox {}".format(warcprox.__version__)) - arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true') - arg_parser.add_argument('--trace', dest='trace', action='store_true') - arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true') arg_parser.add_argument( '--requests', dest='requests', type=int, default=200, help='number of urls to fetch') @@ -253,6 +172,15 @@ Benchmarking code uses asyncio/aiohttp and requires python 3.5 or later. arg_parser.add_argument( '--skip-baseline', dest='skip_baseline', action='store_true', help='skip the baseline bechmarks') + + # filter out options that are not configurable for the benchmarks + filtered = [] + for action in arg_parser._action_groups[1]._group_actions: + if action.dest not in ( + 'port', 'address', 'cacert', 'certs_dir', 'directory'): + filtered.append(action) + arg_parser._action_groups[1]._group_actions = filtered + return arg_parser if __name__ == '__main__': diff --git a/setup.py b/setup.py index ebeb213..9e5d580 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.2.1b2.dev115', + version='2.2.1b2.dev116', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From c13fd9a40ea2423d66ad8d0f8e4263cd593d24eb Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 14 Nov 2017 16:35:25 -0800 Subject: [PATCH 2/9] have --profile profile proxy threads as well as warc writer threads --- setup.py | 2 +- warcprox/__init__.py | 13 ++++++++++++- warcprox/controller.py | 23 +++++++++++++++++++++++ warcprox/main.py | 7 ++++++- warcprox/mitmproxy.py | 25 ++++++++++++++++++++++--- warcprox/warcproxy.py | 3 ++- warcprox/writerthread.py | 17 +++++++++++++++-- 7 files changed, 81 insertions(+), 9 deletions(-) diff --git a/setup.py b/setup.py index 9e5d580..90a7719 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.2.1b2.dev116', + version='2.2.1b2.dev117', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/__init__.py b/warcprox/__init__.py index e50a415..6ac4fff 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -92,7 +92,7 @@ class RequestBlockedByRule(Exception): def __str__(self): return "%s: %s" % (self.__class__.__name__, self.msg) -# monkey-patch log level TRACE +# monkey-patch log levels TRACE and NOTICE TRACE = 5 import logging def _logging_trace(msg, *args, **kwargs): @@ -104,6 +104,17 @@ logging.trace = _logging_trace logging.Logger.trace = _logger_trace logging.addLevelName(TRACE, 'TRACE') +NOTICE = (logging.INFO + logging.WARN) // 2 +import logging +def _logging_notice(msg, *args, **kwargs): + logging.root.notice(msg, *args, **kwargs) +def _logger_notice(self, msg, *args, **kwargs): + if self.isEnabledFor(NOTICE): + self._log(NOTICE, msg, args, **kwargs) +logging.notice = _logging_notice +logging.Logger.notice = _logger_notice +logging.addLevelName(NOTICE, 'NOTICE') + import warcprox.controller as controller import warcprox.playback as playback import warcprox.dedup as dedup diff --git a/warcprox/controller.py b/warcprox/controller.py index 42f71de..217f519 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -254,6 +254,9 @@ class WarcproxController(object): # last_mem_dbg = datetime.datetime.utcnow() time.sleep(0.5) + + if self.options.profile: + self._dump_profiling() except: self.logger.critical( "shutting down in response to fatal exception", @@ -262,3 +265,23 @@ class WarcproxController(object): finally: self.shutdown() + def _dump_profiling(self): + import pstats + import tempfile + import os + import io + with tempfile.TemporaryDirectory() as tmpdir: + files = [] + for th_id, profiler in self.proxy.profilers.items(): + file = os.path.join(tmpdir, '%s.dat' % th_id) + profiler.dump_stats(file) + files.append(file) + + buf = io.StringIO() + stats = pstats.Stats(*files, stream=buf) + stats.sort_stats('cumulative') + stats.print_stats(0.1) + self.logger.notice( + 'aggregate performance profile of %s proxy threads:\n%s', + len(files), buf.getvalue()) + diff --git a/warcprox/main.py b/warcprox/main.py index 1e6aaf8..462da8e 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -126,6 +126,9 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): help=argparse.SUPPRESS) arg_parser.add_argument('--profile', action='store_true', default=False, help=argparse.SUPPRESS) + arg_parser.add_argument( + '--writer-threads', dest='writer_threads', type=int, default=None, + help=argparse.SUPPRESS) arg_parser.add_argument( '--onion-tor-socks-proxy', dest='onion_tor_socks_proxy', default=None, help=( @@ -257,12 +260,14 @@ def init_controller(args): # number of warc writer threads = sqrt(proxy.max_threads) # I came up with this out of thin air because it strikes me as reasonable # 1=>1 2=>1 5=>2 10=>3 50=>7 100=>10 200=>14 500=>22 1000=>32 2000=>45 + num_writer_threads = args.writer_threads or int(proxy.max_threads ** 0.5) + logging.debug('initializing %d warc writer threads', num_writer_threads) warc_writer_threads = [ warcprox.writerthread.WarcWriterThread( name='WarcWriterThread%03d' % i, recorded_url_q=recorded_url_q, writer_pool=writer_pool, dedup_db=dedup_db, listeners=listeners, options=options) - for i in range(int(proxy.max_threads ** 0.5))] + for i in range(num_writer_threads)] if args.rethinkdb_servers: svcreg = doublethink.ServiceRegistry(rr) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 2c34bcd..d1ac12e 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -541,14 +541,33 @@ class PooledMitmProxy(PooledMixIn, MitmProxy): # This value is passed as the "backlog" argument to listen(2). The default # value from socketserver.TCPServer is 5. Increasing this value is part of # the solution to client connections being closed suddenly and this message - # appearing in kernel log on linux: "TCP: request_sock_TCP: # Possible SYN - # flooding on port 8000. Sending cookies. Check SNMP # counters." I think + # appearing in kernel log on linux: "TCP: request_sock_TCP: Possible SYN + # flooding on port 8000. Sending cookies. Check SNMP counters." I think # this comes into play because we don't always accept(2) immediately (see # PooledMixIn.get_request()). # See also https://blog.dubbelboer.com/2012/04/09/syn-cookies.html request_queue_size = 4096 - def process_request_thread(self, request, client_address): + def __init__(self, max_threads, options=warcprox.Options()): + PooledMixIn.__init__(self, max_threads) + + if options.profile: + self.profilers = {} + self.process_request_thread = self._profile_process_request_thread + else: + self.profilers + self.process_request_thread = self._process_request_thread + + def _profile_process_request_thread(self, request, client_address): + if not threading.current_thread().ident in self.profilers: + import cProfile + self.profilers[threading.current_thread().ident] = cProfile.Profile() + profiler = self.profilers[threading.current_thread().ident] + profiler.enable() + self._process_request_thread(request, client_address) + profiler.disable() + + def _process_request_thread(self, request, client_address): ''' This an almost verbatim copy/paste of socketserver.ThreadingMixIn.process_request_thread. diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index f1de01e..3331221 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -425,7 +425,8 @@ class WarcProxy(SingleThreadedWarcProxy, warcprox.mitmproxy.PooledMitmProxy): self.logger.info( "max_threads=%s set by command line option", options.max_threads) - warcprox.mitmproxy.PooledMitmProxy.__init__(self, options.max_threads) + warcprox.mitmproxy.PooledMitmProxy.__init__( + self, options.max_threads, options) SingleThreadedWarcProxy.__init__( self, ca, recorded_url_q, stats_db, options) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index a8a6ef7..1041a30 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -33,7 +33,6 @@ import time from datetime import datetime from hanzo import warctools import warcprox -import cProfile import sys class WarcWriterThread(threading.Thread): @@ -59,7 +58,21 @@ class WarcWriterThread(threading.Thread): def run(self): if self.options.profile: - cProfile.runctx('self._run()', globals(), locals(), sort='cumulative') + import cProfile + import pstats + import io + profiler = cProfile.Profile() + + profiler.enable() + self._run() + profiler.disable() + + buf = io.StringIO() + stats = pstats.Stats(profiler, stream=buf) + stats.sort_stats('cumulative') + stats.print_stats(0.1) + self.logger.notice( + '%s performance profile:\n%s', self, buf.getvalue()) else: self._run() From 5c2c21de0732a9a6ef16064fe6409d4c4e81e8e8 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 14 Nov 2017 16:44:31 -0800 Subject: [PATCH 3/9] aggregate warc writer thread profiles much like we do for proxy threads --- setup.py | 2 +- warcprox/controller.py | 24 +++++++++++++++++++----- warcprox/writerthread.py | 18 +++++------------- 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/setup.py b/setup.py index 90a7719..912d7ed 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.2.1b2.dev117', + version='2.2.1b2.dev118', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/controller.py b/warcprox/controller.py index 217f519..fbf9d8a 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -44,7 +44,7 @@ class WarcproxController(object): Create warcprox controller. If supplied, `proxy` should be an instance of WarcProxy, and - `warc_writer_threads` should be an list of WarcWriterThread instances. + `warc_writer_threads` should be a list of WarcWriterThread instances. If not supplied, they are created with default values. If supplied, playback_proxy should be an instance of PlaybackProxy. If @@ -266,11 +266,9 @@ class WarcproxController(object): self.shutdown() def _dump_profiling(self): - import pstats - import tempfile - import os - import io + import pstats, tempfile, os, io with tempfile.TemporaryDirectory() as tmpdir: + # proxy threads files = [] for th_id, profiler in self.proxy.profilers.items(): file = os.path.join(tmpdir, '%s.dat' % th_id) @@ -285,3 +283,19 @@ class WarcproxController(object): 'aggregate performance profile of %s proxy threads:\n%s', len(files), buf.getvalue()) + + # warc writer threads + files = [] + for wwt in self.warc_writer_threads: + file = os.path.join(tmpdir, '%s.dat' % th_id) + profiler.dump_stats(file) + files.append(file) + + buf = io.StringIO() + stats = pstats.Stats(*files, stream=buf) + stats.sort_stats('cumulative') + stats.print_stats(0.1) + self.logger.notice( + 'aggregate performance profile of %s warc writer threads:\n%s', + len(self.warc_writer_threads), buf.getvalue()) + diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 1041a30..8138eec 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -56,23 +56,15 @@ class WarcWriterThread(threading.Thread): self.idle = None self.method_filter = set(method.upper() for method in self.options.method_filter or []) - def run(self): if self.options.profile: import cProfile - import pstats - import io - profiler = cProfile.Profile() + self.profiler = cProfile.Profile() - profiler.enable() + def run(self): + if self.options.profile: + self.profiler.enable() self._run() - profiler.disable() - - buf = io.StringIO() - stats = pstats.Stats(profiler, stream=buf) - stats.sort_stats('cumulative') - stats.print_stats(0.1) - self.logger.notice( - '%s performance profile:\n%s', self, buf.getvalue()) + self.profiler.disable() else: self._run() From fdfc84cea02716a03afb8463e759d45ac6eb2fab Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 14 Nov 2017 17:14:21 -0800 Subject: [PATCH 4/9] fix mistakes in warc write thread profile aggregation --- warcprox/controller.py | 5 ++--- warcprox/writerthread.py | 4 +--- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/warcprox/controller.py b/warcprox/controller.py index fbf9d8a..11291c4 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -283,12 +283,11 @@ class WarcproxController(object): 'aggregate performance profile of %s proxy threads:\n%s', len(files), buf.getvalue()) - # warc writer threads files = [] for wwt in self.warc_writer_threads: - file = os.path.join(tmpdir, '%s.dat' % th_id) - profiler.dump_stats(file) + file = os.path.join(tmpdir, '%s.dat' % wwt.ident) + wwt.profiler.dump_stats(file) files.append(file) buf = io.StringIO() diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 8138eec..3f42fc1 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -56,12 +56,10 @@ class WarcWriterThread(threading.Thread): self.idle = None self.method_filter = set(method.upper() for method in self.options.method_filter or []) + def run(self): if self.options.profile: import cProfile self.profiler = cProfile.Profile() - - def run(self): - if self.options.profile: self.profiler.enable() self._run() self.profiler.disable() From 95b2b86487afa0674ba23690d5f0efe56c417eb7 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 15 Nov 2017 23:41:44 +0000 Subject: [PATCH 5/9] better error message for bad WARCPROX_WRITE_RECORD request --- setup.py | 2 +- warcprox/warcproxy.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 912d7ed..bbaa5d6 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.2.1b2.dev118', + version='2.2.1b2.dev119', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 3331221..12aac74 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -311,7 +311,10 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): self.server.recorded_url_q.put(rec_custom) self.send_response(204, 'OK') else: - self.send_error(400, 'Bad request') + self.send_error(400, message='Bad request', explain=( + 'Bad request. WARC-Type, Content-Length, and Content-Type ' + 'request headers required for WARCPROX_WRITE_RECORD ' + 'request.')) self.end_headers() except: From b28f9b9fb783223f2f1d618679b21f16703d2107 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 22 Nov 2017 11:08:34 -0800 Subject: [PATCH 6/9] fix oops --- setup.py | 2 +- warcprox/mitmproxy.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index bbaa5d6..12ea7f6 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.2.1b2.dev119', + version='2.2.1b2.dev120', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index d1ac12e..93a80a2 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -550,12 +550,11 @@ class PooledMitmProxy(PooledMixIn, MitmProxy): def __init__(self, max_threads, options=warcprox.Options()): PooledMixIn.__init__(self, max_threads) + self.profilers = {} if options.profile: - self.profilers = {} self.process_request_thread = self._profile_process_request_thread else: - self.profilers self.process_request_thread = self._process_request_thread def _profile_process_request_thread(self, request, client_address): From 627ef5667b2e29c103f4092bebd877cac6ba33e6 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 22 Nov 2017 12:49:46 -0800 Subject: [PATCH 7/9] failing test for correct handling of "http.client.RemoteDisconnected: Remote end closed connection without response" from remote server --- setup.py | 2 +- tests/test_warcprox.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 12ea7f6..0327f2c 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.2.1b2.dev120', + version='2.2.1b2.dev121', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 97e4351..9e46209 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -237,6 +237,9 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler): raise Exception('bad path') headers = b'HTTP/1.1 200 OK\r\n' + actual_headers + b'\r\n' logging.info('headers=%r payload=%r', headers, payload) + elif self.path == '/empty-response': + headers = b'' + payload = b'' else: payload = b'404 Not Found\n' headers = (b'HTTP/1.1 404 Not Found\r\n' @@ -1643,6 +1646,18 @@ def test_long_warcprox_meta( with pytest.raises(StopIteration): next(rec_iter) +def test_empty_response( + warcprox_, http_daemon, https_daemon, archiving_proxies, + playback_proxies): + + url = 'http://localhost:%s/empty-response' % http_daemon.server_port + response = requests.get(url, proxies=archiving_proxies, verify=False) + assert response.status_code == 500 + + url = 'https://localhost:%s/empty-response' % http_daemon.server_port + response = requests.get(url, proxies=archiving_proxies, verify=False) + assert response.status_code == 500 + def test_payload_digest(warcprox_, http_daemon): ''' Tests that digest is of RFC2616 "entity body" From 5be289730fc280c9827dab185931d6848f718ce8 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 22 Nov 2017 13:11:26 -0800 Subject: [PATCH 8/9] fix failing test, and change response code from 500 to more appropriate 502 --- setup.py | 2 +- tests/test_warcprox.py | 8 +++++--- warcprox/mitmproxy.py | 13 ++++++------- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/setup.py b/setup.py index 0327f2c..49003f7 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.2.1b2.dev121', + version='2.2.1b2.dev122', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 9e46209..5f426fc 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1652,11 +1652,13 @@ def test_empty_response( url = 'http://localhost:%s/empty-response' % http_daemon.server_port response = requests.get(url, proxies=archiving_proxies, verify=False) - assert response.status_code == 500 + assert response.status_code == 502 + assert response.reason == 'Remote end closed connection without response' - url = 'https://localhost:%s/empty-response' % http_daemon.server_port + url = 'https://localhost:%s/empty-response' % https_daemon.server_port response = requests.get(url, proxies=archiving_proxies, verify=False) - assert response.status_code == 500 + assert response.status_code == 502 + assert response.reason == 'Remote end closed connection without response' def test_payload_digest(warcprox_, http_daemon): ''' diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 93a80a2..8d950fa 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -363,9 +363,12 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): try: return self._proxy_request() - except: - self.logger.error("exception proxying request", exc_info=True) - raise + except Exception as e: + self.logger.error( + 'error from remote server(?) %r: %r', + self.requestline, e, exc_info=True) + self.send_error(502, str(e)) + return def _proxy_request(self, extra_response_headers={}): ''' @@ -425,10 +428,6 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): buf = prox_rec_res.read(65536) self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) - except Exception as e: - self.logger.error( - "%r proxying %s %s", e, self.command, self.url, - exc_info=True) finally: # Let's close off the remote end if prox_rec_res: From 330635c0a82e93a026ea24957fe7b766614f5bda Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 22 Nov 2017 13:55:44 -0800 Subject: [PATCH 9/9] fix test in py<=3.4 --- setup.py | 2 +- tests/test_warcprox.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 49003f7..d81ac6a 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.2.1b2.dev122', + version='2.2.1b2.dev123', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 5f426fc..5afd906 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1653,12 +1653,12 @@ def test_empty_response( url = 'http://localhost:%s/empty-response' % http_daemon.server_port response = requests.get(url, proxies=archiving_proxies, verify=False) assert response.status_code == 502 - assert response.reason == 'Remote end closed connection without response' + # this is the reason in python >= 3.5 but not in 3.4 and 2.7 + # assert response.reason == 'Remote end closed connection without response' url = 'https://localhost:%s/empty-response' % https_daemon.server_port response = requests.get(url, proxies=archiving_proxies, verify=False) assert response.status_code == 502 - assert response.reason == 'Remote end closed connection without response' def test_payload_digest(warcprox_, http_daemon): '''