diff --git a/setup.py b/setup.py index 40ebddb..52af206 100755 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ ''' setup.py - setuptools installation configuration for warcprox -Copyright (C) 2013-2022 Internet Archive +Copyright (C) 2013-2021 Internet Archive This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License @@ -44,7 +44,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.31-qa-2', + version='2.4.29', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 3ca74f2..c115b4f 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1730,7 +1730,6 @@ def test_load_plugin(): assert isinstance( controller._postfetch_chain[-4].listener, warcprox.stats.RunningStats) - # MyEarlyPlugin assert isinstance( controller._postfetch_chain[0], EarlyPlugin) diff --git a/warcprox/controller.py b/warcprox/controller.py index 954cbc1..8d670cb 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -166,9 +166,8 @@ class WarcproxController(object): with processor.inq.mutex: l = list(processor.inq.queue) for recorded_url in l: - if recorded_url.timestamp: - if not earliest or (recorded_url.timestamp < earliest): - earliest = recorded_url.timestamp + if earliest is None or recorded_url.timestamp < earliest: + earliest = recorded_url.timestamp return earliest def postfetch_status(self): diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index a55b860..a423a22 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -596,18 +596,15 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): 'bytes exceeded for URL %s', self._max_resource_size, self.url) break - elif time.time() - start > 3 * 60 * 60: - if not 'content-length' in self.headers: - prox_rec_res.truncated = b'time' - self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR) - self._remote_server_conn.sock.close() - self.logger.info( - 'reached hard timeout of 3 hours fetching url ' - 'without content-length: %s', self.url) - break - else: - self.logger.info( - 'long-running fetch for URL %s', self.url) + elif (not 'content-length' in self.headers + and time.time() - start > 3 * 60 * 60): + prox_rec_res.truncated = b'time' + self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR) + self._remote_server_conn.sock.close() + self.logger.info( + 'reached hard timeout of 3 hours fetching url ' + 'without content-length: %s', self.url) + break self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) # Let's close off the remote end. If remote connection is fine, @@ -675,17 +672,13 @@ class PooledMixIn(socketserver.ThreadingMixIn): def process_request(self, request, client_address): self.active_requests[request] = doublethink.utcnow() - try: - future = self.pool.submit( - self.process_request_thread, request, client_address) - future.add_done_callback( - lambda f: self.active_requests.pop(request, None)) - if future.done(): - # avoid theoretical timing issue, in case process_request_thread - # managed to finish before future.add_done_callback() ran - self.active_requests.pop(request, None) - except RuntimeError as exc: - self.logger.error("Error processing request %s", str(exc)) + future = self.pool.submit( + self.process_request_thread, request, client_address) + future.add_done_callback( + lambda f: self.active_requests.pop(request, None)) + if future.done(): + # avoid theoretical timing issue, in case process_request_thread + # managed to finish before future.add_done_callback() ran self.active_requests.pop(request, None) def get_request(self): diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 42996e5..05eb8b7 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -46,7 +46,6 @@ import tempfile import hashlib import doublethink import re -import zlib class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): ''' @@ -176,9 +175,6 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): warcprox_meta = json.loads(self.headers['Warcprox-Meta']) self._security_check(warcprox_meta) self._enforce_limits(warcprox_meta) - if 'compressed_blocks' in warcprox_meta: - warcprox_meta['blocks'] = json.loads(zlib.decompress(warcprox_meta['compressed_blocks']).decode()) - del warcprox_meta['compressed_blocks'] self._enforce_blocks(warcprox_meta) def _connect_to_remote_server(self):