resetting to Jul 1 updates

This commit is contained in:
Barbara Miller 2022-08-03 19:58:54 -07:00
parent 20789e4edb
commit 8f10fce93a
5 changed files with 20 additions and 33 deletions

View File

@ -2,7 +2,7 @@
'''
setup.py - setuptools installation configuration for warcprox
Copyright (C) 2013-2022 Internet Archive
Copyright (C) 2013-2021 Internet Archive
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
@ -44,7 +44,7 @@ except:
setuptools.setup(
name='warcprox',
version='2.4.31-qa-2',
version='2.4.29',
description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox',
author='Noah Levitt',

View File

@ -1730,7 +1730,6 @@ def test_load_plugin():
assert isinstance(
controller._postfetch_chain[-4].listener,
warcprox.stats.RunningStats)
# MyEarlyPlugin
assert isinstance(
controller._postfetch_chain[0],
EarlyPlugin)

View File

@ -166,9 +166,8 @@ class WarcproxController(object):
with processor.inq.mutex:
l = list(processor.inq.queue)
for recorded_url in l:
if recorded_url.timestamp:
if not earliest or (recorded_url.timestamp < earliest):
earliest = recorded_url.timestamp
if earliest is None or recorded_url.timestamp < earliest:
earliest = recorded_url.timestamp
return earliest
def postfetch_status(self):

View File

@ -596,18 +596,15 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
'bytes exceeded for URL %s',
self._max_resource_size, self.url)
break
elif time.time() - start > 3 * 60 * 60:
if not 'content-length' in self.headers:
prox_rec_res.truncated = b'time'
self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR)
self._remote_server_conn.sock.close()
self.logger.info(
'reached hard timeout of 3 hours fetching url '
'without content-length: %s', self.url)
break
else:
self.logger.info(
'long-running fetch for URL %s', self.url)
elif (not 'content-length' in self.headers
and time.time() - start > 3 * 60 * 60):
prox_rec_res.truncated = b'time'
self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR)
self._remote_server_conn.sock.close()
self.logger.info(
'reached hard timeout of 3 hours fetching url '
'without content-length: %s', self.url)
break
self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
# Let's close off the remote end. If remote connection is fine,
@ -675,17 +672,13 @@ class PooledMixIn(socketserver.ThreadingMixIn):
def process_request(self, request, client_address):
self.active_requests[request] = doublethink.utcnow()
try:
future = self.pool.submit(
self.process_request_thread, request, client_address)
future.add_done_callback(
lambda f: self.active_requests.pop(request, None))
if future.done():
# avoid theoretical timing issue, in case process_request_thread
# managed to finish before future.add_done_callback() ran
self.active_requests.pop(request, None)
except RuntimeError as exc:
self.logger.error("Error processing request %s", str(exc))
future = self.pool.submit(
self.process_request_thread, request, client_address)
future.add_done_callback(
lambda f: self.active_requests.pop(request, None))
if future.done():
# avoid theoretical timing issue, in case process_request_thread
# managed to finish before future.add_done_callback() ran
self.active_requests.pop(request, None)
def get_request(self):

View File

@ -46,7 +46,6 @@ import tempfile
import hashlib
import doublethink
import re
import zlib
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
'''
@ -176,9 +175,6 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
warcprox_meta = json.loads(self.headers['Warcprox-Meta'])
self._security_check(warcprox_meta)
self._enforce_limits(warcprox_meta)
if 'compressed_blocks' in warcprox_meta:
warcprox_meta['blocks'] = json.loads(zlib.decompress(warcprox_meta['compressed_blocks']).decode())
del warcprox_meta['compressed_blocks']
self._enforce_blocks(warcprox_meta)
def _connect_to_remote_server(self):