resetting to Jul 1 updates

This commit is contained in:
Barbara Miller 2022-08-03 19:58:54 -07:00
parent 20789e4edb
commit 8f10fce93a
5 changed files with 20 additions and 33 deletions

View File

@ -2,7 +2,7 @@
''' '''
setup.py - setuptools installation configuration for warcprox setup.py - setuptools installation configuration for warcprox
Copyright (C) 2013-2022 Internet Archive Copyright (C) 2013-2021 Internet Archive
This program is free software; you can redistribute it and/or This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License modify it under the terms of the GNU General Public License
@ -44,7 +44,7 @@ except:
setuptools.setup( setuptools.setup(
name='warcprox', name='warcprox',
version='2.4.31-qa-2', version='2.4.29',
description='WARC writing MITM HTTP/S proxy', description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox', url='https://github.com/internetarchive/warcprox',
author='Noah Levitt', author='Noah Levitt',

View File

@ -1730,7 +1730,6 @@ def test_load_plugin():
assert isinstance( assert isinstance(
controller._postfetch_chain[-4].listener, controller._postfetch_chain[-4].listener,
warcprox.stats.RunningStats) warcprox.stats.RunningStats)
# MyEarlyPlugin
assert isinstance( assert isinstance(
controller._postfetch_chain[0], controller._postfetch_chain[0],
EarlyPlugin) EarlyPlugin)

View File

@ -166,9 +166,8 @@ class WarcproxController(object):
with processor.inq.mutex: with processor.inq.mutex:
l = list(processor.inq.queue) l = list(processor.inq.queue)
for recorded_url in l: for recorded_url in l:
if recorded_url.timestamp: if earliest is None or recorded_url.timestamp < earliest:
if not earliest or (recorded_url.timestamp < earliest): earliest = recorded_url.timestamp
earliest = recorded_url.timestamp
return earliest return earliest
def postfetch_status(self): def postfetch_status(self):

View File

@ -596,18 +596,15 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
'bytes exceeded for URL %s', 'bytes exceeded for URL %s',
self._max_resource_size, self.url) self._max_resource_size, self.url)
break break
elif time.time() - start > 3 * 60 * 60: elif (not 'content-length' in self.headers
if not 'content-length' in self.headers: and time.time() - start > 3 * 60 * 60):
prox_rec_res.truncated = b'time' prox_rec_res.truncated = b'time'
self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR) self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR)
self._remote_server_conn.sock.close() self._remote_server_conn.sock.close()
self.logger.info( self.logger.info(
'reached hard timeout of 3 hours fetching url ' 'reached hard timeout of 3 hours fetching url '
'without content-length: %s', self.url) 'without content-length: %s', self.url)
break break
else:
self.logger.info(
'long-running fetch for URL %s', self.url)
self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
# Let's close off the remote end. If remote connection is fine, # Let's close off the remote end. If remote connection is fine,
@ -675,17 +672,13 @@ class PooledMixIn(socketserver.ThreadingMixIn):
def process_request(self, request, client_address): def process_request(self, request, client_address):
self.active_requests[request] = doublethink.utcnow() self.active_requests[request] = doublethink.utcnow()
try: future = self.pool.submit(
future = self.pool.submit( self.process_request_thread, request, client_address)
self.process_request_thread, request, client_address) future.add_done_callback(
future.add_done_callback( lambda f: self.active_requests.pop(request, None))
lambda f: self.active_requests.pop(request, None)) if future.done():
if future.done(): # avoid theoretical timing issue, in case process_request_thread
# avoid theoretical timing issue, in case process_request_thread # managed to finish before future.add_done_callback() ran
# managed to finish before future.add_done_callback() ran
self.active_requests.pop(request, None)
except RuntimeError as exc:
self.logger.error("Error processing request %s", str(exc))
self.active_requests.pop(request, None) self.active_requests.pop(request, None)
def get_request(self): def get_request(self):

View File

@ -46,7 +46,6 @@ import tempfile
import hashlib import hashlib
import doublethink import doublethink
import re import re
import zlib
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
''' '''
@ -176,9 +175,6 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
warcprox_meta = json.loads(self.headers['Warcprox-Meta']) warcprox_meta = json.loads(self.headers['Warcprox-Meta'])
self._security_check(warcprox_meta) self._security_check(warcprox_meta)
self._enforce_limits(warcprox_meta) self._enforce_limits(warcprox_meta)
if 'compressed_blocks' in warcprox_meta:
warcprox_meta['blocks'] = json.loads(zlib.decompress(warcprox_meta['compressed_blocks']).decode())
del warcprox_meta['compressed_blocks']
self._enforce_blocks(warcprox_meta) self._enforce_blocks(warcprox_meta)
def _connect_to_remote_server(self): def _connect_to_remote_server(self):