diff --git a/setup.py b/setup.py index 8c0415e..63d8488 100755 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.18', + version='2.4.21', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index b06043c..a420740 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -52,6 +52,7 @@ import mock import email.message import socketserver from concurrent import futures +import urllib.parse try: import http.server as http_server @@ -175,8 +176,10 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler): def build_response(self): m = re.match(r'^/([^/]+)/([^/]+)$', self.path) if m is not None: - special_header = 'warcprox-test-header: {}!'.format(m.group(1)).encode('utf-8') - payload = 'I am the warcprox test payload! {}!\n'.format(10*m.group(2)).encode('utf-8') + seg1 = urllib.parse.unquote(m.group(1)) + seg2 = urllib.parse.unquote(m.group(2)) + special_header = 'warcprox-test-header: {}!'.format(seg1).encode('utf-8') + payload = 'I am the warcprox test payload! {}!\n'.format(10*seg2).encode('utf-8') headers = (b'HTTP/1.1 200 OK\r\n' + b'Content-Type: text/plain\r\n' + special_header + b'\r\n' @@ -1351,7 +1354,7 @@ def test_domain_data_soft_limit( warcprox_.proxy.remote_connection_pool.clear() # novel, pushes stats over the limit - url = 'https://muh.XN--Zz-2Ka.locALHOst:{}/z/~'.format(https_daemon.server_port) + url = 'https://muh.XN--Zz-2Ka.locALHOst:{}/z/%7E'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) diff --git a/warcprox/controller.py b/warcprox/controller.py index 9a2880e..84c3b93 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -37,6 +37,7 @@ import doublethink import importlib import queue import socket +import os class Factory: @staticmethod diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 9562fa5..0e09239 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -266,6 +266,9 @@ class CdxServerDedup(DedupDb): performance optimisation to handle that. limit < 0 is very inefficient in general. Maybe it could be configurable in the future. + Skip dedup for URLs with session params. These URLs are certainly + unique and highly volatile, we cannot dedup them. + :param digest_key: b'sha1:' (prefix is optional). Example: b'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A' :param url: Target URL string @@ -274,6 +277,8 @@ class CdxServerDedup(DedupDb): """ u = url.decode("utf-8") if isinstance(url, bytes) else url try: + if any(s in u for s in ('JSESSIONID=', 'session=', 'sess=')): + return None result = self.http_pool.request('GET', self.cdx_url, fields=dict( url=u, fl="timestamp,digest", filter="!mimetype:warc/revisit", limit=-1)) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 9bac478..8f86cd0 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -762,7 +762,7 @@ class PooledMitmProxy(PooledMixIn, MitmProxy): Abort active connections to remote servers to achieve prompt shutdown. ''' self.shutting_down = True - for sock in self.remote_server_socks: + for sock in list(self.remote_server_socks): self.shutdown_request(sock) class SingleThreadedMitmProxy(http_server.HTTPServer): @@ -780,7 +780,7 @@ class SingleThreadedMitmProxy(http_server.HTTPServer): self.bad_hostnames_ports_lock = RLock() self.remote_connection_pool = PoolManager( - num_pools=max((options.max_threads or 0) // 6, 400)) + num_pools=max((options.max_threads or 0) // 6, 400), maxsize=6) if options.onion_tor_socks_proxy: try: diff --git a/warcprox/writer.py b/warcprox/writer.py index 730d606..cc44be2 100644 --- a/warcprox/writer.py +++ b/warcprox/writer.py @@ -167,14 +167,17 @@ class WarcWriter: if self.open_suffix == '': try: fcntl.lockf(self.f, fcntl.LOCK_UN) - except IOError as exc: + except Exception as exc: self.logger.error( 'could not unlock file %s (%s)', self.path, exc) - self.f.close() - finalpath = os.path.sep.join( - [self.directory, self.finalname]) - os.rename(self.path, finalpath) - + try: + self.f.close() + finalpath = os.path.sep.join( + [self.directory, self.finalname]) + os.rename(self.path, finalpath) + except Exception as exc: + self.logger.error( + 'could not close and rename file %s (%s)', self.path, exc) self.path = None self.f = None