Merge remote-tracking branch 'origin/master' into qa

* origin/master:
  bump version after merge
  Another exception when trying to close a WARC file
  bump version after merges
  try to fix test failing due to url-encoding
  Use "except Exception" to catch all exception types
  Set connection pool maxsize=6
  Handle ValueError when trying to close WARC file
  Skip cdx dedup for volatile URLs with session params
  Increase remote_connection_pool maxsize
  bump version
  add missing import
  avoid this problem
This commit is contained in:
Noah Levitt 2019-11-19 13:31:34 -08:00
commit c7ddeea2f0
6 changed files with 24 additions and 12 deletions

View File

@ -43,7 +43,7 @@ except:
setuptools.setup(
name='warcprox',
version='2.4.18',
version='2.4.21',
description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox',
author='Noah Levitt',

View File

@ -52,6 +52,7 @@ import mock
import email.message
import socketserver
from concurrent import futures
import urllib.parse
try:
import http.server as http_server
@ -175,8 +176,10 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
def build_response(self):
m = re.match(r'^/([^/]+)/([^/]+)$', self.path)
if m is not None:
special_header = 'warcprox-test-header: {}!'.format(m.group(1)).encode('utf-8')
payload = 'I am the warcprox test payload! {}!\n'.format(10*m.group(2)).encode('utf-8')
seg1 = urllib.parse.unquote(m.group(1))
seg2 = urllib.parse.unquote(m.group(2))
special_header = 'warcprox-test-header: {}!'.format(seg1).encode('utf-8')
payload = 'I am the warcprox test payload! {}!\n'.format(10*seg2).encode('utf-8')
headers = (b'HTTP/1.1 200 OK\r\n'
+ b'Content-Type: text/plain\r\n'
+ special_header + b'\r\n'
@ -1351,7 +1354,7 @@ def test_domain_data_soft_limit(
warcprox_.proxy.remote_connection_pool.clear()
# novel, pushes stats over the limit
url = 'https://muh.XN--Zz-2Ka.locALHOst:{}/z/~'.format(https_daemon.server_port)
url = 'https://muh.XN--Zz-2Ka.locALHOst:{}/z/%7E'.format(https_daemon.server_port)
response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True,
verify=False)

View File

@ -37,6 +37,7 @@ import doublethink
import importlib
import queue
import socket
import os
class Factory:
@staticmethod

View File

@ -266,6 +266,9 @@ class CdxServerDedup(DedupDb):
performance optimisation to handle that. limit < 0 is very inefficient
in general. Maybe it could be configurable in the future.
Skip dedup for URLs with session params. These URLs are certainly
unique and highly volatile, we cannot dedup them.
:param digest_key: b'sha1:<KEY-VALUE>' (prefix is optional).
Example: b'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
:param url: Target URL string
@ -274,6 +277,8 @@ class CdxServerDedup(DedupDb):
"""
u = url.decode("utf-8") if isinstance(url, bytes) else url
try:
if any(s in u for s in ('JSESSIONID=', 'session=', 'sess=')):
return None
result = self.http_pool.request('GET', self.cdx_url, fields=dict(
url=u, fl="timestamp,digest", filter="!mimetype:warc/revisit",
limit=-1))

View File

@ -762,7 +762,7 @@ class PooledMitmProxy(PooledMixIn, MitmProxy):
Abort active connections to remote servers to achieve prompt shutdown.
'''
self.shutting_down = True
for sock in self.remote_server_socks:
for sock in list(self.remote_server_socks):
self.shutdown_request(sock)
class SingleThreadedMitmProxy(http_server.HTTPServer):
@ -780,7 +780,7 @@ class SingleThreadedMitmProxy(http_server.HTTPServer):
self.bad_hostnames_ports_lock = RLock()
self.remote_connection_pool = PoolManager(
num_pools=max((options.max_threads or 0) // 6, 400))
num_pools=max((options.max_threads or 0) // 6, 400), maxsize=6)
if options.onion_tor_socks_proxy:
try:

View File

@ -167,14 +167,17 @@ class WarcWriter:
if self.open_suffix == '':
try:
fcntl.lockf(self.f, fcntl.LOCK_UN)
except IOError as exc:
except Exception as exc:
self.logger.error(
'could not unlock file %s (%s)', self.path, exc)
self.f.close()
finalpath = os.path.sep.join(
[self.directory, self.finalname])
os.rename(self.path, finalpath)
try:
self.f.close()
finalpath = os.path.sep.join(
[self.directory, self.finalname])
os.rename(self.path, finalpath)
except Exception as exc:
self.logger.error(
'could not close and rename file %s (%s)', self.path, exc)
self.path = None
self.f = None