Merge remote-tracking branch 'origin/master' into qa

* origin/master:
  bump version after merge
  Another exception when trying to close a WARC file
  bump version after merges
  try to fix test failing due to url-encoding
  Use "except Exception" to catch all exception types
  Set connection pool maxsize=6
  Handle ValueError when trying to close WARC file
  Skip cdx dedup for volatile URLs with session params
  Increase remote_connection_pool maxsize
  bump version
  add missing import
  avoid this problem
This commit is contained in:
Noah Levitt 2019-11-19 13:31:34 -08:00
commit c7ddeea2f0
6 changed files with 24 additions and 12 deletions

View File

@ -43,7 +43,7 @@ except:
setuptools.setup( setuptools.setup(
name='warcprox', name='warcprox',
version='2.4.18', version='2.4.21',
description='WARC writing MITM HTTP/S proxy', description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox', url='https://github.com/internetarchive/warcprox',
author='Noah Levitt', author='Noah Levitt',

View File

@ -52,6 +52,7 @@ import mock
import email.message import email.message
import socketserver import socketserver
from concurrent import futures from concurrent import futures
import urllib.parse
try: try:
import http.server as http_server import http.server as http_server
@ -175,8 +176,10 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
def build_response(self): def build_response(self):
m = re.match(r'^/([^/]+)/([^/]+)$', self.path) m = re.match(r'^/([^/]+)/([^/]+)$', self.path)
if m is not None: if m is not None:
special_header = 'warcprox-test-header: {}!'.format(m.group(1)).encode('utf-8') seg1 = urllib.parse.unquote(m.group(1))
payload = 'I am the warcprox test payload! {}!\n'.format(10*m.group(2)).encode('utf-8') seg2 = urllib.parse.unquote(m.group(2))
special_header = 'warcprox-test-header: {}!'.format(seg1).encode('utf-8')
payload = 'I am the warcprox test payload! {}!\n'.format(10*seg2).encode('utf-8')
headers = (b'HTTP/1.1 200 OK\r\n' headers = (b'HTTP/1.1 200 OK\r\n'
+ b'Content-Type: text/plain\r\n' + b'Content-Type: text/plain\r\n'
+ special_header + b'\r\n' + special_header + b'\r\n'
@ -1351,7 +1354,7 @@ def test_domain_data_soft_limit(
warcprox_.proxy.remote_connection_pool.clear() warcprox_.proxy.remote_connection_pool.clear()
# novel, pushes stats over the limit # novel, pushes stats over the limit
url = 'https://muh.XN--Zz-2Ka.locALHOst:{}/z/~'.format(https_daemon.server_port) url = 'https://muh.XN--Zz-2Ka.locALHOst:{}/z/%7E'.format(https_daemon.server_port)
response = requests.get( response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True, url, proxies=archiving_proxies, headers=headers, stream=True,
verify=False) verify=False)

View File

@ -37,6 +37,7 @@ import doublethink
import importlib import importlib
import queue import queue
import socket import socket
import os
class Factory: class Factory:
@staticmethod @staticmethod

View File

@ -266,6 +266,9 @@ class CdxServerDedup(DedupDb):
performance optimisation to handle that. limit < 0 is very inefficient performance optimisation to handle that. limit < 0 is very inefficient
in general. Maybe it could be configurable in the future. in general. Maybe it could be configurable in the future.
Skip dedup for URLs with session params. These URLs are certainly
unique and highly volatile, we cannot dedup them.
:param digest_key: b'sha1:<KEY-VALUE>' (prefix is optional). :param digest_key: b'sha1:<KEY-VALUE>' (prefix is optional).
Example: b'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A' Example: b'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
:param url: Target URL string :param url: Target URL string
@ -274,6 +277,8 @@ class CdxServerDedup(DedupDb):
""" """
u = url.decode("utf-8") if isinstance(url, bytes) else url u = url.decode("utf-8") if isinstance(url, bytes) else url
try: try:
if any(s in u for s in ('JSESSIONID=', 'session=', 'sess=')):
return None
result = self.http_pool.request('GET', self.cdx_url, fields=dict( result = self.http_pool.request('GET', self.cdx_url, fields=dict(
url=u, fl="timestamp,digest", filter="!mimetype:warc/revisit", url=u, fl="timestamp,digest", filter="!mimetype:warc/revisit",
limit=-1)) limit=-1))

View File

@ -762,7 +762,7 @@ class PooledMitmProxy(PooledMixIn, MitmProxy):
Abort active connections to remote servers to achieve prompt shutdown. Abort active connections to remote servers to achieve prompt shutdown.
''' '''
self.shutting_down = True self.shutting_down = True
for sock in self.remote_server_socks: for sock in list(self.remote_server_socks):
self.shutdown_request(sock) self.shutdown_request(sock)
class SingleThreadedMitmProxy(http_server.HTTPServer): class SingleThreadedMitmProxy(http_server.HTTPServer):
@ -780,7 +780,7 @@ class SingleThreadedMitmProxy(http_server.HTTPServer):
self.bad_hostnames_ports_lock = RLock() self.bad_hostnames_ports_lock = RLock()
self.remote_connection_pool = PoolManager( self.remote_connection_pool = PoolManager(
num_pools=max((options.max_threads or 0) // 6, 400)) num_pools=max((options.max_threads or 0) // 6, 400), maxsize=6)
if options.onion_tor_socks_proxy: if options.onion_tor_socks_proxy:
try: try:

View File

@ -167,14 +167,17 @@ class WarcWriter:
if self.open_suffix == '': if self.open_suffix == '':
try: try:
fcntl.lockf(self.f, fcntl.LOCK_UN) fcntl.lockf(self.f, fcntl.LOCK_UN)
except IOError as exc: except Exception as exc:
self.logger.error( self.logger.error(
'could not unlock file %s (%s)', self.path, exc) 'could not unlock file %s (%s)', self.path, exc)
self.f.close() try:
finalpath = os.path.sep.join( self.f.close()
[self.directory, self.finalname]) finalpath = os.path.sep.join(
os.rename(self.path, finalpath) [self.directory, self.finalname])
os.rename(self.path, finalpath)
except Exception as exc:
self.logger.error(
'could not close and rename file %s (%s)', self.path, exc)
self.path = None self.path = None
self.f = None self.f = None