mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Merge remote-tracking branch 'origin/master' into qa
* origin/master: bump version after merge Another exception when trying to close a WARC file bump version after merges try to fix test failing due to url-encoding Use "except Exception" to catch all exception types Set connection pool maxsize=6 Handle ValueError when trying to close WARC file Skip cdx dedup for volatile URLs with session params Increase remote_connection_pool maxsize bump version add missing import avoid this problem
This commit is contained in:
commit
c7ddeea2f0
2
setup.py
2
setup.py
@ -43,7 +43,7 @@ except:
|
||||
|
||||
setuptools.setup(
|
||||
name='warcprox',
|
||||
version='2.4.18',
|
||||
version='2.4.21',
|
||||
description='WARC writing MITM HTTP/S proxy',
|
||||
url='https://github.com/internetarchive/warcprox',
|
||||
author='Noah Levitt',
|
||||
|
@ -52,6 +52,7 @@ import mock
|
||||
import email.message
|
||||
import socketserver
|
||||
from concurrent import futures
|
||||
import urllib.parse
|
||||
|
||||
try:
|
||||
import http.server as http_server
|
||||
@ -175,8 +176,10 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
|
||||
def build_response(self):
|
||||
m = re.match(r'^/([^/]+)/([^/]+)$', self.path)
|
||||
if m is not None:
|
||||
special_header = 'warcprox-test-header: {}!'.format(m.group(1)).encode('utf-8')
|
||||
payload = 'I am the warcprox test payload! {}!\n'.format(10*m.group(2)).encode('utf-8')
|
||||
seg1 = urllib.parse.unquote(m.group(1))
|
||||
seg2 = urllib.parse.unquote(m.group(2))
|
||||
special_header = 'warcprox-test-header: {}!'.format(seg1).encode('utf-8')
|
||||
payload = 'I am the warcprox test payload! {}!\n'.format(10*seg2).encode('utf-8')
|
||||
headers = (b'HTTP/1.1 200 OK\r\n'
|
||||
+ b'Content-Type: text/plain\r\n'
|
||||
+ special_header + b'\r\n'
|
||||
@ -1351,7 +1354,7 @@ def test_domain_data_soft_limit(
|
||||
warcprox_.proxy.remote_connection_pool.clear()
|
||||
|
||||
# novel, pushes stats over the limit
|
||||
url = 'https://muh.XN--Zz-2Ka.locALHOst:{}/z/~'.format(https_daemon.server_port)
|
||||
url = 'https://muh.XN--Zz-2Ka.locALHOst:{}/z/%7E'.format(https_daemon.server_port)
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||
verify=False)
|
||||
|
@ -37,6 +37,7 @@ import doublethink
|
||||
import importlib
|
||||
import queue
|
||||
import socket
|
||||
import os
|
||||
|
||||
class Factory:
|
||||
@staticmethod
|
||||
|
@ -266,6 +266,9 @@ class CdxServerDedup(DedupDb):
|
||||
performance optimisation to handle that. limit < 0 is very inefficient
|
||||
in general. Maybe it could be configurable in the future.
|
||||
|
||||
Skip dedup for URLs with session params. These URLs are certainly
|
||||
unique and highly volatile, we cannot dedup them.
|
||||
|
||||
:param digest_key: b'sha1:<KEY-VALUE>' (prefix is optional).
|
||||
Example: b'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
|
||||
:param url: Target URL string
|
||||
@ -274,6 +277,8 @@ class CdxServerDedup(DedupDb):
|
||||
"""
|
||||
u = url.decode("utf-8") if isinstance(url, bytes) else url
|
||||
try:
|
||||
if any(s in u for s in ('JSESSIONID=', 'session=', 'sess=')):
|
||||
return None
|
||||
result = self.http_pool.request('GET', self.cdx_url, fields=dict(
|
||||
url=u, fl="timestamp,digest", filter="!mimetype:warc/revisit",
|
||||
limit=-1))
|
||||
|
@ -762,7 +762,7 @@ class PooledMitmProxy(PooledMixIn, MitmProxy):
|
||||
Abort active connections to remote servers to achieve prompt shutdown.
|
||||
'''
|
||||
self.shutting_down = True
|
||||
for sock in self.remote_server_socks:
|
||||
for sock in list(self.remote_server_socks):
|
||||
self.shutdown_request(sock)
|
||||
|
||||
class SingleThreadedMitmProxy(http_server.HTTPServer):
|
||||
@ -780,7 +780,7 @@ class SingleThreadedMitmProxy(http_server.HTTPServer):
|
||||
self.bad_hostnames_ports_lock = RLock()
|
||||
|
||||
self.remote_connection_pool = PoolManager(
|
||||
num_pools=max((options.max_threads or 0) // 6, 400))
|
||||
num_pools=max((options.max_threads or 0) // 6, 400), maxsize=6)
|
||||
|
||||
if options.onion_tor_socks_proxy:
|
||||
try:
|
||||
|
@ -167,14 +167,17 @@ class WarcWriter:
|
||||
if self.open_suffix == '':
|
||||
try:
|
||||
fcntl.lockf(self.f, fcntl.LOCK_UN)
|
||||
except IOError as exc:
|
||||
except Exception as exc:
|
||||
self.logger.error(
|
||||
'could not unlock file %s (%s)', self.path, exc)
|
||||
self.f.close()
|
||||
finalpath = os.path.sep.join(
|
||||
[self.directory, self.finalname])
|
||||
os.rename(self.path, finalpath)
|
||||
|
||||
try:
|
||||
self.f.close()
|
||||
finalpath = os.path.sep.join(
|
||||
[self.directory, self.finalname])
|
||||
os.rename(self.path, finalpath)
|
||||
except Exception as exc:
|
||||
self.logger.error(
|
||||
'could not close and rename file %s (%s)', self.path, exc)
|
||||
self.path = None
|
||||
self.f = None
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user