mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Merge remote-tracking branch 'origin/master' into qa
* origin/master: bump version after merge Another exception when trying to close a WARC file bump version after merges try to fix test failing due to url-encoding Use "except Exception" to catch all exception types Set connection pool maxsize=6 Handle ValueError when trying to close WARC file Skip cdx dedup for volatile URLs with session params Increase remote_connection_pool maxsize bump version add missing import avoid this problem
This commit is contained in:
commit
c7ddeea2f0
2
setup.py
2
setup.py
@ -43,7 +43,7 @@ except:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='warcprox',
|
name='warcprox',
|
||||||
version='2.4.18',
|
version='2.4.21',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -52,6 +52,7 @@ import mock
|
|||||||
import email.message
|
import email.message
|
||||||
import socketserver
|
import socketserver
|
||||||
from concurrent import futures
|
from concurrent import futures
|
||||||
|
import urllib.parse
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import http.server as http_server
|
import http.server as http_server
|
||||||
@ -175,8 +176,10 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
def build_response(self):
|
def build_response(self):
|
||||||
m = re.match(r'^/([^/]+)/([^/]+)$', self.path)
|
m = re.match(r'^/([^/]+)/([^/]+)$', self.path)
|
||||||
if m is not None:
|
if m is not None:
|
||||||
special_header = 'warcprox-test-header: {}!'.format(m.group(1)).encode('utf-8')
|
seg1 = urllib.parse.unquote(m.group(1))
|
||||||
payload = 'I am the warcprox test payload! {}!\n'.format(10*m.group(2)).encode('utf-8')
|
seg2 = urllib.parse.unquote(m.group(2))
|
||||||
|
special_header = 'warcprox-test-header: {}!'.format(seg1).encode('utf-8')
|
||||||
|
payload = 'I am the warcprox test payload! {}!\n'.format(10*seg2).encode('utf-8')
|
||||||
headers = (b'HTTP/1.1 200 OK\r\n'
|
headers = (b'HTTP/1.1 200 OK\r\n'
|
||||||
+ b'Content-Type: text/plain\r\n'
|
+ b'Content-Type: text/plain\r\n'
|
||||||
+ special_header + b'\r\n'
|
+ special_header + b'\r\n'
|
||||||
@ -1351,7 +1354,7 @@ def test_domain_data_soft_limit(
|
|||||||
warcprox_.proxy.remote_connection_pool.clear()
|
warcprox_.proxy.remote_connection_pool.clear()
|
||||||
|
|
||||||
# novel, pushes stats over the limit
|
# novel, pushes stats over the limit
|
||||||
url = 'https://muh.XN--Zz-2Ka.locALHOst:{}/z/~'.format(https_daemon.server_port)
|
url = 'https://muh.XN--Zz-2Ka.locALHOst:{}/z/%7E'.format(https_daemon.server_port)
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||||
verify=False)
|
verify=False)
|
||||||
|
@ -37,6 +37,7 @@ import doublethink
|
|||||||
import importlib
|
import importlib
|
||||||
import queue
|
import queue
|
||||||
import socket
|
import socket
|
||||||
|
import os
|
||||||
|
|
||||||
class Factory:
|
class Factory:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -266,6 +266,9 @@ class CdxServerDedup(DedupDb):
|
|||||||
performance optimisation to handle that. limit < 0 is very inefficient
|
performance optimisation to handle that. limit < 0 is very inefficient
|
||||||
in general. Maybe it could be configurable in the future.
|
in general. Maybe it could be configurable in the future.
|
||||||
|
|
||||||
|
Skip dedup for URLs with session params. These URLs are certainly
|
||||||
|
unique and highly volatile, we cannot dedup them.
|
||||||
|
|
||||||
:param digest_key: b'sha1:<KEY-VALUE>' (prefix is optional).
|
:param digest_key: b'sha1:<KEY-VALUE>' (prefix is optional).
|
||||||
Example: b'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
|
Example: b'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
|
||||||
:param url: Target URL string
|
:param url: Target URL string
|
||||||
@ -274,6 +277,8 @@ class CdxServerDedup(DedupDb):
|
|||||||
"""
|
"""
|
||||||
u = url.decode("utf-8") if isinstance(url, bytes) else url
|
u = url.decode("utf-8") if isinstance(url, bytes) else url
|
||||||
try:
|
try:
|
||||||
|
if any(s in u for s in ('JSESSIONID=', 'session=', 'sess=')):
|
||||||
|
return None
|
||||||
result = self.http_pool.request('GET', self.cdx_url, fields=dict(
|
result = self.http_pool.request('GET', self.cdx_url, fields=dict(
|
||||||
url=u, fl="timestamp,digest", filter="!mimetype:warc/revisit",
|
url=u, fl="timestamp,digest", filter="!mimetype:warc/revisit",
|
||||||
limit=-1))
|
limit=-1))
|
||||||
|
@ -762,7 +762,7 @@ class PooledMitmProxy(PooledMixIn, MitmProxy):
|
|||||||
Abort active connections to remote servers to achieve prompt shutdown.
|
Abort active connections to remote servers to achieve prompt shutdown.
|
||||||
'''
|
'''
|
||||||
self.shutting_down = True
|
self.shutting_down = True
|
||||||
for sock in self.remote_server_socks:
|
for sock in list(self.remote_server_socks):
|
||||||
self.shutdown_request(sock)
|
self.shutdown_request(sock)
|
||||||
|
|
||||||
class SingleThreadedMitmProxy(http_server.HTTPServer):
|
class SingleThreadedMitmProxy(http_server.HTTPServer):
|
||||||
@ -780,7 +780,7 @@ class SingleThreadedMitmProxy(http_server.HTTPServer):
|
|||||||
self.bad_hostnames_ports_lock = RLock()
|
self.bad_hostnames_ports_lock = RLock()
|
||||||
|
|
||||||
self.remote_connection_pool = PoolManager(
|
self.remote_connection_pool = PoolManager(
|
||||||
num_pools=max((options.max_threads or 0) // 6, 400))
|
num_pools=max((options.max_threads or 0) // 6, 400), maxsize=6)
|
||||||
|
|
||||||
if options.onion_tor_socks_proxy:
|
if options.onion_tor_socks_proxy:
|
||||||
try:
|
try:
|
||||||
|
@ -167,14 +167,17 @@ class WarcWriter:
|
|||||||
if self.open_suffix == '':
|
if self.open_suffix == '':
|
||||||
try:
|
try:
|
||||||
fcntl.lockf(self.f, fcntl.LOCK_UN)
|
fcntl.lockf(self.f, fcntl.LOCK_UN)
|
||||||
except IOError as exc:
|
except Exception as exc:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
'could not unlock file %s (%s)', self.path, exc)
|
'could not unlock file %s (%s)', self.path, exc)
|
||||||
self.f.close()
|
try:
|
||||||
finalpath = os.path.sep.join(
|
self.f.close()
|
||||||
[self.directory, self.finalname])
|
finalpath = os.path.sep.join(
|
||||||
os.rename(self.path, finalpath)
|
[self.directory, self.finalname])
|
||||||
|
os.rename(self.path, finalpath)
|
||||||
|
except Exception as exc:
|
||||||
|
self.logger.error(
|
||||||
|
'could not close and rename file %s (%s)', self.path, exc)
|
||||||
self.path = None
|
self.path = None
|
||||||
self.f = None
|
self.f = None
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user