mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Merge branch 'do_not_archive' into qa
This commit is contained in:
commit
f202f12bc5
2
setup.py
2
setup.py
@ -40,7 +40,7 @@ except:
|
||||
|
||||
setuptools.setup(
|
||||
name='warcprox',
|
||||
version='2.4b2.dev149',
|
||||
version='2.4b2.dev150',
|
||||
description='WARC writing MITM HTTP/S proxy',
|
||||
url='https://github.com/internetarchive/warcprox',
|
||||
author='Noah Levitt',
|
||||
|
@ -181,6 +181,12 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
|
||||
+ b'Content-Type: text/plain\r\n'
|
||||
+ b'\r\n')
|
||||
payload = b'This response is missing a Content-Length http header.'
|
||||
elif self.path == '/300k-content':
|
||||
payload = b'0123456789' * 30000
|
||||
headers = (b'HTTP/1.1 200 OK\r\n'
|
||||
+ b'Content-Type: text/plain\r\n'
|
||||
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n'
|
||||
+ b'\r\n')
|
||||
elif self.path.startswith('/test_payload_digest-'):
|
||||
content_body = (
|
||||
b'Hello. How are you. I am the test_payload_digest '
|
||||
@ -365,7 +371,8 @@ def warcprox_(request):
|
||||
'--playback-port=0',
|
||||
'--onion-tor-socks-proxy=localhost:9050',
|
||||
'--crawl-log-dir=crawl-logs',
|
||||
'--socket-timeout=4']
|
||||
'--socket-timeout=4',
|
||||
'--max-resource-size=200000']
|
||||
if request.config.getoption('--rethinkdb-dedup-url'):
|
||||
argv.append('--rethinkdb-dedup-url=%s' % request.config.getoption('--rethinkdb-dedup-url'))
|
||||
# test these here only
|
||||
@ -1211,6 +1218,23 @@ def test_missing_content_length(archiving_proxies, http_daemon, https_daemon, wa
|
||||
# wait for postfetch chain
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2)
|
||||
|
||||
def test_limit_large_resource(archiving_proxies, http_daemon, warcprox_):
|
||||
"""We try to load a 300k response but we use --max-resource-size=200000 in
|
||||
`warcprox_` so it will be truncated. We expect it to limit the result as
|
||||
soon as it passes the 200000 limit. As warcprox read() chunk size is 65536,
|
||||
the expected result size is 65536*4=262144.
|
||||
"""
|
||||
urls_before = warcprox_.proxy.running_stats.urls
|
||||
|
||||
url = 'http://localhost:%s/300k-content' % http_daemon.server_port
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, verify=False, timeout=10)
|
||||
assert len(response.content) == 262144
|
||||
|
||||
# wait for processing of this url to finish so that it doesn't interfere
|
||||
# with subsequent tests
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
|
||||
|
||||
def test_method_filter(
|
||||
warcprox_, https_daemon, http_daemon, archiving_proxies,
|
||||
playback_proxies):
|
||||
|
@ -166,6 +166,9 @@ def _build_arg_parser(prog='warcprox'):
|
||||
arg_parser.add_argument(
|
||||
'--socket-timeout', dest='socket_timeout', type=float,
|
||||
default=None, help=argparse.SUPPRESS)
|
||||
arg_parser.add_argument(
|
||||
'--max-resource-size', dest='max_resource_size', type=int,
|
||||
default=None, help='maximum resource size limit in bytes')
|
||||
arg_parser.add_argument(
|
||||
'--crawl-log-dir', dest='crawl_log_dir', default=None, help=(
|
||||
'if specified, write crawl log files in the specified '
|
||||
|
@ -161,6 +161,7 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
|
||||
self.fp = self.recorder
|
||||
|
||||
self.payload_digest = None
|
||||
self.truncated = None
|
||||
|
||||
def begin(self, extra_response_headers={}):
|
||||
http_client.HTTPResponse.begin(self) # reads status line, headers
|
||||
@ -207,6 +208,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
'''
|
||||
logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler")
|
||||
_socket_timeout = 60
|
||||
_max_resource_size = None
|
||||
|
||||
def __init__(self, request, client_address, server):
|
||||
threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1])
|
||||
@ -431,6 +433,13 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
buf = prox_rec_res.read(65536)
|
||||
while buf != b'':
|
||||
buf = prox_rec_res.read(65536)
|
||||
if self._max_resource_size:
|
||||
if prox_rec_res.recorder.len > self._max_resource_size:
|
||||
prox_rec_res.truncated = b'length'
|
||||
self.logger.error(
|
||||
'Max resource size %d bytes exceeded for URL %s',
|
||||
self._max_resource_size, self.url)
|
||||
break
|
||||
|
||||
self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
|
||||
finally:
|
||||
|
1
warcprox/version.txt
Normal file
1
warcprox/version.txt
Normal file
@ -0,0 +1 @@
|
||||
1.4-20160105052702-f79e744
|
@ -67,7 +67,8 @@ class WarcRecordBuilder:
|
||||
content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
|
||||
remote_ip=recorded_url.remote_ip,
|
||||
payload_digest=warcprox.digest_str(
|
||||
recorded_url.payload_digest, self.base32))
|
||||
recorded_url.payload_digest, self.base32),
|
||||
truncated=recorded_url.truncated)
|
||||
|
||||
def build_warc_records(self, recorded_url):
|
||||
"""Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)"""
|
||||
@ -91,7 +92,7 @@ class WarcRecordBuilder:
|
||||
def build_warc_record(self, url, warc_date=None, recorder=None, data=None,
|
||||
concurrent_to=None, warc_type=None, content_type=None, remote_ip=None,
|
||||
profile=None, refers_to=None, refers_to_target_uri=None,
|
||||
refers_to_date=None, payload_digest=None):
|
||||
refers_to_date=None, payload_digest=None, truncated=None):
|
||||
|
||||
if warc_date is None:
|
||||
warc_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow())
|
||||
@ -120,6 +121,9 @@ class WarcRecordBuilder:
|
||||
headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
|
||||
if payload_digest is not None:
|
||||
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
|
||||
# truncated value may be 'length' or 'time'
|
||||
if truncated is not None:
|
||||
headers.append((b'WARC-Truncated', truncated))
|
||||
|
||||
if recorder is not None:
|
||||
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1')))
|
||||
|
@ -221,7 +221,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
timestamp=timestamp, host=self.hostname,
|
||||
duration=datetime.datetime.utcnow()-timestamp,
|
||||
referer=self.headers.get('referer'),
|
||||
payload_digest=prox_rec_res.payload_digest)
|
||||
payload_digest=prox_rec_res.payload_digest,
|
||||
truncated=prox_rec_res.truncated)
|
||||
self.server.recorded_url_q.put(recorded_url)
|
||||
|
||||
return recorded_url
|
||||
@ -330,7 +331,8 @@ class RecordedUrl:
|
||||
warcprox_meta=None, content_type=None, custom_type=None,
|
||||
status=None, size=None, client_ip=None, method=None,
|
||||
timestamp=None, host=None, duration=None, referer=None,
|
||||
payload_digest=None, warc_records=None, do_not_archive=False):
|
||||
payload_digest=None, truncated=None, warc_records=None,
|
||||
do_not_archive=False):
|
||||
# XXX should test what happens with non-ascii url (when does
|
||||
# url-encoding happen?)
|
||||
if type(url) is not bytes:
|
||||
@ -369,6 +371,7 @@ class RecordedUrl:
|
||||
self.duration = duration
|
||||
self.referer = referer
|
||||
self.payload_digest = payload_digest
|
||||
self.truncated = truncated
|
||||
self.warc_records = warc_records
|
||||
self.do_not_archive = do_not_archive
|
||||
|
||||
@ -400,6 +403,8 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object):
|
||||
|
||||
if options.socket_timeout:
|
||||
WarcProxyHandler._socket_timeout = options.socket_timeout
|
||||
if options.max_resource_size:
|
||||
WarcProxyHandler._max_resource_size = options.max_resource_size
|
||||
|
||||
http_server.HTTPServer.__init__(
|
||||
self, server_address, WarcProxyHandler, bind_and_activate=True)
|
||||
|
Loading…
x
Reference in New Issue
Block a user