Merge branch 'do_not_archive' into qa

This commit is contained in:
Barbara Miller 2018-02-20 16:01:55 -08:00
commit f202f12bc5
7 changed files with 52 additions and 6 deletions

View File

@ -40,7 +40,7 @@ except:
setuptools.setup(
name='warcprox',
version='2.4b2.dev149',
version='2.4b2.dev150',
description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox',
author='Noah Levitt',

View File

@ -181,6 +181,12 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
+ b'Content-Type: text/plain\r\n'
+ b'\r\n')
payload = b'This response is missing a Content-Length http header.'
elif self.path == '/300k-content':
payload = b'0123456789' * 30000
headers = (b'HTTP/1.1 200 OK\r\n'
+ b'Content-Type: text/plain\r\n'
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n'
+ b'\r\n')
elif self.path.startswith('/test_payload_digest-'):
content_body = (
b'Hello. How are you. I am the test_payload_digest '
@ -365,7 +371,8 @@ def warcprox_(request):
'--playback-port=0',
'--onion-tor-socks-proxy=localhost:9050',
'--crawl-log-dir=crawl-logs',
'--socket-timeout=4']
'--socket-timeout=4',
'--max-resource-size=200000']
if request.config.getoption('--rethinkdb-dedup-url'):
argv.append('--rethinkdb-dedup-url=%s' % request.config.getoption('--rethinkdb-dedup-url'))
# test these here only
@ -1211,6 +1218,23 @@ def test_missing_content_length(archiving_proxies, http_daemon, https_daemon, wa
# wait for postfetch chain
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2)
def test_limit_large_resource(archiving_proxies, http_daemon, warcprox_):
"""We try to load a 300k response but we use --max-resource-size=200000 in
`warcprox_` so it will be truncated. We expect it to limit the result as
soon as it passes the 200000 limit. As warcprox read() chunk size is 65536,
the expected result size is 65536*4=262144.
"""
urls_before = warcprox_.proxy.running_stats.urls
url = 'http://localhost:%s/300k-content' % http_daemon.server_port
response = requests.get(
url, proxies=archiving_proxies, verify=False, timeout=10)
assert len(response.content) == 262144
# wait for processing of this url to finish so that it doesn't interfere
# with subsequent tests
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
def test_method_filter(
warcprox_, https_daemon, http_daemon, archiving_proxies,
playback_proxies):

View File

@ -166,6 +166,9 @@ def _build_arg_parser(prog='warcprox'):
arg_parser.add_argument(
'--socket-timeout', dest='socket_timeout', type=float,
default=None, help=argparse.SUPPRESS)
arg_parser.add_argument(
'--max-resource-size', dest='max_resource_size', type=int,
default=None, help='maximum resource size limit in bytes')
arg_parser.add_argument(
'--crawl-log-dir', dest='crawl_log_dir', default=None, help=(
'if specified, write crawl log files in the specified '

View File

@ -161,6 +161,7 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
self.fp = self.recorder
self.payload_digest = None
self.truncated = None
def begin(self, extra_response_headers={}):
http_client.HTTPResponse.begin(self) # reads status line, headers
@ -207,6 +208,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
'''
logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler")
_socket_timeout = 60
_max_resource_size = None
def __init__(self, request, client_address, server):
threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1])
@ -431,6 +433,13 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
buf = prox_rec_res.read(65536)
while buf != b'':
buf = prox_rec_res.read(65536)
if self._max_resource_size:
if prox_rec_res.recorder.len > self._max_resource_size:
prox_rec_res.truncated = b'length'
self.logger.error(
'Max resource size %d bytes exceeded for URL %s',
self._max_resource_size, self.url)
break
self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
finally:

1
warcprox/version.txt Normal file
View File

@ -0,0 +1 @@
1.4-20160105052702-f79e744

View File

@ -67,7 +67,8 @@ class WarcRecordBuilder:
content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
remote_ip=recorded_url.remote_ip,
payload_digest=warcprox.digest_str(
recorded_url.payload_digest, self.base32))
recorded_url.payload_digest, self.base32),
truncated=recorded_url.truncated)
def build_warc_records(self, recorded_url):
"""Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)"""
@ -91,7 +92,7 @@ class WarcRecordBuilder:
def build_warc_record(self, url, warc_date=None, recorder=None, data=None,
concurrent_to=None, warc_type=None, content_type=None, remote_ip=None,
profile=None, refers_to=None, refers_to_target_uri=None,
refers_to_date=None, payload_digest=None):
refers_to_date=None, payload_digest=None, truncated=None):
if warc_date is None:
warc_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow())
@ -120,6 +121,9 @@ class WarcRecordBuilder:
headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
if payload_digest is not None:
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
# truncated value may be 'length' or 'time'
if truncated is not None:
headers.append((b'WARC-Truncated', truncated))
if recorder is not None:
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1')))

View File

@ -221,7 +221,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
timestamp=timestamp, host=self.hostname,
duration=datetime.datetime.utcnow()-timestamp,
referer=self.headers.get('referer'),
payload_digest=prox_rec_res.payload_digest)
payload_digest=prox_rec_res.payload_digest,
truncated=prox_rec_res.truncated)
self.server.recorded_url_q.put(recorded_url)
return recorded_url
@ -330,7 +331,8 @@ class RecordedUrl:
warcprox_meta=None, content_type=None, custom_type=None,
status=None, size=None, client_ip=None, method=None,
timestamp=None, host=None, duration=None, referer=None,
payload_digest=None, warc_records=None, do_not_archive=False):
payload_digest=None, truncated=None, warc_records=None,
do_not_archive=False):
# XXX should test what happens with non-ascii url (when does
# url-encoding happen?)
if type(url) is not bytes:
@ -369,6 +371,7 @@ class RecordedUrl:
self.duration = duration
self.referer = referer
self.payload_digest = payload_digest
self.truncated = truncated
self.warc_records = warc_records
self.do_not_archive = do_not_archive
@ -400,6 +403,8 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object):
if options.socket_timeout:
WarcProxyHandler._socket_timeout = options.socket_timeout
if options.max_resource_size:
WarcProxyHandler._max_resource_size = options.max_resource_size
http_server.HTTPServer.__init__(
self, server_address, WarcProxyHandler, bind_and_activate=True)