Merge branch 'do_not_archive' into qa

This commit is contained in:
Barbara Miller 2018-02-20 16:01:55 -08:00
commit f202f12bc5
7 changed files with 52 additions and 6 deletions

View File

@ -40,7 +40,7 @@ except:
setuptools.setup( setuptools.setup(
name='warcprox', name='warcprox',
version='2.4b2.dev149', version='2.4b2.dev150',
description='WARC writing MITM HTTP/S proxy', description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox', url='https://github.com/internetarchive/warcprox',
author='Noah Levitt', author='Noah Levitt',

View File

@ -181,6 +181,12 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
+ b'Content-Type: text/plain\r\n' + b'Content-Type: text/plain\r\n'
+ b'\r\n') + b'\r\n')
payload = b'This response is missing a Content-Length http header.' payload = b'This response is missing a Content-Length http header.'
elif self.path == '/300k-content':
payload = b'0123456789' * 30000
headers = (b'HTTP/1.1 200 OK\r\n'
+ b'Content-Type: text/plain\r\n'
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n'
+ b'\r\n')
elif self.path.startswith('/test_payload_digest-'): elif self.path.startswith('/test_payload_digest-'):
content_body = ( content_body = (
b'Hello. How are you. I am the test_payload_digest ' b'Hello. How are you. I am the test_payload_digest '
@ -365,7 +371,8 @@ def warcprox_(request):
'--playback-port=0', '--playback-port=0',
'--onion-tor-socks-proxy=localhost:9050', '--onion-tor-socks-proxy=localhost:9050',
'--crawl-log-dir=crawl-logs', '--crawl-log-dir=crawl-logs',
'--socket-timeout=4'] '--socket-timeout=4',
'--max-resource-size=200000']
if request.config.getoption('--rethinkdb-dedup-url'): if request.config.getoption('--rethinkdb-dedup-url'):
argv.append('--rethinkdb-dedup-url=%s' % request.config.getoption('--rethinkdb-dedup-url')) argv.append('--rethinkdb-dedup-url=%s' % request.config.getoption('--rethinkdb-dedup-url'))
# test these here only # test these here only
@ -1211,6 +1218,23 @@ def test_missing_content_length(archiving_proxies, http_daemon, https_daemon, wa
# wait for postfetch chain # wait for postfetch chain
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2) wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2)
def test_limit_large_resource(archiving_proxies, http_daemon, warcprox_):
"""We try to load a 300k response but we use --max-resource-size=200000 in
`warcprox_` so it will be truncated. We expect it to limit the result as
soon as it passes the 200000 limit. As warcprox read() chunk size is 65536,
the expected result size is 65536*4=262144.
"""
urls_before = warcprox_.proxy.running_stats.urls
url = 'http://localhost:%s/300k-content' % http_daemon.server_port
response = requests.get(
url, proxies=archiving_proxies, verify=False, timeout=10)
assert len(response.content) == 262144
# wait for processing of this url to finish so that it doesn't interfere
# with subsequent tests
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1)
def test_method_filter( def test_method_filter(
warcprox_, https_daemon, http_daemon, archiving_proxies, warcprox_, https_daemon, http_daemon, archiving_proxies,
playback_proxies): playback_proxies):

View File

@ -166,6 +166,9 @@ def _build_arg_parser(prog='warcprox'):
arg_parser.add_argument( arg_parser.add_argument(
'--socket-timeout', dest='socket_timeout', type=float, '--socket-timeout', dest='socket_timeout', type=float,
default=None, help=argparse.SUPPRESS) default=None, help=argparse.SUPPRESS)
arg_parser.add_argument(
'--max-resource-size', dest='max_resource_size', type=int,
default=None, help='maximum resource size limit in bytes')
arg_parser.add_argument( arg_parser.add_argument(
'--crawl-log-dir', dest='crawl_log_dir', default=None, help=( '--crawl-log-dir', dest='crawl_log_dir', default=None, help=(
'if specified, write crawl log files in the specified ' 'if specified, write crawl log files in the specified '

View File

@ -161,6 +161,7 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
self.fp = self.recorder self.fp = self.recorder
self.payload_digest = None self.payload_digest = None
self.truncated = None
def begin(self, extra_response_headers={}): def begin(self, extra_response_headers={}):
http_client.HTTPResponse.begin(self) # reads status line, headers http_client.HTTPResponse.begin(self) # reads status line, headers
@ -207,6 +208,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
''' '''
logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler") logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler")
_socket_timeout = 60 _socket_timeout = 60
_max_resource_size = None
def __init__(self, request, client_address, server): def __init__(self, request, client_address, server):
threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1]) threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1])
@ -431,6 +433,13 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
buf = prox_rec_res.read(65536) buf = prox_rec_res.read(65536)
while buf != b'': while buf != b'':
buf = prox_rec_res.read(65536) buf = prox_rec_res.read(65536)
if self._max_resource_size:
if prox_rec_res.recorder.len > self._max_resource_size:
prox_rec_res.truncated = b'length'
self.logger.error(
'Max resource size %d bytes exceeded for URL %s',
self._max_resource_size, self.url)
break
self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
finally: finally:

1
warcprox/version.txt Normal file
View File

@ -0,0 +1 @@
1.4-20160105052702-f79e744

View File

@ -67,7 +67,8 @@ class WarcRecordBuilder:
content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE, content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
remote_ip=recorded_url.remote_ip, remote_ip=recorded_url.remote_ip,
payload_digest=warcprox.digest_str( payload_digest=warcprox.digest_str(
recorded_url.payload_digest, self.base32)) recorded_url.payload_digest, self.base32),
truncated=recorded_url.truncated)
def build_warc_records(self, recorded_url): def build_warc_records(self, recorded_url):
"""Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)""" """Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)"""
@ -91,7 +92,7 @@ class WarcRecordBuilder:
def build_warc_record(self, url, warc_date=None, recorder=None, data=None, def build_warc_record(self, url, warc_date=None, recorder=None, data=None,
concurrent_to=None, warc_type=None, content_type=None, remote_ip=None, concurrent_to=None, warc_type=None, content_type=None, remote_ip=None,
profile=None, refers_to=None, refers_to_target_uri=None, profile=None, refers_to=None, refers_to_target_uri=None,
refers_to_date=None, payload_digest=None): refers_to_date=None, payload_digest=None, truncated=None):
if warc_date is None: if warc_date is None:
warc_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow()) warc_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow())
@ -120,6 +121,9 @@ class WarcRecordBuilder:
headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type)) headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
if payload_digest is not None: if payload_digest is not None:
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest)) headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
# truncated value may be 'length' or 'time'
if truncated is not None:
headers.append((b'WARC-Truncated', truncated))
if recorder is not None: if recorder is not None:
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1'))) headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1')))

View File

@ -221,7 +221,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
timestamp=timestamp, host=self.hostname, timestamp=timestamp, host=self.hostname,
duration=datetime.datetime.utcnow()-timestamp, duration=datetime.datetime.utcnow()-timestamp,
referer=self.headers.get('referer'), referer=self.headers.get('referer'),
payload_digest=prox_rec_res.payload_digest) payload_digest=prox_rec_res.payload_digest,
truncated=prox_rec_res.truncated)
self.server.recorded_url_q.put(recorded_url) self.server.recorded_url_q.put(recorded_url)
return recorded_url return recorded_url
@ -330,7 +331,8 @@ class RecordedUrl:
warcprox_meta=None, content_type=None, custom_type=None, warcprox_meta=None, content_type=None, custom_type=None,
status=None, size=None, client_ip=None, method=None, status=None, size=None, client_ip=None, method=None,
timestamp=None, host=None, duration=None, referer=None, timestamp=None, host=None, duration=None, referer=None,
payload_digest=None, warc_records=None, do_not_archive=False): payload_digest=None, truncated=None, warc_records=None,
do_not_archive=False):
# XXX should test what happens with non-ascii url (when does # XXX should test what happens with non-ascii url (when does
# url-encoding happen?) # url-encoding happen?)
if type(url) is not bytes: if type(url) is not bytes:
@ -369,6 +371,7 @@ class RecordedUrl:
self.duration = duration self.duration = duration
self.referer = referer self.referer = referer
self.payload_digest = payload_digest self.payload_digest = payload_digest
self.truncated = truncated
self.warc_records = warc_records self.warc_records = warc_records
self.do_not_archive = do_not_archive self.do_not_archive = do_not_archive
@ -400,6 +403,8 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object):
if options.socket_timeout: if options.socket_timeout:
WarcProxyHandler._socket_timeout = options.socket_timeout WarcProxyHandler._socket_timeout = options.socket_timeout
if options.max_resource_size:
WarcProxyHandler._max_resource_size = options.max_resource_size
http_server.HTTPServer.__init__( http_server.HTTPServer.__init__(
self, server_address, WarcProxyHandler, bind_and_activate=True) self, server_address, WarcProxyHandler, bind_and_activate=True)