diff --git a/setup.py b/setup.py index 66e8a07..31ee69f 100755 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ except: setuptools.setup( name='warcprox', - version='2.4b2.dev149', + version='2.4b2.dev150', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index da36df3..f054d69 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -181,6 +181,12 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler): + b'Content-Type: text/plain\r\n' + b'\r\n') payload = b'This response is missing a Content-Length http header.' + elif self.path == '/300k-content': + payload = b'0123456789' * 30000 + headers = (b'HTTP/1.1 200 OK\r\n' + + b'Content-Type: text/plain\r\n' + + b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n' + + b'\r\n') elif self.path.startswith('/test_payload_digest-'): content_body = ( b'Hello. How are you. I am the test_payload_digest ' @@ -365,7 +371,8 @@ def warcprox_(request): '--playback-port=0', '--onion-tor-socks-proxy=localhost:9050', '--crawl-log-dir=crawl-logs', - '--socket-timeout=4'] + '--socket-timeout=4', + '--max-resource-size=200000'] if request.config.getoption('--rethinkdb-dedup-url'): argv.append('--rethinkdb-dedup-url=%s' % request.config.getoption('--rethinkdb-dedup-url')) # test these here only @@ -1211,6 +1218,23 @@ def test_missing_content_length(archiving_proxies, http_daemon, https_daemon, wa # wait for postfetch chain wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2) +def test_limit_large_resource(archiving_proxies, http_daemon, warcprox_): + """We try to load a 300k response but we use --max-resource-size=200000 in + `warcprox_` so it will be truncated. We expect it to limit the result as + soon as it passes the 200000 limit. As warcprox read() chunk size is 65536, + the expected result size is 65536*4=262144. + """ + urls_before = warcprox_.proxy.running_stats.urls + + url = 'http://localhost:%s/300k-content' % http_daemon.server_port + response = requests.get( + url, proxies=archiving_proxies, verify=False, timeout=10) + assert len(response.content) == 262144 + + # wait for processing of this url to finish so that it doesn't interfere + # with subsequent tests + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 1) + def test_method_filter( warcprox_, https_daemon, http_daemon, archiving_proxies, playback_proxies): diff --git a/warcprox/main.py b/warcprox/main.py index 8d16d3b..64d01c7 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -166,6 +166,9 @@ def _build_arg_parser(prog='warcprox'): arg_parser.add_argument( '--socket-timeout', dest='socket_timeout', type=float, default=None, help=argparse.SUPPRESS) + arg_parser.add_argument( + '--max-resource-size', dest='max_resource_size', type=int, + default=None, help='maximum resource size limit in bytes') arg_parser.add_argument( '--crawl-log-dir', dest='crawl_log_dir', default=None, help=( 'if specified, write crawl log files in the specified ' diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 1c40968..f0f8a77 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -161,6 +161,7 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): self.fp = self.recorder self.payload_digest = None + self.truncated = None def begin(self, extra_response_headers={}): http_client.HTTPResponse.begin(self) # reads status line, headers @@ -207,6 +208,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): ''' logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler") _socket_timeout = 60 + _max_resource_size = None def __init__(self, request, client_address, server): threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1]) @@ -431,6 +433,13 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): buf = prox_rec_res.read(65536) while buf != b'': buf = prox_rec_res.read(65536) + if self._max_resource_size: + if prox_rec_res.recorder.len > self._max_resource_size: + prox_rec_res.truncated = b'length' + self.logger.error( + 'Max resource size %d bytes exceeded for URL %s', + self._max_resource_size, self.url) + break self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) finally: diff --git a/warcprox/version.txt b/warcprox/version.txt new file mode 100644 index 0000000..5c2dcd5 --- /dev/null +++ b/warcprox/version.txt @@ -0,0 +1 @@ +1.4-20160105052702-f79e744 diff --git a/warcprox/warc.py b/warcprox/warc.py index a929a73..708366b 100644 --- a/warcprox/warc.py +++ b/warcprox/warc.py @@ -67,7 +67,8 @@ class WarcRecordBuilder: content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE, remote_ip=recorded_url.remote_ip, payload_digest=warcprox.digest_str( - recorded_url.payload_digest, self.base32)) + recorded_url.payload_digest, self.base32), + truncated=recorded_url.truncated) def build_warc_records(self, recorded_url): """Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)""" @@ -91,7 +92,7 @@ class WarcRecordBuilder: def build_warc_record(self, url, warc_date=None, recorder=None, data=None, concurrent_to=None, warc_type=None, content_type=None, remote_ip=None, profile=None, refers_to=None, refers_to_target_uri=None, - refers_to_date=None, payload_digest=None): + refers_to_date=None, payload_digest=None, truncated=None): if warc_date is None: warc_date = warctools.warc.warc_datetime_str(datetime.datetime.utcnow()) @@ -120,6 +121,9 @@ class WarcRecordBuilder: headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type)) if payload_digest is not None: headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest)) + # truncated value may be 'length' or 'time' + if truncated is not None: + headers.append((b'WARC-Truncated', truncated)) if recorder is not None: headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(recorder)).encode('latin1'))) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index e55b295..5b42655 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -221,7 +221,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): timestamp=timestamp, host=self.hostname, duration=datetime.datetime.utcnow()-timestamp, referer=self.headers.get('referer'), - payload_digest=prox_rec_res.payload_digest) + payload_digest=prox_rec_res.payload_digest, + truncated=prox_rec_res.truncated) self.server.recorded_url_q.put(recorded_url) return recorded_url @@ -330,7 +331,8 @@ class RecordedUrl: warcprox_meta=None, content_type=None, custom_type=None, status=None, size=None, client_ip=None, method=None, timestamp=None, host=None, duration=None, referer=None, - payload_digest=None, warc_records=None, do_not_archive=False): + payload_digest=None, truncated=None, warc_records=None, + do_not_archive=False): # XXX should test what happens with non-ascii url (when does # url-encoding happen?) if type(url) is not bytes: @@ -369,6 +371,7 @@ class RecordedUrl: self.duration = duration self.referer = referer self.payload_digest = payload_digest + self.truncated = truncated self.warc_records = warc_records self.do_not_archive = do_not_archive @@ -400,6 +403,8 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object): if options.socket_timeout: WarcProxyHandler._socket_timeout = options.socket_timeout + if options.max_resource_size: + WarcProxyHandler._max_resource_size = options.max_resource_size http_server.HTTPServer.__init__( self, server_address, WarcProxyHandler, bind_and_activate=True)