diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 791ebeb..f9c795d 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -293,6 +293,12 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler): payload = chunkify( b'Server closes connection when client expects next chunk') payload = payload[:-7] + elif self.path == '/space_in_content_type': + payload = b'test' + headers = (b'HTTP/1.1 200 OK\r\n' + + b'Content-Type: \r\n' + + b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n' + + b'\r\n') else: payload = b'404 Not Found\n' headers = (b'HTTP/1.1 404 Not Found\r\n' @@ -1481,7 +1487,7 @@ def test_missing_content_length(archiving_proxies, http_daemon, https_daemon, wa assert not 'content-length' in response.headers # wait for postfetch chain - wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2) + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2, timeout=20) def test_limit_large_resource(archiving_proxies, http_daemon, warcprox_): """We try to load a 300k response but we use --max-resource-size=200000 in @@ -1993,6 +1999,114 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies): assert extra_info['contentSize'] == 38 assert extra_info['method'] == 'WARCPROX_WRITE_RECORD' + #Empty spae for Content Type + url = 'http://localhost:%s/space_in_content_type' % http_daemon.server_port + headers = {'Warcprox-Meta': json.dumps({'warc-prefix': 'test_crawl_log_5'})} + response = requests.get(url, proxies=archiving_proxies, headers=headers) + + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 6) + + file = os.path.join( + warcprox_.options.crawl_log_dir, + 'test_crawl_log_5-%s-%s.log' % (hostname, port)) + + assert os.path.exists(file) + crawl_log_5 = open(file, 'rb').read() + assert re.match(br'\A2[^\n]+\n\Z', crawl_log_5) + assert crawl_log_5[24:31] == b' 200 ' + assert crawl_log_5[31:42] == b' 4 ' + fields = crawl_log_5.split() + assert len(fields) == 13 + assert fields[3].endswith(b'/space_in_content_type') + assert fields[4] == b'-' + assert fields[5] == b'-' + assert fields[6] == b'-' + assert fields[7] == b'-' + assert re.match(br'^\d{17}[+]\d{3}', fields[8]) + assert fields[9] == b'sha1:a94a8fe5ccb19ba61c4c0873d391e987982fbbd3' + assert fields[10] == b'-' + assert fields[11] == b'-' + extra_info = json.loads(fields[12].decode('utf-8')) + assert set(extra_info.keys()) == { + 'contentSize', 'warcFilename', 'warcFileOffset'} + assert extra_info['contentSize'] == 59 + + + #Fetch Exception + url = 'http://localhost-doesnt-exist:%s/connection-error' % http_daemon.server_port + headers = {'Warcprox-Meta': json.dumps({'warc-prefix': 'test_crawl_log_6'})} + response = requests.get(url, proxies=archiving_proxies, headers=headers) + + #Verify the connection is cleaned up properly after the exception + url = 'http://localhost:%s/b/aa' % http_daemon.server_port + response = requests.get(url, proxies=archiving_proxies) + assert response.status_code == 200 + + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 7) + + file = os.path.join( + warcprox_.options.crawl_log_dir, + 'test_crawl_log_6-%s-%s.log' % (hostname, port)) + + assert os.path.exists(file) + crawl_log_6 = open(file, 'rb').read() + assert re.match(br'\A2[^\n]+\n\Z', crawl_log_6) + + #seems to vary depending on the environment + assert crawl_log_6[24:31] == b' -6 ' or crawl_log_6[24:31] == b' -2 ' + assert crawl_log_6[31:42] == b' 0 ' + fields = crawl_log_6.split() + assert len(fields) == 13 + assert fields[3].endswith(b'/connection-error') + assert fields[4] == b'-' + assert fields[5] == b'-' + assert fields[6] == b'-' + assert fields[7] == b'-' + assert fields[8] == b'-' + assert fields[9] == b'-' + assert fields[10] == b'-' + assert fields[11] == b'-' + extra_info = json.loads(fields[12].decode('utf-8')) + assert set(extra_info.keys()) == {'exception'} + + #Test the same bad server to check for -404 + url = 'http://localhost-doesnt-exist:%s/connection-error' % http_daemon.server_port + headers = {'Warcprox-Meta': json.dumps({'warc-prefix': 'test_crawl_log_7'})} + response = requests.get(url, proxies=archiving_proxies, headers=headers) + + #Verify the connection is cleaned up properly after the exception + url = 'http://localhost:%s/b/aa' % http_daemon.server_port + response = requests.get(url, proxies=archiving_proxies) + assert response.status_code == 200 + + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 8) + + file = os.path.join( + warcprox_.options.crawl_log_dir, + 'test_crawl_log_7-%s-%s.log' % (hostname, port)) + + assert os.path.exists(file) + crawl_log_7 = open(file, 'rb').read() + assert re.match(br'\A2[^\n]+\n\Z', crawl_log_7) + assert crawl_log_7[24:31] == b' -404 ' + assert crawl_log_7[31:42] == b' 0 ' + fields = crawl_log_7.split() + assert len(fields) == 13 + assert fields[3].endswith(b'/connection-error') + assert fields[4] == b'-' + assert fields[5] == b'-' + assert fields[6] == b'-' + assert fields[7] == b'-' + assert fields[8] == b'-' + assert fields[9] == b'-' + assert fields[10] == b'-' + assert fields[11] == b'-' + extra_info = json.loads(fields[12].decode('utf-8')) + assert set(extra_info.keys()) == {'exception'} + def test_long_warcprox_meta( warcprox_, http_daemon, archiving_proxies, playback_proxies): urls_before = warcprox_.proxy.running_stats.urls diff --git a/warcprox/crawl_log.py b/warcprox/crawl_log.py index 6c847bb..4e67723 100644 --- a/warcprox/crawl_log.py +++ b/warcprox/crawl_log.py @@ -44,9 +44,9 @@ class CrawlLogger(object): status = self.get_artificial_status(recorded_url) extra_info = {'contentSize': recorded_url.size,} if recorded_url.size is not None and recorded_url.size > 0 else {} if hasattr(recorded_url, 'exception') and recorded_url.exception is not None: - extra_info['exception'] = str(recorded_url.exception) + extra_info['exception'] = str(recorded_url.exception).replace(" ", "_") if(hasattr(recorded_url, 'message') and recorded_url.message is not None): - extra_info['exceptionMessage'] = str(recorded_url.message) + extra_info['exceptionMessage'] = str(recorded_url.message).replace(" ", "_") if records: extra_info['warcFilename'] = records[0].warc_filename extra_info['warcFileOffset'] = records[0].offset @@ -71,7 +71,7 @@ class CrawlLogger(object): recorded_url.url, '-', # hop path recorded_url.referer or '-', - recorded_url.mimetype if recorded_url.mimetype is not None else '-', + recorded_url.mimetype if recorded_url.mimetype is not None and recorded_url.mimetype.strip() else '-', '-', '{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format( recorded_url.timestamp,