From e0732ffaf497c757baedfa7c05b36acc09abc808 Mon Sep 17 00:00:00 2001
From: Adam Miller <adam@archive.org>
Date: Mon, 29 Mar 2021 22:22:19 +0000
Subject: [PATCH 1/3] Checking for content type header consiting of only empty
 spaces and removing spaces from exception messages in json section

---
 warcprox/crawl_log.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/warcprox/crawl_log.py b/warcprox/crawl_log.py
index 6c847bb..4e67723 100644
--- a/warcprox/crawl_log.py
+++ b/warcprox/crawl_log.py
@@ -44,9 +44,9 @@ class CrawlLogger(object):
         status = self.get_artificial_status(recorded_url)
         extra_info = {'contentSize': recorded_url.size,} if recorded_url.size is not None and recorded_url.size > 0 else {}
         if hasattr(recorded_url, 'exception') and recorded_url.exception is not None:
-            extra_info['exception'] = str(recorded_url.exception)
+            extra_info['exception'] = str(recorded_url.exception).replace(" ", "_")
             if(hasattr(recorded_url, 'message') and recorded_url.message is not None):
-                extra_info['exceptionMessage'] = str(recorded_url.message)
+                extra_info['exceptionMessage'] = str(recorded_url.message).replace(" ", "_")
         if records:
             extra_info['warcFilename'] = records[0].warc_filename
             extra_info['warcFileOffset'] = records[0].offset
@@ -71,7 +71,7 @@ class CrawlLogger(object):
             recorded_url.url,
             '-', # hop path
             recorded_url.referer or '-',
-            recorded_url.mimetype if recorded_url.mimetype is not None else '-',
+            recorded_url.mimetype if recorded_url.mimetype is not None and recorded_url.mimetype.strip() else '-',
             '-',
             '{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(
                 recorded_url.timestamp,

From 5f1c8c75fa251e0e98c1eb31c79890ef919c1bb8 Mon Sep 17 00:00:00 2001
From: Adam Miller <adam@archive.org>
Date: Wed, 31 Mar 2021 23:22:04 +0000
Subject: [PATCH 2/3] Add test cases for space in content type header and
 exception messages

---
 tests/test_warcprox.py | 112 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)

diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py
index 447d2b8..c45e1e0 100755
--- a/tests/test_warcprox.py
+++ b/tests/test_warcprox.py
@@ -293,6 +293,12 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
             payload = chunkify(
                     b'Server closes connection when client expects next chunk')
             payload = payload[:-7]
+        elif self.path == '/space_in_content_type':
+            payload = b'test'
+            headers = (b'HTTP/1.1 200 OK\r\n'
+                    +  b'Content-Type:  \r\n'
+                    +  b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n'
+                    +  b'\r\n')
         else:
             payload = b'404 Not Found\n'
             headers = (b'HTTP/1.1 404 Not Found\r\n'
@@ -1992,6 +1998,112 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
     assert extra_info['contentSize'] == 38
     assert extra_info['method'] == 'WARCPROX_WRITE_RECORD'
 
+    #Empty spae for Content Type
+    url = 'http://localhost:%s/space_in_content_type' % http_daemon.server_port
+    headers = {'Warcprox-Meta': json.dumps({'warc-prefix': 'test_crawl_log_5'})}
+    response = requests.get(url, proxies=archiving_proxies, headers=headers)
+
+    # wait for postfetch chain
+    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 6)
+
+    file = os.path.join(
+            warcprox_.options.crawl_log_dir,
+            'test_crawl_log_5-%s-%s.log' % (hostname, port))
+
+    assert os.path.exists(file)
+    crawl_log_5 = open(file, 'rb').read()
+    assert re.match(br'\A2[^\n]+\n\Z', crawl_log_5)
+    assert crawl_log_5[24:31] == b'   200 '
+    assert crawl_log_5[31:42] == b'         4 '
+    fields = crawl_log_5.split()
+    assert len(fields) == 13
+    assert fields[3].endswith(b'/space_in_content_type')
+    assert fields[4] == b'-'
+    assert fields[5] == b'-'
+    assert fields[6] == b'-'
+    assert fields[7] == b'-'
+    assert re.match(br'^\d{17}[+]\d{3}', fields[8])
+    assert fields[9] == b'sha1:a94a8fe5ccb19ba61c4c0873d391e987982fbbd3'
+    assert fields[10] == b'-'
+    assert fields[11] == b'-'
+    extra_info = json.loads(fields[12].decode('utf-8'))
+    assert set(extra_info.keys()) == {
+            'contentSize', 'warcFilename', 'warcFileOffset'}
+    assert extra_info['contentSize'] == 59
+
+
+    #Fetch Exception
+    url = 'http://localhost-doesnt-exist:%s/connection-error' % http_daemon.server_port
+    headers = {'Warcprox-Meta': json.dumps({'warc-prefix': 'test_crawl_log_6'})}
+    response = requests.get(url, proxies=archiving_proxies, headers=headers)
+
+    #Verify the connection is cleaned up properly after the exception
+    url = 'http://localhost:%s/b/aa' % http_daemon.server_port
+    response = requests.get(url, proxies=archiving_proxies)
+    assert response.status_code == 200
+
+    # wait for postfetch chain
+    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 7)
+
+    file = os.path.join(
+            warcprox_.options.crawl_log_dir,
+            'test_crawl_log_6-%s-%s.log' % (hostname, port))
+
+    assert os.path.exists(file)
+    crawl_log_6 = open(file, 'rb').read()
+    assert re.match(br'\A2[^\n]+\n\Z', crawl_log_6)
+    assert crawl_log_6[24:31] == b'    -2 '
+    assert crawl_log_6[31:42] == b'         0 '
+    fields = crawl_log_6.split()
+    assert len(fields) == 13
+    assert fields[3].endswith(b'/connection-error')
+    assert fields[4] == b'-'
+    assert fields[5] == b'-'
+    assert fields[6] == b'-'
+    assert fields[7] == b'-'
+    assert fields[8] == b'-'
+    assert fields[9] == b'-'
+    assert fields[10] == b'-'
+    assert fields[11] == b'-'
+    extra_info = json.loads(fields[12].decode('utf-8'))
+    assert set(extra_info.keys()) == {'exception'}
+
+    #Test the same bad server to check for -404
+    url = 'http://localhost-doesnt-exist:%s/connection-error' % http_daemon.server_port
+    headers = {'Warcprox-Meta': json.dumps({'warc-prefix': 'test_crawl_log_7'})}
+    response = requests.get(url, proxies=archiving_proxies, headers=headers)
+
+    #Verify the connection is cleaned up properly after the exception
+    url = 'http://localhost:%s/b/aa' % http_daemon.server_port
+    response = requests.get(url, proxies=archiving_proxies)
+    assert response.status_code == 200
+
+    # wait for postfetch chain
+    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 8)
+
+    file = os.path.join(
+        warcprox_.options.crawl_log_dir,
+        'test_crawl_log_7-%s-%s.log' % (hostname, port))
+
+    assert os.path.exists(file)
+    crawl_log_7 = open(file, 'rb').read()
+    assert re.match(br'\A2[^\n]+\n\Z', crawl_log_7)
+    assert crawl_log_7[24:31] == b'  -404 '
+    assert crawl_log_7[31:42] == b'         0 '
+    fields = crawl_log_7.split()
+    assert len(fields) == 13
+    assert fields[3].endswith(b'/connection-error')
+    assert fields[4] == b'-'
+    assert fields[5] == b'-'
+    assert fields[6] == b'-'
+    assert fields[7] == b'-'
+    assert fields[8] == b'-'
+    assert fields[9] == b'-'
+    assert fields[10] == b'-'
+    assert fields[11] == b'-'
+    extra_info = json.loads(fields[12].decode('utf-8'))
+    assert set(extra_info.keys()) == {'exception'}
+
 def test_long_warcprox_meta(
         warcprox_, http_daemon, archiving_proxies, playback_proxies):
     urls_before = warcprox_.proxy.running_stats.urls

From 7f406b794294248345c45faa5bb5bb2813f991e7 Mon Sep 17 00:00:00 2001
From: Adam Miller <adam@archive.org>
Date: Thu, 1 Apr 2021 00:01:47 +0000
Subject: [PATCH 3/3] Trying to fix tests that only fail during ci

---
 tests/test_warcprox.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py
index c45e1e0..3a776ee 100755
--- a/tests/test_warcprox.py
+++ b/tests/test_warcprox.py
@@ -1487,7 +1487,7 @@ def test_missing_content_length(archiving_proxies, http_daemon, https_daemon, wa
     assert not 'content-length' in response.headers
 
     # wait for postfetch chain
-    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2)
+    wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2, timeout=20)
 
 def test_limit_large_resource(archiving_proxies, http_daemon, warcprox_):
     """We try to load a 300k response but we use --max-resource-size=200000 in
@@ -2052,7 +2052,9 @@ def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
     assert os.path.exists(file)
     crawl_log_6 = open(file, 'rb').read()
     assert re.match(br'\A2[^\n]+\n\Z', crawl_log_6)
-    assert crawl_log_6[24:31] == b'    -2 '
+
+    #seems to vary depending on the environment
+    assert crawl_log_6[24:31] == b'    -6 ' or crawl_log_6[24:31] == b'    -2 ' 
     assert crawl_log_6[31:42] == b'         0 '
     fields = crawl_log_6.split()
     assert len(fields) == 13