deal with bad content-type header

we had bad stuff get into a crawl log because of a url that returned a
Content-Type header value with spaces in it (but no semicolon)
This commit is contained in:
Noah Levitt 2019-04-24 10:40:22 -07:00
parent f207e32f50
commit 3298128e0c
2 changed files with 4 additions and 4 deletions

View File

@ -42,7 +42,7 @@ except:
setuptools.setup(
name='warcprox',
version='2.4.5',
version='2.4.6',
description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox',
author='Noah Levitt',

View File

@ -47,6 +47,7 @@ from urllib3 import PoolManager
import tempfile
import hashlib
import doublethink
import re
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
'''
@ -387,9 +388,8 @@ class RecordedUrl:
self.mimetype = content_type
if self.mimetype:
n = self.mimetype.find(";")
if n >= 0:
self.mimetype = self.mimetype[:n]
# chop off subtype, and ensure there's no whitespace
self.mimetype = re.split(r'[;\s]', self.mimetype, 2)[0]
self.custom_type = custom_type
self.status = status