From 3298128e0ce21f73dfc242166ff1f06073f3282c Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 24 Apr 2019 10:40:22 -0700 Subject: [PATCH] deal with bad content-type header we had bad stuff get into a crawl log because of a url that returned a Content-Type header value with spaces in it (but no semicolon) --- setup.py | 2 +- warcprox/warcproxy.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index b635ff1..e762b13 100755 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.5', + version='2.4.6', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 8066ace..4a5312e 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -47,6 +47,7 @@ from urllib3 import PoolManager import tempfile import hashlib import doublethink +import re class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): ''' @@ -387,9 +388,8 @@ class RecordedUrl: self.mimetype = content_type if self.mimetype: - n = self.mimetype.find(";") - if n >= 0: - self.mimetype = self.mimetype[:n] + # chop off subtype, and ensure there's no whitespace + self.mimetype = re.split(r'[;\s]', self.mimetype, 2)[0] self.custom_type = custom_type self.status = status