mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
deal with bad content-type header
we had bad stuff get into a crawl log because of a url that returned a Content-Type header value with spaces in it (but no semicolon)
This commit is contained in:
parent
f207e32f50
commit
3298128e0c
2
setup.py
2
setup.py
@ -42,7 +42,7 @@ except:
|
||||
|
||||
setuptools.setup(
|
||||
name='warcprox',
|
||||
version='2.4.5',
|
||||
version='2.4.6',
|
||||
description='WARC writing MITM HTTP/S proxy',
|
||||
url='https://github.com/internetarchive/warcprox',
|
||||
author='Noah Levitt',
|
||||
|
@ -47,6 +47,7 @@ from urllib3 import PoolManager
|
||||
import tempfile
|
||||
import hashlib
|
||||
import doublethink
|
||||
import re
|
||||
|
||||
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
'''
|
||||
@ -387,9 +388,8 @@ class RecordedUrl:
|
||||
|
||||
self.mimetype = content_type
|
||||
if self.mimetype:
|
||||
n = self.mimetype.find(";")
|
||||
if n >= 0:
|
||||
self.mimetype = self.mimetype[:n]
|
||||
# chop off subtype, and ensure there's no whitespace
|
||||
self.mimetype = re.split(r'[;\s]', self.mimetype, 2)[0]
|
||||
|
||||
self.custom_type = custom_type
|
||||
self.status = status
|
||||
|
Loading…
x
Reference in New Issue
Block a user