mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
deal with bad content-type header
we had bad stuff get into a crawl log because of a url that returned a Content-Type header value with spaces in it (but no semicolon)
This commit is contained in:
parent
f207e32f50
commit
3298128e0c
2
setup.py
2
setup.py
@ -42,7 +42,7 @@ except:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='warcprox',
|
name='warcprox',
|
||||||
version='2.4.5',
|
version='2.4.6',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -47,6 +47,7 @@ from urllib3 import PoolManager
|
|||||||
import tempfile
|
import tempfile
|
||||||
import hashlib
|
import hashlib
|
||||||
import doublethink
|
import doublethink
|
||||||
|
import re
|
||||||
|
|
||||||
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||||
'''
|
'''
|
||||||
@ -387,9 +388,8 @@ class RecordedUrl:
|
|||||||
|
|
||||||
self.mimetype = content_type
|
self.mimetype = content_type
|
||||||
if self.mimetype:
|
if self.mimetype:
|
||||||
n = self.mimetype.find(";")
|
# chop off subtype, and ensure there's no whitespace
|
||||||
if n >= 0:
|
self.mimetype = re.split(r'[;\s]', self.mimetype, 2)[0]
|
||||||
self.mimetype = self.mimetype[:n]
|
|
||||||
|
|
||||||
self.custom_type = custom_type
|
self.custom_type = custom_type
|
||||||
self.status = status
|
self.status = status
|
||||||
|
Loading…
x
Reference in New Issue
Block a user