1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

chardet optimization: using chardet feed() approach to avoid passing in entire buffer

This commit is contained in:
Ilya Kreymer 2014-03-17 20:53:42 -07:00
parent d1ad9b5e69
commit a6b4ae4c47

View File

@ -1,6 +1,8 @@
import chardet
#import chardet
import pkgutil
import yaml
from chardet.universaldetector import UniversalDetector
from io import BytesIO
from header_rewriter import RewrittenStatusAndHeaders
@ -151,11 +153,31 @@ class RewriteContent:
return buff
#def _detect_charset(self, stream):
# buff = stream.read(8192)
# result = chardet.detect(buff)
# print "chardet result: " + str(result)
# return (result['encoding'], buff)
def _detect_charset(self, stream):
buff = stream.read(8192)
result = chardet.detect(buff)
print "chardet result: " + str(result)
return (result['encoding'], buff)
full_buff = stream.read(8192)
io_buff = BytesIO(full_buff)
detector = UniversalDetector()
try:
buff = io_buff.read(256)
while buff:
detector.feed(buff)
if detector.done:
break
buff = io_buff.read(256)
finally:
detector.close()
print "chardet result: " + str(detector.result)
return (detector.result['encoding'], full_buff)
# Create a generator reading from a stream,
# with optional rewriting and final read call