1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

chardet optimization: using chardet feed() approach to avoid passing in entire buffer

This commit is contained in:
Ilya Kreymer 2014-03-17 20:53:42 -07:00
parent d1ad9b5e69
commit a6b4ae4c47

View File

@ -1,6 +1,8 @@
import chardet #import chardet
import pkgutil import pkgutil
import yaml import yaml
from chardet.universaldetector import UniversalDetector
from io import BytesIO
from header_rewriter import RewrittenStatusAndHeaders from header_rewriter import RewrittenStatusAndHeaders
@ -151,11 +153,31 @@ class RewriteContent:
return buff return buff
#def _detect_charset(self, stream):
# buff = stream.read(8192)
# result = chardet.detect(buff)
# print "chardet result: " + str(result)
# return (result['encoding'], buff)
def _detect_charset(self, stream): def _detect_charset(self, stream):
buff = stream.read(8192) full_buff = stream.read(8192)
result = chardet.detect(buff) io_buff = BytesIO(full_buff)
print "chardet result: " + str(result)
return (result['encoding'], buff) detector = UniversalDetector()
try:
buff = io_buff.read(256)
while buff:
detector.feed(buff)
if detector.done:
break
buff = io_buff.read(256)
finally:
detector.close()
print "chardet result: " + str(detector.result)
return (detector.result['encoding'], full_buff)
# Create a generator reading from a stream, # Create a generator reading from a stream,
# with optional rewriting and final read call # with optional rewriting and final read call