diff --git a/pywb/cdx/zipnum.py b/pywb/cdx/zipnum.py index 021026ac..66ced245 100644 --- a/pywb/cdx/zipnum.py +++ b/pywb/cdx/zipnum.py @@ -2,7 +2,7 @@ import os import collections import itertools import logging -from cStringIO import StringIO +from io import BytesIO import datetime from cdxsource import CDXSource @@ -189,7 +189,7 @@ class ZipNumCluster(CDXSource): def decompress_block(range_): decomp = gzip_decompressor() buff = decomp.decompress(reader.read(range_)) - return readline_to_iter(StringIO(buff)) + return readline_to_iter(BytesIO(buff)) iter_ = itertools.chain(*itertools.imap(decompress_block, ranges)) diff --git a/pywb/core/replay_views.py b/pywb/core/replay_views.py index 8ac99be2..b627a05a 100644 --- a/pywb/core/replay_views.py +++ b/pywb/core/replay_views.py @@ -1,4 +1,4 @@ -import StringIO +from io import BytesIO from pywb.rewrite.url_rewriter import UrlRewriter from pywb.utils.bufferedreaders import ChunkedDataReader @@ -149,11 +149,11 @@ class ReplayView: # Buffer rewrite iterator and return a response from a string def buffered_response(self, status_headers, iterator): - out = StringIO.StringIO() + out = BytesIO() try: for buff in iterator: - out.write(buff) + out.write(bytes(buff)) finally: content = out.getvalue() diff --git a/pywb/utils/bufferedreaders.py b/pywb/utils/bufferedreaders.py index 6be38b85..4ab72e5f 100644 --- a/pywb/utils/bufferedreaders.py +++ b/pywb/utils/bufferedreaders.py @@ -1,4 +1,4 @@ -import StringIO +from io import BytesIO import zlib @@ -44,12 +44,13 @@ class DecompressingBufferedReader(object): self.buff = None self.num_read = 0 + self.buff_size = 0 def _fillbuff(self, block_size=None): if not block_size: block_size = self.block_size - if not self.buff or self.buff.pos >= self.buff.len: + if not self.buff or self.buff.tell() == self.buff_size: data = self.stream.read(block_size) self._process_read(data) @@ -57,7 +58,7 @@ class DecompressingBufferedReader(object): data = self._decompress(data) self.buff_size = len(data) self.num_read += self.buff_size - self.buff = StringIO.StringIO(data) + self.buff = BytesIO(data) def _decompress(self, data): if self.decompressor and data: @@ -129,21 +130,21 @@ class ChunkedDataReader(DecompressingBufferedReader): assumed to not be chunked and no more dechunking occurs. Properly formatted chunked data: - >>> c = ChunkedDataReader(StringIO.StringIO("4\r\n1234\r\n0\r\n\r\n")); + >>> c = ChunkedDataReader(BytesIO("4\r\n1234\r\n0\r\n\r\n")); >>> c.read() + c.read() '1234' Non-chunked data: - >>> ChunkedDataReader(StringIO.StringIO("xyz123!@#")).read() + >>> ChunkedDataReader(BytesIO("xyz123!@#")).read() 'xyz123!@#' Starts like chunked data, but isn't: - >>> c = ChunkedDataReader(StringIO.StringIO("1\r\nxyz123!@#")); + >>> c = ChunkedDataReader(BytesIO("1\r\nxyz123!@#")); >>> c.read() + c.read() '1\r\nx123!@#' Chunked data cut off part way through: - >>> c = ChunkedDataReader(StringIO.StringIO("4\r\n1234\r\n4\r\n12")); + >>> c = ChunkedDataReader(BytesIO("4\r\n1234\r\n4\r\n12")); >>> c.read() + c.read() '123412' """ @@ -161,7 +162,7 @@ class ChunkedDataReader(DecompressingBufferedReader): if self.all_chunks_read: return - if not self.buff or self.buff.pos >= self.buff.len: + if not self.buff or self.buff.tell() >= self.buff_size: length_header = self.stream.readline(64) self._data = '' diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index ad1aeb12..ffb15c55 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -8,7 +8,7 @@ import hmac import urllib2 import time import pkg_resources - +from io import open #================================================================= def is_http(filename): diff --git a/pywb/utils/test/loaders_test.py b/pywb/utils/test/loaders_test.py index a8454816..f66cee01 100644 --- a/pywb/utils/test/loaders_test.py +++ b/pywb/utils/test/loaders_test.py @@ -1,13 +1,13 @@ #================================================================= """ # LimitReader Tests ->>> LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10).read(26) +>>> LimitReader(BytesIO('abcdefghjiklmnopqrstuvwxyz'), 10).read(26) 'abcdefghji' ->>> LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 8).readline(26) +>>> LimitReader(BytesIO('abcdefghjiklmnopqrstuvwxyz'), 8).readline(26) 'abcdefgh' ->>> read_multiple(LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20]) +>>> read_multiple(LimitReader(BytesIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20]) 'efghji' # BlockLoader Tests (includes LimitReader) @@ -30,6 +30,9 @@ >>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline() ' CDX N b a m s k r M S V g\\n' +>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb'), decomp_type = 'gzip').readline() +' CDX N b a m s k r M S V g\\n' + #DecompressingBufferedReader readline() with decompression (zipnum file, no header) >>> DecompressingBufferedReader(open(test_zip_dir + 'zipnum-sample.cdx.gz', 'rb'), decomp_type = 'gzip').readline() 'com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz\\n' @@ -38,7 +41,7 @@ 'Example Domain' # test very small block size ->>> dbr = DecompressingBufferedReader(StringIO.StringIO('ABCDEFG\\nHIJKLMN\\nOPQR\\nXYZ'), block_size = 3) +>>> dbr = DecompressingBufferedReader(BytesIO('ABCDEFG\\nHIJKLMN\\nOPQR\\nXYZ'), block_size = 3) >>> dbr.readline(); dbr.readline(4); dbr.readline(); dbr.readline(); dbr.readline(2); dbr.readline(); dbr.readline() 'ABCDEFG\\n' 'HIJK' @@ -52,7 +55,7 @@ #================================================================= import os -import StringIO +from io import BytesIO, open from pywb.utils.loaders import BlockLoader, HMACCookieMaker from pywb.utils.loaders import LimitReader, SeekableTextFileReader from pywb.utils.bufferedreaders import DecompressingBufferedReader diff --git a/pywb/utils/test/statusandheaders_test.py b/pywb/utils/test/statusandheaders_test.py index 3473e71e..e52caa5e 100644 --- a/pywb/utils/test/statusandheaders_test.py +++ b/pywb/utils/test/statusandheaders_test.py @@ -1,17 +1,17 @@ """ ->>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO.StringIO(status_headers_1)) +>>> StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_1)) StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'), ('Some', 'Value'), ('Multi-Line', 'Value1 Also This')]) ->>> StatusAndHeadersParser(['Other']).parse(StringIO.StringIO(status_headers_1)) +>>> StatusAndHeadersParser(['Other']).parse(BytesIO(status_headers_1)) Traceback (most recent call last): StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK """ from pywb.utils.statusandheaders import StatusAndHeadersParser -import StringIO +from io import BytesIO status_headers_1 = "\