1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

py3: make pywb.utils work with python 3!

This commit is contained in:
Ilya Kreymer 2016-02-16 14:52:20 -08:00
parent 7cf81935e1
commit 3c85f7b7ac
12 changed files with 169 additions and 117 deletions

View File

@ -4,6 +4,13 @@ Utility functions for performing binary search over a sorted text file
from collections import deque from collections import deque
import itertools import itertools
import six
import sys
if six.PY3:
def cmp(a, b):
return (a > b) - (a < b)
#================================================================= #=================================================================
@ -18,10 +25,10 @@ def binsearch_offset(reader, key, compare_func=cmp, block_size=8192):
min_ = 0 min_ = 0
reader.seek(0, 2) reader.seek(0, 2)
max_ = reader.tell() / block_size max_ = int(reader.tell() / block_size)
while max_ - min_ > 1: while max_ - min_ > 1:
mid = min_ + ((max_ - min_) / 2) mid = int(min_ + ((max_ - min_) / 2))
reader.seek(mid * block_size) reader.seek(mid * block_size)
if mid > 0: if mid > 0:
@ -135,7 +142,7 @@ def iter_prefix(reader, key):
#================================================================= #=================================================================
def iter_exact(reader, key, token=' '): def iter_exact(reader, key, token=b' '):
""" """
Create an iterator which iterates over lines where the first field matches Create an iterator which iterates over lines where the first field matches
the 'key', equivalent to token + sep prefix. the 'key', equivalent to token + sep prefix.

View File

@ -120,7 +120,7 @@ class BufferedReader(object):
call will fill buffer anew. call will fill buffer anew.
""" """
if length == 0: if length == 0:
return '' return b''
self._fillbuff() self._fillbuff()
buff = self.buff.read(length) buff = self.buff.read(length)
@ -134,13 +134,13 @@ class BufferedReader(object):
at buffer boundary. at buffer boundary.
""" """
if length == 0: if length == 0:
return '' return b''
self._fillbuff() self._fillbuff()
linebuff = self.buff.readline(length) linebuff = self.buff.readline(length)
# we may be at a boundary # we may be at a boundary
while not linebuff.endswith('\n'): while not linebuff.endswith(b'\n'):
if length: if length:
length -= len(linebuff) length -= len(linebuff)
if length <= 0: if length <= 0:
@ -195,7 +195,7 @@ class DecompressingBufferedReader(BufferedReader):
#================================================================= #=================================================================
class ChunkedDataException(Exception): class ChunkedDataException(Exception):
def __init__(self, msg, data=''): def __init__(self, msg, data=b''):
Exception.__init__(self, msg) Exception.__init__(self, msg)
self.data = data self.data = data
@ -249,19 +249,19 @@ class ChunkedDataReader(BufferedReader):
def _try_decode(self, length_header): def _try_decode(self, length_header):
# decode length header # decode length header
try: try:
chunk_size = int(length_header.strip().split(';')[0], 16) chunk_size = int(length_header.strip().split(b';')[0], 16)
except ValueError: except ValueError:
raise ChunkedDataException("Couldn't decode length header " + raise ChunkedDataException(b"Couldn't decode length header " +
length_header) length_header)
if not chunk_size: if not chunk_size:
# chunk_size 0 indicates end of file # chunk_size 0 indicates end of file
self.all_chunks_read = True self.all_chunks_read = True
self._process_read('') self._process_read(b'')
return return
data_len = 0 data_len = 0
data = '' data = b''
# read chunk # read chunk
while data_len < chunk_size: while data_len < chunk_size:
@ -285,8 +285,8 @@ class ChunkedDataReader(BufferedReader):
# it should end in \r\n # it should end in \r\n
if not self.all_chunks_read: if not self.all_chunks_read:
clrf = self.stream.read(2) clrf = self.stream.read(2)
if clrf != '\r\n': if clrf != b'\r\n':
raise ChunkedDataException("Chunk terminator not found.", raise ChunkedDataException(b"Chunk terminator not found.",
data) data)
# hand to base class for further processing # hand to base class for further processing

View File

@ -2,9 +2,9 @@
""" """
import surt import surt
import urlparse import six.moves.urllib.parse as urlparse
from wbexception import BadRequestException from pywb.utils.wbexception import BadRequestException
#================================================================= #=================================================================
@ -128,11 +128,11 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
('example.com/', 'example.com0') ('example.com/', 'example.com0')
# errors: domain range not supported # errors: domain range not supported
>>> calc_search_range('http://example.com/path/file.html', 'domain', False) >>> calc_search_range('http://example.com/path/file.html', 'domain', False) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last): Traceback (most recent call last):
UrlCanonicalizeException: matchType=domain unsupported for non-surt UrlCanonicalizeException: matchType=domain unsupported for non-surt
>>> calc_search_range('http://example.com/path/file.html', 'blah', False) >>> calc_search_range('http://example.com/path/file.html', 'blah', False) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last): Traceback (most recent call last):
UrlCanonicalizeException: Invalid match_type: blah UrlCanonicalizeException: Invalid match_type: blah

View File

@ -1,5 +1,5 @@
import pkgutil import pkgutil
from loaders import load_yaml_config from pywb.utils.loaders import load_yaml_config
#================================================================= #=================================================================

View File

@ -5,12 +5,15 @@ local and remote access
import os import os
import hmac import hmac
import urllib
#import urllib2
import requests import requests
import urlparse
import six
import six.moves.urllib.request as urllib_req
import six.moves.urllib.parse as urlparse
import time import time
import pkg_resources import pkg_resources
from io import open, BytesIO from io import open, BytesIO
try: try:
@ -30,7 +33,7 @@ def to_file_url(filename):
""" Convert a filename to a file:// url """ Convert a filename to a file:// url
""" """
url = os.path.abspath(filename) url = os.path.abspath(filename)
url = urlparse.urljoin('file:', urllib.pathname2url(url)) url = urlparse.urljoin('file:', urllib_req.pathname2url(url))
return url return url
@ -80,7 +83,7 @@ def extract_post_query(method, mime, length, stream, buffered_stream=None):
buffered_stream.write(post_query) buffered_stream.write(post_query)
buffered_stream.seek(0) buffered_stream.seek(0)
post_query = urllib.unquote_plus(post_query) post_query = urlparse.unquote_plus(post_query)
return post_query return post_query
@ -210,7 +213,7 @@ class LocalFileLoader(object):
# convert to filename # convert to filename
if url.startswith('file://'): if url.startswith('file://'):
file_only = True file_only = True
url = urllib.url2pathname(url[len('file://'):]) url = urllib_req.url2pathname(url[len('file://'):])
try: try:
# first, try as file # first, try as file
@ -253,7 +256,7 @@ class HttpLoader(object):
headers['Range'] = BlockLoader._make_range_header(offset, length) headers['Range'] = BlockLoader._make_range_header(offset, length)
if self.cookie_maker: if self.cookie_maker:
if isinstance(self.cookie_maker, basestring): if isinstance(self.cookie_maker, six.string_types):
headers['Cookie'] = self.cookie_maker headers['Cookie'] = self.cookie_maker
else: else:
headers['Cookie'] = self.cookie_maker.make() headers['Cookie'] = self.cookie_maker.make()
@ -311,14 +314,14 @@ class HMACCookieMaker(object):
self.duration = duration self.duration = duration
def make(self, extra_id=''): def make(self, extra_id=''):
expire = str(long(time.time() + self.duration)) expire = str(int(time.time() + self.duration))
if extra_id: if extra_id:
msg = extra_id + '-' + expire msg = extra_id + '-' + expire
else: else:
msg = expire msg = expire
hmacdigest = hmac.new(self.key, msg) hmacdigest = hmac.new(self.key.encode('utf-8'), msg.encode('utf-8'))
hexdigest = hmacdigest.hexdigest() hexdigest = hmacdigest.hexdigest()
if extra_id: if extra_id:
@ -349,7 +352,7 @@ class LimitReader(object):
length = self.limit length = self.limit
if length == 0: if length == 0:
return '' return b''
buff = self.stream.read(length) buff = self.stream.read(length)
self.limit -= len(buff) self.limit -= len(buff)
@ -362,7 +365,7 @@ class LimitReader(object):
length = self.limit length = self.limit
if length == 0: if length == 0:
return '' return b''
buff = self.stream.readline(length) buff = self.stream.readline(length)
self.limit -= len(buff) self.limit -= len(buff)

View File

@ -4,6 +4,7 @@ Representation and parsing of HTTP-style status + headers
import pprint import pprint
from copy import copy from copy import copy
from six.moves import range
#================================================================= #=================================================================
@ -36,7 +37,7 @@ class StatusAndHeaders(object):
return old header value, if any return old header value, if any
""" """
name_lower = name.lower() name_lower = name.lower()
for index in xrange(len(self.headers) - 1, -1, -1): for index in range(len(self.headers) - 1, -1, -1):
curr_name, curr_value = self.headers[index] curr_name, curr_value = self.headers[index]
if curr_name.lower() == name_lower: if curr_name.lower() == name_lower:
self.headers[index] = (curr_name, value) self.headers[index] = (curr_name, value)
@ -52,7 +53,7 @@ class StatusAndHeaders(object):
""" """
header_dict = copy(header_dict) header_dict = copy(header_dict)
for index in xrange(len(self.headers) - 1, -1, -1): for index in range(len(self.headers) - 1, -1, -1):
curr_name, curr_value = self.headers[index] curr_name, curr_value = self.headers[index]
name_lower = curr_name.lower() name_lower = curr_name.lower()
if name_lower in header_dict: if name_lower in header_dict:
@ -68,7 +69,7 @@ class StatusAndHeaders(object):
return True if header removed, False otherwise return True if header removed, False otherwise
""" """
name_lower = name.lower() name_lower = name.lower()
for index in xrange(len(self.headers) - 1, -1, -1): for index in range(len(self.headers) - 1, -1, -1):
if self.headers[index][0].lower() == name_lower: if self.headers[index][0].lower() == name_lower:
del self.headers[index] del self.headers[index]
return True return True
@ -93,7 +94,7 @@ class StatusAndHeaders(object):
code = int(code) code = int(code)
assert(code > 0) assert(code > 0)
return True return True
except ValueError, AssertionError: except(ValueError, AssertionError):
self.statusline = valid_statusline self.statusline = valid_statusline
return False return False

View File

@ -82,13 +82,13 @@ test_cdx_dir = get_test_dir() + 'cdx/'
def print_binsearch_results(key, iter_func): def print_binsearch_results(key, iter_func):
with open(test_cdx_dir + 'iana.cdx', 'rb') as cdx: with open(test_cdx_dir + 'iana.cdx', 'rb') as cdx:
for line in iter_func(cdx, key): for line in iter_func(cdx, key.encode('utf-8')):
print line print(line.decode('utf-8'))
def print_binsearch_results_range(key, end_key, iter_func, prev_size=0): def print_binsearch_results_range(key, end_key, iter_func, prev_size=0):
with open(test_cdx_dir + 'iana.cdx', 'rb') as cdx: with open(test_cdx_dir + 'iana.cdx', 'rb') as cdx:
for line in iter_func(cdx, key, end_key, prev_size=prev_size): for line in iter_func(cdx, key.encode('utf-8'), end_key.encode('utf-8'), prev_size=prev_size):
print line print(line.decode('utf-8'))
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -3,19 +3,19 @@ r"""
#================================================================= #=================================================================
# DecompressingBufferedReader readline() # DecompressingBufferedReader readline()
>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline() >>> print_str(DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline())
' CDX N b a m s k r M S V g\n' ' CDX N b a m s k r M S V g\n'
# detect not compressed # detect not compressed
>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb'), decomp_type = 'gzip').readline() >>> print_str(DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb'), decomp_type = 'gzip').readline())
' CDX N b a m s k r M S V g\n' ' CDX N b a m s k r M S V g\n'
# decompress with on the fly compression, default gzip compression # decompress with on the fly compression, default gzip compression
>>> DecompressingBufferedReader(BytesIO(compress('ABC\n1234\n'))).read() >>> print_str(DecompressingBufferedReader(BytesIO(compress('ABC\n1234\n'))).read())
'ABC\n1234\n' 'ABC\n1234\n'
# decompress with on the fly compression, default 'inflate' compression # decompress with on the fly compression, default 'inflate' compression
>>> DecompressingBufferedReader(BytesIO(compress_alt('ABC\n1234\n')), decomp_type='deflate').read() >>> print_str(DecompressingBufferedReader(BytesIO(compress_alt('ABC\n1234\n')), decomp_type='deflate').read())
'ABC\n1234\n' 'ABC\n1234\n'
# error: invalid compress type # error: invalid compress type
@ -23,26 +23,18 @@ r"""
Traceback (most recent call last): Traceback (most recent call last):
Exception: Decompression type not supported: bzip2 Exception: Decompression type not supported: bzip2
# error: compressed member, followed by not compressed -- considered invalid
>>> x = DecompressingBufferedReader(BytesIO(compress('ABC') + '123'), decomp_type = 'gzip')
>>> b = x.read()
>>> b = x.read_next_member()
>>> x.read()
Traceback (most recent call last):
error: Error -3 while decompressing: incorrect header check
# invalid output when reading compressed data as not compressed # invalid output when reading compressed data as not compressed
>>> DecompressingBufferedReader(BytesIO(compress('ABC')), decomp_type = None).read() != 'ABC' >>> DecompressingBufferedReader(BytesIO(compress('ABC')), decomp_type = None).read() != b'ABC'
True True
# DecompressingBufferedReader readline() with decompression (zipnum file, no header) # DecompressingBufferedReader readline() with decompression (zipnum file, no header)
>>> DecompressingBufferedReader(open(test_zip_dir + 'zipnum-sample.cdx.gz', 'rb'), decomp_type = 'gzip').readline() >>> print_str(DecompressingBufferedReader(open(test_zip_dir + 'zipnum-sample.cdx.gz', 'rb'), decomp_type = 'gzip').readline())
'com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz\n' 'com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz\n'
# test very small block size # test very small block size
>>> dbr = DecompressingBufferedReader(BytesIO('ABCDEFG\nHIJKLMN\nOPQR\nXYZ'), block_size = 3) >>> dbr = DecompressingBufferedReader(BytesIO(b'ABCDEFG\nHIJKLMN\nOPQR\nXYZ'), block_size = 3)
>>> dbr.readline(); dbr.readline(4); dbr.readline(); dbr.readline(); dbr.readline(2); dbr.readline(); dbr.readline() >>> print_str(dbr.readline()); print_str(dbr.readline(4)); print_str(dbr.readline()); print_str(dbr.readline()); print_str(dbr.readline(2)); print_str(dbr.readline()); print_str(dbr.readline())
'ABCDEFG\n' 'ABCDEFG\n'
'HIJK' 'HIJK'
'LMN\n' 'LMN\n'
@ -52,8 +44,8 @@ True
'' ''
# test zero length reads # test zero length reads
>>> x = DecompressingBufferedReader(LimitReader(BytesIO('\r\n'), 1)) >>> x = DecompressingBufferedReader(LimitReader(BytesIO(b'\r\n'), 1))
>>> x.readline(0); x.read(0) >>> print_str(x.readline(0)); print_str(x.read(0))
'' ''
'' ''
@ -61,71 +53,69 @@ True
#================================================================= #=================================================================
Properly formatted chunked data: Properly formatted chunked data:
>>> c = ChunkedDataReader(BytesIO("4\r\n1234\r\n0\r\n\r\n")); >>> c = ChunkedDataReader(BytesIO(b"4\r\n1234\r\n0\r\n\r\n"));
>>> c.read() + c.read() + c.read() >>> print_str(c.read() + c.read() + c.read())
'1234' '1234'
Non-chunked data: Non-chunked data:
>>> ChunkedDataReader(BytesIO("xyz123!@#")).read() >>> print_str(ChunkedDataReader(BytesIO(b"xyz123!@#")).read())
'xyz123!@#' 'xyz123!@#'
Non-chunked, compressed data, specify decomp_type Non-chunked, compressed data, specify decomp_type
>>> ChunkedDataReader(BytesIO(compress('ABCDEF')), decomp_type='gzip').read() >>> print_str(ChunkedDataReader(BytesIO(compress('ABCDEF')), decomp_type='gzip').read())
'ABCDEF' 'ABCDEF'
Non-chunked, compressed data, specifiy compression seperately Non-chunked, compressed data, specifiy compression seperately
>>> c = ChunkedDataReader(BytesIO(compress('ABCDEF'))); c.set_decomp('gzip'); c.read() >>> c = ChunkedDataReader(BytesIO(compress('ABCDEF'))); c.set_decomp('gzip'); print_str(c.read())
'ABCDEF' 'ABCDEF'
Non-chunked, compressed data, wrap in DecompressingBufferedReader Non-chunked, compressed data, wrap in DecompressingBufferedReader
>>> DecompressingBufferedReader(ChunkedDataReader(BytesIO(compress('\nABCDEF\nGHIJ')))).read() >>> print_str(DecompressingBufferedReader(ChunkedDataReader(BytesIO(compress('\nABCDEF\nGHIJ')))).read())
'\nABCDEF\nGHIJ' '\nABCDEF\nGHIJ'
Chunked compressed data Chunked compressed data
Split compressed stream into 10-byte chunk and a remainder chunk Split compressed stream into 10-byte chunk and a remainder chunk
>>> b = compress('ABCDEFGHIJKLMNOP') >>> b = compress('ABCDEFGHIJKLMNOP')
>>> l = len(b) >>> l = len(b)
>>> in_ = format(10, 'x') + "\r\n" + b[:10] + "\r\n" + format(l - 10, 'x') + "\r\n" + b[10:] + "\r\n0\r\n\r\n" >>> in_ = format(10, 'x').encode('utf-8') + b"\r\n" + b[:10] + b"\r\n" + format(l - 10, 'x').encode('utf-8') + b"\r\n" + b[10:] + b"\r\n0\r\n\r\n"
>>> c = ChunkedDataReader(BytesIO(in_), decomp_type='gzip') >>> c = ChunkedDataReader(BytesIO(in_), decomp_type='gzip')
>>> c.read() >>> print_str(c.read())
'ABCDEFGHIJKLMNOP' 'ABCDEFGHIJKLMNOP'
Starts like chunked data, but isn't: Starts like chunked data, but isn't:
>>> c = ChunkedDataReader(BytesIO("1\r\nxyz123!@#")); >>> c = ChunkedDataReader(BytesIO(b"1\r\nxyz123!@#"));
>>> c.read() + c.read() >>> print_str(c.read() + c.read())
'1\r\nx123!@#' '1\r\nx123!@#'
Chunked data cut off part way through: Chunked data cut off part way through:
>>> c = ChunkedDataReader(BytesIO("4\r\n1234\r\n4\r\n12")); >>> c = ChunkedDataReader(BytesIO(b"4\r\n1234\r\n4\r\n12"));
>>> c.read() + c.read() >>> print_str(c.read() + c.read())
'123412' '123412'
Zero-Length chunk: Zero-Length chunk:
>>> ChunkedDataReader(BytesIO("0\r\n\r\n")).read() >>> print_str(ChunkedDataReader(BytesIO(b"0\r\n\r\n")).read())
'' ''
Chunked data cut off with exceptions
>>> c = ChunkedDataReader(BytesIO("4\r\n1234\r\n4\r\n12"), raise_exceptions=True)
>>> c.read() + c.read()
Traceback (most recent call last):
ChunkedDataException: Ran out of data before end of chunk
""" """
from io import BytesIO from io import BytesIO
from pywb.utils.bufferedreaders import ChunkedDataReader from pywb.utils.bufferedreaders import ChunkedDataReader, ChunkedDataException
from pywb.utils.bufferedreaders import DecompressingBufferedReader from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.utils.loaders import LimitReader from pywb.utils.loaders import LimitReader
from pywb import get_test_dir from pywb import get_test_dir
import six
import zlib import zlib
import pytest
test_cdx_dir = get_test_dir() + 'cdx/' test_cdx_dir = get_test_dir() + 'cdx/'
test_zip_dir = get_test_dir() + 'zipcdx/' test_zip_dir = get_test_dir() + 'zipcdx/'
def compress(buff): def compress(buff):
buff = buff.encode('utf-8')
compressobj = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16) compressobj = zlib.compressobj(6, zlib.DEFLATED, zlib.MAX_WBITS + 16)
compressed = compressobj.compress(buff) compressed = compressobj.compress(buff)
compressed += compressobj.flush() compressed += compressobj.flush()
@ -134,6 +124,7 @@ def compress(buff):
# plain "inflate" # plain "inflate"
def compress_alt(buff): def compress_alt(buff):
buff = buff.encode('utf-8')
compressobj = zlib.compressobj(6, zlib.DEFLATED) compressobj = zlib.compressobj(6, zlib.DEFLATED)
compressed = compressobj.compress(buff) compressed = compressobj.compress(buff)
compressed += compressobj.flush() compressed += compressobj.flush()
@ -142,6 +133,32 @@ def compress_alt(buff):
return compressed return compressed
# Errors
def test_err_compress_mix():
# error: compressed member, followed by not compressed -- considered invalid
x = DecompressingBufferedReader(BytesIO(compress('ABC') + b'123'), decomp_type = 'gzip')
b = x.read()
b = x.read_next_member()
with pytest.raises(zlib.error):
x.read()
#error: Error -3 while decompressing: incorrect header check
def test_err_chunk_cut_off():
# Chunked data cut off with exceptions
c = ChunkedDataReader(BytesIO(b"4\r\n1234\r\n4\r\n12"), raise_exceptions=True)
with pytest.raises(ChunkedDataException):
c.read() + c.read()
#ChunkedDataException: Ran out of data before end of chunk
def print_str(string):
return string.decode('utf-8') if six.PY3 else string
if __name__ == "__main__": if __name__ == "__main__":
import doctest import doctest
doctest.testmod() doctest.testmod()

View File

@ -1,30 +1,30 @@
#================================================================= #=================================================================
r""" r"""
# LimitReader Tests # LimitReader Tests
>>> LimitReader(BytesIO('abcdefghjiklmnopqrstuvwxyz'), 10).read(26) >>> LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 10).read(26)
'abcdefghji' 'abcdefghji'
>>> LimitReader(BytesIO('abcdefghjiklmnopqrstuvwxyz'), 8).readline(26) >>> LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 8).readline(26)
'abcdefgh' 'abcdefgh'
>>> LimitReader.wrap_stream(LimitReader(BytesIO('abcdefghjiklmnopqrstuvwxyz'), 8), 4).readline(26) >>> LimitReader.wrap_stream(LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 8), 4).readline(26)
'abcd' 'abcd'
>>> read_multiple(LimitReader(BytesIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20]) >>> read_multiple(LimitReader(StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20])
'efghji' 'efghji'
# zero-length read # zero-length read
>>> LimitReader(BytesIO('a'), 0).readline(0) >>> print_str(LimitReader(StringIO('a'), 0).readline(0))
'' ''
# don't wrap if invalid length # don't wrap if invalid length
>>> b = BytesIO('b') >>> b = StringIO('b')
>>> LimitReader.wrap_stream(b, 'abc') == b >>> LimitReader.wrap_stream(b, 'abc') == b
True True
# BlockLoader Tests (includes LimitReader) # BlockLoader Tests (includes LimitReader)
# Ensure attempt to read more than 100 bytes, reads exactly 100 bytes # Ensure attempt to read more than 100 bytes, reads exactly 100 bytes
>>> len(BlockLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400')) >>> len(BlockLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read(400))
100 100
# no length specified, read full amount requested # no length specified, read full amount requested
@ -32,26 +32,26 @@ True
400 400
# no such file # no such file
>>> len(BlockLoader().load('_x_no_such_file_', 0, 100).read('400')) #>>> len(BlockLoader().load('_x_no_such_file_', 0, 100).read(400)) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last): Traceback (most recent call last):
IOError: [Errno 2] No such file or directory: '_x_no_such_file_' IOError: [Errno 2] No such file or directory: '_x_no_such_file_'
# HMAC Cookie Maker # HMAC Cookie Maker
>>> BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read() >>> print_str(BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read())
'Example Domain' 'Example Domain'
# fixed cookie, range request # fixed cookie, range request
>>> BlockLoader('some=value').load('http://example.com', 41, 14).read() >>> print_str(BlockLoader('some=value').load('http://example.com', 41, 14).read())
'Example Domain' 'Example Domain'
# range request # range request
>>> BlockLoader().load('http://example.com', 1262).read() >>> print_str(BlockLoader().load('http://example.com', 1262).read())
'</html>\n' '</html>\n'
# unknown loader error # unknown loader error
>>> BlockLoader().load('foo://example.com', 10).read() #>>> BlockLoader().load('foo://example.com', 10).read() # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last): #Traceback (most recent call last):
IOError: No Loader for type: foo #IOError: No Loader for type: foo
# test with extra id, ensure 4 parts of the A-B=C-D form are present # test with extra id, ensure 4 parts of the A-B=C-D form are present
>>> len(re.split('[-=]', HMACCookieMaker('test', 'test', 5).make('extra'))) >>> len(re.split('[-=]', HMACCookieMaker('test', 'test', 5).make('extra')))
@ -84,42 +84,42 @@ IOError: No Loader for type: foo
# correct POST data # correct POST data
>>> post_data = 'foo=bar&dir=%2Fbaz' >>> post_data = 'foo=bar&dir=%2Fbaz'
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data)) >>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data), StringIO(post_data))
'foo=bar&dir=/baz' 'foo=bar&dir=/baz'
# unsupported method # unsupported method
>>> extract_post_query('PUT', 'application/x-www-form-urlencoded', len(post_data), BytesIO(post_data)) >>> extract_post_query('PUT', 'application/x-www-form-urlencoded', len(post_data), StringIO(post_data))
# unsupported type # unsupported type
>>> extract_post_query('POST', 'text/plain', len(post_data), BytesIO(post_data)) >>> extract_post_query('POST', 'text/plain', len(post_data), StringIO(post_data))
# invalid length # invalid length
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', 'abc', BytesIO(post_data)) >>> extract_post_query('POST', 'application/x-www-form-urlencoded', 'abc', StringIO(post_data))
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', 0, BytesIO(post_data)) >>> extract_post_query('POST', 'application/x-www-form-urlencoded', 0, StringIO(post_data))
# length too short # length too short
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) - 4, BytesIO(post_data)) >>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) - 4, StringIO(post_data))
'foo=bar&dir=%2' 'foo=bar&dir=%2'
# length too long # length too long
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) + 4, BytesIO(post_data)) >>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) + 4, StringIO(post_data))
'foo=bar&dir=/baz' 'foo=bar&dir=/baz'
# test read_last_line # test read_last_line
>>> read_last_line(BytesIO('A\nB\nC')) >>> print_str(read_last_line(BytesIO(b'A\nB\nC')))
'C' 'C'
>>> read_last_line(BytesIO('Some Line\nLonger Line\nLongest Last Line LL'), offset=8) >>> print_str(read_last_line(BytesIO(b'Some Line\nLonger Line\nLongest Last Line LL'), offset=8))
'Longest Last Line LL' 'Longest Last Line LL'
>>> read_last_line(BytesIO('A\nBC')) >>> print_str(read_last_line(BytesIO(b'A\nBC')))
'BC' 'BC'
>>> read_last_line(BytesIO('A\nBC\n')) >>> print_str(read_last_line(BytesIO(b'A\nBC\n')))
'BC\n' 'BC\n'
>>> read_last_line(BytesIO('ABC')) >>> print_str(read_last_line(BytesIO(b'ABC')))
'ABC' 'ABC'
""" """
@ -130,7 +130,10 @@ import re
import os import os
import pytest import pytest
import six
from six import StringIO
from io import BytesIO from io import BytesIO
from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
from pywb.utils.loaders import LimitReader, extract_client_cookie, extract_post_query from pywb.utils.loaders import LimitReader, extract_client_cookie, extract_post_query
from pywb.utils.loaders import append_post_query, read_last_line from pywb.utils.loaders import append_post_query, read_last_line
@ -165,8 +168,27 @@ def test_s3_read_1():
assert len(buff) == 2526 assert len(buff) == 2526
reader = DecompressingBufferedReader(BytesIO(buff)) reader = DecompressingBufferedReader(BytesIO(buff))
assert reader.readline() == 'WARC/1.0\r\n' assert reader.readline() == b'WARC/1.0\r\n'
assert reader.readline() == 'WARC-Type: response\r\n' assert reader.readline() == b'WARC-Type: response\r\n'
# Error
def test_err_no_such_file():
# no such file
with pytest.raises(IOError):
len(BlockLoader().load('_x_no_such_file_', 0, 100).read('400'))
def test_err_unknown_loader():
# unknown loader error
with pytest.raises(IOError):
BlockLoader().load('foo://example.com', 10).read()
#IOError: No Loader for type: foo
def print_str(string):
return string.decode('utf-8') if six.PY3 else string
if __name__ == "__main__": if __name__ == "__main__":
import doctest import doctest

View File

@ -1,5 +1,5 @@
""" """
>>> st1 = StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_1)) >>> st1 = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1))
>>> st1 >>> st1
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'), StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
('Some', 'Value'), ('Some', 'Value'),
@ -12,30 +12,30 @@ StatusAndHeaders(protocol = '', statusline = '206 Partial Content', headers = [
('Accept-Ranges', 'bytes')]) ('Accept-Ranges', 'bytes')])
# other protocol expected # other protocol expected
>>> StatusAndHeadersParser(['Other']).parse(BytesIO(status_headers_1)) >>> StatusAndHeadersParser(['Other']).parse(StringIO(status_headers_1)) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last): Traceback (most recent call last):
StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK StatusAndHeadersParserException: Expected Status Line starting with ['Other'] - Found: HTTP/1.0 200 OK
>>> StatusAndHeadersParser(['Other'], verify=False).parse(BytesIO(status_headers_1)) >>> StatusAndHeadersParser(['Other'], verify=False).parse(StringIO(status_headers_1))
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'), StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '200 OK', headers = [ ('Content-Type', 'ABC'),
('Some', 'Value'), ('Some', 'Value'),
('Multi-Line', 'Value1 Also This')]) ('Multi-Line', 'Value1 Also This')])
# verify protocol line # verify protocol line
>>> StatusAndHeadersParser(['HTTP/1.0'], verify=True).parse(BytesIO(unknown_protocol_headers)) >>> StatusAndHeadersParser(['HTTP/1.0'], verify=True).parse(StringIO(unknown_protocol_headers)) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last): Traceback (most recent call last):
StatusAndHeadersParserException: Expected Status Line starting with ['HTTP/1.0'] - Found: OtherBlah StatusAndHeadersParserException: Expected Status Line starting with ['HTTP/1.0'] - Found: OtherBlah
# allow unexpected/invalid protocol line # allow unexpected/invalid protocol line
>>> StatusAndHeadersParser(['HTTP/1.0'], verify=False).parse(BytesIO(unknown_protocol_headers)) >>> StatusAndHeadersParser(['HTTP/1.0'], verify=False).parse(StringIO(unknown_protocol_headers))
StatusAndHeaders(protocol = 'OtherBlah', statusline = 'OtherBlah', headers = [('Foo', 'Bar')]) StatusAndHeaders(protocol = 'OtherBlah', statusline = 'OtherBlah', headers = [('Foo', 'Bar')])
# test equality op # test equality op
>>> st1 == StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_1)) >>> st1 == StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1))
True True
# replace header, print new headers # replace header, print new headers
@ -55,15 +55,15 @@ True
False False
# empty # empty
>>> st2 = StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_2)); x = st2.validate_statusline('204 No Content'); st2 >>> st2 = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_2)); x = st2.validate_statusline('204 No Content'); st2
StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = []) StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = [])
>>> StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_3)) >>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_3))
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '204 Empty', headers = [('Content-Type', 'Value'), ('Content-Length', '0')]) StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '204 Empty', headers = [('Content-Type', 'Value'), ('Content-Length', '0')])
# case-insensitive match # case-insensitive match
>>> StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_4)) >>> StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_4))
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '204 empty', headers = [('Content-Type', 'Value'), ('Content-Length', '0')]) StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '204 empty', headers = [('Content-Type', 'Value'), ('Content-Length', '0')])
@ -71,7 +71,8 @@ StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '204 empty', headers = [('C
from pywb.utils.statusandheaders import StatusAndHeadersParser, StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeadersParser, StatusAndHeaders
from io import BytesIO #from io import StringIO
from six import StringIO
status_headers_1 = "\ status_headers_1 = "\

View File

@ -7,7 +7,7 @@ import re
import time import time
import datetime import datetime
import calendar import calendar
from itertools import imap from six.moves import map
from email.utils import parsedate, formatdate from email.utils import parsedate, formatdate
#================================================================= #=================================================================
@ -36,7 +36,7 @@ def iso_date_to_datetime(string):
if nums[-1] == '': if nums[-1] == '':
nums = nums[:-1] nums = nums[:-1]
the_datetime = datetime.datetime(*imap(int, nums)) the_datetime = datetime.datetime(*map(int, nums))
return the_datetime return the_datetime

View File

@ -47,7 +47,7 @@ setup(
long_description=long_description, long_description=long_description,
license='GPL', license='GPL',
packages=find_packages(), packages=find_packages(),
zip_safe=True, zip_safe=False,
provides=[ provides=[
'pywb', 'pywb',
'pywb.utils', 'pywb.utils',
@ -73,11 +73,12 @@ setup(
glob.glob('sample_archive/text_content/*')), glob.glob('sample_archive/text_content/*')),
], ],
install_requires=[ install_requires=[
'six'
'chardet', 'chardet',
'requests', 'requests',
'redis', 'redis',
'jinja2', 'jinja2',
'surt==0.2', 'surt==0.3b4',
'pyyaml', 'pyyaml',
'watchdog', 'watchdog',
'webencodings', 'webencodings',