1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

remove SeekableTextFileReader, replaced with standard file-like objects

and seek(0, 2) and tell() to get file length
This commit is contained in:
Ilya Kreymer 2014-05-06 20:54:42 -07:00
parent 46449ac188
commit e7957a5cae
7 changed files with 13 additions and 61 deletions

View File

@ -1,5 +1,4 @@
from pywb.utils.binsearch import iter_range from pywb.utils.binsearch import iter_range
from pywb.utils.loaders import SeekableTextFileReader
from pywb.utils.wbexception import AccessException, NotFoundException from pywb.utils.wbexception import AccessException, NotFoundException
from pywb.utils.wbexception import BadRequestException, WbException from pywb.utils.wbexception import BadRequestException, WbException
@ -29,7 +28,7 @@ class CDXFile(CDXSource):
self.filename = filename self.filename = filename
def load_cdx(self, query): def load_cdx(self, query):
source = SeekableTextFileReader(self.filename) source = open(self.filename)
return iter_range(source, query.key, query.end_key) return iter_range(source, query.key, query.end_key)
def __str__(self): def __str__(self):

View File

@ -9,7 +9,6 @@ from cdxsource import CDXSource
from cdxobject import IDXObject from cdxobject import IDXObject
from pywb.utils.loaders import BlockLoader from pywb.utils.loaders import BlockLoader
from pywb.utils.loaders import SeekableTextFileReader
from pywb.utils.bufferedreaders import gzip_decompressor from pywb.utils.bufferedreaders import gzip_decompressor
from pywb.utils.binsearch import iter_range, linearsearch from pywb.utils.binsearch import iter_range, linearsearch
@ -113,7 +112,7 @@ class ZipNumCluster(CDXSource):
def load_cdx(self, query): def load_cdx(self, query):
self.load_loc() self.load_loc()
reader = SeekableTextFileReader(self.summary) reader = open(self.summary)
idx_iter = iter_range(reader, idx_iter = iter_range(reader,
query.key, query.key,

View File

@ -16,7 +16,9 @@ def binsearch_offset(reader, key, compare_func=cmp, block_size=8192):
Optional compare_func may be specified Optional compare_func may be specified
""" """
min_ = 0 min_ = 0
max_ = reader.getsize() / block_size
reader.seek(0, 2)
max_ = reader.tell() / block_size
while max_ - min_ > 1: while max_ - min_ > 1:
mid = min_ + ((max_ - min_) / 2) mid = min_ + ((max_ - min_) / 2)

View File

@ -198,34 +198,3 @@ class LimitReader(object):
pass pass
return stream return stream
#=================================================================
# Local text file with known size -- used for binsearch
#=================================================================
class SeekableTextFileReader(object):
"""
A very simple file-like object wrapper that knows it's total size,
via getsize()
Supports seek() operation.
Assumed to be a text file. Used for binsearch.
"""
def __init__(self, filename):
self.fh = open(filename, 'rb')
self.filename = filename
self.size = os.path.getsize(filename)
def getsize(self):
return self.size
def read(self, length=None):
return self.fh.read(length)
def readline(self, length=None):
return self.fh.readline(length)
def seek(self, offset):
return self.fh.seek(offset)
def close(self):
return self.fh.close()

View File

@ -59,7 +59,6 @@ org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFA
#================================================================= #=================================================================
import os import os
from pywb.utils.binsearch import iter_prefix, iter_exact, iter_range from pywb.utils.binsearch import iter_prefix, iter_exact, iter_range
from pywb.utils.loaders import SeekableTextFileReader
from pywb import get_test_dir from pywb import get_test_dir
@ -67,17 +66,14 @@ from pywb import get_test_dir
test_cdx_dir = get_test_dir() + 'cdx/' test_cdx_dir = get_test_dir() + 'cdx/'
def print_binsearch_results(key, iter_func): def print_binsearch_results(key, iter_func):
cdx = SeekableTextFileReader(test_cdx_dir + 'iana.cdx') with open(test_cdx_dir + 'iana.cdx') as cdx:
for line in iter_func(cdx, key):
for line in iter_func(cdx, key): print line
print line
def print_binsearch_results_range(key, end_key, iter_func, prev_size=0): def print_binsearch_results_range(key, end_key, iter_func, prev_size=0):
cdx = SeekableTextFileReader(test_cdx_dir + 'iana.cdx') with open(test_cdx_dir + 'iana.cdx') as cdx:
for line in iter_func(cdx, key, end_key, prev_size=prev_size):
for line in iter_func(cdx, key, end_key, prev_size=prev_size): print line
print line
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -39,18 +39,6 @@ True
# test with extra id, ensure 4 parts of the A-B=C-D form are present # test with extra id, ensure 4 parts of the A-B=C-D form are present
>>> len(re.split('[-=]', HMACCookieMaker('test', 'test', 5).make('extra'))) >>> len(re.split('[-=]', HMACCookieMaker('test', 'test', 5).make('extra')))
4 4
# SeekableTextFileReader Test
>>> sr = SeekableTextFileReader(test_cdx_dir + 'iana.cdx')
>>> sr.getsize()
30399
>>> seek_read_full(sr, 100)
'org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz\\n'
# seek, read, close
>>> r = sr.seek(0); sr.read(10); sr.close()
' CDX N b a'
""" """
@ -58,7 +46,7 @@ True
import re import re
from io import BytesIO from io import BytesIO
from pywb.utils.loaders import BlockLoader, HMACCookieMaker from pywb.utils.loaders import BlockLoader, HMACCookieMaker
from pywb.utils.loaders import LimitReader, SeekableTextFileReader from pywb.utils.loaders import LimitReader
from pywb import get_test_dir from pywb import get_test_dir

View File

@ -1,7 +1,6 @@
import redis import redis
from pywb.utils.binsearch import iter_exact from pywb.utils.binsearch import iter_exact
from pywb.utils.loaders import SeekableTextFileReader
import urlparse import urlparse
import os import os
@ -57,7 +56,7 @@ class RedisResolver:
class PathIndexResolver: class PathIndexResolver:
def __init__(self, pathindex_file): def __init__(self, pathindex_file):
self.pathindex_file = pathindex_file self.pathindex_file = pathindex_file
self.reader = SeekableTextFileReader(pathindex_file) self.reader = open(pathindex_file)
def __call__(self, filename): def __call__(self, filename):
result = iter_exact(self.reader, filename, '\t') result = iter_exact(self.reader, filename, '\t')