diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index 1bd3c158..71742041 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -1,5 +1,4 @@ from pywb.utils.binsearch import iter_range -from pywb.utils.loaders import SeekableTextFileReader from pywb.utils.wbexception import AccessException, NotFoundException from pywb.utils.wbexception import BadRequestException, WbException @@ -29,7 +28,7 @@ class CDXFile(CDXSource): self.filename = filename def load_cdx(self, query): - source = SeekableTextFileReader(self.filename) + source = open(self.filename) return iter_range(source, query.key, query.end_key) def __str__(self): diff --git a/pywb/cdx/zipnum.py b/pywb/cdx/zipnum.py index e282dfc0..071319a5 100644 --- a/pywb/cdx/zipnum.py +++ b/pywb/cdx/zipnum.py @@ -9,7 +9,6 @@ from cdxsource import CDXSource from cdxobject import IDXObject from pywb.utils.loaders import BlockLoader -from pywb.utils.loaders import SeekableTextFileReader from pywb.utils.bufferedreaders import gzip_decompressor from pywb.utils.binsearch import iter_range, linearsearch @@ -113,7 +112,7 @@ class ZipNumCluster(CDXSource): def load_cdx(self, query): self.load_loc() - reader = SeekableTextFileReader(self.summary) + reader = open(self.summary) idx_iter = iter_range(reader, query.key, diff --git a/pywb/utils/binsearch.py b/pywb/utils/binsearch.py index 7d939c18..4e3b506c 100644 --- a/pywb/utils/binsearch.py +++ b/pywb/utils/binsearch.py @@ -16,7 +16,9 @@ def binsearch_offset(reader, key, compare_func=cmp, block_size=8192): Optional compare_func may be specified """ min_ = 0 - max_ = reader.getsize() / block_size + + reader.seek(0, 2) + max_ = reader.tell() / block_size while max_ - min_ > 1: mid = min_ + ((max_ - min_) / 2) diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index f86e4072..a1d12d27 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -198,34 +198,3 @@ class LimitReader(object): pass return stream - - -#================================================================= -# Local text file with known size -- used for binsearch -#================================================================= -class SeekableTextFileReader(object): - """ - A very simple file-like object wrapper that knows it's total size, - via getsize() - Supports seek() operation. - Assumed to be a text file. Used for binsearch. - """ - def __init__(self, filename): - self.fh = open(filename, 'rb') - self.filename = filename - self.size = os.path.getsize(filename) - - def getsize(self): - return self.size - - def read(self, length=None): - return self.fh.read(length) - - def readline(self, length=None): - return self.fh.readline(length) - - def seek(self, offset): - return self.fh.seek(offset) - - def close(self): - return self.fh.close() diff --git a/pywb/utils/test/test_binsearch.py b/pywb/utils/test/test_binsearch.py index 40ea1f58..c599377e 100644 --- a/pywb/utils/test/test_binsearch.py +++ b/pywb/utils/test/test_binsearch.py @@ -59,7 +59,6 @@ org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFA #================================================================= import os from pywb.utils.binsearch import iter_prefix, iter_exact, iter_range -from pywb.utils.loaders import SeekableTextFileReader from pywb import get_test_dir @@ -67,17 +66,14 @@ from pywb import get_test_dir test_cdx_dir = get_test_dir() + 'cdx/' def print_binsearch_results(key, iter_func): - cdx = SeekableTextFileReader(test_cdx_dir + 'iana.cdx') - - for line in iter_func(cdx, key): - print line - + with open(test_cdx_dir + 'iana.cdx') as cdx: + for line in iter_func(cdx, key): + print line def print_binsearch_results_range(key, end_key, iter_func, prev_size=0): - cdx = SeekableTextFileReader(test_cdx_dir + 'iana.cdx') - - for line in iter_func(cdx, key, end_key, prev_size=prev_size): - print line + with open(test_cdx_dir + 'iana.cdx') as cdx: + for line in iter_func(cdx, key, end_key, prev_size=prev_size): + print line if __name__ == "__main__": diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py index 88368146..b64f2419 100644 --- a/pywb/utils/test/test_loaders.py +++ b/pywb/utils/test/test_loaders.py @@ -39,18 +39,6 @@ True # test with extra id, ensure 4 parts of the A-B=C-D form are present >>> len(re.split('[-=]', HMACCookieMaker('test', 'test', 5).make('extra'))) 4 - -# SeekableTextFileReader Test ->>> sr = SeekableTextFileReader(test_cdx_dir + 'iana.cdx') ->>> sr.getsize() -30399 - ->>> seek_read_full(sr, 100) -'org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz\\n' - -# seek, read, close ->>> r = sr.seek(0); sr.read(10); sr.close() -' CDX N b a' """ @@ -58,7 +46,7 @@ True import re from io import BytesIO from pywb.utils.loaders import BlockLoader, HMACCookieMaker -from pywb.utils.loaders import LimitReader, SeekableTextFileReader +from pywb.utils.loaders import LimitReader from pywb import get_test_dir diff --git a/pywb/warc/pathresolvers.py b/pywb/warc/pathresolvers.py index 5419eeb9..469fbfb1 100644 --- a/pywb/warc/pathresolvers.py +++ b/pywb/warc/pathresolvers.py @@ -1,7 +1,6 @@ import redis from pywb.utils.binsearch import iter_exact -from pywb.utils.loaders import SeekableTextFileReader import urlparse import os @@ -57,7 +56,7 @@ class RedisResolver: class PathIndexResolver: def __init__(self, pathindex_file): self.pathindex_file = pathindex_file - self.reader = SeekableTextFileReader(pathindex_file) + self.reader = open(pathindex_file) def __call__(self, filename): result = iter_exact(self.reader, filename, '\t')