mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
remove SeekableTextFileReader, replaced with standard file-like objects
and seek(0, 2) and tell() to get file length
This commit is contained in:
parent
46449ac188
commit
e7957a5cae
@ -1,5 +1,4 @@
|
|||||||
from pywb.utils.binsearch import iter_range
|
from pywb.utils.binsearch import iter_range
|
||||||
from pywb.utils.loaders import SeekableTextFileReader
|
|
||||||
|
|
||||||
from pywb.utils.wbexception import AccessException, NotFoundException
|
from pywb.utils.wbexception import AccessException, NotFoundException
|
||||||
from pywb.utils.wbexception import BadRequestException, WbException
|
from pywb.utils.wbexception import BadRequestException, WbException
|
||||||
@ -29,7 +28,7 @@ class CDXFile(CDXSource):
|
|||||||
self.filename = filename
|
self.filename = filename
|
||||||
|
|
||||||
def load_cdx(self, query):
|
def load_cdx(self, query):
|
||||||
source = SeekableTextFileReader(self.filename)
|
source = open(self.filename)
|
||||||
return iter_range(source, query.key, query.end_key)
|
return iter_range(source, query.key, query.end_key)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
|
@ -9,7 +9,6 @@ from cdxsource import CDXSource
|
|||||||
from cdxobject import IDXObject
|
from cdxobject import IDXObject
|
||||||
|
|
||||||
from pywb.utils.loaders import BlockLoader
|
from pywb.utils.loaders import BlockLoader
|
||||||
from pywb.utils.loaders import SeekableTextFileReader
|
|
||||||
from pywb.utils.bufferedreaders import gzip_decompressor
|
from pywb.utils.bufferedreaders import gzip_decompressor
|
||||||
from pywb.utils.binsearch import iter_range, linearsearch
|
from pywb.utils.binsearch import iter_range, linearsearch
|
||||||
|
|
||||||
@ -113,7 +112,7 @@ class ZipNumCluster(CDXSource):
|
|||||||
def load_cdx(self, query):
|
def load_cdx(self, query):
|
||||||
self.load_loc()
|
self.load_loc()
|
||||||
|
|
||||||
reader = SeekableTextFileReader(self.summary)
|
reader = open(self.summary)
|
||||||
|
|
||||||
idx_iter = iter_range(reader,
|
idx_iter = iter_range(reader,
|
||||||
query.key,
|
query.key,
|
||||||
|
@ -16,7 +16,9 @@ def binsearch_offset(reader, key, compare_func=cmp, block_size=8192):
|
|||||||
Optional compare_func may be specified
|
Optional compare_func may be specified
|
||||||
"""
|
"""
|
||||||
min_ = 0
|
min_ = 0
|
||||||
max_ = reader.getsize() / block_size
|
|
||||||
|
reader.seek(0, 2)
|
||||||
|
max_ = reader.tell() / block_size
|
||||||
|
|
||||||
while max_ - min_ > 1:
|
while max_ - min_ > 1:
|
||||||
mid = min_ + ((max_ - min_) / 2)
|
mid = min_ + ((max_ - min_) / 2)
|
||||||
|
@ -198,34 +198,3 @@ class LimitReader(object):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
return stream
|
return stream
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
# Local text file with known size -- used for binsearch
|
|
||||||
#=================================================================
|
|
||||||
class SeekableTextFileReader(object):
|
|
||||||
"""
|
|
||||||
A very simple file-like object wrapper that knows it's total size,
|
|
||||||
via getsize()
|
|
||||||
Supports seek() operation.
|
|
||||||
Assumed to be a text file. Used for binsearch.
|
|
||||||
"""
|
|
||||||
def __init__(self, filename):
|
|
||||||
self.fh = open(filename, 'rb')
|
|
||||||
self.filename = filename
|
|
||||||
self.size = os.path.getsize(filename)
|
|
||||||
|
|
||||||
def getsize(self):
|
|
||||||
return self.size
|
|
||||||
|
|
||||||
def read(self, length=None):
|
|
||||||
return self.fh.read(length)
|
|
||||||
|
|
||||||
def readline(self, length=None):
|
|
||||||
return self.fh.readline(length)
|
|
||||||
|
|
||||||
def seek(self, offset):
|
|
||||||
return self.fh.seek(offset)
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
return self.fh.close()
|
|
||||||
|
@ -59,7 +59,6 @@ org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFA
|
|||||||
#=================================================================
|
#=================================================================
|
||||||
import os
|
import os
|
||||||
from pywb.utils.binsearch import iter_prefix, iter_exact, iter_range
|
from pywb.utils.binsearch import iter_prefix, iter_exact, iter_range
|
||||||
from pywb.utils.loaders import SeekableTextFileReader
|
|
||||||
|
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
|
|
||||||
@ -67,17 +66,14 @@ from pywb import get_test_dir
|
|||||||
test_cdx_dir = get_test_dir() + 'cdx/'
|
test_cdx_dir = get_test_dir() + 'cdx/'
|
||||||
|
|
||||||
def print_binsearch_results(key, iter_func):
|
def print_binsearch_results(key, iter_func):
|
||||||
cdx = SeekableTextFileReader(test_cdx_dir + 'iana.cdx')
|
with open(test_cdx_dir + 'iana.cdx') as cdx:
|
||||||
|
for line in iter_func(cdx, key):
|
||||||
for line in iter_func(cdx, key):
|
print line
|
||||||
print line
|
|
||||||
|
|
||||||
|
|
||||||
def print_binsearch_results_range(key, end_key, iter_func, prev_size=0):
|
def print_binsearch_results_range(key, end_key, iter_func, prev_size=0):
|
||||||
cdx = SeekableTextFileReader(test_cdx_dir + 'iana.cdx')
|
with open(test_cdx_dir + 'iana.cdx') as cdx:
|
||||||
|
for line in iter_func(cdx, key, end_key, prev_size=prev_size):
|
||||||
for line in iter_func(cdx, key, end_key, prev_size=prev_size):
|
print line
|
||||||
print line
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -39,18 +39,6 @@ True
|
|||||||
# test with extra id, ensure 4 parts of the A-B=C-D form are present
|
# test with extra id, ensure 4 parts of the A-B=C-D form are present
|
||||||
>>> len(re.split('[-=]', HMACCookieMaker('test', 'test', 5).make('extra')))
|
>>> len(re.split('[-=]', HMACCookieMaker('test', 'test', 5).make('extra')))
|
||||||
4
|
4
|
||||||
|
|
||||||
# SeekableTextFileReader Test
|
|
||||||
>>> sr = SeekableTextFileReader(test_cdx_dir + 'iana.cdx')
|
|
||||||
>>> sr.getsize()
|
|
||||||
30399
|
|
||||||
|
|
||||||
>>> seek_read_full(sr, 100)
|
|
||||||
'org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz\\n'
|
|
||||||
|
|
||||||
# seek, read, close
|
|
||||||
>>> r = sr.seek(0); sr.read(10); sr.close()
|
|
||||||
' CDX N b a'
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@ -58,7 +46,7 @@ True
|
|||||||
import re
|
import re
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pywb.utils.loaders import BlockLoader, HMACCookieMaker
|
from pywb.utils.loaders import BlockLoader, HMACCookieMaker
|
||||||
from pywb.utils.loaders import LimitReader, SeekableTextFileReader
|
from pywb.utils.loaders import LimitReader
|
||||||
|
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
|
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
import redis
|
import redis
|
||||||
|
|
||||||
from pywb.utils.binsearch import iter_exact
|
from pywb.utils.binsearch import iter_exact
|
||||||
from pywb.utils.loaders import SeekableTextFileReader
|
|
||||||
|
|
||||||
import urlparse
|
import urlparse
|
||||||
import os
|
import os
|
||||||
@ -57,7 +56,7 @@ class RedisResolver:
|
|||||||
class PathIndexResolver:
|
class PathIndexResolver:
|
||||||
def __init__(self, pathindex_file):
|
def __init__(self, pathindex_file):
|
||||||
self.pathindex_file = pathindex_file
|
self.pathindex_file = pathindex_file
|
||||||
self.reader = SeekableTextFileReader(pathindex_file)
|
self.reader = open(pathindex_file)
|
||||||
|
|
||||||
def __call__(self, filename):
|
def __call__(self, filename):
|
||||||
result = iter_exact(self.reader, filename, '\t')
|
result = iter_exact(self.reader, filename, '\t')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user