1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Combine FileLoader/HttpLoader into a single BlockLoader which

delegates based on scheme
This commit is contained in:
Ilya Kreymer 2014-02-22 11:08:46 -08:00
parent 434fd23a95
commit 1754f15831
4 changed files with 57 additions and 54 deletions

View File

@ -7,7 +7,7 @@ from cStringIO import StringIO
from cdxsource import CDXSource
from cdxobject import IDXObject
from pywb.utils.loaders import FileLoader, LimitReader
from pywb.utils.loaders import BlockLoader
from pywb.utils.loaders import SeekableTextFileReader
from pywb.utils.bufferedreaders import gzip_decompressor
from pywb.utils.binsearch import iter_range, linearsearch
@ -101,7 +101,11 @@ class ZipNumCluster(CDXSource):
if blocks:
yield self.block_to_cdx_iter(blocks, ranges, params)
blocks = ZipBlocks(idx['part'], idx['offset'], idx['length'], 1)
blocks = ZipBlocks(idx['part'],
idx['offset'],
idx['length'],
1)
ranges = [blocks.length]
if blocks:
@ -130,7 +134,7 @@ class ZipNumCluster(CDXSource):
msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
logging.debug(msg.format(b=blocks, loc=location))
reader = FileLoader().load(location, blocks.offset, blocks.length)
reader = BlockLoader().load(location, blocks.offset, blocks.length)
def decompress_block(range_):
decomp = gzip_decompressor()

View File

@ -9,18 +9,50 @@ import urllib2
import time
def is_http(filename):
return any(filename.startswith(x) for x in ['http://', 'https://'])
#=================================================================
# load a reader from http
#=================================================================
class HttpLoader(object):
class BlockLoader(object):
"""
Load a file-like reader over http using range requests
and an optional cookie created via a cookie_maker
a loader which can stream blocks of content
given a uri, offset and optional length.
Currently supports: http/https and file/local file system
"""
def __init__(self, cookie_maker=None):
self.cookie_maker = cookie_maker
def load(self, url, offset, length):
"""
Determine loading method based on uri
"""
if is_http(url):
return self.load_http(url, offset, length)
else:
return self.load_file(url, offset, length)
def load_file(self, url, offset, length):
"""
Load a file-like reader from the local file system
"""
if url.startswith('file://'):
url = url[len('file://'):]
afile = open(url, 'rb')
afile.seek(offset)
if length > 0:
return LimitReader(afile, length)
else:
return afile
def load_http(self, url, offset, length):
"""
Load a file-like reader over http using range requests
and an optional cookie created via a cookie_maker
"""
if length > 0:
range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
else:
@ -71,27 +103,6 @@ class HMACCookieMaker(object):
return cookie
#=================================================================
# load a reader from local filesystem
#=================================================================
class FileLoader(object):
"""
Load a file-like reader from the local file system
"""
def load(self, url, offset, length):
if url.startswith('file://'):
url = url[len('file://'):]
afile = open(url, 'rb')
afile.seek(offset)
if length > 0:
return LimitReader(afile, length)
else:
return afile
#=================================================================
# Limit Reader
#=================================================================

View File

@ -10,9 +10,9 @@
>>> read_multiple(LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20])
'efghji'
# FileLoader Tests (includes LimitReader)
# BlockLoader Tests (includes LimitReader)
# Ensure attempt to read more than 100 bytes, reads exactly 100 bytes
>>> len(FileLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400'))
>>> len(BlockLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400'))
100
# SeekableTextFileReader Test
@ -34,7 +34,7 @@
>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
' CDX N b a m s k r M S V g\\n'
>>> HttpLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
>>> BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
'Example Domain'
# test very small block size
@ -53,7 +53,7 @@
#=================================================================
import os
import StringIO
from pywb.utils.loaders import FileLoader, HttpLoader, HMACCookieMaker
from pywb.utils.loaders import BlockLoader, HMACCookieMaker
from pywb.utils.loaders import LimitReader, SeekableTextFileReader
from pywb.utils.bufferedreaders import DecompressingBufferedReader

View File

@ -6,7 +6,7 @@ from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.statusandheaders import StatusAndHeadersParserException
from pywb.utils.loaders import FileLoader, HttpLoader
from pywb.utils.loaders import BlockLoader
from pywb.utils.bufferedreaders import DecompressingBufferedReader
#=================================================================
@ -32,23 +32,11 @@ class ArcWarcRecordLoader:
ARC_HEADERS = ["uri", "ip-address", "creation-date",
"content-type", "length"]
@staticmethod
def create_default_loaders(cookie_maker=None):
http = HttpLoader(cookie_maker)
file = FileLoader()
return {
'http': http,
'https': http,
'file': file,
'': file
}
def __init__(self, loaders={}, cookie_maker=None, block_size=8192):
self.loaders = loaders
if not self.loaders:
self.loaders = self.create_default_loaders(cookie_maker)
def __init__(self, loader=None, cookie_maker=None, block_size=8192):
if not loader:
loader = BlockLoader(cookie_maker)
self.loader = loader
self.block_size = block_size
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
@ -60,16 +48,16 @@ class ArcWarcRecordLoader:
def load(self, url, offset, length):
url_parts = urlparse.urlsplit(url)
loader = self.loaders.get(url_parts.scheme)
if not loader:
raise ArchiveLoadFailed('Unknown Protocol', url)
#loader = self.loaders.get(url_parts.scheme)
#if not loader:
# raise ArchiveLoadFailed('Unknown Protocol', url)
try:
length = int(length)
except:
length = -1
raw = loader.load(url, long(offset), length)
raw = self.loader.load(url, long(offset), length)
decomp_type = 'gzip'