1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

Combine FileLoader/HttpLoader into a single BlockLoader which

delegates based on scheme
This commit is contained in:
Ilya Kreymer 2014-02-22 11:08:46 -08:00
parent 434fd23a95
commit 1754f15831
4 changed files with 57 additions and 54 deletions

View File

@ -7,7 +7,7 @@ from cStringIO import StringIO
from cdxsource import CDXSource from cdxsource import CDXSource
from cdxobject import IDXObject from cdxobject import IDXObject
from pywb.utils.loaders import FileLoader, LimitReader from pywb.utils.loaders import BlockLoader
from pywb.utils.loaders import SeekableTextFileReader from pywb.utils.loaders import SeekableTextFileReader
from pywb.utils.bufferedreaders import gzip_decompressor from pywb.utils.bufferedreaders import gzip_decompressor
from pywb.utils.binsearch import iter_range, linearsearch from pywb.utils.binsearch import iter_range, linearsearch
@ -101,7 +101,11 @@ class ZipNumCluster(CDXSource):
if blocks: if blocks:
yield self.block_to_cdx_iter(blocks, ranges, params) yield self.block_to_cdx_iter(blocks, ranges, params)
blocks = ZipBlocks(idx['part'], idx['offset'], idx['length'], 1) blocks = ZipBlocks(idx['part'],
idx['offset'],
idx['length'],
1)
ranges = [blocks.length] ranges = [blocks.length]
if blocks: if blocks:
@ -130,7 +134,7 @@ class ZipNumCluster(CDXSource):
msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}' msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
logging.debug(msg.format(b=blocks, loc=location)) logging.debug(msg.format(b=blocks, loc=location))
reader = FileLoader().load(location, blocks.offset, blocks.length) reader = BlockLoader().load(location, blocks.offset, blocks.length)
def decompress_block(range_): def decompress_block(range_):
decomp = gzip_decompressor() decomp = gzip_decompressor()

View File

@ -9,18 +9,50 @@ import urllib2
import time import time
def is_http(filename):
return any(filename.startswith(x) for x in ['http://', 'https://'])
#================================================================= #=================================================================
# load a reader from http class BlockLoader(object):
#=================================================================
class HttpLoader(object):
""" """
Load a file-like reader over http using range requests a loader which can stream blocks of content
and an optional cookie created via a cookie_maker given a uri, offset and optional length.
Currently supports: http/https and file/local file system
""" """
def __init__(self, cookie_maker=None): def __init__(self, cookie_maker=None):
self.cookie_maker = cookie_maker self.cookie_maker = cookie_maker
def load(self, url, offset, length): def load(self, url, offset, length):
"""
Determine loading method based on uri
"""
if is_http(url):
return self.load_http(url, offset, length)
else:
return self.load_file(url, offset, length)
def load_file(self, url, offset, length):
"""
Load a file-like reader from the local file system
"""
if url.startswith('file://'):
url = url[len('file://'):]
afile = open(url, 'rb')
afile.seek(offset)
if length > 0:
return LimitReader(afile, length)
else:
return afile
def load_http(self, url, offset, length):
"""
Load a file-like reader over http using range requests
and an optional cookie created via a cookie_maker
"""
if length > 0: if length > 0:
range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1) range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
else: else:
@ -71,27 +103,6 @@ class HMACCookieMaker(object):
return cookie return cookie
#=================================================================
# load a reader from local filesystem
#=================================================================
class FileLoader(object):
"""
Load a file-like reader from the local file system
"""
def load(self, url, offset, length):
if url.startswith('file://'):
url = url[len('file://'):]
afile = open(url, 'rb')
afile.seek(offset)
if length > 0:
return LimitReader(afile, length)
else:
return afile
#================================================================= #=================================================================
# Limit Reader # Limit Reader
#================================================================= #=================================================================

View File

@ -10,9 +10,9 @@
>>> read_multiple(LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20]) >>> read_multiple(LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20])
'efghji' 'efghji'
# FileLoader Tests (includes LimitReader) # BlockLoader Tests (includes LimitReader)
# Ensure attempt to read more than 100 bytes, reads exactly 100 bytes # Ensure attempt to read more than 100 bytes, reads exactly 100 bytes
>>> len(FileLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400')) >>> len(BlockLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400'))
100 100
# SeekableTextFileReader Test # SeekableTextFileReader Test
@ -34,7 +34,7 @@
>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline() >>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
' CDX N b a m s k r M S V g\\n' ' CDX N b a m s k r M S V g\\n'
>>> HttpLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read() >>> BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
'Example Domain' 'Example Domain'
# test very small block size # test very small block size
@ -53,7 +53,7 @@
#================================================================= #=================================================================
import os import os
import StringIO import StringIO
from pywb.utils.loaders import FileLoader, HttpLoader, HMACCookieMaker from pywb.utils.loaders import BlockLoader, HMACCookieMaker
from pywb.utils.loaders import LimitReader, SeekableTextFileReader from pywb.utils.loaders import LimitReader, SeekableTextFileReader
from pywb.utils.bufferedreaders import DecompressingBufferedReader from pywb.utils.bufferedreaders import DecompressingBufferedReader

View File

@ -6,7 +6,7 @@ from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.statusandheaders import StatusAndHeadersParser from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.statusandheaders import StatusAndHeadersParserException from pywb.utils.statusandheaders import StatusAndHeadersParserException
from pywb.utils.loaders import FileLoader, HttpLoader from pywb.utils.loaders import BlockLoader
from pywb.utils.bufferedreaders import DecompressingBufferedReader from pywb.utils.bufferedreaders import DecompressingBufferedReader
#================================================================= #=================================================================
@ -32,23 +32,11 @@ class ArcWarcRecordLoader:
ARC_HEADERS = ["uri", "ip-address", "creation-date", ARC_HEADERS = ["uri", "ip-address", "creation-date",
"content-type", "length"] "content-type", "length"]
@staticmethod def __init__(self, loader=None, cookie_maker=None, block_size=8192):
def create_default_loaders(cookie_maker=None): if not loader:
http = HttpLoader(cookie_maker) loader = BlockLoader(cookie_maker)
file = FileLoader()
return {
'http': http,
'https': http,
'file': file,
'': file
}
def __init__(self, loaders={}, cookie_maker=None, block_size=8192):
self.loaders = loaders
if not self.loaders:
self.loaders = self.create_default_loaders(cookie_maker)
self.loader = loader
self.block_size = block_size self.block_size = block_size
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS) self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
@ -60,16 +48,16 @@ class ArcWarcRecordLoader:
def load(self, url, offset, length): def load(self, url, offset, length):
url_parts = urlparse.urlsplit(url) url_parts = urlparse.urlsplit(url)
loader = self.loaders.get(url_parts.scheme) #loader = self.loaders.get(url_parts.scheme)
if not loader: #if not loader:
raise ArchiveLoadFailed('Unknown Protocol', url) # raise ArchiveLoadFailed('Unknown Protocol', url)
try: try:
length = int(length) length = int(length)
except: except:
length = -1 length = -1
raw = loader.load(url, long(offset), length) raw = self.loader.load(url, long(offset), length)
decomp_type = 'gzip' decomp_type = 'gzip'