mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Combine FileLoader/HttpLoader into a single BlockLoader which
delegates based on scheme
This commit is contained in:
parent
434fd23a95
commit
1754f15831
@ -7,7 +7,7 @@ from cStringIO import StringIO
|
|||||||
from cdxsource import CDXSource
|
from cdxsource import CDXSource
|
||||||
from cdxobject import IDXObject
|
from cdxobject import IDXObject
|
||||||
|
|
||||||
from pywb.utils.loaders import FileLoader, LimitReader
|
from pywb.utils.loaders import BlockLoader
|
||||||
from pywb.utils.loaders import SeekableTextFileReader
|
from pywb.utils.loaders import SeekableTextFileReader
|
||||||
from pywb.utils.bufferedreaders import gzip_decompressor
|
from pywb.utils.bufferedreaders import gzip_decompressor
|
||||||
from pywb.utils.binsearch import iter_range, linearsearch
|
from pywb.utils.binsearch import iter_range, linearsearch
|
||||||
@ -101,7 +101,11 @@ class ZipNumCluster(CDXSource):
|
|||||||
if blocks:
|
if blocks:
|
||||||
yield self.block_to_cdx_iter(blocks, ranges, params)
|
yield self.block_to_cdx_iter(blocks, ranges, params)
|
||||||
|
|
||||||
blocks = ZipBlocks(idx['part'], idx['offset'], idx['length'], 1)
|
blocks = ZipBlocks(idx['part'],
|
||||||
|
idx['offset'],
|
||||||
|
idx['length'],
|
||||||
|
1)
|
||||||
|
|
||||||
ranges = [blocks.length]
|
ranges = [blocks.length]
|
||||||
|
|
||||||
if blocks:
|
if blocks:
|
||||||
@ -130,7 +134,7 @@ class ZipNumCluster(CDXSource):
|
|||||||
msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
|
msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
|
||||||
logging.debug(msg.format(b=blocks, loc=location))
|
logging.debug(msg.format(b=blocks, loc=location))
|
||||||
|
|
||||||
reader = FileLoader().load(location, blocks.offset, blocks.length)
|
reader = BlockLoader().load(location, blocks.offset, blocks.length)
|
||||||
|
|
||||||
def decompress_block(range_):
|
def decompress_block(range_):
|
||||||
decomp = gzip_decompressor()
|
decomp = gzip_decompressor()
|
||||||
|
@ -9,18 +9,50 @@ import urllib2
|
|||||||
import time
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
def is_http(filename):
|
||||||
|
return any(filename.startswith(x) for x in ['http://', 'https://'])
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# load a reader from http
|
class BlockLoader(object):
|
||||||
#=================================================================
|
|
||||||
class HttpLoader(object):
|
|
||||||
"""
|
"""
|
||||||
Load a file-like reader over http using range requests
|
a loader which can stream blocks of content
|
||||||
and an optional cookie created via a cookie_maker
|
given a uri, offset and optional length.
|
||||||
|
Currently supports: http/https and file/local file system
|
||||||
"""
|
"""
|
||||||
def __init__(self, cookie_maker=None):
|
def __init__(self, cookie_maker=None):
|
||||||
self.cookie_maker = cookie_maker
|
self.cookie_maker = cookie_maker
|
||||||
|
|
||||||
def load(self, url, offset, length):
|
def load(self, url, offset, length):
|
||||||
|
"""
|
||||||
|
Determine loading method based on uri
|
||||||
|
"""
|
||||||
|
if is_http(url):
|
||||||
|
return self.load_http(url, offset, length)
|
||||||
|
else:
|
||||||
|
return self.load_file(url, offset, length)
|
||||||
|
|
||||||
|
def load_file(self, url, offset, length):
|
||||||
|
"""
|
||||||
|
Load a file-like reader from the local file system
|
||||||
|
"""
|
||||||
|
|
||||||
|
if url.startswith('file://'):
|
||||||
|
url = url[len('file://'):]
|
||||||
|
|
||||||
|
afile = open(url, 'rb')
|
||||||
|
afile.seek(offset)
|
||||||
|
|
||||||
|
if length > 0:
|
||||||
|
return LimitReader(afile, length)
|
||||||
|
else:
|
||||||
|
return afile
|
||||||
|
|
||||||
|
def load_http(self, url, offset, length):
|
||||||
|
"""
|
||||||
|
Load a file-like reader over http using range requests
|
||||||
|
and an optional cookie created via a cookie_maker
|
||||||
|
"""
|
||||||
if length > 0:
|
if length > 0:
|
||||||
range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
|
range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
|
||||||
else:
|
else:
|
||||||
@ -71,27 +103,6 @@ class HMACCookieMaker(object):
|
|||||||
return cookie
|
return cookie
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
# load a reader from local filesystem
|
|
||||||
#=================================================================
|
|
||||||
class FileLoader(object):
|
|
||||||
"""
|
|
||||||
Load a file-like reader from the local file system
|
|
||||||
"""
|
|
||||||
|
|
||||||
def load(self, url, offset, length):
|
|
||||||
if url.startswith('file://'):
|
|
||||||
url = url[len('file://'):]
|
|
||||||
|
|
||||||
afile = open(url, 'rb')
|
|
||||||
afile.seek(offset)
|
|
||||||
|
|
||||||
if length > 0:
|
|
||||||
return LimitReader(afile, length)
|
|
||||||
else:
|
|
||||||
return afile
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# Limit Reader
|
# Limit Reader
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
@ -10,9 +10,9 @@
|
|||||||
>>> read_multiple(LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20])
|
>>> read_multiple(LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20])
|
||||||
'efghji'
|
'efghji'
|
||||||
|
|
||||||
# FileLoader Tests (includes LimitReader)
|
# BlockLoader Tests (includes LimitReader)
|
||||||
# Ensure attempt to read more than 100 bytes, reads exactly 100 bytes
|
# Ensure attempt to read more than 100 bytes, reads exactly 100 bytes
|
||||||
>>> len(FileLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400'))
|
>>> len(BlockLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400'))
|
||||||
100
|
100
|
||||||
|
|
||||||
# SeekableTextFileReader Test
|
# SeekableTextFileReader Test
|
||||||
@ -34,7 +34,7 @@
|
|||||||
>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
|
>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
|
||||||
' CDX N b a m s k r M S V g\\n'
|
' CDX N b a m s k r M S V g\\n'
|
||||||
|
|
||||||
>>> HttpLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
|
>>> BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
|
||||||
'Example Domain'
|
'Example Domain'
|
||||||
|
|
||||||
# test very small block size
|
# test very small block size
|
||||||
@ -53,7 +53,7 @@
|
|||||||
#=================================================================
|
#=================================================================
|
||||||
import os
|
import os
|
||||||
import StringIO
|
import StringIO
|
||||||
from pywb.utils.loaders import FileLoader, HttpLoader, HMACCookieMaker
|
from pywb.utils.loaders import BlockLoader, HMACCookieMaker
|
||||||
from pywb.utils.loaders import LimitReader, SeekableTextFileReader
|
from pywb.utils.loaders import LimitReader, SeekableTextFileReader
|
||||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@ from pywb.utils.statusandheaders import StatusAndHeaders
|
|||||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||||
from pywb.utils.statusandheaders import StatusAndHeadersParserException
|
from pywb.utils.statusandheaders import StatusAndHeadersParserException
|
||||||
|
|
||||||
from pywb.utils.loaders import FileLoader, HttpLoader
|
from pywb.utils.loaders import BlockLoader
|
||||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -32,23 +32,11 @@ class ArcWarcRecordLoader:
|
|||||||
ARC_HEADERS = ["uri", "ip-address", "creation-date",
|
ARC_HEADERS = ["uri", "ip-address", "creation-date",
|
||||||
"content-type", "length"]
|
"content-type", "length"]
|
||||||
|
|
||||||
@staticmethod
|
def __init__(self, loader=None, cookie_maker=None, block_size=8192):
|
||||||
def create_default_loaders(cookie_maker=None):
|
if not loader:
|
||||||
http = HttpLoader(cookie_maker)
|
loader = BlockLoader(cookie_maker)
|
||||||
file = FileLoader()
|
|
||||||
return {
|
|
||||||
'http': http,
|
|
||||||
'https': http,
|
|
||||||
'file': file,
|
|
||||||
'': file
|
|
||||||
}
|
|
||||||
|
|
||||||
def __init__(self, loaders={}, cookie_maker=None, block_size=8192):
|
|
||||||
self.loaders = loaders
|
|
||||||
|
|
||||||
if not self.loaders:
|
|
||||||
self.loaders = self.create_default_loaders(cookie_maker)
|
|
||||||
|
|
||||||
|
self.loader = loader
|
||||||
self.block_size = block_size
|
self.block_size = block_size
|
||||||
|
|
||||||
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
|
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
|
||||||
@ -60,16 +48,16 @@ class ArcWarcRecordLoader:
|
|||||||
def load(self, url, offset, length):
|
def load(self, url, offset, length):
|
||||||
url_parts = urlparse.urlsplit(url)
|
url_parts = urlparse.urlsplit(url)
|
||||||
|
|
||||||
loader = self.loaders.get(url_parts.scheme)
|
#loader = self.loaders.get(url_parts.scheme)
|
||||||
if not loader:
|
#if not loader:
|
||||||
raise ArchiveLoadFailed('Unknown Protocol', url)
|
# raise ArchiveLoadFailed('Unknown Protocol', url)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
length = int(length)
|
length = int(length)
|
||||||
except:
|
except:
|
||||||
length = -1
|
length = -1
|
||||||
|
|
||||||
raw = loader.load(url, long(offset), length)
|
raw = self.loader.load(url, long(offset), length)
|
||||||
|
|
||||||
decomp_type = 'gzip'
|
decomp_type = 'gzip'
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user