1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-16 00:24:48 +01:00

refactor: split off BlockLoader support into BlockArcWarcRecordLoader, plain ArcWarcRecordLoader only includes parse_record_stream(), no load()

use BlockArcWarcRecordLoader() only when needed for replay
This commit is contained in:
Ilya Kreymer 2017-03-01 14:57:44 -08:00
parent 1213466afb
commit b7285b1a77
5 changed files with 42 additions and 35 deletions

@ -0,0 +1,33 @@
from pywb.utils.loaders import BlockLoader
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.warc.recordloader import ArcWarcRecordLoader
#=================================================================
class BlockArcWarcRecordLoader(ArcWarcRecordLoader):
def __init__(self, loader=None, cookie_maker=None, block_size=8192, *args, **kwargs):
if not loader:
loader = BlockLoader(cookie_maker=cookie_maker)
self.loader = loader
self.block_size = block_size
super(BlockArcWarcRecordLoader, self).__init__(*args, **kwargs)
def load(self, url, offset, length, no_record_parse=False):
""" Load a single record from given url at offset with length
and parse as either warc or arc record
"""
try:
length = int(length)
except:
length = -1
stream = self.loader.load(url, int(offset), length)
decomp_type = 'gzip'
# Create decompressing stream
stream = DecompressingBufferedReader(stream=stream,
decomp_type=decomp_type,
block_size=self.block_size)
return self.parse_record_stream(stream, no_record_parse=no_record_parse)

@ -4,15 +4,13 @@ from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.statusandheaders import StatusAndHeadersParserException
from pywb.utils.loaders import BlockLoader, LimitReader
from pywb.utils.loaders import LimitReader
from pywb.utils.loaders import to_native_str
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.utils.wbexception import WbException
from pywb.utils.timeutils import timestamp_to_iso_date
from six.moves import zip
import six
#=================================================================
@ -56,14 +54,7 @@ class ArcWarcRecordLoader(object):
NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:')
HTTP_SCHEMES = ('http:', 'https:')
def __init__(self, loader=None, cookie_maker=None, block_size=8192,
verify_http=True, arc2warc=True):
if not loader:
loader = BlockLoader(cookie_maker=cookie_maker)
self.loader = loader
self.block_size = block_size
def __init__(self, verify_http=True, arc2warc=True):
if arc2warc:
self.arc_parser = ARC2WARCHeadersParser()
else:
@ -74,25 +65,6 @@ class ArcWarcRecordLoader(object):
self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
def load(self, url, offset, length, no_record_parse=False):
""" Load a single record from given url at offset with length
and parse as either warc or arc record
"""
try:
length = int(length)
except:
length = -1
stream = self.loader.load(url, int(offset), length)
decomp_type = 'gzip'
# Create decompressing stream
stream = DecompressingBufferedReader(stream=stream,
decomp_type=decomp_type,
block_size=self.block_size)
return self.parse_record_stream(stream, no_record_parse=no_record_parse)
def parse_record_stream(self, stream,
statusline=None,
known_format=None,

@ -1,5 +1,6 @@
from pywb.utils.timeutils import iso_date_to_timestamp
from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
from pywb.warc.recordloader import ArchiveLoadFailed
from pywb.utils.wbexception import NotFoundException
import six
@ -9,7 +10,7 @@ import six
class ResolvingLoader(object):
MISSING_REVISIT_MSG = 'Original for revisit record could not be loaded'
def __init__(self, path_resolvers, record_loader=ArcWarcRecordLoader(), no_record_parse=False):
def __init__(self, path_resolvers, record_loader=BlockArcWarcRecordLoader(), no_record_parse=False):
self.path_resolvers = path_resolvers
self.record_loader = record_loader
self.no_record_parse = no_record_parse

@ -293,6 +293,7 @@ import pprint
import six
from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
from pywb.warc.resolvingloader import ResolvingLoader
from pywb.warc.pathresolvers import PathResolverMapper
from pywb.cdx.cdxobject import CDXObject
@ -326,7 +327,7 @@ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \
def load_test_archive(test_file, offset, length):
path = test_warc_dir + test_file
testloader = ArcWarcRecordLoader()
testloader = BlockArcWarcRecordLoader()
archive = testloader.load(path, offset, length)

@ -12,7 +12,7 @@ from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
from pywb.framework.wbrequestresponse import WbResponse
from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
from pywb.warc.resolvingloader import ResolvingLoader
from pywb.warc.pathresolvers import PathResolverMapper
@ -134,7 +134,7 @@ class WBHandler(SearchPageWbUrlHandler):
def _init_replay_view(self, config):
cookie_maker = config.get('cookie_maker')
record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker)
record_loader = BlockArcWarcRecordLoader(cookie_maker=cookie_maker)
paths = config.get('archive_paths')