1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

refactor: split off BlockLoader support into BlockArcWarcRecordLoader, plain ArcWarcRecordLoader only includes parse_record_stream(), no load()

use BlockArcWarcRecordLoader() only when needed for replay
This commit is contained in:
Ilya Kreymer 2017-03-01 14:57:44 -08:00
parent 1213466afb
commit b7285b1a77
5 changed files with 42 additions and 35 deletions

View File

@ -0,0 +1,33 @@
from pywb.utils.loaders import BlockLoader
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.warc.recordloader import ArcWarcRecordLoader
#=================================================================
class BlockArcWarcRecordLoader(ArcWarcRecordLoader):
def __init__(self, loader=None, cookie_maker=None, block_size=8192, *args, **kwargs):
if not loader:
loader = BlockLoader(cookie_maker=cookie_maker)
self.loader = loader
self.block_size = block_size
super(BlockArcWarcRecordLoader, self).__init__(*args, **kwargs)
def load(self, url, offset, length, no_record_parse=False):
""" Load a single record from given url at offset with length
and parse as either warc or arc record
"""
try:
length = int(length)
except:
length = -1
stream = self.loader.load(url, int(offset), length)
decomp_type = 'gzip'
# Create decompressing stream
stream = DecompressingBufferedReader(stream=stream,
decomp_type=decomp_type,
block_size=self.block_size)
return self.parse_record_stream(stream, no_record_parse=no_record_parse)

View File

@ -4,15 +4,13 @@ from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.statusandheaders import StatusAndHeadersParser from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.statusandheaders import StatusAndHeadersParserException from pywb.utils.statusandheaders import StatusAndHeadersParserException
from pywb.utils.loaders import BlockLoader, LimitReader from pywb.utils.loaders import LimitReader
from pywb.utils.loaders import to_native_str from pywb.utils.loaders import to_native_str
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.utils.wbexception import WbException from pywb.utils.wbexception import WbException
from pywb.utils.timeutils import timestamp_to_iso_date from pywb.utils.timeutils import timestamp_to_iso_date
from six.moves import zip from six.moves import zip
import six
#================================================================= #=================================================================
@ -56,14 +54,7 @@ class ArcWarcRecordLoader(object):
NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:') NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:')
HTTP_SCHEMES = ('http:', 'https:') HTTP_SCHEMES = ('http:', 'https:')
def __init__(self, loader=None, cookie_maker=None, block_size=8192, def __init__(self, verify_http=True, arc2warc=True):
verify_http=True, arc2warc=True):
if not loader:
loader = BlockLoader(cookie_maker=cookie_maker)
self.loader = loader
self.block_size = block_size
if arc2warc: if arc2warc:
self.arc_parser = ARC2WARCHeadersParser() self.arc_parser = ARC2WARCHeadersParser()
else: else:
@ -74,25 +65,6 @@ class ArcWarcRecordLoader(object):
self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http) self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
def load(self, url, offset, length, no_record_parse=False):
""" Load a single record from given url at offset with length
and parse as either warc or arc record
"""
try:
length = int(length)
except:
length = -1
stream = self.loader.load(url, int(offset), length)
decomp_type = 'gzip'
# Create decompressing stream
stream = DecompressingBufferedReader(stream=stream,
decomp_type=decomp_type,
block_size=self.block_size)
return self.parse_record_stream(stream, no_record_parse=no_record_parse)
def parse_record_stream(self, stream, def parse_record_stream(self, stream,
statusline=None, statusline=None,
known_format=None, known_format=None,

View File

@ -1,5 +1,6 @@
from pywb.utils.timeutils import iso_date_to_timestamp from pywb.utils.timeutils import iso_date_to_timestamp
from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
from pywb.warc.recordloader import ArchiveLoadFailed
from pywb.utils.wbexception import NotFoundException from pywb.utils.wbexception import NotFoundException
import six import six
@ -9,7 +10,7 @@ import six
class ResolvingLoader(object): class ResolvingLoader(object):
MISSING_REVISIT_MSG = 'Original for revisit record could not be loaded' MISSING_REVISIT_MSG = 'Original for revisit record could not be loaded'
def __init__(self, path_resolvers, record_loader=ArcWarcRecordLoader(), no_record_parse=False): def __init__(self, path_resolvers, record_loader=BlockArcWarcRecordLoader(), no_record_parse=False):
self.path_resolvers = path_resolvers self.path_resolvers = path_resolvers
self.record_loader = record_loader self.record_loader = record_loader
self.no_record_parse = no_record_parse self.no_record_parse = no_record_parse

View File

@ -293,6 +293,7 @@ import pprint
import six import six
from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
from pywb.warc.resolvingloader import ResolvingLoader from pywb.warc.resolvingloader import ResolvingLoader
from pywb.warc.pathresolvers import PathResolverMapper from pywb.warc.pathresolvers import PathResolverMapper
from pywb.cdx.cdxobject import CDXObject from pywb.cdx.cdxobject import CDXObject
@ -326,7 +327,7 @@ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \
def load_test_archive(test_file, offset, length): def load_test_archive(test_file, offset, length):
path = test_warc_dir + test_file path = test_warc_dir + test_file
testloader = ArcWarcRecordLoader() testloader = BlockArcWarcRecordLoader()
archive = testloader.load(path, offset, length) archive = testloader.load(path, offset, length)

View File

@ -12,7 +12,7 @@ from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.wbrequestresponse import WbResponse
from pywb.warc.recordloader import ArcWarcRecordLoader from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
from pywb.warc.resolvingloader import ResolvingLoader from pywb.warc.resolvingloader import ResolvingLoader
from pywb.warc.pathresolvers import PathResolverMapper from pywb.warc.pathresolvers import PathResolverMapper
@ -134,7 +134,7 @@ class WBHandler(SearchPageWbUrlHandler):
def _init_replay_view(self, config): def _init_replay_view(self, config):
cookie_maker = config.get('cookie_maker') cookie_maker = config.get('cookie_maker')
record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker) record_loader = BlockArcWarcRecordLoader(cookie_maker=cookie_maker)
paths = config.get('archive_paths') paths = config.get('archive_paths')