mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-16 00:24:48 +01:00
refactor: split off BlockLoader support into BlockArcWarcRecordLoader, plain ArcWarcRecordLoader only includes parse_record_stream(), no load()
use BlockArcWarcRecordLoader() only when needed for replay
This commit is contained in:
parent
1213466afb
commit
b7285b1a77
33
pywb/warc/blockrecordloader.py
Normal file
33
pywb/warc/blockrecordloader.py
Normal file
@ -0,0 +1,33 @@
|
||||
from pywb.utils.loaders import BlockLoader
|
||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
|
||||
|
||||
#=================================================================
|
||||
class BlockArcWarcRecordLoader(ArcWarcRecordLoader):
|
||||
def __init__(self, loader=None, cookie_maker=None, block_size=8192, *args, **kwargs):
|
||||
if not loader:
|
||||
loader = BlockLoader(cookie_maker=cookie_maker)
|
||||
|
||||
self.loader = loader
|
||||
self.block_size = block_size
|
||||
super(BlockArcWarcRecordLoader, self).__init__(*args, **kwargs)
|
||||
|
||||
def load(self, url, offset, length, no_record_parse=False):
|
||||
""" Load a single record from given url at offset with length
|
||||
and parse as either warc or arc record
|
||||
"""
|
||||
try:
|
||||
length = int(length)
|
||||
except:
|
||||
length = -1
|
||||
|
||||
stream = self.loader.load(url, int(offset), length)
|
||||
decomp_type = 'gzip'
|
||||
|
||||
# Create decompressing stream
|
||||
stream = DecompressingBufferedReader(stream=stream,
|
||||
decomp_type=decomp_type,
|
||||
block_size=self.block_size)
|
||||
|
||||
return self.parse_record_stream(stream, no_record_parse=no_record_parse)
|
@ -4,15 +4,13 @@ from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParserException
|
||||
|
||||
from pywb.utils.loaders import BlockLoader, LimitReader
|
||||
from pywb.utils.loaders import LimitReader
|
||||
from pywb.utils.loaders import to_native_str
|
||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||
|
||||
from pywb.utils.wbexception import WbException
|
||||
from pywb.utils.timeutils import timestamp_to_iso_date
|
||||
|
||||
from six.moves import zip
|
||||
import six
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -56,14 +54,7 @@ class ArcWarcRecordLoader(object):
|
||||
NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:')
|
||||
HTTP_SCHEMES = ('http:', 'https:')
|
||||
|
||||
def __init__(self, loader=None, cookie_maker=None, block_size=8192,
|
||||
verify_http=True, arc2warc=True):
|
||||
if not loader:
|
||||
loader = BlockLoader(cookie_maker=cookie_maker)
|
||||
|
||||
self.loader = loader
|
||||
self.block_size = block_size
|
||||
|
||||
def __init__(self, verify_http=True, arc2warc=True):
|
||||
if arc2warc:
|
||||
self.arc_parser = ARC2WARCHeadersParser()
|
||||
else:
|
||||
@ -74,25 +65,6 @@ class ArcWarcRecordLoader(object):
|
||||
|
||||
self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
|
||||
|
||||
def load(self, url, offset, length, no_record_parse=False):
|
||||
""" Load a single record from given url at offset with length
|
||||
and parse as either warc or arc record
|
||||
"""
|
||||
try:
|
||||
length = int(length)
|
||||
except:
|
||||
length = -1
|
||||
|
||||
stream = self.loader.load(url, int(offset), length)
|
||||
decomp_type = 'gzip'
|
||||
|
||||
# Create decompressing stream
|
||||
stream = DecompressingBufferedReader(stream=stream,
|
||||
decomp_type=decomp_type,
|
||||
block_size=self.block_size)
|
||||
|
||||
return self.parse_record_stream(stream, no_record_parse=no_record_parse)
|
||||
|
||||
def parse_record_stream(self, stream,
|
||||
statusline=None,
|
||||
known_format=None,
|
||||
|
@ -1,5 +1,6 @@
|
||||
from pywb.utils.timeutils import iso_date_to_timestamp
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
|
||||
from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
|
||||
from pywb.warc.recordloader import ArchiveLoadFailed
|
||||
from pywb.utils.wbexception import NotFoundException
|
||||
|
||||
import six
|
||||
@ -9,7 +10,7 @@ import six
|
||||
class ResolvingLoader(object):
|
||||
MISSING_REVISIT_MSG = 'Original for revisit record could not be loaded'
|
||||
|
||||
def __init__(self, path_resolvers, record_loader=ArcWarcRecordLoader(), no_record_parse=False):
|
||||
def __init__(self, path_resolvers, record_loader=BlockArcWarcRecordLoader(), no_record_parse=False):
|
||||
self.path_resolvers = path_resolvers
|
||||
self.record_loader = record_loader
|
||||
self.no_record_parse = no_record_parse
|
||||
|
@ -293,6 +293,7 @@ import pprint
|
||||
import six
|
||||
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
|
||||
from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
|
||||
from pywb.warc.resolvingloader import ResolvingLoader
|
||||
from pywb.warc.pathresolvers import PathResolverMapper
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
@ -326,7 +327,7 @@ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \
|
||||
def load_test_archive(test_file, offset, length):
|
||||
path = test_warc_dir + test_file
|
||||
|
||||
testloader = ArcWarcRecordLoader()
|
||||
testloader = BlockArcWarcRecordLoader()
|
||||
|
||||
archive = testloader.load(path, offset, length)
|
||||
|
||||
|
@ -12,7 +12,7 @@ from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
|
||||
from pywb.warc.resolvingloader import ResolvingLoader
|
||||
from pywb.warc.pathresolvers import PathResolverMapper
|
||||
|
||||
@ -134,7 +134,7 @@ class WBHandler(SearchPageWbUrlHandler):
|
||||
|
||||
def _init_replay_view(self, config):
|
||||
cookie_maker = config.get('cookie_maker')
|
||||
record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker)
|
||||
record_loader = BlockArcWarcRecordLoader(cookie_maker=cookie_maker)
|
||||
|
||||
paths = config.get('archive_paths')
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user