mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
refactor: split off BlockLoader support into BlockArcWarcRecordLoader, plain ArcWarcRecordLoader only includes parse_record_stream(), no load()
use BlockArcWarcRecordLoader() only when needed for replay
This commit is contained in:
parent
1213466afb
commit
b7285b1a77
33
pywb/warc/blockrecordloader.py
Normal file
33
pywb/warc/blockrecordloader.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
from pywb.utils.loaders import BlockLoader
|
||||||
|
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||||
|
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class BlockArcWarcRecordLoader(ArcWarcRecordLoader):
|
||||||
|
def __init__(self, loader=None, cookie_maker=None, block_size=8192, *args, **kwargs):
|
||||||
|
if not loader:
|
||||||
|
loader = BlockLoader(cookie_maker=cookie_maker)
|
||||||
|
|
||||||
|
self.loader = loader
|
||||||
|
self.block_size = block_size
|
||||||
|
super(BlockArcWarcRecordLoader, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
def load(self, url, offset, length, no_record_parse=False):
|
||||||
|
""" Load a single record from given url at offset with length
|
||||||
|
and parse as either warc or arc record
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
length = int(length)
|
||||||
|
except:
|
||||||
|
length = -1
|
||||||
|
|
||||||
|
stream = self.loader.load(url, int(offset), length)
|
||||||
|
decomp_type = 'gzip'
|
||||||
|
|
||||||
|
# Create decompressing stream
|
||||||
|
stream = DecompressingBufferedReader(stream=stream,
|
||||||
|
decomp_type=decomp_type,
|
||||||
|
block_size=self.block_size)
|
||||||
|
|
||||||
|
return self.parse_record_stream(stream, no_record_parse=no_record_parse)
|
@ -4,15 +4,13 @@ from pywb.utils.statusandheaders import StatusAndHeaders
|
|||||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||||
from pywb.utils.statusandheaders import StatusAndHeadersParserException
|
from pywb.utils.statusandheaders import StatusAndHeadersParserException
|
||||||
|
|
||||||
from pywb.utils.loaders import BlockLoader, LimitReader
|
from pywb.utils.loaders import LimitReader
|
||||||
from pywb.utils.loaders import to_native_str
|
from pywb.utils.loaders import to_native_str
|
||||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
|
||||||
|
|
||||||
from pywb.utils.wbexception import WbException
|
from pywb.utils.wbexception import WbException
|
||||||
from pywb.utils.timeutils import timestamp_to_iso_date
|
from pywb.utils.timeutils import timestamp_to_iso_date
|
||||||
|
|
||||||
from six.moves import zip
|
from six.moves import zip
|
||||||
import six
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -56,14 +54,7 @@ class ArcWarcRecordLoader(object):
|
|||||||
NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:')
|
NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:')
|
||||||
HTTP_SCHEMES = ('http:', 'https:')
|
HTTP_SCHEMES = ('http:', 'https:')
|
||||||
|
|
||||||
def __init__(self, loader=None, cookie_maker=None, block_size=8192,
|
def __init__(self, verify_http=True, arc2warc=True):
|
||||||
verify_http=True, arc2warc=True):
|
|
||||||
if not loader:
|
|
||||||
loader = BlockLoader(cookie_maker=cookie_maker)
|
|
||||||
|
|
||||||
self.loader = loader
|
|
||||||
self.block_size = block_size
|
|
||||||
|
|
||||||
if arc2warc:
|
if arc2warc:
|
||||||
self.arc_parser = ARC2WARCHeadersParser()
|
self.arc_parser = ARC2WARCHeadersParser()
|
||||||
else:
|
else:
|
||||||
@ -74,25 +65,6 @@ class ArcWarcRecordLoader(object):
|
|||||||
|
|
||||||
self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
|
self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
|
||||||
|
|
||||||
def load(self, url, offset, length, no_record_parse=False):
|
|
||||||
""" Load a single record from given url at offset with length
|
|
||||||
and parse as either warc or arc record
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
length = int(length)
|
|
||||||
except:
|
|
||||||
length = -1
|
|
||||||
|
|
||||||
stream = self.loader.load(url, int(offset), length)
|
|
||||||
decomp_type = 'gzip'
|
|
||||||
|
|
||||||
# Create decompressing stream
|
|
||||||
stream = DecompressingBufferedReader(stream=stream,
|
|
||||||
decomp_type=decomp_type,
|
|
||||||
block_size=self.block_size)
|
|
||||||
|
|
||||||
return self.parse_record_stream(stream, no_record_parse=no_record_parse)
|
|
||||||
|
|
||||||
def parse_record_stream(self, stream,
|
def parse_record_stream(self, stream,
|
||||||
statusline=None,
|
statusline=None,
|
||||||
known_format=None,
|
known_format=None,
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
from pywb.utils.timeutils import iso_date_to_timestamp
|
from pywb.utils.timeutils import iso_date_to_timestamp
|
||||||
from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
|
from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
|
||||||
|
from pywb.warc.recordloader import ArchiveLoadFailed
|
||||||
from pywb.utils.wbexception import NotFoundException
|
from pywb.utils.wbexception import NotFoundException
|
||||||
|
|
||||||
import six
|
import six
|
||||||
@ -9,7 +10,7 @@ import six
|
|||||||
class ResolvingLoader(object):
|
class ResolvingLoader(object):
|
||||||
MISSING_REVISIT_MSG = 'Original for revisit record could not be loaded'
|
MISSING_REVISIT_MSG = 'Original for revisit record could not be loaded'
|
||||||
|
|
||||||
def __init__(self, path_resolvers, record_loader=ArcWarcRecordLoader(), no_record_parse=False):
|
def __init__(self, path_resolvers, record_loader=BlockArcWarcRecordLoader(), no_record_parse=False):
|
||||||
self.path_resolvers = path_resolvers
|
self.path_resolvers = path_resolvers
|
||||||
self.record_loader = record_loader
|
self.record_loader = record_loader
|
||||||
self.no_record_parse = no_record_parse
|
self.no_record_parse = no_record_parse
|
||||||
|
@ -293,6 +293,7 @@ import pprint
|
|||||||
import six
|
import six
|
||||||
|
|
||||||
from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
|
from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
|
||||||
|
from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
|
||||||
from pywb.warc.resolvingloader import ResolvingLoader
|
from pywb.warc.resolvingloader import ResolvingLoader
|
||||||
from pywb.warc.pathresolvers import PathResolverMapper
|
from pywb.warc.pathresolvers import PathResolverMapper
|
||||||
from pywb.cdx.cdxobject import CDXObject
|
from pywb.cdx.cdxobject import CDXObject
|
||||||
@ -326,7 +327,7 @@ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \
|
|||||||
def load_test_archive(test_file, offset, length):
|
def load_test_archive(test_file, offset, length):
|
||||||
path = test_warc_dir + test_file
|
path = test_warc_dir + test_file
|
||||||
|
|
||||||
testloader = ArcWarcRecordLoader()
|
testloader = BlockArcWarcRecordLoader()
|
||||||
|
|
||||||
archive = testloader.load(path, offset, length)
|
archive = testloader.load(path, offset, length)
|
||||||
|
|
||||||
|
@ -12,7 +12,7 @@ from pywb.utils.statusandheaders import StatusAndHeaders
|
|||||||
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
|
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
|
||||||
from pywb.framework.wbrequestresponse import WbResponse
|
from pywb.framework.wbrequestresponse import WbResponse
|
||||||
|
|
||||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader
|
||||||
from pywb.warc.resolvingloader import ResolvingLoader
|
from pywb.warc.resolvingloader import ResolvingLoader
|
||||||
from pywb.warc.pathresolvers import PathResolverMapper
|
from pywb.warc.pathresolvers import PathResolverMapper
|
||||||
|
|
||||||
@ -134,7 +134,7 @@ class WBHandler(SearchPageWbUrlHandler):
|
|||||||
|
|
||||||
def _init_replay_view(self, config):
|
def _init_replay_view(self, config):
|
||||||
cookie_maker = config.get('cookie_maker')
|
cookie_maker = config.get('cookie_maker')
|
||||||
record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker)
|
record_loader = BlockArcWarcRecordLoader(cookie_maker=cookie_maker)
|
||||||
|
|
||||||
paths = config.get('archive_paths')
|
paths = config.get('archive_paths')
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user