From b7285b1a77cf6982465a756e39b2d1041b92b876 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 1 Mar 2017 14:57:44 -0800 Subject: [PATCH] refactor: split off BlockLoader support into BlockArcWarcRecordLoader, plain ArcWarcRecordLoader only includes parse_record_stream(), no load() use BlockArcWarcRecordLoader() only when needed for replay --- pywb/warc/blockrecordloader.py | 33 +++++++++++++++++++++++++++++++++ pywb/warc/recordloader.py | 32 ++------------------------------ pywb/warc/resolvingloader.py | 5 +++-- pywb/warc/test/test_loading.py | 3 ++- pywb/webapp/handlers.py | 4 ++-- 5 files changed, 42 insertions(+), 35 deletions(-) create mode 100644 pywb/warc/blockrecordloader.py diff --git a/pywb/warc/blockrecordloader.py b/pywb/warc/blockrecordloader.py new file mode 100644 index 00000000..fea50c31 --- /dev/null +++ b/pywb/warc/blockrecordloader.py @@ -0,0 +1,33 @@ +from pywb.utils.loaders import BlockLoader +from pywb.utils.bufferedreaders import DecompressingBufferedReader +from pywb.warc.recordloader import ArcWarcRecordLoader + + +#================================================================= +class BlockArcWarcRecordLoader(ArcWarcRecordLoader): + def __init__(self, loader=None, cookie_maker=None, block_size=8192, *args, **kwargs): + if not loader: + loader = BlockLoader(cookie_maker=cookie_maker) + + self.loader = loader + self.block_size = block_size + super(BlockArcWarcRecordLoader, self).__init__(*args, **kwargs) + + def load(self, url, offset, length, no_record_parse=False): + """ Load a single record from given url at offset with length + and parse as either warc or arc record + """ + try: + length = int(length) + except: + length = -1 + + stream = self.loader.load(url, int(offset), length) + decomp_type = 'gzip' + + # Create decompressing stream + stream = DecompressingBufferedReader(stream=stream, + decomp_type=decomp_type, + block_size=self.block_size) + + return self.parse_record_stream(stream, no_record_parse=no_record_parse) diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 751f3787..3becb294 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -4,15 +4,13 @@ from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeadersParser from pywb.utils.statusandheaders import StatusAndHeadersParserException -from pywb.utils.loaders import BlockLoader, LimitReader +from pywb.utils.loaders import LimitReader from pywb.utils.loaders import to_native_str -from pywb.utils.bufferedreaders import DecompressingBufferedReader from pywb.utils.wbexception import WbException from pywb.utils.timeutils import timestamp_to_iso_date from six.moves import zip -import six #================================================================= @@ -56,14 +54,7 @@ class ArcWarcRecordLoader(object): NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:') HTTP_SCHEMES = ('http:', 'https:') - def __init__(self, loader=None, cookie_maker=None, block_size=8192, - verify_http=True, arc2warc=True): - if not loader: - loader = BlockLoader(cookie_maker=cookie_maker) - - self.loader = loader - self.block_size = block_size - + def __init__(self, verify_http=True, arc2warc=True): if arc2warc: self.arc_parser = ARC2WARCHeadersParser() else: @@ -74,25 +65,6 @@ class ArcWarcRecordLoader(object): self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http) - def load(self, url, offset, length, no_record_parse=False): - """ Load a single record from given url at offset with length - and parse as either warc or arc record - """ - try: - length = int(length) - except: - length = -1 - - stream = self.loader.load(url, int(offset), length) - decomp_type = 'gzip' - - # Create decompressing stream - stream = DecompressingBufferedReader(stream=stream, - decomp_type=decomp_type, - block_size=self.block_size) - - return self.parse_record_stream(stream, no_record_parse=no_record_parse) - def parse_record_stream(self, stream, statusline=None, known_format=None, diff --git a/pywb/warc/resolvingloader.py b/pywb/warc/resolvingloader.py index ed0f8049..5b74a9c7 100644 --- a/pywb/warc/resolvingloader.py +++ b/pywb/warc/resolvingloader.py @@ -1,5 +1,6 @@ from pywb.utils.timeutils import iso_date_to_timestamp -from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed +from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader +from pywb.warc.recordloader import ArchiveLoadFailed from pywb.utils.wbexception import NotFoundException import six @@ -9,7 +10,7 @@ import six class ResolvingLoader(object): MISSING_REVISIT_MSG = 'Original for revisit record could not be loaded' - def __init__(self, path_resolvers, record_loader=ArcWarcRecordLoader(), no_record_parse=False): + def __init__(self, path_resolvers, record_loader=BlockArcWarcRecordLoader(), no_record_parse=False): self.path_resolvers = path_resolvers self.record_loader = record_loader self.no_record_parse = no_record_parse diff --git a/pywb/warc/test/test_loading.py b/pywb/warc/test/test_loading.py index ba3f669c..bd629a4a 100644 --- a/pywb/warc/test/test_loading.py +++ b/pywb/warc/test/test_loading.py @@ -293,6 +293,7 @@ import pprint import six from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed +from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader from pywb.warc.resolvingloader import ResolvingLoader from pywb.warc.pathresolvers import PathResolverMapper from pywb.cdx.cdxobject import CDXObject @@ -326,7 +327,7 @@ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - \ def load_test_archive(test_file, offset, length): path = test_warc_dir + test_file - testloader = ArcWarcRecordLoader() + testloader = BlockArcWarcRecordLoader() archive = testloader.load(path, offset, length) diff --git a/pywb/webapp/handlers.py b/pywb/webapp/handlers.py index b7411ccb..2317d5db 100644 --- a/pywb/webapp/handlers.py +++ b/pywb/webapp/handlers.py @@ -12,7 +12,7 @@ from pywb.utils.statusandheaders import StatusAndHeaders from pywb.framework.basehandlers import BaseHandler, WbUrlHandler from pywb.framework.wbrequestresponse import WbResponse -from pywb.warc.recordloader import ArcWarcRecordLoader +from pywb.warc.blockrecordloader import BlockArcWarcRecordLoader from pywb.warc.resolvingloader import ResolvingLoader from pywb.warc.pathresolvers import PathResolverMapper @@ -134,7 +134,7 @@ class WBHandler(SearchPageWbUrlHandler): def _init_replay_view(self, config): cookie_maker = config.get('cookie_maker') - record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker) + record_loader = BlockArcWarcRecordLoader(cookie_maker=cookie_maker) paths = config.get('archive_paths')