diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 5fe5f64d..f1302458 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -13,8 +13,8 @@ from pywb.utils.wbexception import WbException #================================================================= -ArcWarcRecord = collections.namedtuple('ArchiveRecord', - 'type, rec_headers, ' + +ArcWarcRecord = collections.namedtuple('ArcWarcRecord', + 'format, rec_type, rec_headers, ' + 'stream, status_headers') @@ -32,7 +32,7 @@ class ArchiveLoadFailed(WbException): #================================================================= class ArcWarcRecordLoader: # Standard ARC headers - ARC_HEADERS = ["uri", "ip-address", "creation-date", + ARC_HEADERS = ["uri", "ip-address", "archive-date", "content-type", "length"] def __init__(self, loader=None, cookie_maker=None, block_size=8192): @@ -49,36 +49,38 @@ class ArcWarcRecordLoader: self.http_parser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1']) def load(self, url, offset, length): - url_parts = urlparse.urlsplit(url) - - #loader = self.loaders.get(url_parts.scheme) - #if not loader: - # raise ArchiveLoadFailed('Unknown Protocol', url) - try: length = int(length) except: length = -1 - raw = self.loader.load(url, long(offset), length) - + stream = self.loader.load(url, long(offset), length) decomp_type = 'gzip' # Create decompressing stream - stream = DecompressingBufferedReader(stream=raw, + stream = DecompressingBufferedReader(stream=stream, decomp_type=decomp_type, block_size=self.block_size) + return self.parse_record_stream(stream) + + def parse_record_stream(self, stream): (the_format, rec_headers) = self._detect_type_load_headers(stream) if the_format == 'arc': - rec_type = 'response' - length = int(rec_headers.get_header('length')) - + if rec_headers.get_header('uri').startswith('filedesc://'): + rec_type = 'arc_header' + length = 0 + else: + rec_type = 'response' + length = int(rec_headers.get_header('length')) elif the_format == 'warc': rec_type = rec_headers.get_header('WARC-Type') length = int(rec_headers.get_header('Content-Length')) + # ================================================================ + # handle different types of records + # special case: empty w/arc record (hopefully a revisit) if length == 0: status_headers = StatusAndHeaders('204 No Content', []) @@ -91,6 +93,12 @@ class ArcWarcRecordLoader: status_headers = StatusAndHeaders('200 OK', content_type) + elif (rec_type == 'warcinfo' or + rec_type == 'arc_header' or + rec_type == 'request'): + # not parsing these for now + status_headers = StatusAndHeaders('204 No Content', []) + # special case: http 0.9 response, no status or headers #elif rec_type == 'response': # content_type = rec_headers.get_header('Content-Type') @@ -109,7 +117,7 @@ class ArcWarcRecordLoader: if remains > 0: stream = LimitReader.wrap_stream(stream, remains) - return ArcWarcRecord((the_format, rec_type), + return ArcWarcRecord(the_format, rec_type, rec_headers, stream, status_headers) def _detect_type_load_headers(self, stream): diff --git a/pywb/warc/test/test_loading.py b/pywb/warc/test/test_loading.py index 9979891a..f8d81f0f 100644 --- a/pywb/warc/test/test_loading.py +++ b/pywb/warc/test/test_loading.py @@ -256,7 +256,8 @@ def load_test_archive(test_file, offset, length): archive = testloader.load(path, offset, length) - pprint.pprint((archive.type, archive.rec_headers, archive.status_headers)) + pprint.pprint(((archive.format, archive.rec_type), + archive.rec_headers, archive.status_headers)) #==============================================================================