1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

warc: seperate parse_record_loader() to enable direct parsing

of a file-like stream
detect and ignore warcinfo and arc header
This commit is contained in:
Ilya Kreymer 2014-03-29 15:58:03 -07:00
parent 99eadb3d4f
commit 7760b9b5a2
2 changed files with 26 additions and 17 deletions

View File

@ -13,8 +13,8 @@ from pywb.utils.wbexception import WbException
#=================================================================
ArcWarcRecord = collections.namedtuple('ArchiveRecord',
'type, rec_headers, ' +
ArcWarcRecord = collections.namedtuple('ArcWarcRecord',
'format, rec_type, rec_headers, ' +
'stream, status_headers')
@ -32,7 +32,7 @@ class ArchiveLoadFailed(WbException):
#=================================================================
class ArcWarcRecordLoader:
# Standard ARC headers
ARC_HEADERS = ["uri", "ip-address", "creation-date",
ARC_HEADERS = ["uri", "ip-address", "archive-date",
"content-type", "length"]
def __init__(self, loader=None, cookie_maker=None, block_size=8192):
@ -49,36 +49,38 @@ class ArcWarcRecordLoader:
self.http_parser = StatusAndHeadersParser(['HTTP/1.0', 'HTTP/1.1'])
def load(self, url, offset, length):
url_parts = urlparse.urlsplit(url)
#loader = self.loaders.get(url_parts.scheme)
#if not loader:
# raise ArchiveLoadFailed('Unknown Protocol', url)
try:
length = int(length)
except:
length = -1
raw = self.loader.load(url, long(offset), length)
stream = self.loader.load(url, long(offset), length)
decomp_type = 'gzip'
# Create decompressing stream
stream = DecompressingBufferedReader(stream=raw,
stream = DecompressingBufferedReader(stream=stream,
decomp_type=decomp_type,
block_size=self.block_size)
return self.parse_record_stream(stream)
def parse_record_stream(self, stream):
(the_format, rec_headers) = self._detect_type_load_headers(stream)
if the_format == 'arc':
rec_type = 'response'
length = int(rec_headers.get_header('length'))
if rec_headers.get_header('uri').startswith('filedesc://'):
rec_type = 'arc_header'
length = 0
else:
rec_type = 'response'
length = int(rec_headers.get_header('length'))
elif the_format == 'warc':
rec_type = rec_headers.get_header('WARC-Type')
length = int(rec_headers.get_header('Content-Length'))
# ================================================================
# handle different types of records
# special case: empty w/arc record (hopefully a revisit)
if length == 0:
status_headers = StatusAndHeaders('204 No Content', [])
@ -91,6 +93,12 @@ class ArcWarcRecordLoader:
status_headers = StatusAndHeaders('200 OK', content_type)
elif (rec_type == 'warcinfo' or
rec_type == 'arc_header' or
rec_type == 'request'):
# not parsing these for now
status_headers = StatusAndHeaders('204 No Content', [])
# special case: http 0.9 response, no status or headers
#elif rec_type == 'response':
# content_type = rec_headers.get_header('Content-Type')
@ -109,7 +117,7 @@ class ArcWarcRecordLoader:
if remains > 0:
stream = LimitReader.wrap_stream(stream, remains)
return ArcWarcRecord((the_format, rec_type),
return ArcWarcRecord(the_format, rec_type,
rec_headers, stream, status_headers)
def _detect_type_load_headers(self, stream):

View File

@ -256,7 +256,8 @@ def load_test_archive(test_file, offset, length):
archive = testloader.load(path, offset, length)
pprint.pprint((archive.type, archive.rec_headers, archive.status_headers))
pprint.pprint(((archive.format, archive.rec_type),
archive.rec_headers, archive.status_headers))
#==============================================================================