1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-28 00:25:21 +01:00

warc indexing: better handling of records with content-length to small, read first line to get to warc end (fixes indexing of warc in ikreymer/webarchiveplayer#14)

This commit is contained in:
Ilya Kreymer 2015-11-26 00:47:15 -08:00
parent 2922801b7c
commit 4303ce4ecb

View File

@ -43,7 +43,14 @@ class ArchiveIterator(object):
package which will create a properly chunked gzip file: package which will create a properly chunked gzip file:
warc2warc -Z myfile.{0} > myfile.{0}.gz warc2warc -Z myfile.{0} > myfile.{0}.gz
""" """
INC_RECORD = """\
WARNING: Record not followed by newline, perhaps Content-Length is invalid
Offset: {0}
Remainder: {1}
"""
def __init__(self, fileobj, no_record_parse=False, def __init__(self, fileobj, no_record_parse=False,
verify_http=False): verify_http=False):
@ -130,15 +137,30 @@ class ArchiveIterator(object):
count empty_size so that it can be substracted from count empty_size so that it can be substracted from
the record length for uncompressed the record length for uncompressed
if first line read is not blank, likely error in WARC/ARC,
display a warning
""" """
empty_size = 0 empty_size = 0
first_line = True
while True: while True:
line = self.reader.readline() line = self.reader.readline()
if len(line) == 0: if len(line) == 0:
return None, empty_size return None, empty_size
if line.rstrip() == '': stripped = line.rstrip()
if stripped == '' or first_line:
empty_size += len(line) empty_size += len(line)
if stripped != '':
# if first line is not blank,
# likely content-length was invalid, display warning
err_offset = self.fh.tell() - self.reader.rem_length() - empty_size
sys.stderr.write(self.INC_RECORD.format(err_offset, line))
first_line = False
continue continue
return line, empty_size return line, empty_size