1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

warc indexing: better handling of records with content-length to small, read first line to get to warc end (fixes indexing of warc in ikreymer/webarchiveplayer#14)

This commit is contained in:
Ilya Kreymer 2015-11-26 00:47:15 -08:00
parent 2922801b7c
commit 4303ce4ecb

View File

@ -43,7 +43,14 @@ class ArchiveIterator(object):
package which will create a properly chunked gzip file:
warc2warc -Z myfile.{0} > myfile.{0}.gz
"""
"""
INC_RECORD = """\
WARNING: Record not followed by newline, perhaps Content-Length is invalid
Offset: {0}
Remainder: {1}
"""
def __init__(self, fileobj, no_record_parse=False,
verify_http=False):
@ -130,15 +137,30 @@ class ArchiveIterator(object):
count empty_size so that it can be substracted from
the record length for uncompressed
if first line read is not blank, likely error in WARC/ARC,
display a warning
"""
empty_size = 0
first_line = True
while True:
line = self.reader.readline()
if len(line) == 0:
return None, empty_size
if line.rstrip() == '':
stripped = line.rstrip()
if stripped == '' or first_line:
empty_size += len(line)
if stripped != '':
# if first line is not blank,
# likely content-length was invalid, display warning
err_offset = self.fh.tell() - self.reader.rem_length() - empty_size
sys.stderr.write(self.INC_RECORD.format(err_offset, line))
first_line = False
continue
return line, empty_size