mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
warc indexing: better handling of records with content-length to small, read first line to get to warc end (fixes indexing of warc in ikreymer/webarchiveplayer#14)
This commit is contained in:
parent
2922801b7c
commit
4303ce4ecb
@ -43,7 +43,14 @@ class ArchiveIterator(object):
|
||||
package which will create a properly chunked gzip file:
|
||||
|
||||
warc2warc -Z myfile.{0} > myfile.{0}.gz
|
||||
"""
|
||||
"""
|
||||
|
||||
INC_RECORD = """\
|
||||
WARNING: Record not followed by newline, perhaps Content-Length is invalid
|
||||
Offset: {0}
|
||||
Remainder: {1}
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self, fileobj, no_record_parse=False,
|
||||
verify_http=False):
|
||||
@ -130,15 +137,30 @@ class ArchiveIterator(object):
|
||||
|
||||
count empty_size so that it can be substracted from
|
||||
the record length for uncompressed
|
||||
|
||||
if first line read is not blank, likely error in WARC/ARC,
|
||||
display a warning
|
||||
"""
|
||||
empty_size = 0
|
||||
first_line = True
|
||||
|
||||
while True:
|
||||
line = self.reader.readline()
|
||||
if len(line) == 0:
|
||||
return None, empty_size
|
||||
|
||||
if line.rstrip() == '':
|
||||
stripped = line.rstrip()
|
||||
|
||||
if stripped == '' or first_line:
|
||||
empty_size += len(line)
|
||||
|
||||
if stripped != '':
|
||||
# if first line is not blank,
|
||||
# likely content-length was invalid, display warning
|
||||
err_offset = self.fh.tell() - self.reader.rem_length() - empty_size
|
||||
sys.stderr.write(self.INC_RECORD.format(err_offset, line))
|
||||
|
||||
first_line = False
|
||||
continue
|
||||
|
||||
return line, empty_size
|
||||
|
Loading…
x
Reference in New Issue
Block a user