From 4303ce4ecbdf852a9dc762a90bc145b7895e76fd Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 26 Nov 2015 00:47:15 -0800 Subject: [PATCH] warc indexing: better handling of records with content-length to small, read first line to get to warc end (fixes indexing of warc in ikreymer/webarchiveplayer#14) --- pywb/warc/archiveiterator.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py index e8925eb8..b6341860 100644 --- a/pywb/warc/archiveiterator.py +++ b/pywb/warc/archiveiterator.py @@ -43,7 +43,14 @@ class ArchiveIterator(object): package which will create a properly chunked gzip file: warc2warc -Z myfile.{0} > myfile.{0}.gz - """ +""" + + INC_RECORD = """\ + WARNING: Record not followed by newline, perhaps Content-Length is invalid + Offset: {0} + Remainder: {1} +""" + def __init__(self, fileobj, no_record_parse=False, verify_http=False): @@ -130,15 +137,30 @@ class ArchiveIterator(object): count empty_size so that it can be substracted from the record length for uncompressed + + if first line read is not blank, likely error in WARC/ARC, + display a warning """ empty_size = 0 + first_line = True + while True: line = self.reader.readline() if len(line) == 0: return None, empty_size - if line.rstrip() == '': + stripped = line.rstrip() + + if stripped == '' or first_line: empty_size += len(line) + + if stripped != '': + # if first line is not blank, + # likely content-length was invalid, display warning + err_offset = self.fh.tell() - self.reader.rem_length() - empty_size + sys.stderr.write(self.INC_RECORD.format(err_offset, line)) + + first_line = False continue return line, empty_size