diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py index e8925eb8..b6341860 100644 --- a/pywb/warc/archiveiterator.py +++ b/pywb/warc/archiveiterator.py @@ -43,7 +43,14 @@ class ArchiveIterator(object): package which will create a properly chunked gzip file: warc2warc -Z myfile.{0} > myfile.{0}.gz - """ +""" + + INC_RECORD = """\ + WARNING: Record not followed by newline, perhaps Content-Length is invalid + Offset: {0} + Remainder: {1} +""" + def __init__(self, fileobj, no_record_parse=False, verify_http=False): @@ -130,15 +137,30 @@ class ArchiveIterator(object): count empty_size so that it can be substracted from the record length for uncompressed + + if first line read is not blank, likely error in WARC/ARC, + display a warning """ empty_size = 0 + first_line = True + while True: line = self.reader.readline() if len(line) == 0: return None, empty_size - if line.rstrip() == '': + stripped = line.rstrip() + + if stripped == '' or first_line: empty_size += len(line) + + if stripped != '': + # if first line is not blank, + # likely content-length was invalid, display warning + err_offset = self.fh.tell() - self.reader.rem_length() - empty_size + sys.stderr.write(self.INC_RECORD.format(err_offset, line)) + + first_line = False continue return line, empty_size