warc indexing: better handling of records with content-length to small, read first line to get to warc end (fixes indexing of warc in ikreymer/webarchiveplayer#14)

2025-03-15 00:03:28 +01:00 · 2015-11-26 00:47:15 -08:00 · 2015-11-26 00:47:15 -08:00 · 4303ce4ecb
commit 4303ce4ecb
parent 2922801b7c
1 changed files with 24 additions and 2 deletions
--- a/pywb/warc/archiveiterator.py
+++ b/pywb/warc/archiveiterator.py
@ -43,7 +43,14 @@ class ArchiveIterator(object):
    package which will create a properly chunked gzip file:

    warc2warc -Z myfile.{0} > myfile.{0}.gz
-    """
+"""
+
+    INC_RECORD = """\
+    WARNING: Record not followed by newline, perhaps Content-Length is invalid
+    Offset: {0}
+    Remainder: {1}
+"""
+

    def __init__(self, fileobj, no_record_parse=False,
                 verify_http=False):
@ -130,15 +137,30 @@ class ArchiveIterator(object):

          count empty_size so that it can be substracted from
          the record length for uncompressed
+
+          if first line read is not blank, likely error in WARC/ARC,
+          display a warning
        """
        empty_size = 0
+        first_line = True
+
        while True:
            line = self.reader.readline()
            if len(line) == 0:
                return None, empty_size

-            if line.rstrip() == '':
+            stripped = line.rstrip()
+
+            if stripped == '' or first_line:
                empty_size += len(line)
+
+                if stripped != '':
+                    # if first line is not blank,
+                    # likely content-length was invalid, display warning
+                    err_offset = self.fh.tell() - self.reader.rem_length() - empty_size
+                    sys.stderr.write(self.INC_RECORD.format(err_offset, line))
+
+                first_line = False
                continue

            return line, empty_size