1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

archiveiterator/cdxindexer: cleaner load path for compressed and

uncompressed, ability to distinguish between chunked and non-chunked
warcs/arcs
Raise error for non-chunked gzip warcs as they can not be indexed for
replay, addressing #48
add 'bad' non-chunked gzip file for testing, using custom ext
This commit is contained in:
Ilya Kreymer 2014-11-06 01:29:14 -08:00
parent 044792f99f
commit 49e98e0cdc
3 changed files with 74 additions and 20 deletions

View File

@ -21,6 +21,25 @@ class ArchiveIterator(object):
"""
GZIP_ERR_MSG = """
ERROR: Non-chunked gzip file detected, gzip block continues
beyond single record.
This file is probably not a multi-chunk gzip but a single gzip file.
To allow seek, a gzipped {1} must have each record compressed into
a single gzip chunk and concatenated together.
This file is likely still valid and you can use it by decompressing it:
gunzip myfile.{0}.gz
You can then also use the 'warc2warc' tool from the 'warc-tools'
package which will create a properly chunked gzip file:
warc2warc -Z myfile.{0} > myfile.{0}.gz
"""
def __init__(self, fileobj):
self.fh = fileobj
@ -42,27 +61,34 @@ class ArchiveIterator(object):
block_size=block_size)
self.offset = self.fh.tell()
next_line = None
self.next_line = None
is_valid = True
while True:
try:
record = self._next_record(next_line)
record = self._next_record(self.next_line)
if not is_valid:
self._raise_err()
yield record
except EOFError:
break
self.read_to_end(record)
# for non-compressed, consume blank lines here
if not self.reader.decompressor:
next_line = self._consume_blanklines()
if next_line is None:
# at end of file
break
if self.reader.decompressor:
is_valid = self.reader.read_next_member()
# reset reader for next member
else:
self.reader.read_next_member()
def _raise_err(self):
frmt = 'warc/arc'
if self.known_format:
frmt = self.known_format
frmt_up = frmt.upper()
msg = self.GZIP_ERR_MSG.format(frmt, frmt_up)
raise Exception(msg)
def _consume_blanklines(self):
""" Consume blank lines that are between records
@ -72,25 +98,31 @@ class ArchiveIterator(object):
and are included in record length which is the full gzip envelope
- For uncompressed, they are between records and so are NOT part of
the record length
count empty_size so that it can be substracted from
the record length for uncompressed
"""
empty_size = 0
while True:
line = self.reader.readline()
if len(line) == 0:
return None
return None, empty_size
if line.rstrip() == '':
self.offset = self.fh.tell() - self.reader.rem_length()
empty_size += len(line)
continue
return line
return line, empty_size
def read_to_end(self, record, compute_digest=False):
""" Read remainder of the stream
If a digester is included, update it
with the data read
"""
# already at end of this record, don't read until it is consumed
if self.member_info:
return self.member_info
return None
if compute_digest:
digester = hashlib.sha1()
@ -114,19 +146,29 @@ class ArchiveIterator(object):
- For uncompressed files, blank lines are read later,
and not included in the record length
"""
if self.reader.decompressor:
self._consume_blanklines()
#if self.reader.decompressor:
self.next_line, empty_size = self._consume_blanklines()
self.offset = self.fh.tell() - self.reader.rem_length()
#if self.offset < 0:
# raise Exception('Not Gzipped Properly')
if self.next_line:
self.offset -= len(self.next_line)
length = self.offset - curr_offset
if not self.reader.decompressor:
length -= empty_size
if compute_digest:
digest = base64.b32encode(digester.digest())
else:
digest = None
self.member_info = (curr_offset, length, digest)
return self.member_info
#return self.member_info
#return next_line
def _next_record(self, next_line):
""" Use loader to parse the record from the reader stream
@ -250,7 +292,9 @@ def create_record_iter(arcv_iter, options):
entry.post_query = post_query
entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
#entry.set_rec_info(*arcv_iter.read_to_end(record, compute_digest))
arcv_iter.read_to_end(record, compute_digest)
entry.set_rec_info(*arcv_iter.member_info)
entry.record = record
yield entry

View File

@ -144,7 +144,6 @@ Total: 4
from pywb import get_test_dir
#from pywb.warc.archiveindexer import ArchiveIndexer, main, cdx_filename
from pywb.warc.cdxindexer import write_cdx_index, main, cdx_filename
from io import BytesIO
@ -154,6 +153,9 @@ import os
import shutil
import tempfile
from pytest import raises
TEST_CDX_DIR = get_test_dir() + 'cdx/'
TEST_WARC_DIR = get_test_dir() + 'warcs/'
@ -231,3 +233,11 @@ def cli_lines_with_dir(input_):
print('Total: ' + str(len(lines)))
def test_non_chunked_gzip_err():
with raises(Exception):
print_cdx_index('example-bad.warc.gz.bad')
if __name__ == "__main__":
import doctest
doctest.testmod()

Binary file not shown.