1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

warc: make ArchiveIterator an actual iterator

warc indexing test: add test for reading warc with interspersed empty gzip records, ensure they are ignored
This commit is contained in:
Ilya Kreymer 2017-03-01 12:48:06 -08:00
parent 114ef2a637
commit c66d251a90
2 changed files with 69 additions and 38 deletions

View File

@ -7,6 +7,7 @@ from pywb.warc.recordloader import ArcWarcRecordLoader
import hashlib import hashlib
import base64 import base64
import six
import re import re
import sys import sys
@ -17,8 +18,11 @@ except ImportError: # pragma: no cover
from ordereddict import OrderedDict from ordereddict import OrderedDict
#================================================================= # ============================================================================
class ArchiveIterator(object): BUFF_SIZE = 16384
class ArchiveIterator(six.Iterator):
""" Iterate over records in WARC and ARC files, both gzip chunk """ Iterate over records in WARC and ARC files, both gzip chunk
compressed and uncompressed compressed and uncompressed
@ -52,9 +56,9 @@ class ArchiveIterator(object):
Remainder: {1} Remainder: {1}
""" """
def __init__(self, fileobj, no_record_parse=False, def __init__(self, fileobj, no_record_parse=False,
verify_http=False, arc2warc=False): verify_http=False, arc2warc=False, block_size=BUFF_SIZE):
self.fh = fileobj self.fh = fileobj
self.loader = ArcWarcRecordLoader(verify_http=verify_http, self.loader = ArcWarcRecordLoader(verify_http=verify_http,
@ -69,55 +73,59 @@ class ArchiveIterator(object):
self.member_info = None self.member_info = None
self.no_record_parse = no_record_parse self.no_record_parse = no_record_parse
def __call__(self, block_size=16384):
""" iterate over each record
"""
decomp_type = 'gzip'
self.reader = DecompressingBufferedReader(self.fh, self.reader = DecompressingBufferedReader(self.fh,
block_size=block_size) block_size=block_size)
self.offset = self.fh.tell() self.offset = self.fh.tell()
self.next_line = None self.next_line = None
raise_invalid_gzip = False self._raise_invalid_gzip = False
empty_record = False self._is_empty = False
record = None self._is_first = True
self.last_record = None
def __iter__(self):
return self
def __next__(self):
while True: while True:
if not self._is_first:
self._finish_record()
self._is_first = False
try: try:
curr_offset = self.fh.tell() self.last_record = self._next_record(self.next_line)
record = self._next_record(self.next_line) if self._raise_invalid_gzip:
if raise_invalid_gzip:
self._raise_invalid_gzip_err() self._raise_invalid_gzip_err()
yield record return self.last_record
except EOFError: except EOFError:
empty_record = True self._is_empty = True
if record: def _finish_record(self):
self.read_to_end(record) if self.last_record:
self.read_to_end(self.last_record)
if self.reader.decompressor: if self.reader.decompressor:
# if another gzip member, continue # if another gzip member, continue
if self.reader.read_next_member(): if self.reader.read_next_member():
continue return
# if empty record, then we're done # if empty record, then we're done
elif empty_record: elif self._is_empty:
break raise StopIteration()
# otherwise, probably a gzip # otherwise, probably a gzip
# containing multiple non-chunked records # containing multiple non-chunked records
# raise this as an error # raise this as an error
else: else:
raise_invalid_gzip = True self._raise_invalid_gzip = True
# non-gzip, so we're done # non-gzip, so we're done
elif empty_record: elif self._is_empty:
break raise StopIteration()
def _raise_invalid_gzip_err(self): def _raise_invalid_gzip_err(self):
""" A gzip file with multiple ARC/WARC records, non-chunked """ A gzip file with multiple ARC/WARC records, non-chunked
@ -185,7 +193,7 @@ class ArchiveIterator(object):
curr_offset = self.offset curr_offset = self.offset
while True: while True:
b = record.stream.read(8192) b = record.stream.read(BUFF_SIZE)
if not b: if not b:
break break
num += len(b) num += len(b)
@ -349,7 +357,6 @@ class DefaultRecordParser(object):
def create_record_iter(self, raw_iter): def create_record_iter(self, raw_iter):
append_post = self.options.get('append_post') append_post = self.options.get('append_post')
include_all = self.options.get('include_all') include_all = self.options.get('include_all')
block_size = self.options.get('block_size', 16384)
surt_ordered = self.options.get('surt_ordered', True) surt_ordered = self.options.get('surt_ordered', True)
minimal = self.options.get('minimal') minimal = self.options.get('minimal')
@ -357,7 +364,7 @@ class DefaultRecordParser(object):
raise Exception('Sorry, minimal index option and ' + raise Exception('Sorry, minimal index option and ' +
'append POST options can not be used together') 'append POST options can not be used together')
for record in raw_iter(block_size): for record in raw_iter:
entry = None entry = None
if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'): if not include_all and not minimal and (record.status_headers.get_statuscode() == '-'):

View File

@ -370,6 +370,30 @@ def test_cdxj_empty():
assert buff.getvalue() == b'' assert buff.getvalue() == b''
def test_cdxj_middle_empty_records():
empty_gzip_record = b'\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x03\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00'
new_warc = BytesIO()
with open(TEST_WARC_DIR + 'example2.warc.gz', 'rb') as fh:
new_warc.write(empty_gzip_record)
new_warc.write(fh.read())
new_warc.write(empty_gzip_record)
new_warc.write(empty_gzip_record)
fh.seek(0)
new_warc.write(fh.read())
options = dict(cdxj=True)
buff = BytesIO()
new_warc.seek(0)
write_cdx_index(buff, new_warc, 'empty.warc.gz', **options)
lines = buff.getvalue().rstrip().split(b'\n')
assert len(lines) == 2, lines
if __name__ == "__main__": if __name__ == "__main__":
import doctest import doctest