mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
warc indexing: refactor to add create_payload_buffer() which can be overriden in custom iterators to create a file-like object
that will receive write() calls to buffer the payload when indexing. Default implementation does not buffer the payload
This commit is contained in:
parent
98843a2551
commit
d1c0bfac10
@ -166,7 +166,7 @@ class ArchiveIterator(object):
|
|||||||
|
|
||||||
return line, empty_size
|
return line, empty_size
|
||||||
|
|
||||||
def read_to_end(self, record, compute_digest=False):
|
def read_to_end(self, record, payload_callback=None):
|
||||||
""" Read remainder of the stream
|
""" Read remainder of the stream
|
||||||
If a digester is included, update it
|
If a digester is included, update it
|
||||||
with the data read
|
with the data read
|
||||||
@ -176,11 +176,6 @@ class ArchiveIterator(object):
|
|||||||
if self.member_info:
|
if self.member_info:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if compute_digest:
|
|
||||||
digester = hashlib.sha1()
|
|
||||||
else:
|
|
||||||
digester = None
|
|
||||||
|
|
||||||
num = 0
|
num = 0
|
||||||
curr_offset = self.offset
|
curr_offset = self.offset
|
||||||
|
|
||||||
@ -189,8 +184,8 @@ class ArchiveIterator(object):
|
|||||||
if not b:
|
if not b:
|
||||||
break
|
break
|
||||||
num += len(b)
|
num += len(b)
|
||||||
if digester:
|
if payload_callback:
|
||||||
digester.update(b)
|
payload_callback(b)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
- For compressed files, blank lines are consumed
|
- For compressed files, blank lines are consumed
|
||||||
@ -213,12 +208,7 @@ class ArchiveIterator(object):
|
|||||||
if not self.reader.decompressor:
|
if not self.reader.decompressor:
|
||||||
length -= empty_size
|
length -= empty_size
|
||||||
|
|
||||||
if compute_digest:
|
self.member_info = (curr_offset, length)
|
||||||
digest = base64.b32encode(digester.digest())
|
|
||||||
else:
|
|
||||||
digest = None
|
|
||||||
|
|
||||||
self.member_info = (curr_offset, length, digest)
|
|
||||||
#return self.member_info
|
#return self.member_info
|
||||||
#return next_line
|
#return next_line
|
||||||
|
|
||||||
@ -243,9 +233,16 @@ class ArchiveIterator(object):
|
|||||||
class ArchiveIndexEntryMixin(object):
|
class ArchiveIndexEntryMixin(object):
|
||||||
MIME_RE = re.compile('[; ]')
|
MIME_RE = re.compile('[; ]')
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super(ArchiveIndexEntryMixin, self).__init__()
|
||||||
|
self.reset_entry()
|
||||||
|
|
||||||
def reset_entry(self):
|
def reset_entry(self):
|
||||||
self['urlkey'] = ''
|
self['urlkey'] = ''
|
||||||
self['metadata'] = ''
|
self['metadata'] = ''
|
||||||
|
self.buffer = None
|
||||||
|
self.record = None
|
||||||
|
|
||||||
|
|
||||||
def extract_mime(self, mime, def_mime='unk'):
|
def extract_mime(self, mime, def_mime='unk'):
|
||||||
""" Utility function to extract mimetype only
|
""" Utility function to extract mimetype only
|
||||||
@ -264,10 +261,7 @@ class ArchiveIndexEntryMixin(object):
|
|||||||
elif self['status'] == '204' and 'Error' in status_headers.statusline:
|
elif self['status'] == '204' and 'Error' in status_headers.statusline:
|
||||||
self['status'] = '-'
|
self['status'] = '-'
|
||||||
|
|
||||||
def set_rec_info(self, offset, length, digest):
|
def set_rec_info(self, offset, length):
|
||||||
if digest:
|
|
||||||
self['digest'] = digest
|
|
||||||
|
|
||||||
self['length'] = str(length)
|
self['length'] = str(length)
|
||||||
self['offset'] = str(offset)
|
self['offset'] = str(offset)
|
||||||
|
|
||||||
@ -300,6 +294,8 @@ class DefaultRecordIter(object):
|
|||||||
def __init__(self, **options):
|
def __init__(self, **options):
|
||||||
self.options = options
|
self.options = options
|
||||||
self.entry_cache = {}
|
self.entry_cache = {}
|
||||||
|
self.digester = None
|
||||||
|
self.buff = None
|
||||||
|
|
||||||
def _create_index_entry(self, rec_type):
|
def _create_index_entry(self, rec_type):
|
||||||
try:
|
try:
|
||||||
@ -315,13 +311,37 @@ class DefaultRecordIter(object):
|
|||||||
|
|
||||||
return entry
|
return entry
|
||||||
|
|
||||||
|
def begin_payload(self, compute_digest, entry):
|
||||||
|
if compute_digest:
|
||||||
|
self.digester = hashlib.sha1()
|
||||||
|
else:
|
||||||
|
self.digester = None
|
||||||
|
|
||||||
|
self.entry = entry
|
||||||
|
entry.buffer = self.create_payload_buffer(entry)
|
||||||
|
|
||||||
|
def handle_payload(self, buff):
|
||||||
|
if self.digester:
|
||||||
|
self.digester.update(buff)
|
||||||
|
|
||||||
|
if self.entry and self.entry.buffer:
|
||||||
|
self.entry.buffer.write(buff)
|
||||||
|
|
||||||
|
def end_payload(self, entry):
|
||||||
|
if self.digester:
|
||||||
|
entry['digest'] = base64.b32encode(self.digester.digest())
|
||||||
|
|
||||||
|
self.entry = None
|
||||||
|
|
||||||
|
def create_payload_buffer(self, entry):
|
||||||
|
return None
|
||||||
|
|
||||||
def create_record_iter(self, arcv_iter):
|
def create_record_iter(self, arcv_iter):
|
||||||
append_post = self.options.get('append_post')
|
append_post = self.options.get('append_post')
|
||||||
include_all = self.options.get('include_all')
|
include_all = self.options.get('include_all')
|
||||||
block_size = self.options.get('block_size', 16384)
|
block_size = self.options.get('block_size', 16384)
|
||||||
surt_ordered = self.options.get('surt_ordered', True)
|
surt_ordered = self.options.get('surt_ordered', True)
|
||||||
minimal = self.options.get('minimal')
|
minimal = self.options.get('minimal')
|
||||||
append_post = self.options.get('append_post')
|
|
||||||
|
|
||||||
if append_post and minimal:
|
if append_post and minimal:
|
||||||
raise Exception('Sorry, minimal index option and ' +
|
raise Exception('Sorry, minimal index option and ' +
|
||||||
@ -371,10 +391,14 @@ class DefaultRecordIter(object):
|
|||||||
|
|
||||||
entry['_post_query'] = post_query
|
entry['_post_query'] = post_query
|
||||||
|
|
||||||
arcv_iter.read_to_end(record, compute_digest)
|
|
||||||
entry.set_rec_info(*arcv_iter.member_info)
|
|
||||||
entry.record = record
|
entry.record = record
|
||||||
|
|
||||||
|
self.begin_payload(compute_digest, entry)
|
||||||
|
arcv_iter.read_to_end(record, self.handle_payload)
|
||||||
|
|
||||||
|
entry.set_rec_info(*arcv_iter.member_info)
|
||||||
|
self.end_payload(entry)
|
||||||
|
|
||||||
yield entry
|
yield entry
|
||||||
|
|
||||||
def join_request_records(self, entry_iter):
|
def join_request_records(self, entry_iter):
|
||||||
|
@ -122,7 +122,7 @@ class ArcWarcRecordLoader:
|
|||||||
length = int(length) - sub_len
|
length = int(length) - sub_len
|
||||||
if length < 0:
|
if length < 0:
|
||||||
is_err = True
|
is_err = True
|
||||||
except ValueError:
|
except (ValueError, TypeError):
|
||||||
is_err = True
|
is_err = True
|
||||||
|
|
||||||
# err condition
|
# err condition
|
||||||
|
Loading…
x
Reference in New Issue
Block a user