From 7a8fed2681871f71041184ac88893cf9029950ec Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 10 Mar 2017 10:05:39 -0800 Subject: [PATCH] update to wario1.1 archiveindexer: explicitly consume content for each record --- pywb/warc/archiveindexer.py | 11 ++++++++++- requirements.txt | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/pywb/warc/archiveindexer.py b/pywb/warc/archiveindexer.py index e28271f2..d8059d45 100644 --- a/pywb/warc/archiveindexer.py +++ b/pywb/warc/archiveindexer.py @@ -1,6 +1,8 @@ from pywb.utils.canonicalize import canonicalize from pywb.utils.loaders import extract_post_query, append_post_query +from pywb.webagg.utils import BUFF_SIZE + from warcio.timeutils import iso_date_to_timestamp from warcio.archiveiterator import ArchiveIterator @@ -188,7 +190,14 @@ class DefaultRecordParser(object): entry.record = record self.begin_payload(compute_digest, entry) - raw_iter.read_to_end(record, self.handle_payload) + + while True: + buff = record.raw_stream.read(BUFF_SIZE) + if not buff: + break + self.handle_payload(buff) + + raw_iter.read_to_end(record) entry.set_rec_info(*raw_iter.member_info) self.end_payload(entry) diff --git a/requirements.txt b/requirements.txt index 93bb9d11..96453edb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ six -warcio +warcio==1.1 chardet requests redis