From 147c3217dd87cc0f402b0e00c800760ca91a46b7 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 1 May 2017 21:50:39 -0700 Subject: [PATCH] update to warcio==1.3 recorder: use ArcWarcRecordLoader() for parsing response record multifilewarcwriter: ensure digest is computed before trying to lookup revisits --- pywb/recorder/multifilewarcwriter.py | 3 +++ pywb/recorder/recorderapp.py | 4 +++- requirements.txt | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pywb/recorder/multifilewarcwriter.py b/pywb/recorder/multifilewarcwriter.py index 9c0396ef..06f980bb 100644 --- a/pywb/recorder/multifilewarcwriter.py +++ b/pywb/recorder/multifilewarcwriter.py @@ -44,6 +44,9 @@ class MultiFileWARCWriter(BaseWARCWriter): if not self.dedup_index or record.rec_type != 'response': return record + # ensure payload digest is computed at this point + self.ensure_digest(record, block=False, payload=True) + try: url = record.rec_headers.get_header('WARC-Target-URI') digest = record.rec_headers.get_header('WARC-Payload-Digest') diff --git a/pywb/recorder/recorderapp.py b/pywb/recorder/recorderapp.py index 8641fc61..56d897ef 100644 --- a/pywb/recorder/recorderapp.py +++ b/pywb/recorder/recorderapp.py @@ -2,6 +2,8 @@ from pywb.webagg.utils import StreamIter, BUFF_SIZE from pywb.webagg.utils import ParamFormatter, res_template from pywb.webagg.inputrequest import DirectWSGIInputRequest +from warcio.recordloader import ArcWarcRecordLoader + from pywb.recorder.filters import SkipRangeRequestFilter, CollectionFilter from six.moves.urllib.parse import parse_qsl @@ -69,7 +71,7 @@ class RecorderApp(object): resp_length = resp_pay.tell() resp_pay.seek(0) - resp = self.writer.create_record_from_stream(resp_pay, resp_length) + resp = ArcWarcRecordLoader().parse_record_stream(resp_pay) if resp.rec_type == 'response': uri = resp.rec_headers.get_header('WARC-Target-Uri') diff --git a/requirements.txt b/requirements.txt index 5e6367b9..61b4653a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ six -warcio==1.2 +warcio==1.3 chardet requests redis