From 09295747b7f4ebcb90111cfee83c84c748196187 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 13 Oct 2017 17:13:55 +0200 Subject: [PATCH] Extract WARC field "WARC-Identified-Payload-Type" (#251) and add it as field "mime-detected" to index entry --- pywb/indexer/archiveindexer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pywb/indexer/archiveindexer.py b/pywb/indexer/archiveindexer.py index 1b5f1760..b3ff8f35 100644 --- a/pywb/indexer/archiveindexer.py +++ b/pywb/indexer/archiveindexer.py @@ -265,6 +265,9 @@ class DefaultRecordParser(object): entry.extract_mime(record.http_headers. get_header('Content-Type'), def_mime) + # detected mime from WARC-Identified-Payload-Type + entry['mime-detected'] = record.rec_headers.get_header( + 'WARC-Identified-Payload-Type') # status -- only for response records (by convention): if record.rec_type == 'response' and not self.options.get('minimal'):