From dfc30331174c17f45a5c13fac28a4fd2a37b15d0 Mon Sep 17 00:00:00 2001 From: John Berlin Date: Mon, 20 Aug 2018 18:04:09 -0400 Subject: [PATCH] Added skipping of metadata records with mime = text/anvl to cdxindexer.py. (#366) Updated test_indexing.py to include a test for no-indexing metadata records with mime == text/anvl Fixes https://github.com/webrecorder/webrecorderplayer-electron/issues/63. --- pywb/indexer/cdxindexer.py | 11 +++++++- pywb/indexer/test/test_indexing.py | 41 ++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/pywb/indexer/cdxindexer.py b/pywb/indexer/cdxindexer.py index 325b2279..a947f142 100644 --- a/pywb/indexer/cdxindexer.py +++ b/pywb/indexer/cdxindexer.py @@ -38,6 +38,10 @@ import six #================================================================= class BaseCDXWriter(object): + # To ensure we do not index metadata mime types + # from older WARC specs (Heritrix 1.x) that collide with response records + METADATA_NO_INDEX_TYPES = ('text/anvl', ) + def __init__(self, out): self.out = codecs.getwriter('utf-8')(out) #self.out = out @@ -50,11 +54,16 @@ class BaseCDXWriter(object): if not entry.get('url') or not entry.get('urlkey'): return - if entry.record.rec_type == 'warcinfo': + if self._is_skipped(entry): return self.write_cdx_line(self.out, entry, filename) + def _is_skipped(self, entry): + if entry.record.rec_type == 'warcinfo': + return True + return entry.record.rec_type == 'metadata' and entry['mime'] in self.METADATA_NO_INDEX_TYPES + def __exit__(self, *args): return False diff --git a/pywb/indexer/test/test_indexing.py b/pywb/indexer/test/test_indexing.py index 80a9f8d9..86e4f7d8 100644 --- a/pywb/indexer/test/test_indexing.py +++ b/pywb/indexer/test/test_indexing.py @@ -422,6 +422,47 @@ com,example)/%c3%83%c2%a9 20000101000000 http://example.com/\xc3\x83\xc2\xa9 tex """ +def test_no_index_metadata_mime_textanvl(): + test_data = b'\ +WARC/0.18\r\n\ +WARC-Type: response\r\n\ +WARC-Record-ID: \r\n\ +WARC-Target-URI: http://example.com/xyz.pdf\r\n\ +WARC-Date: 2014-04-01T05:20:11Z\r\n\ +WARC-Payload-Digest: sha1:EDIYL6WNHDY62TPKUCPSEWMOAAGYTOAS\r\n\ +Content-Type: application/http; msgtype=response\r\n\ +Content-Length: 4\r\n\ +\r\n\ +ABCD\r\n\ +\r\n\ +\r\n\ +\r\n\ +WARC/0.18\r\n\ +WARC-Type: metadata\r\n\ +WARC-Record-ID: \r\n\ +WARC-Target-URI: http://example.com/xyz.pdf\r\n\ +WARC-Date: 2014-04-01T05:20:11Z\r\n\ +WARC-Payload-Digest: sha1:EDIYL6WNHDY62TPKUCPSEWMOAAGYTOAS\r\n\ +Content-Type: text/anvl\r\n\ +Content-Length: 4\r\n\ +\r\n\ +ABCD\r\n\ +\r\n\ +' + options = dict(include_all=True) + + buff = BytesIO() + + test_record = BytesIO(test_data) + + write_cdx_index(buff, test_record, 'test.warc.gz', **options) + + assert buff.getvalue() == b"""\ + CDX N b a m s k r M S V g +com,example)/xyz.pdf 20140401052011 http://example.com/xyz.pdf application/http 200 EDIYL6WNHDY62TPKUCPSEWMOAAGYTOAS - - 310 0 test.warc.gz +""" + + if __name__ == "__main__": import doctest doctest.testmod()