1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Added skipping of metadata records with mime = text/anvl to cdxindexer.py. (#366)

Updated test_indexing.py to include a test for no-indexing metadata records with mime == text/anvl
Fixes https://github.com/webrecorder/webrecorderplayer-electron/issues/63.
This commit is contained in:
John Berlin 2018-08-20 18:04:09 -04:00 committed by Ilya Kreymer
parent d62ab14914
commit dfc3033117
2 changed files with 51 additions and 1 deletions

View File

@ -38,6 +38,10 @@ import six
#=================================================================
class BaseCDXWriter(object):
# To ensure we do not index metadata mime types
# from older WARC specs (Heritrix 1.x) that collide with response records
METADATA_NO_INDEX_TYPES = ('text/anvl', )
def __init__(self, out):
self.out = codecs.getwriter('utf-8')(out)
#self.out = out
@ -50,11 +54,16 @@ class BaseCDXWriter(object):
if not entry.get('url') or not entry.get('urlkey'):
return
if entry.record.rec_type == 'warcinfo':
if self._is_skipped(entry):
return
self.write_cdx_line(self.out, entry, filename)
def _is_skipped(self, entry):
if entry.record.rec_type == 'warcinfo':
return True
return entry.record.rec_type == 'metadata' and entry['mime'] in self.METADATA_NO_INDEX_TYPES
def __exit__(self, *args):
return False

View File

@ -422,6 +422,47 @@ com,example)/%c3%83%c2%a9 20000101000000 http://example.com/\xc3\x83\xc2\xa9 tex
"""
def test_no_index_metadata_mime_textanvl():
test_data = b'\
WARC/0.18\r\n\
WARC-Type: response\r\n\
WARC-Record-ID: <urn:uuid:1fd7789c-9cd5-47ea-b7ba-2a97dc06680b>\r\n\
WARC-Target-URI: http://example.com/xyz.pdf\r\n\
WARC-Date: 2014-04-01T05:20:11Z\r\n\
WARC-Payload-Digest: sha1:EDIYL6WNHDY62TPKUCPSEWMOAAGYTOAS\r\n\
Content-Type: application/http; msgtype=response\r\n\
Content-Length: 4\r\n\
\r\n\
ABCD\r\n\
\r\n\
\r\n\
\r\n\
WARC/0.18\r\n\
WARC-Type: metadata\r\n\
WARC-Record-ID: <urn:uuid:0735267f-5749-4c02-b08b-955af5d76032>\r\n\
WARC-Target-URI: http://example.com/xyz.pdf\r\n\
WARC-Date: 2014-04-01T05:20:11Z\r\n\
WARC-Payload-Digest: sha1:EDIYL6WNHDY62TPKUCPSEWMOAAGYTOAS\r\n\
Content-Type: text/anvl\r\n\
Content-Length: 4\r\n\
\r\n\
ABCD\r\n\
\r\n\
'
options = dict(include_all=True)
buff = BytesIO()
test_record = BytesIO(test_data)
write_cdx_index(buff, test_record, 'test.warc.gz', **options)
assert buff.getvalue() == b"""\
CDX N b a m s k r M S V g
com,example)/xyz.pdf 20140401052011 http://example.com/xyz.pdf application/http 200 EDIYL6WNHDY62TPKUCPSEWMOAAGYTOAS - - 310 0 test.warc.gz
"""
if __name__ == "__main__":
import doctest
doctest.testmod()