1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-26 07:49:24 +01:00
pywb/pywb/recorder/test/test_recorder.py
Ilya Kreymer bcb5bef39d Windows Build Fixes/Appveyor CI (#225)
windows build fixes: all tests should pass, ci with appveyor
- add appveyor.yml
- path fixes for windows, use os.path.join
- templates_dir: use '/' always for jinja2 paths
- auto colls: ensure chdir before deleting dir
- recorder: ensure warc writer is always closed
- recorder: disable locking in warcwriter on windows for now (read access not avail, shared
lock seems to not be working)
- zipnum: ensure block is closed after read!
- cached dir test: wait before adding file
- tests: adjust timeout tests to allow more leeway in timing
2017-08-05 17:12:16 -07:00

642 lines
24 KiB
Python

from gevent import monkey; monkey.patch_all()
import gevent
from pywb.warcserver.test.testutils import TempDirTests, LiveServerTests, BaseTestClass, to_path
from pywb.warcserver.test.testutils import FakeRedisTests
import os
import webtest
from pytest import raises
from fakeredis import FakeStrictRedis
from pywb.recorder.recorderapp import RecorderApp
from pywb.recorder.redisindexer import WritableRedisIndexer
from pywb.recorder.multifilewarcwriter import PerRecordWARCWriter, MultiFileWARCWriter
from pywb.recorder.filters import ExcludeSpecificHeaders, ExcludeHttpOnlyCookieHeaders
from pywb.recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy
from pywb.utils.memento import MementoUtils
from pywb.warcserver.index.cdxobject import CDXObject
from warcio.statusandheaders import StatusAndHeadersParser
from warcio.bufferedreaders import DecompressingBufferedReader
from warcio.recordloader import ArcWarcRecordLoader
from warcio.archiveiterator import ArchiveIterator
from pywb.indexer.cdxindexer import write_cdx_index
from six.moves.urllib.parse import quote, unquote, urlencode
from io import BytesIO
import time
import json
UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
general_req_data = "\
GET {path} HTTP/1.1\r\n\
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\r\n\
User-agent: %s\r\n\
X-Other: foo\r\n\
Host: {host}\r\n\
Cookie: boo=far\r\n\
\r\n" % UA
class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass):
@classmethod
def setup_class(cls):
super(TestRecorder, cls).setup_class()
cls.warcs_dir = to_path(cls.root_dir + '/warcs')
os.makedirs(cls.warcs_dir)
cls.upstream_url = 'http://localhost:{0}'.format(cls.server.port)
def _get_dedup_index(self, dupe_policy=WriteRevisitDupePolicy(), user=True):
if user:
file_key_template = '{user}:{coll}:warc'
redis_url = 'redis://localhost/2/{user}:{coll}:cdxj'
else:
file_key_template = '{coll}:warc'
redis_url = 'redis://localhost/2/{coll}:cdxj'
dedup_index = WritableRedisIndexer(redis_url=redis_url,
file_key_template=file_key_template,
rel_path_template=to_path(self.root_dir + '/warcs/'),
dupe_policy=dupe_policy)
return dedup_index
def _test_warc_write(self, recorder_app, host, path, other_params='', link_url=''):
url = 'http://' + host + path
req_url = '/live/resource/postreq?url=' + url + other_params
testapp = webtest.TestApp(recorder_app)
resp = testapp.post(req_url, general_req_data.format(host=host, path=path).encode('utf-8'))
if not recorder_app.write_queue.empty():
recorder_app._write_one()
assert resp.headers['WebAgg-Source-Coll'] == 'live'
if not link_url:
link_url = unquote(url)
assert resp.headers['Link'] == MementoUtils.make_link(link_url, 'original')
assert resp.headers['Memento-Datetime'] != ''
return resp
def _test_all_warcs(self, dirname, num):
coll_dir = to_path(self.root_dir + dirname)
assert os.path.isdir(coll_dir) == (num != None)
if num is None:
return
files = [x for x in os.listdir(coll_dir) if os.path.isfile(os.path.join(coll_dir, x))]
assert len(files) == num
assert all(x.endswith('.warc.gz') for x in files)
self._verify_content_len(coll_dir, files)
return files, coll_dir
def _load_resp_req(self, base_path):
warcs = os.listdir(base_path)
assert len(warcs) == 1
warc = warcs[0]
stored_resp = None
stored_req = None
with open(os.path.join(base_path, warc), 'rb') as fh:
for rec in ArchiveIterator(fh):
if rec.rec_type == 'response':
stored_resp = rec
elif rec.rec_type == 'request':
stored_req = rec
assert stored_resp is not None
assert stored_req is not None
return stored_req, stored_resp
def _get_http_only_cookies(self, record):
non_http_only = None
http_only = None
for header in record.http_headers.headers:
name = header[0].lower()
if name == 'set-cookie':
if ExcludeHttpOnlyCookieHeaders.HTTPONLY_RX.search(header[1].lower()):
http_only = header
else:
non_http_only = header
return non_http_only, http_only
def _verify_content_len(self, base_dir, files):
for filename in files:
filename = os.path.join(base_dir, filename)
with open(filename, 'rb') as fh:
for record in ArchiveIterator(fh, no_record_parse=True):
assert record.http_headers == None
assert int(record.rec_headers.get_header('Content-Length')) == record.length
assert record.length == len(record.raw_stream.read())
def test_record_warc_1(self):
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')))
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs/', 1)
def test_record_warc_2(self):
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='live')
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs/', 2)
def test_error_url(self):
recorder_app = RecorderApp(self.upstream_url + '01',
PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='live')
testapp = webtest.TestApp(recorder_app)
resp = testapp.get('/live/resource?url=http://example.com/', status=400)
assert resp.json['error'] != ''
self._test_all_warcs('/warcs/', 2)
def test_record_cookies_header(self):
base_path = to_path(self.root_dir + '/warcs/cookiecheck/')
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCWriter(base_path), accept_colls='live')
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar')
assert b'HTTP/1.1 302' in resp.body
buff = BytesIO(resp.body)
record = ArcWarcRecordLoader().parse_record_stream(buff)
assert ('Set-Cookie', 'name=value; Path=/') in record.http_headers.headers
assert ('Set-Cookie', 'foo=bar; Path=/') in record.http_headers.headers
stored_req, stored_resp = self._load_resp_req(base_path)
assert ('Set-Cookie', 'name=value; Path=/') in stored_resp.http_headers.headers
assert ('Set-Cookie', 'foo=bar; Path=/') in stored_resp.http_headers.headers
assert ('X-Other', 'foo') in stored_req.http_headers.headers
assert ('Cookie', 'boo=far') in stored_req.http_headers.headers
self._test_all_warcs('/warcs/cookiecheck/', 1)
def test_record_skip_all_cookies_header(self):
warc_path = to_path(self.root_dir + '/warcs/cookieskip/')
header_filter = ExcludeSpecificHeaders(['Set-Cookie', 'Cookie'])
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCWriter(warc_path, header_filter=header_filter),
accept_colls='live')
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar')
assert b'HTTP/1.1 302' in resp.body
buff = BytesIO(resp.body)
record = ArcWarcRecordLoader().parse_record_stream(buff)
assert ('Set-Cookie', 'name=value; Path=/') in record.http_headers.headers
assert ('Set-Cookie', 'foo=bar; Path=/') in record.http_headers.headers
stored_req, stored_resp = self._load_resp_req(warc_path)
assert ('Set-Cookie', 'name=value; Path=/') not in stored_resp.http_headers.headers
assert ('Set-Cookie', 'foo=bar; Path=/') not in stored_resp.http_headers.headers
assert ('X-Other', 'foo') in stored_req.http_headers.headers
assert ('Cookie', 'boo=far') not in stored_req.http_headers.headers
self._test_all_warcs('/warcs/cookieskip/', 1)
def test_record_skip_http_only_cookies_header(self):
warc_path = to_path(self.root_dir + '/warcs/cookieskip_httponly/')
header_filter = ExcludeHttpOnlyCookieHeaders()
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCWriter(warc_path, header_filter=header_filter),
accept_colls='live')
resp = self._test_warc_write(recorder_app, 'www.google.com', '/')
assert b'HTTP/1.1 302' in resp.body
buff = BytesIO(resp.body)
record = ArcWarcRecordLoader().parse_record_stream(buff)
non_http_only, http_only = self._get_http_only_cookies(record)
# both httponly and other cookies
assert http_only != None
assert non_http_only != None
stored_req, stored_resp = self._load_resp_req(warc_path)
non_http_only, http_only = self._get_http_only_cookies(stored_resp)
# no httponly cookies
assert http_only == None
assert non_http_only != None
assert ('X-Other', 'foo') in stored_req.http_headers.headers
assert ('Cookie', 'boo=far') not in stored_req.http_headers.headers
self._test_all_warcs('/warcs/cookieskip_httponly/', 1)
def test_record_skip_wrong_coll(self):
recorder_app = RecorderApp(self.upstream_url,
writer=PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='not-live')
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs/', 2)
def test_record_param_user_coll(self):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
dedup_index = self._get_dedup_index()
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCWriter(warc_path, dedup_index=dedup_index))
self._test_all_warcs('/warcs/USER/COLL/', None)
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/user-agent',
'&param.recorder.user=USER&param.recorder.coll=COLL')
assert '"user-agent": "{0}"'.format(UA) in resp.text
#assert b'HTTP/1.1 200 OK' in resp.body
#assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs/USER/COLL/', 1)
r = FakeStrictRedis.from_url('redis://localhost/2')
res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
assert len(res) == 1
cdx = CDXObject(res[0])
assert cdx['urlkey'] == 'org,httpbin)/user-agent'
assert cdx['mime'] == 'application/json'
assert cdx['offset'] == '0'
assert cdx['filename'].startswith(to_path('USER/COLL/'))
assert cdx['filename'].endswith('.warc.gz')
warcs = r.hgetall('USER:COLL:warc')
full_path = to_path(self.root_dir + '/warcs/' + cdx['filename'])
assert warcs == {cdx['filename'].encode('utf-8'): full_path.encode('utf-8')}
def test_record_param_user_coll_same_dir(self):
warc_path = to_path(self.root_dir + '/warcs2/')
dedup_index = self._get_dedup_index()
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCWriter(warc_path, dedup_index=dedup_index, key_template='{user}:{coll}'))
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?foo=bar', '&param.recorder.user=USER2&param.recorder.coll=COLL2')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?foo=bar', '&param.recorder.user=USER2&param.recorder.coll=COLL3')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs2', 2)
def test_record_param_user_coll_revisit(self):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
dedup_index = self._get_dedup_index()
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCWriter(warc_path, dedup_index=dedup_index))
self._test_all_warcs('/warcs/USER/COLL/', 1)
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/user-agent',
'&param.recorder.user=USER&param.recorder.coll=COLL')
assert '"user-agent": "{0}"'.format(UA) in resp.text
#assert b'HTTP/1.1 200 OK' in resp.body
#assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs/USER/COLL/', 2)
# Test Redis CDX
r = FakeStrictRedis.from_url('redis://localhost/2')
res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
assert len(res) == 2
if b'warc/revisit' in res[0]:
cdx = CDXObject(res[0])
else:
cdx = CDXObject(res[1])
assert cdx['urlkey'] == 'org,httpbin)/user-agent'
assert cdx['mime'] == 'warc/revisit'
assert cdx['offset'] == '0'
assert cdx['filename'].startswith(to_path('USER/COLL/'))
assert cdx['filename'].endswith('.warc.gz')
fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename'])
warcs = r.hgetall('USER:COLL:warc')
assert len(warcs) == 2
assert warcs[cdx['filename'].encode('utf-8')] == fullwarc.encode('utf-8')
with open(fullwarc, 'rb') as fh:
decomp = DecompressingBufferedReader(fh)
# Test refers-to headers
status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(decomp)
assert status_headers.get_header('WARC-Type') == 'revisit'
assert status_headers.get_header('WARC-Target-URI') == 'http://httpbin.org/user-agent'
assert status_headers.get_header('WARC-Date') != ''
assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://httpbin.org/user-agent'
assert status_headers.get_header('WARC-Refers-To-Date') != ''
def test_record_param_user_coll_skip(self):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
dedup_index = self._get_dedup_index(dupe_policy=SkipDupePolicy())
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCWriter(warc_path, dedup_index=dedup_index))
# No new entries written
self._test_all_warcs('/warcs/USER/COLL/', 2)
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/user-agent',
'&param.recorder.user=USER&param.recorder.coll=COLL')
assert '"user-agent": "{0}"'.format(UA) in resp.text
#assert b'HTTP/1.1 200 OK' in resp.body
#assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs/USER/COLL/', 2)
# Test Redis CDX
r = FakeStrictRedis.from_url('redis://localhost/2')
res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
assert len(res) == 2
def test_record_param_user_coll_write_dupe_no_revisit(self):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
dedup_index = self._get_dedup_index(dupe_policy=WriteDupePolicy())
writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index)
recorder_app = RecorderApp(self.upstream_url, writer)
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?foo=bar', '&param.recorder.user=USER&param.recorder.coll=COLL')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs/USER/COLL/', 3)
r = FakeStrictRedis.from_url('redis://localhost/2')
res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
assert len(res) == 3
mimes = [CDXObject(x)['mime'] for x in res]
assert sorted(mimes) == ['application/json', 'application/json', 'warc/revisit']
assert len(writer.fh_cache) == 0
# Keep Open
def test_record_file_warc_keep_open(self):
path = to_path(self.root_dir + '/warcs/A.warc.gz')
writer = MultiFileWARCWriter(path)
recorder_app = RecorderApp(self.upstream_url, writer)
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
assert os.path.isfile(path)
assert len(writer.fh_cache) == 1
writer.close()
assert len(writer.fh_cache) == 0
def test_record_multiple_writes_keep_open(self):
warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz')
rel_path = to_path(self.root_dir + '/warcs/')
dedup_index = self._get_dedup_index(user=False)
writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
recorder_app = RecorderApp(self.upstream_url, writer)
# First Record
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?foo=bar', '&param.recorder.coll=FOO')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
# Second Record
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?boo=far', '&param.recorder.coll=FOO')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"boo": "far"' in resp.body
self._test_all_warcs('/warcs/FOO/', 1)
# Check two records in WARC
r = FakeStrictRedis.from_url('redis://localhost/2')
res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,')
assert len(res) == 2
files, coll_dir = self._test_all_warcs('/warcs/FOO/', 1)
fullname = coll_dir + files[0]
cdxout = BytesIO()
with open(fullname, 'rb') as fh:
filename = os.path.relpath(fullname, rel_path)
write_cdx_index(cdxout, fh, filename,
cdxj=True, append_post=True, sort=True)
res = [CDXObject(x) for x in res]
cdxres = cdxout.getvalue().strip()
cdxres = cdxres.split(b'\n')
cdxres = [CDXObject(x) for x in cdxres]
assert cdxres == res
assert len(writer.fh_cache) == 1
writer.close_key(to_path(self.root_dir + '/warcs/FOO/'))
assert len(writer.fh_cache) == 0
writer.close()
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?boo=far', '&param.recorder.coll=FOO')
self._test_all_warcs('/warcs/FOO/', 2)
warcs = r.hgetall('FOO:warc')
assert len(warcs) == 2
writer.close()
assert len(writer.fh_cache) == 0
def test_record_multiple_writes_rollover_idle(self):
warc_path = to_path(self.root_dir + '/warcs/GOO/ABC-{hostname}-{timestamp}.warc.gz')
rel_path = to_path(self.root_dir + '/warcs/')
dedup_index = self._get_dedup_index(user=False)
writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index, max_idle_secs=0.9)
recorder_app = RecorderApp(self.upstream_url, writer)
# First Record
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?foo=bar', '&param.recorder.coll=GOO')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
# Second Record
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?boo=far', '&param.recorder.coll=GOO')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"boo": "far"' in resp.body
self._test_all_warcs('/warcs/GOO/', 1)
time.sleep(1.0)
writer.close_idle_files()
# Third Record
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?goo=bar', '&param.recorder.coll=GOO')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"goo": "bar"' in resp.body
self._test_all_warcs('/warcs/GOO/', 2)
writer.close()
assert len(writer.fh_cache) == 0
def test_record_custom_record(self):
dedup_index = self._get_dedup_index(user=False)
warc_path = to_path(self.root_dir + '/warcs/meta/meta.warc.gz')
writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
recorder_app = RecorderApp(self.upstream_url, writer)
req_url = '/live/resource/postreq?url=custom://httpbin.org&param.recorder.coll=META&put_record=resource'
buff = b'Some Data'
testapp = webtest.TestApp(recorder_app)
headers = {'content-type': 'text/plain',
'WARC-Custom': 'foo'
}
resp = testapp.put(req_url, headers=headers, params=buff)
assert resp.json['success'] == 'true'
assert resp.json['WARC-Date'] != ''
self._test_all_warcs('/warcs/meta', 1)
r = FakeStrictRedis.from_url('redis://localhost/2')
warcs = r.hgetall('META:warc')
assert len(warcs) == 1
warc_key = os.path.join('meta', 'meta.warc.gz').encode('utf-8')
with open(warcs[warc_key], 'rb') as fh:
decomp = DecompressingBufferedReader(fh)
record = ArcWarcRecordLoader().parse_record_stream(decomp, ensure_http_headers=True)
status_headers = record.rec_headers
assert len(record.rec_headers.headers) == 9
assert status_headers.get_header('WARC-Type') == 'resource'
assert status_headers.get_header('WARC-Target-URI') == 'custom://httpbin.org'
assert status_headers.get_header('WARC-Record-ID') != ''
assert status_headers.get_header('WARC-Date') != ''
assert status_headers.get_header('WARC-Block-Digest') != ''
assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest')
assert status_headers.get_header('Content-Type') == 'text/plain'
assert status_headers.get_header('Content-Length') == str(len(buff))
assert status_headers.get_header('WARC-Custom') == 'foo'
assert record.raw_stream.read() == buff
status_headers = record.http_headers
assert len(record.http_headers.headers) == 2
assert status_headers.get_header('Content-Type') == 'text/plain'
assert status_headers.get_header('Content-Length') == str(len(buff))
writer.close()
assert len(writer.fh_cache) == 0
def test_record_video_metadata(self):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
dedup_index = self._get_dedup_index()
writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index)
recorder_app = RecorderApp(self.upstream_url, writer)
params = {'param.recorder.user': 'USER',
'param.recorder.coll': 'VIDEO',
'content_type': 'application/vnd.youtube-dl_formats+json'
}
resp = self._test_warc_write(recorder_app,
'www.youtube.com', '/v/BfBgWtAIbRc', '&' + urlencode(params),
link_url='metadata://www.youtube.com/v/BfBgWtAIbRc')
r = FakeStrictRedis.from_url('redis://localhost/2')
warcs = r.hgetall('USER:VIDEO:warc')
assert len(warcs) == 1
filename = list(warcs.values())[0]
with open(filename, 'rb') as fh:
decomp = DecompressingBufferedReader(fh)
record = ArcWarcRecordLoader().parse_record_stream(decomp)
status_headers = record.rec_headers
assert status_headers.get_header('WARC-Type') == 'metadata'
assert status_headers.get_header('Content-Type') == 'application/vnd.youtube-dl_formats+json'
assert status_headers.get_header('WARC-Block-Digest') != ''
assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest')