1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-26 07:49:24 +01:00

rename WARCRecorder -> WARCWriter, add optional max_size to single warc recorder

per-record recorder combines http response/req into single file
This commit is contained in:
Ilya Kreymer 2016-03-18 19:49:14 -07:00
parent b64be0dff1
commit e81457df5f
3 changed files with 111 additions and 88 deletions

View File

@ -6,9 +6,6 @@ from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.warc.recordloader import ArcWarcRecord from pywb.warc.recordloader import ArcWarcRecord
from pywb.warc.recordloader import ArcWarcRecordLoader from pywb.warc.recordloader import ArcWarcRecordLoader
from recorder.warcrecorder import SingleFileWARCRecorder, PerRecordWARCRecorder
from recorder.redisindexer import WritableRedisIndexer
from six.moves.urllib.parse import parse_qsl from six.moves.urllib.parse import parse_qsl
import json import json

View File

@ -12,7 +12,7 @@ from pytest import raises
from recorder.recorderapp import RecorderApp from recorder.recorderapp import RecorderApp
from recorder.redisindexer import WritableRedisIndexer from recorder.redisindexer import WritableRedisIndexer
from recorder.warcrecorder import PerRecordWARCRecorder, SingleFileWARCRecorder from recorder.warcwriter import PerRecordWARCWriter, SingleFileWARCWriter
from recorder.filters import ExcludeSpecificHeaders, SkipDupePolicy, WriteDupePolicy from recorder.filters import ExcludeSpecificHeaders, SkipDupePolicy, WriteDupePolicy
from webagg.utils import MementoUtils from webagg.utils import MementoUtils
@ -70,10 +70,21 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
files = [x for x in os.listdir(coll_dir) if os.path.isfile(os.path.join(coll_dir, x))] files = [x for x in os.listdir(coll_dir) if os.path.isfile(os.path.join(coll_dir, x))]
assert len(files) == num assert len(files) == num
assert all(x.endswith('.warc.gz') for x in files) assert all(x.endswith('.warc.gz') for x in files)
return files, coll_dir
def test_record_warc_1(self): def test_record_warc_1(self):
recorder_app = RecorderApp(self.upstream_url, recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCRecorder(to_path(self.root_dir + '/warcs/'))) PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')))
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs/', 1)
def test_record_warc_2(self):
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='live')
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar') resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
assert b'HTTP/1.1 200 OK' in resp.body assert b'HTTP/1.1 200 OK' in resp.body
@ -81,19 +92,9 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
self._test_all_warcs('/warcs/', 2) self._test_all_warcs('/warcs/', 2)
def test_record_warc_2(self):
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCRecorder(to_path(self.root_dir + '/warcs/')), accept_colls='live')
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs/', 4)
def test_error_url(self): def test_error_url(self):
recorder_app = RecorderApp(self.upstream_url + '01', recorder_app = RecorderApp(self.upstream_url + '01',
PerRecordWARCRecorder(to_path(self.root_dir + '/warcs/')), accept_colls='live') PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='live')
testapp = webtest.TestApp(recorder_app) testapp = webtest.TestApp(recorder_app)
@ -101,12 +102,12 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
assert resp.json['error'] != '' assert resp.json['error'] != ''
self._test_all_warcs('/warcs/', 4) self._test_all_warcs('/warcs/', 2)
def test_record_cookies_header(self): def test_record_cookies_header(self):
base_path = to_path(self.root_dir + '/warcs/cookiecheck/') base_path = to_path(self.root_dir + '/warcs/cookiecheck/')
recorder_app = RecorderApp(self.upstream_url, recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCRecorder(base_path), accept_colls='live') PerRecordWARCWriter(base_path), accept_colls='live')
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar') resp = self._test_warc_write(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar')
assert b'HTTP/1.1 302' in resp.body assert b'HTTP/1.1 302' in resp.body
@ -134,7 +135,7 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
base_path = to_path(self.root_dir + '/warcs/cookieskip/') base_path = to_path(self.root_dir + '/warcs/cookieskip/')
header_filter = ExcludeSpecificHeaders(['Set-Cookie', 'Cookie']) header_filter = ExcludeSpecificHeaders(['Set-Cookie', 'Cookie'])
recorder_app = RecorderApp(self.upstream_url, recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCRecorder(base_path, header_filter=header_filter), PerRecordWARCWriter(base_path, header_filter=header_filter),
accept_colls='live') accept_colls='live')
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar') resp = self._test_warc_write(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar')
@ -162,13 +163,13 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
def test_record_skip_wrong_coll(self): def test_record_skip_wrong_coll(self):
recorder_app = RecorderApp(self.upstream_url, recorder_app = RecorderApp(self.upstream_url,
writer=PerRecordWARCRecorder(to_path(self.root_dir + '/warcs/')), accept_colls='not-live') writer=PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='not-live')
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar') resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
assert b'HTTP/1.1 200 OK' in resp.body assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs/', 4) self._test_all_warcs('/warcs/', 2)
@patch('redis.StrictRedis', FakeStrictRedis) @patch('redis.StrictRedis', FakeStrictRedis)
def test_record_param_user_coll(self): def test_record_param_user_coll(self):
@ -180,16 +181,16 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
rel_path_template=self.root_dir + '/warcs/') rel_path_template=self.root_dir + '/warcs/')
recorder_app = RecorderApp(self.upstream_url, recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCRecorder(warc_path, dedup_index=dedup_index)) PerRecordWARCWriter(warc_path, dedup_index=dedup_index))
self._test_all_warcs('/warcs/', 4) self._test_all_warcs('/warcs/', 2)
resp = self._test_warc_write(recorder_app, 'httpbin.org', resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?foo=bar', '&param.recorder.user=USER&param.recorder.coll=COLL') '/get?foo=bar', '&param.recorder.user=USER&param.recorder.coll=COLL')
assert b'HTTP/1.1 200 OK' in resp.body assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs/USER/COLL/', 2) self._test_all_warcs('/warcs/USER/COLL/', 1)
r = FakeStrictRedis.from_url('redis://localhost/2') r = FakeStrictRedis.from_url('redis://localhost/2')
@ -213,16 +214,16 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
rel_path_template=self.root_dir + '/warcs/') rel_path_template=self.root_dir + '/warcs/')
recorder_app = RecorderApp(self.upstream_url, recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCRecorder(warc_path, dedup_index=dedup_index)) PerRecordWARCWriter(warc_path, dedup_index=dedup_index))
self._test_all_warcs('/warcs/', 4) self._test_all_warcs('/warcs/', 2)
resp = self._test_warc_write(recorder_app, 'httpbin.org', resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?foo=bar', '&param.recorder.user=USER&param.recorder.coll=COLL') '/get?foo=bar', '&param.recorder.user=USER&param.recorder.coll=COLL')
assert b'HTTP/1.1 200 OK' in resp.body assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs/USER/COLL/', 4) self._test_all_warcs('/warcs/USER/COLL/', 2)
# Test Redis CDX # Test Redis CDX
r = FakeStrictRedis.from_url('redis://localhost/2') r = FakeStrictRedis.from_url('redis://localhost/2')
@ -259,17 +260,17 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
dupe_policy=SkipDupePolicy()) dupe_policy=SkipDupePolicy())
recorder_app = RecorderApp(self.upstream_url, recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCRecorder(warc_path, dedup_index=dedup_index)) PerRecordWARCWriter(warc_path, dedup_index=dedup_index))
# No new entries written # No new entries written
self._test_all_warcs('/warcs/', 4) self._test_all_warcs('/warcs/', 2)
resp = self._test_warc_write(recorder_app, 'httpbin.org', resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?foo=bar', '&param.recorder.user=USER&param.recorder.coll=COLL') '/get?foo=bar', '&param.recorder.user=USER&param.recorder.coll=COLL')
assert b'HTTP/1.1 200 OK' in resp.body assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs/USER/COLL/', 4) self._test_all_warcs('/warcs/USER/COLL/', 2)
# Test Redis CDX # Test Redis CDX
r = FakeStrictRedis.from_url('redis://localhost/2') r = FakeStrictRedis.from_url('redis://localhost/2')
@ -288,14 +289,14 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
dupe_policy=WriteDupePolicy()) dupe_policy=WriteDupePolicy())
recorder_app = RecorderApp(self.upstream_url, recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCRecorder(warc_path, dedup_index=dedup_index)) PerRecordWARCWriter(warc_path, dedup_index=dedup_index))
resp = self._test_warc_write(recorder_app, 'httpbin.org', resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?foo=bar', '&param.recorder.user=USER&param.recorder.coll=COLL') '/get?foo=bar', '&param.recorder.user=USER&param.recorder.coll=COLL')
assert b'HTTP/1.1 200 OK' in resp.body assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs/USER/COLL/', 6) self._test_all_warcs('/warcs/USER/COLL/', 3)
r = FakeStrictRedis.from_url('redis://localhost/2') r = FakeStrictRedis.from_url('redis://localhost/2')
@ -310,7 +311,7 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
def test_record_single_file_warc_1(self): def test_record_single_file_warc_1(self):
path = to_path(self.root_dir + '/warcs/A.warc.gz') path = to_path(self.root_dir + '/warcs/A.warc.gz')
recorder_app = RecorderApp(self.upstream_url, recorder_app = RecorderApp(self.upstream_url,
SingleFileWARCRecorder(path)) SingleFileWARCWriter(path))
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar') resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
assert b'HTTP/1.1 200 OK' in resp.body assert b'HTTP/1.1 200 OK' in resp.body
@ -321,14 +322,14 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
@patch('redis.StrictRedis', FakeStrictRedis) @patch('redis.StrictRedis', FakeStrictRedis)
def test_record_single_file_multiple_writes(self): def test_record_single_file_multiple_writes(self):
warc_path = to_path(self.root_dir + '/warcs/FOO/rec-test.warc.gz') warc_path = to_path(self.root_dir + '/warcs/FOO/rec-{hostname}-{timestamp}.warc.gz')
rel_path = self.root_dir + '/warcs/' rel_path = self.root_dir + '/warcs/'
dedup_index = WritableRedisIndexer('redis://localhost/2/{coll}:cdxj', dedup_index = WritableRedisIndexer('redis://localhost/2/{coll}:cdxj',
rel_path_template=rel_path) rel_path_template=rel_path)
writer = SingleFileWARCRecorder(warc_path, dedup_index=dedup_index) writer = SingleFileWARCWriter(warc_path, dedup_index=dedup_index)
recorder_app = RecorderApp(self.upstream_url, writer) recorder_app = RecorderApp(self.upstream_url, writer)
# First Record # First Record
@ -352,11 +353,12 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,') res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,')
assert len(res) == 2 assert len(res) == 2
assert os.path.isfile(warc_path) files, coll_dir = self._test_all_warcs('/warcs/FOO/', 1)
fullname = coll_dir + files[0]
cdxout = BytesIO() cdxout = BytesIO()
with open(warc_path, 'rb') as fh: with open(fullname, 'rb') as fh:
filename = os.path.relpath(warc_path, rel_path) filename = os.path.relpath(fullname, rel_path)
write_cdx_index(cdxout, fh, filename, write_cdx_index(cdxout, fh, filename,
cdxj=True, append_post=True, sort=True) cdxj=True, append_post=True, sort=True)
@ -368,10 +370,10 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
assert cdxres == res assert cdxres == res
# close this file
writer.close() writer.close()
with raises(OSError):
resp = self._test_warc_write(recorder_app, 'httpbin.org', resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?boo=far', '&param.recorder.coll=FOO') '/get?boo=far', '&param.recorder.coll=FOO')
self._test_all_warcs('/warcs/FOO/', 2)

View File

@ -27,7 +27,7 @@ from recorder.filters import ExcludeNone
# ============================================================================ # ============================================================================
class BaseWARCRecorder(object): class BaseWARCWriter(object):
WARC_RECORDS = {'warcinfo': 'application/warc-fields', WARC_RECORDS = {'warcinfo': 'application/warc-fields',
'response': 'application/http; msgtype=response', 'response': 'application/http; msgtype=response',
'revisit': 'application/http; msgtype=response', 'revisit': 'application/http; msgtype=response',
@ -45,6 +45,7 @@ class BaseWARCRecorder(object):
self.dedup_index = dedup_index self.dedup_index = dedup_index
self.rec_source_name = name self.rec_source_name = name
self.header_filter = header_filter self.header_filter = header_filter
self.hostname = gethostname()
def ensure_digest(self, record): def ensure_digest(self, record):
block_digest = record.rec_headers.get('WARC-Block-Digest') block_digest = record.rec_headers.get('WARC-Block-Digest')
@ -135,7 +136,7 @@ class BaseWARCRecorder(object):
def _write_warc_record(self, out, record): def _write_warc_record(self, out, record):
if self.gzip: if self.gzip:
out = GzippingWriter(out) out = GzippingWrapper(out)
self._line(out, b'WARC/1.0') self._line(out, b'WARC/1.0')
@ -196,7 +197,7 @@ class BaseWARCRecorder(object):
# ============================================================================ # ============================================================================
class GzippingWriter(object): class GzippingWrapper(object):
def __init__(self, out): def __init__(self, out):
self.compressor = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS + 16) self.compressor = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS + 16)
self.out = out self.out = out
@ -227,11 +228,63 @@ class Digester(object):
# ============================================================================ # ============================================================================
class SingleFileWARCRecorder(BaseWARCRecorder): class PerRecordWARCWriter(BaseWARCWriter):
def __init__(self, filename, *args, **kwargs): DEF_TEMPLATE = 'rec-{timestamp}-{hostname}.warc.gz'
super(SingleFileWARCRecorder, self).__init__(*args, **kwargs)
self.filename = filename.format(timestamp=timestamp20_now(), def __init__(self, warcdir, filename_template=None, *args, **kwargs):
host=gethostname()) super(PerRecordWARCWriter, self).__init__(*args, **kwargs)
if not filename_template:
filename_template = self.DEF_TEMPLATE
self.filename_template = warcdir + filename_template
def _do_write_req_resp(self, req, resp, params, formatter):
#resp_uuid = resp.rec_headers['WARC-Record-ID'].split(':')[-1].strip('<> ')
#req_uuid = req.rec_headers['WARC-Record-ID'].split(':')[-1].strip('<> ')
timestamp = timestamp20_now()
filename = formatter.format(self.filename_template,
hostname=self.hostname,
timestamp=timestamp)
path, name = os.path.split(filename)
try:
os.makedirs(path)
except:
pass
url = resp.rec_headers.get('WARC-Target-Uri')
print('Writing resp/req for {0} to {1}'.format(url, filename))
with open(filename, 'a+b') as out:
start = out.tell()
self._write_warc_record(out, resp)
self._write_warc_record(out, req)
out.flush()
out.seek(start)
if self.dedup_index:
self.dedup_index.index_records(out, params, filename=filename)
# ============================================================================
class SingleFileWARCWriter(BaseWARCWriter):
def __init__(self, filename_template, dir_prefix='', max_size=0, *args, **kwargs):
super(SingleFileWARCWriter, self).__init__(*args, **kwargs)
self.dir_prefix = dir_prefix
self.filename_template = filename_template
self.max_size = max_size
self._open_file()
def _open_file(self):
timestamp = timestamp20_now()
filename = self.filename_template.format(hostname=self.hostname,
timestamp=timestamp)
self.filename = self.dir_prefix + filename
try: try:
os.makedirs(os.path.dirname(self.filename)) os.makedirs(os.path.dirname(self.filename))
@ -246,9 +299,10 @@ class SingleFileWARCRecorder(BaseWARCRecorder):
url = resp.rec_headers.get('WARC-Target-Uri') url = resp.rec_headers.get('WARC-Target-Uri')
print('Writing {0} to {1} '.format(url, self.filename)) print('Writing {0} to {1} '.format(url, self.filename))
if not self._fh:
self._open_file()
out = self._fh out = self._fh
if not out:
raise IOError('Already closed')
start = out.tell() start = out.tell()
@ -256,11 +310,18 @@ class SingleFileWARCRecorder(BaseWARCRecorder):
self._write_warc_record(out, req) self._write_warc_record(out, req)
out.flush() out.flush()
new_size = out.tell()
out.seek(start) out.seek(start)
if self.dedup_index: if self.dedup_index:
self.dedup_index.index_records(out, params, filename=self.filename) self.dedup_index.index_records(out, params, filename=self.filename)
# check for rollover
if self.max_size and new_size > self.max_size:
self.close()
def close(self): def close(self):
if not self._fh: if not self._fh:
return None return None
@ -269,40 +330,3 @@ class SingleFileWARCRecorder(BaseWARCRecorder):
self._fh.close() self._fh.close()
self._fh = None self._fh = None
# ============================================================================
class PerRecordWARCRecorder(BaseWARCRecorder):
def __init__(self, warcdir, *args, **kwargs):
super(PerRecordWARCRecorder, self).__init__(*args, **kwargs)
self.warcdir = warcdir
def _do_write_req_resp(self, req, resp, params, formatter):
resp_uuid = resp.rec_headers['WARC-Record-ID'].split(':')[-1].strip('<> ')
req_uuid = req.rec_headers['WARC-Record-ID'].split(':')[-1].strip('<> ')
full_dir = formatter.format(self.warcdir)
try:
os.makedirs(full_dir)
except:
pass
resp_filename = os.path.join(full_dir, resp_uuid + '.warc.gz')
req_filename = os.path.join(full_dir, req_uuid + '.warc.gz')
url = resp.rec_headers.get('WARC-Target-Uri')
print('Writing request for {0} to {1}'.format(url, req_filename))
print('Writing response for {0} to {1}'.format(url, resp_filename))
self._write_and_index(resp_filename, resp, params, True)
self._write_and_index(req_filename, req, params, False)
def _write_and_index(self, filename, rec, params, index=False):
with open(filename, 'w+b') as out:
self._write_warc_record(out, rec)
if index and self.dedup_index:
out.seek(0)
self.dedup_index.index_records(out, params, filename=filename)