mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-26 07:49:24 +01:00
rename WARCRecorder -> WARCWriter, add optional max_size to single warc recorder
per-record recorder combines http response/req into single file
This commit is contained in:
parent
b64be0dff1
commit
e81457df5f
@ -6,9 +6,6 @@ from pywb.utils.statusandheaders import StatusAndHeadersParser
|
|||||||
from pywb.warc.recordloader import ArcWarcRecord
|
from pywb.warc.recordloader import ArcWarcRecord
|
||||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||||
|
|
||||||
from recorder.warcrecorder import SingleFileWARCRecorder, PerRecordWARCRecorder
|
|
||||||
from recorder.redisindexer import WritableRedisIndexer
|
|
||||||
|
|
||||||
from six.moves.urllib.parse import parse_qsl
|
from six.moves.urllib.parse import parse_qsl
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
@ -12,7 +12,7 @@ from pytest import raises
|
|||||||
|
|
||||||
from recorder.recorderapp import RecorderApp
|
from recorder.recorderapp import RecorderApp
|
||||||
from recorder.redisindexer import WritableRedisIndexer
|
from recorder.redisindexer import WritableRedisIndexer
|
||||||
from recorder.warcrecorder import PerRecordWARCRecorder, SingleFileWARCRecorder
|
from recorder.warcwriter import PerRecordWARCWriter, SingleFileWARCWriter
|
||||||
from recorder.filters import ExcludeSpecificHeaders, SkipDupePolicy, WriteDupePolicy
|
from recorder.filters import ExcludeSpecificHeaders, SkipDupePolicy, WriteDupePolicy
|
||||||
|
|
||||||
from webagg.utils import MementoUtils
|
from webagg.utils import MementoUtils
|
||||||
@ -70,10 +70,21 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
|
|||||||
files = [x for x in os.listdir(coll_dir) if os.path.isfile(os.path.join(coll_dir, x))]
|
files = [x for x in os.listdir(coll_dir) if os.path.isfile(os.path.join(coll_dir, x))]
|
||||||
assert len(files) == num
|
assert len(files) == num
|
||||||
assert all(x.endswith('.warc.gz') for x in files)
|
assert all(x.endswith('.warc.gz') for x in files)
|
||||||
|
return files, coll_dir
|
||||||
|
|
||||||
def test_record_warc_1(self):
|
def test_record_warc_1(self):
|
||||||
recorder_app = RecorderApp(self.upstream_url,
|
recorder_app = RecorderApp(self.upstream_url,
|
||||||
PerRecordWARCRecorder(to_path(self.root_dir + '/warcs/')))
|
PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')))
|
||||||
|
|
||||||
|
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
|
||||||
|
assert b'HTTP/1.1 200 OK' in resp.body
|
||||||
|
assert b'"foo": "bar"' in resp.body
|
||||||
|
|
||||||
|
self._test_all_warcs('/warcs/', 1)
|
||||||
|
|
||||||
|
def test_record_warc_2(self):
|
||||||
|
recorder_app = RecorderApp(self.upstream_url,
|
||||||
|
PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='live')
|
||||||
|
|
||||||
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
|
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
|
||||||
assert b'HTTP/1.1 200 OK' in resp.body
|
assert b'HTTP/1.1 200 OK' in resp.body
|
||||||
@ -81,19 +92,9 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
|
|||||||
|
|
||||||
self._test_all_warcs('/warcs/', 2)
|
self._test_all_warcs('/warcs/', 2)
|
||||||
|
|
||||||
def test_record_warc_2(self):
|
|
||||||
recorder_app = RecorderApp(self.upstream_url,
|
|
||||||
PerRecordWARCRecorder(to_path(self.root_dir + '/warcs/')), accept_colls='live')
|
|
||||||
|
|
||||||
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
|
|
||||||
assert b'HTTP/1.1 200 OK' in resp.body
|
|
||||||
assert b'"foo": "bar"' in resp.body
|
|
||||||
|
|
||||||
self._test_all_warcs('/warcs/', 4)
|
|
||||||
|
|
||||||
def test_error_url(self):
|
def test_error_url(self):
|
||||||
recorder_app = RecorderApp(self.upstream_url + '01',
|
recorder_app = RecorderApp(self.upstream_url + '01',
|
||||||
PerRecordWARCRecorder(to_path(self.root_dir + '/warcs/')), accept_colls='live')
|
PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='live')
|
||||||
|
|
||||||
|
|
||||||
testapp = webtest.TestApp(recorder_app)
|
testapp = webtest.TestApp(recorder_app)
|
||||||
@ -101,12 +102,12 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
|
|||||||
|
|
||||||
assert resp.json['error'] != ''
|
assert resp.json['error'] != ''
|
||||||
|
|
||||||
self._test_all_warcs('/warcs/', 4)
|
self._test_all_warcs('/warcs/', 2)
|
||||||
|
|
||||||
def test_record_cookies_header(self):
|
def test_record_cookies_header(self):
|
||||||
base_path = to_path(self.root_dir + '/warcs/cookiecheck/')
|
base_path = to_path(self.root_dir + '/warcs/cookiecheck/')
|
||||||
recorder_app = RecorderApp(self.upstream_url,
|
recorder_app = RecorderApp(self.upstream_url,
|
||||||
PerRecordWARCRecorder(base_path), accept_colls='live')
|
PerRecordWARCWriter(base_path), accept_colls='live')
|
||||||
|
|
||||||
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar')
|
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar')
|
||||||
assert b'HTTP/1.1 302' in resp.body
|
assert b'HTTP/1.1 302' in resp.body
|
||||||
@ -134,7 +135,7 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
|
|||||||
base_path = to_path(self.root_dir + '/warcs/cookieskip/')
|
base_path = to_path(self.root_dir + '/warcs/cookieskip/')
|
||||||
header_filter = ExcludeSpecificHeaders(['Set-Cookie', 'Cookie'])
|
header_filter = ExcludeSpecificHeaders(['Set-Cookie', 'Cookie'])
|
||||||
recorder_app = RecorderApp(self.upstream_url,
|
recorder_app = RecorderApp(self.upstream_url,
|
||||||
PerRecordWARCRecorder(base_path, header_filter=header_filter),
|
PerRecordWARCWriter(base_path, header_filter=header_filter),
|
||||||
accept_colls='live')
|
accept_colls='live')
|
||||||
|
|
||||||
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar')
|
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar')
|
||||||
@ -162,13 +163,13 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
|
|||||||
|
|
||||||
def test_record_skip_wrong_coll(self):
|
def test_record_skip_wrong_coll(self):
|
||||||
recorder_app = RecorderApp(self.upstream_url,
|
recorder_app = RecorderApp(self.upstream_url,
|
||||||
writer=PerRecordWARCRecorder(to_path(self.root_dir + '/warcs/')), accept_colls='not-live')
|
writer=PerRecordWARCWriter(to_path(self.root_dir + '/warcs/')), accept_colls='not-live')
|
||||||
|
|
||||||
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
|
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
|
||||||
assert b'HTTP/1.1 200 OK' in resp.body
|
assert b'HTTP/1.1 200 OK' in resp.body
|
||||||
assert b'"foo": "bar"' in resp.body
|
assert b'"foo": "bar"' in resp.body
|
||||||
|
|
||||||
self._test_all_warcs('/warcs/', 4)
|
self._test_all_warcs('/warcs/', 2)
|
||||||
|
|
||||||
@patch('redis.StrictRedis', FakeStrictRedis)
|
@patch('redis.StrictRedis', FakeStrictRedis)
|
||||||
def test_record_param_user_coll(self):
|
def test_record_param_user_coll(self):
|
||||||
@ -180,16 +181,16 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
|
|||||||
rel_path_template=self.root_dir + '/warcs/')
|
rel_path_template=self.root_dir + '/warcs/')
|
||||||
|
|
||||||
recorder_app = RecorderApp(self.upstream_url,
|
recorder_app = RecorderApp(self.upstream_url,
|
||||||
PerRecordWARCRecorder(warc_path, dedup_index=dedup_index))
|
PerRecordWARCWriter(warc_path, dedup_index=dedup_index))
|
||||||
|
|
||||||
self._test_all_warcs('/warcs/', 4)
|
self._test_all_warcs('/warcs/', 2)
|
||||||
|
|
||||||
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
||||||
'/get?foo=bar', '¶m.recorder.user=USER¶m.recorder.coll=COLL')
|
'/get?foo=bar', '¶m.recorder.user=USER¶m.recorder.coll=COLL')
|
||||||
assert b'HTTP/1.1 200 OK' in resp.body
|
assert b'HTTP/1.1 200 OK' in resp.body
|
||||||
assert b'"foo": "bar"' in resp.body
|
assert b'"foo": "bar"' in resp.body
|
||||||
|
|
||||||
self._test_all_warcs('/warcs/USER/COLL/', 2)
|
self._test_all_warcs('/warcs/USER/COLL/', 1)
|
||||||
|
|
||||||
r = FakeStrictRedis.from_url('redis://localhost/2')
|
r = FakeStrictRedis.from_url('redis://localhost/2')
|
||||||
|
|
||||||
@ -213,16 +214,16 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
|
|||||||
rel_path_template=self.root_dir + '/warcs/')
|
rel_path_template=self.root_dir + '/warcs/')
|
||||||
|
|
||||||
recorder_app = RecorderApp(self.upstream_url,
|
recorder_app = RecorderApp(self.upstream_url,
|
||||||
PerRecordWARCRecorder(warc_path, dedup_index=dedup_index))
|
PerRecordWARCWriter(warc_path, dedup_index=dedup_index))
|
||||||
|
|
||||||
self._test_all_warcs('/warcs/', 4)
|
self._test_all_warcs('/warcs/', 2)
|
||||||
|
|
||||||
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
||||||
'/get?foo=bar', '¶m.recorder.user=USER¶m.recorder.coll=COLL')
|
'/get?foo=bar', '¶m.recorder.user=USER¶m.recorder.coll=COLL')
|
||||||
assert b'HTTP/1.1 200 OK' in resp.body
|
assert b'HTTP/1.1 200 OK' in resp.body
|
||||||
assert b'"foo": "bar"' in resp.body
|
assert b'"foo": "bar"' in resp.body
|
||||||
|
|
||||||
self._test_all_warcs('/warcs/USER/COLL/', 4)
|
self._test_all_warcs('/warcs/USER/COLL/', 2)
|
||||||
|
|
||||||
# Test Redis CDX
|
# Test Redis CDX
|
||||||
r = FakeStrictRedis.from_url('redis://localhost/2')
|
r = FakeStrictRedis.from_url('redis://localhost/2')
|
||||||
@ -259,17 +260,17 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
|
|||||||
dupe_policy=SkipDupePolicy())
|
dupe_policy=SkipDupePolicy())
|
||||||
|
|
||||||
recorder_app = RecorderApp(self.upstream_url,
|
recorder_app = RecorderApp(self.upstream_url,
|
||||||
PerRecordWARCRecorder(warc_path, dedup_index=dedup_index))
|
PerRecordWARCWriter(warc_path, dedup_index=dedup_index))
|
||||||
|
|
||||||
# No new entries written
|
# No new entries written
|
||||||
self._test_all_warcs('/warcs/', 4)
|
self._test_all_warcs('/warcs/', 2)
|
||||||
|
|
||||||
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
||||||
'/get?foo=bar', '¶m.recorder.user=USER¶m.recorder.coll=COLL')
|
'/get?foo=bar', '¶m.recorder.user=USER¶m.recorder.coll=COLL')
|
||||||
assert b'HTTP/1.1 200 OK' in resp.body
|
assert b'HTTP/1.1 200 OK' in resp.body
|
||||||
assert b'"foo": "bar"' in resp.body
|
assert b'"foo": "bar"' in resp.body
|
||||||
|
|
||||||
self._test_all_warcs('/warcs/USER/COLL/', 4)
|
self._test_all_warcs('/warcs/USER/COLL/', 2)
|
||||||
|
|
||||||
# Test Redis CDX
|
# Test Redis CDX
|
||||||
r = FakeStrictRedis.from_url('redis://localhost/2')
|
r = FakeStrictRedis.from_url('redis://localhost/2')
|
||||||
@ -288,14 +289,14 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
|
|||||||
dupe_policy=WriteDupePolicy())
|
dupe_policy=WriteDupePolicy())
|
||||||
|
|
||||||
recorder_app = RecorderApp(self.upstream_url,
|
recorder_app = RecorderApp(self.upstream_url,
|
||||||
PerRecordWARCRecorder(warc_path, dedup_index=dedup_index))
|
PerRecordWARCWriter(warc_path, dedup_index=dedup_index))
|
||||||
|
|
||||||
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
||||||
'/get?foo=bar', '¶m.recorder.user=USER¶m.recorder.coll=COLL')
|
'/get?foo=bar', '¶m.recorder.user=USER¶m.recorder.coll=COLL')
|
||||||
assert b'HTTP/1.1 200 OK' in resp.body
|
assert b'HTTP/1.1 200 OK' in resp.body
|
||||||
assert b'"foo": "bar"' in resp.body
|
assert b'"foo": "bar"' in resp.body
|
||||||
|
|
||||||
self._test_all_warcs('/warcs/USER/COLL/', 6)
|
self._test_all_warcs('/warcs/USER/COLL/', 3)
|
||||||
|
|
||||||
r = FakeStrictRedis.from_url('redis://localhost/2')
|
r = FakeStrictRedis.from_url('redis://localhost/2')
|
||||||
|
|
||||||
@ -310,7 +311,7 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
|
|||||||
def test_record_single_file_warc_1(self):
|
def test_record_single_file_warc_1(self):
|
||||||
path = to_path(self.root_dir + '/warcs/A.warc.gz')
|
path = to_path(self.root_dir + '/warcs/A.warc.gz')
|
||||||
recorder_app = RecorderApp(self.upstream_url,
|
recorder_app = RecorderApp(self.upstream_url,
|
||||||
SingleFileWARCRecorder(path))
|
SingleFileWARCWriter(path))
|
||||||
|
|
||||||
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
|
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
|
||||||
assert b'HTTP/1.1 200 OK' in resp.body
|
assert b'HTTP/1.1 200 OK' in resp.body
|
||||||
@ -321,14 +322,14 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
|
|||||||
|
|
||||||
@patch('redis.StrictRedis', FakeStrictRedis)
|
@patch('redis.StrictRedis', FakeStrictRedis)
|
||||||
def test_record_single_file_multiple_writes(self):
|
def test_record_single_file_multiple_writes(self):
|
||||||
warc_path = to_path(self.root_dir + '/warcs/FOO/rec-test.warc.gz')
|
warc_path = to_path(self.root_dir + '/warcs/FOO/rec-{hostname}-{timestamp}.warc.gz')
|
||||||
|
|
||||||
rel_path = self.root_dir + '/warcs/'
|
rel_path = self.root_dir + '/warcs/'
|
||||||
|
|
||||||
dedup_index = WritableRedisIndexer('redis://localhost/2/{coll}:cdxj',
|
dedup_index = WritableRedisIndexer('redis://localhost/2/{coll}:cdxj',
|
||||||
rel_path_template=rel_path)
|
rel_path_template=rel_path)
|
||||||
|
|
||||||
writer = SingleFileWARCRecorder(warc_path, dedup_index=dedup_index)
|
writer = SingleFileWARCWriter(warc_path, dedup_index=dedup_index)
|
||||||
recorder_app = RecorderApp(self.upstream_url, writer)
|
recorder_app = RecorderApp(self.upstream_url, writer)
|
||||||
|
|
||||||
# First Record
|
# First Record
|
||||||
@ -352,11 +353,12 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
|
|||||||
res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,')
|
res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,')
|
||||||
assert len(res) == 2
|
assert len(res) == 2
|
||||||
|
|
||||||
assert os.path.isfile(warc_path)
|
files, coll_dir = self._test_all_warcs('/warcs/FOO/', 1)
|
||||||
|
fullname = coll_dir + files[0]
|
||||||
|
|
||||||
cdxout = BytesIO()
|
cdxout = BytesIO()
|
||||||
with open(warc_path, 'rb') as fh:
|
with open(fullname, 'rb') as fh:
|
||||||
filename = os.path.relpath(warc_path, rel_path)
|
filename = os.path.relpath(fullname, rel_path)
|
||||||
write_cdx_index(cdxout, fh, filename,
|
write_cdx_index(cdxout, fh, filename,
|
||||||
cdxj=True, append_post=True, sort=True)
|
cdxj=True, append_post=True, sort=True)
|
||||||
|
|
||||||
@ -368,10 +370,10 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
|
|||||||
|
|
||||||
assert cdxres == res
|
assert cdxres == res
|
||||||
|
|
||||||
|
# close this file
|
||||||
writer.close()
|
writer.close()
|
||||||
|
|
||||||
with raises(OSError):
|
|
||||||
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
resp = self._test_warc_write(recorder_app, 'httpbin.org',
|
||||||
'/get?boo=far', '¶m.recorder.coll=FOO')
|
'/get?boo=far', '¶m.recorder.coll=FOO')
|
||||||
|
|
||||||
|
self._test_all_warcs('/warcs/FOO/', 2)
|
||||||
|
@ -27,7 +27,7 @@ from recorder.filters import ExcludeNone
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class BaseWARCRecorder(object):
|
class BaseWARCWriter(object):
|
||||||
WARC_RECORDS = {'warcinfo': 'application/warc-fields',
|
WARC_RECORDS = {'warcinfo': 'application/warc-fields',
|
||||||
'response': 'application/http; msgtype=response',
|
'response': 'application/http; msgtype=response',
|
||||||
'revisit': 'application/http; msgtype=response',
|
'revisit': 'application/http; msgtype=response',
|
||||||
@ -45,6 +45,7 @@ class BaseWARCRecorder(object):
|
|||||||
self.dedup_index = dedup_index
|
self.dedup_index = dedup_index
|
||||||
self.rec_source_name = name
|
self.rec_source_name = name
|
||||||
self.header_filter = header_filter
|
self.header_filter = header_filter
|
||||||
|
self.hostname = gethostname()
|
||||||
|
|
||||||
def ensure_digest(self, record):
|
def ensure_digest(self, record):
|
||||||
block_digest = record.rec_headers.get('WARC-Block-Digest')
|
block_digest = record.rec_headers.get('WARC-Block-Digest')
|
||||||
@ -135,7 +136,7 @@ class BaseWARCRecorder(object):
|
|||||||
|
|
||||||
def _write_warc_record(self, out, record):
|
def _write_warc_record(self, out, record):
|
||||||
if self.gzip:
|
if self.gzip:
|
||||||
out = GzippingWriter(out)
|
out = GzippingWrapper(out)
|
||||||
|
|
||||||
self._line(out, b'WARC/1.0')
|
self._line(out, b'WARC/1.0')
|
||||||
|
|
||||||
@ -196,7 +197,7 @@ class BaseWARCRecorder(object):
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class GzippingWriter(object):
|
class GzippingWrapper(object):
|
||||||
def __init__(self, out):
|
def __init__(self, out):
|
||||||
self.compressor = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS + 16)
|
self.compressor = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS + 16)
|
||||||
self.out = out
|
self.out = out
|
||||||
@ -227,11 +228,63 @@ class Digester(object):
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class SingleFileWARCRecorder(BaseWARCRecorder):
|
class PerRecordWARCWriter(BaseWARCWriter):
|
||||||
def __init__(self, filename, *args, **kwargs):
|
DEF_TEMPLATE = 'rec-{timestamp}-{hostname}.warc.gz'
|
||||||
super(SingleFileWARCRecorder, self).__init__(*args, **kwargs)
|
|
||||||
self.filename = filename.format(timestamp=timestamp20_now(),
|
def __init__(self, warcdir, filename_template=None, *args, **kwargs):
|
||||||
host=gethostname())
|
super(PerRecordWARCWriter, self).__init__(*args, **kwargs)
|
||||||
|
if not filename_template:
|
||||||
|
filename_template = self.DEF_TEMPLATE
|
||||||
|
self.filename_template = warcdir + filename_template
|
||||||
|
|
||||||
|
def _do_write_req_resp(self, req, resp, params, formatter):
|
||||||
|
#resp_uuid = resp.rec_headers['WARC-Record-ID'].split(':')[-1].strip('<> ')
|
||||||
|
#req_uuid = req.rec_headers['WARC-Record-ID'].split(':')[-1].strip('<> ')
|
||||||
|
timestamp = timestamp20_now()
|
||||||
|
|
||||||
|
filename = formatter.format(self.filename_template,
|
||||||
|
hostname=self.hostname,
|
||||||
|
timestamp=timestamp)
|
||||||
|
|
||||||
|
path, name = os.path.split(filename)
|
||||||
|
|
||||||
|
try:
|
||||||
|
os.makedirs(path)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
url = resp.rec_headers.get('WARC-Target-Uri')
|
||||||
|
print('Writing resp/req for {0} to {1}'.format(url, filename))
|
||||||
|
|
||||||
|
with open(filename, 'a+b') as out:
|
||||||
|
start = out.tell()
|
||||||
|
|
||||||
|
self._write_warc_record(out, resp)
|
||||||
|
self._write_warc_record(out, req)
|
||||||
|
|
||||||
|
out.flush()
|
||||||
|
out.seek(start)
|
||||||
|
|
||||||
|
if self.dedup_index:
|
||||||
|
self.dedup_index.index_records(out, params, filename=filename)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
class SingleFileWARCWriter(BaseWARCWriter):
|
||||||
|
def __init__(self, filename_template, dir_prefix='', max_size=0, *args, **kwargs):
|
||||||
|
super(SingleFileWARCWriter, self).__init__(*args, **kwargs)
|
||||||
|
self.dir_prefix = dir_prefix
|
||||||
|
self.filename_template = filename_template
|
||||||
|
self.max_size = max_size
|
||||||
|
self._open_file()
|
||||||
|
|
||||||
|
def _open_file(self):
|
||||||
|
timestamp = timestamp20_now()
|
||||||
|
|
||||||
|
filename = self.filename_template.format(hostname=self.hostname,
|
||||||
|
timestamp=timestamp)
|
||||||
|
|
||||||
|
self.filename = self.dir_prefix + filename
|
||||||
|
|
||||||
try:
|
try:
|
||||||
os.makedirs(os.path.dirname(self.filename))
|
os.makedirs(os.path.dirname(self.filename))
|
||||||
@ -246,9 +299,10 @@ class SingleFileWARCRecorder(BaseWARCRecorder):
|
|||||||
url = resp.rec_headers.get('WARC-Target-Uri')
|
url = resp.rec_headers.get('WARC-Target-Uri')
|
||||||
print('Writing {0} to {1} '.format(url, self.filename))
|
print('Writing {0} to {1} '.format(url, self.filename))
|
||||||
|
|
||||||
|
if not self._fh:
|
||||||
|
self._open_file()
|
||||||
|
|
||||||
out = self._fh
|
out = self._fh
|
||||||
if not out:
|
|
||||||
raise IOError('Already closed')
|
|
||||||
|
|
||||||
start = out.tell()
|
start = out.tell()
|
||||||
|
|
||||||
@ -256,11 +310,18 @@ class SingleFileWARCRecorder(BaseWARCRecorder):
|
|||||||
self._write_warc_record(out, req)
|
self._write_warc_record(out, req)
|
||||||
|
|
||||||
out.flush()
|
out.flush()
|
||||||
|
|
||||||
|
new_size = out.tell()
|
||||||
|
|
||||||
out.seek(start)
|
out.seek(start)
|
||||||
|
|
||||||
if self.dedup_index:
|
if self.dedup_index:
|
||||||
self.dedup_index.index_records(out, params, filename=self.filename)
|
self.dedup_index.index_records(out, params, filename=self.filename)
|
||||||
|
|
||||||
|
# check for rollover
|
||||||
|
if self.max_size and new_size > self.max_size:
|
||||||
|
self.close()
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
if not self._fh:
|
if not self._fh:
|
||||||
return None
|
return None
|
||||||
@ -269,40 +330,3 @@ class SingleFileWARCRecorder(BaseWARCRecorder):
|
|||||||
|
|
||||||
self._fh.close()
|
self._fh.close()
|
||||||
self._fh = None
|
self._fh = None
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
|
||||||
class PerRecordWARCRecorder(BaseWARCRecorder):
|
|
||||||
def __init__(self, warcdir, *args, **kwargs):
|
|
||||||
super(PerRecordWARCRecorder, self).__init__(*args, **kwargs)
|
|
||||||
self.warcdir = warcdir
|
|
||||||
|
|
||||||
def _do_write_req_resp(self, req, resp, params, formatter):
|
|
||||||
resp_uuid = resp.rec_headers['WARC-Record-ID'].split(':')[-1].strip('<> ')
|
|
||||||
req_uuid = req.rec_headers['WARC-Record-ID'].split(':')[-1].strip('<> ')
|
|
||||||
|
|
||||||
full_dir = formatter.format(self.warcdir)
|
|
||||||
|
|
||||||
try:
|
|
||||||
os.makedirs(full_dir)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
resp_filename = os.path.join(full_dir, resp_uuid + '.warc.gz')
|
|
||||||
req_filename = os.path.join(full_dir, req_uuid + '.warc.gz')
|
|
||||||
|
|
||||||
url = resp.rec_headers.get('WARC-Target-Uri')
|
|
||||||
print('Writing request for {0} to {1}'.format(url, req_filename))
|
|
||||||
print('Writing response for {0} to {1}'.format(url, resp_filename))
|
|
||||||
|
|
||||||
self._write_and_index(resp_filename, resp, params, True)
|
|
||||||
self._write_and_index(req_filename, req, params, False)
|
|
||||||
|
|
||||||
def _write_and_index(self, filename, rec, params, index=False):
|
|
||||||
with open(filename, 'w+b') as out:
|
|
||||||
self._write_warc_record(out, rec)
|
|
||||||
if index and self.dedup_index:
|
|
||||||
out.seek(0)
|
|
||||||
self.dedup_index.index_records(out, params, filename=filename)
|
|
||||||
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user