mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
warcwriter: add create_warcinfo_record() for creating a warcinfo and a SimpleTempWARCWriter for writing records to temp buff/file
This commit is contained in:
parent
fd76030cb3
commit
d40edfc22d
@ -12,7 +12,7 @@ from fakeredis import FakeStrictRedis
|
|||||||
|
|
||||||
from recorder.recorderapp import RecorderApp
|
from recorder.recorderapp import RecorderApp
|
||||||
from recorder.redisindexer import WritableRedisIndexer
|
from recorder.redisindexer import WritableRedisIndexer
|
||||||
from recorder.warcwriter import PerRecordWARCWriter, MultiFileWARCWriter
|
from recorder.warcwriter import PerRecordWARCWriter, MultiFileWARCWriter, SimpleTempWARCWriter
|
||||||
from recorder.filters import ExcludeSpecificHeaders
|
from recorder.filters import ExcludeSpecificHeaders
|
||||||
from recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy
|
from recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy
|
||||||
|
|
||||||
@ -27,6 +27,7 @@ from pywb.warc.cdxindexer import write_cdx_index
|
|||||||
from six.moves.urllib.parse import quote, unquote
|
from six.moves.urllib.parse import quote, unquote
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import time
|
import time
|
||||||
|
import json
|
||||||
|
|
||||||
general_req_data = "\
|
general_req_data = "\
|
||||||
GET {path} HTTP/1.1\r\n\
|
GET {path} HTTP/1.1\r\n\
|
||||||
@ -180,7 +181,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
|
|||||||
|
|
||||||
self._test_all_warcs('/warcs/', 2)
|
self._test_all_warcs('/warcs/', 2)
|
||||||
|
|
||||||
#@patch('redis.StrictRedis', FakeStrictRedis)
|
|
||||||
def test_record_param_user_coll(self):
|
def test_record_param_user_coll(self):
|
||||||
|
|
||||||
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
|
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
|
||||||
@ -216,7 +216,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
|
|||||||
assert warcs == {cdx['filename'].encode('utf-8'): full_path.encode('utf-8')}
|
assert warcs == {cdx['filename'].encode('utf-8'): full_path.encode('utf-8')}
|
||||||
|
|
||||||
|
|
||||||
#@patch('redis.StrictRedis', FakeStrictRedis)
|
|
||||||
def test_record_param_user_coll_revisit(self):
|
def test_record_param_user_coll_revisit(self):
|
||||||
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
|
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
|
||||||
|
|
||||||
@ -263,7 +262,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
|
|||||||
assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://httpbin.org/get?foo=bar'
|
assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://httpbin.org/get?foo=bar'
|
||||||
assert status_headers.get_header('WARC-Refers-To-Date') != ''
|
assert status_headers.get_header('WARC-Refers-To-Date') != ''
|
||||||
|
|
||||||
#@patch('redis.StrictRedis', FakeStrictRedis)
|
|
||||||
def test_record_param_user_coll_skip(self):
|
def test_record_param_user_coll_skip(self):
|
||||||
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
|
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
|
||||||
|
|
||||||
@ -288,7 +286,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
|
|||||||
res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
|
res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
|
||||||
assert len(res) == 2
|
assert len(res) == 2
|
||||||
|
|
||||||
#@patch('redis.StrictRedis', FakeStrictRedis)
|
|
||||||
def test_record_param_user_coll_write_dupe_no_revisit(self):
|
def test_record_param_user_coll_write_dupe_no_revisit(self):
|
||||||
|
|
||||||
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
|
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
|
||||||
@ -329,7 +326,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
|
|||||||
assert os.path.isfile(path)
|
assert os.path.isfile(path)
|
||||||
assert len(writer.fh_cache) == 1
|
assert len(writer.fh_cache) == 1
|
||||||
|
|
||||||
#@patch('redis.StrictRedis', FakeStrictRedis)
|
|
||||||
def test_record_multiple_writes_keep_open(self):
|
def test_record_multiple_writes_keep_open(self):
|
||||||
warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz')
|
warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz')
|
||||||
|
|
||||||
@ -397,3 +393,31 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
|
|||||||
|
|
||||||
warcs = r.hgetall('FOO:warc')
|
warcs = r.hgetall('FOO:warc')
|
||||||
assert len(warcs) == 2
|
assert len(warcs) == 2
|
||||||
|
|
||||||
|
def test_warcinfo_record(self):
|
||||||
|
simplewriter = SimpleTempWARCWriter(gzip=False)
|
||||||
|
params = {'software': 'recorder test',
|
||||||
|
'format': 'WARC File Format 1.0',
|
||||||
|
'json-metadata': json.dumps({'foo': 'bar'})}
|
||||||
|
|
||||||
|
record = simplewriter.create_warcinfo_record('testfile.warc.gz', **params)
|
||||||
|
simplewriter.write_record(record)
|
||||||
|
buff = simplewriter.get_buffer()
|
||||||
|
assert isinstance(buff, bytes)
|
||||||
|
|
||||||
|
buff = BytesIO(buff)
|
||||||
|
parsed_record = ArcWarcRecordLoader().parse_record_stream(buff)
|
||||||
|
|
||||||
|
assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo'
|
||||||
|
assert parsed_record.rec_headers.get_header('WARC-Filename') == 'testfile.warc.gz'
|
||||||
|
|
||||||
|
buff = parsed_record.stream.read().decode('utf-8')
|
||||||
|
|
||||||
|
length = parsed_record.rec_headers.get_header('Content-Length')
|
||||||
|
|
||||||
|
assert len(buff) == int(length)
|
||||||
|
|
||||||
|
assert 'json-metadata: {"foo": "bar"}\r\n' in buff
|
||||||
|
assert 'format: WARC File Format 1.0\r\n' in buff
|
||||||
|
assert 'json-metadata: {"foo": "bar"}\r\n' in buff
|
||||||
|
|
||||||
|
@ -13,12 +13,15 @@ import traceback
|
|||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
from socket import gethostname
|
from socket import gethostname
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
import fcntl
|
import fcntl
|
||||||
|
|
||||||
from pywb.utils.loaders import LimitReader, to_native_str
|
from pywb.utils.loaders import LimitReader, to_native_str
|
||||||
from pywb.utils.bufferedreaders import BufferedReader
|
from pywb.utils.bufferedreaders import BufferedReader
|
||||||
from pywb.utils.timeutils import timestamp20_now
|
from pywb.utils.timeutils import timestamp20_now, datetime_to_iso_date
|
||||||
|
|
||||||
|
from pywb.warc.recordloader import ArcWarcRecord
|
||||||
|
|
||||||
from webagg.utils import ParamFormatter, res_template
|
from webagg.utils import ParamFormatter, res_template
|
||||||
|
|
||||||
@ -110,6 +113,25 @@ class BaseWARCWriter(object):
|
|||||||
params['_formatter'] = ParamFormatter(params, name=self.rec_source_name)
|
params['_formatter'] = ParamFormatter(params, name=self.rec_source_name)
|
||||||
self._do_write_req_resp(req, resp, params)
|
self._do_write_req_resp(req, resp, params)
|
||||||
|
|
||||||
|
def create_warcinfo_record(self, filename, **kwargs):
|
||||||
|
headers = {}
|
||||||
|
headers['WARC-Record_ID'] = self._make_warc_id()
|
||||||
|
headers['WARC-Type'] = 'warcinfo'
|
||||||
|
if filename:
|
||||||
|
headers['WARC-Filename'] = filename
|
||||||
|
headers['WARC-Date'] = datetime_to_iso_date(datetime.datetime.utcnow())
|
||||||
|
|
||||||
|
warcinfo = BytesIO()
|
||||||
|
for n, v in six.iteritems(kwargs):
|
||||||
|
self._header(warcinfo, n, v)
|
||||||
|
|
||||||
|
warcinfo.seek(0)
|
||||||
|
|
||||||
|
record = ArcWarcRecord('warc', 'warcinfo', headers, warcinfo,
|
||||||
|
None, '', len(warcinfo.getbuffer()))
|
||||||
|
|
||||||
|
return record
|
||||||
|
|
||||||
def _check_revisit(self, record, params):
|
def _check_revisit(self, record, params):
|
||||||
if not self.dedup_index:
|
if not self.dedup_index:
|
||||||
return record
|
return record
|
||||||
@ -159,7 +181,9 @@ class BaseWARCWriter(object):
|
|||||||
http_headers_only = False
|
http_headers_only = False
|
||||||
|
|
||||||
if record.length:
|
if record.length:
|
||||||
actual_len = len(record.status_headers.headers_buff)
|
actual_len = 0
|
||||||
|
if record.status_headers:
|
||||||
|
actual_len = len(record.status_headers.headers_buff)
|
||||||
|
|
||||||
if not http_headers_only:
|
if not http_headers_only:
|
||||||
diff = record.stream.tell() - actual_len
|
diff = record.stream.tell() - actual_len
|
||||||
@ -170,8 +194,9 @@ class BaseWARCWriter(object):
|
|||||||
# add empty line
|
# add empty line
|
||||||
self._line(out, b'')
|
self._line(out, b'')
|
||||||
|
|
||||||
# write headers and buffer
|
# write headers buffer, if any
|
||||||
out.write(record.status_headers.headers_buff)
|
if record.status_headers:
|
||||||
|
out.write(record.status_headers.headers_buff)
|
||||||
|
|
||||||
if not http_headers_only:
|
if not http_headers_only:
|
||||||
out.write(record.stream.read())
|
out.write(record.stream.read())
|
||||||
@ -351,3 +376,26 @@ class PerRecordWARCWriter(MultiFileWARCWriter):
|
|||||||
kwargs['max_size'] = 1
|
kwargs['max_size'] = 1
|
||||||
super(PerRecordWARCWriter, self).__init__(*args, **kwargs)
|
super(PerRecordWARCWriter, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
class SimpleTempWARCWriter(BaseWARCWriter):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(SimpleTempWARCWriter, self).__init__(*args, **kwargs)
|
||||||
|
self.out = self._create_buffer()
|
||||||
|
|
||||||
|
def _create_buffer(self):
|
||||||
|
return tempfile.SpooledTemporaryFile(max_size=512*1024)
|
||||||
|
|
||||||
|
def _do_write_req_resp(self, req, resp, params):
|
||||||
|
self._write_warc_record(self.out, resp)
|
||||||
|
self._write_warc_record(self.out, req)
|
||||||
|
|
||||||
|
def write_record(self, record):
|
||||||
|
self._write_warc_record(self.out, record)
|
||||||
|
|
||||||
|
def get_buffer(self):
|
||||||
|
pos = self.out.tell()
|
||||||
|
self.out.seek(0)
|
||||||
|
buff = self.out.read()
|
||||||
|
self.out.seek(pos)
|
||||||
|
return buff
|
||||||
|
Loading…
x
Reference in New Issue
Block a user