mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 08:04:49 +01:00
warcwriter: add create_warcinfo_record() for creating a warcinfo and a SimpleTempWARCWriter for writing records to temp buff/file
This commit is contained in:
parent
fd76030cb3
commit
d40edfc22d
@ -12,7 +12,7 @@ from fakeredis import FakeStrictRedis
|
||||
|
||||
from recorder.recorderapp import RecorderApp
|
||||
from recorder.redisindexer import WritableRedisIndexer
|
||||
from recorder.warcwriter import PerRecordWARCWriter, MultiFileWARCWriter
|
||||
from recorder.warcwriter import PerRecordWARCWriter, MultiFileWARCWriter, SimpleTempWARCWriter
|
||||
from recorder.filters import ExcludeSpecificHeaders
|
||||
from recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy
|
||||
|
||||
@ -27,6 +27,7 @@ from pywb.warc.cdxindexer import write_cdx_index
|
||||
from six.moves.urllib.parse import quote, unquote
|
||||
from io import BytesIO
|
||||
import time
|
||||
import json
|
||||
|
||||
general_req_data = "\
|
||||
GET {path} HTTP/1.1\r\n\
|
||||
@ -180,7 +181,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
|
||||
|
||||
self._test_all_warcs('/warcs/', 2)
|
||||
|
||||
#@patch('redis.StrictRedis', FakeStrictRedis)
|
||||
def test_record_param_user_coll(self):
|
||||
|
||||
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
|
||||
@ -216,7 +216,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
|
||||
assert warcs == {cdx['filename'].encode('utf-8'): full_path.encode('utf-8')}
|
||||
|
||||
|
||||
#@patch('redis.StrictRedis', FakeStrictRedis)
|
||||
def test_record_param_user_coll_revisit(self):
|
||||
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
|
||||
|
||||
@ -263,7 +262,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
|
||||
assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://httpbin.org/get?foo=bar'
|
||||
assert status_headers.get_header('WARC-Refers-To-Date') != ''
|
||||
|
||||
#@patch('redis.StrictRedis', FakeStrictRedis)
|
||||
def test_record_param_user_coll_skip(self):
|
||||
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
|
||||
|
||||
@ -288,7 +286,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
|
||||
res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
|
||||
assert len(res) == 2
|
||||
|
||||
#@patch('redis.StrictRedis', FakeStrictRedis)
|
||||
def test_record_param_user_coll_write_dupe_no_revisit(self):
|
||||
|
||||
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
|
||||
@ -329,7 +326,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
|
||||
assert os.path.isfile(path)
|
||||
assert len(writer.fh_cache) == 1
|
||||
|
||||
#@patch('redis.StrictRedis', FakeStrictRedis)
|
||||
def test_record_multiple_writes_keep_open(self):
|
||||
warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz')
|
||||
|
||||
@ -397,3 +393,31 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
|
||||
|
||||
warcs = r.hgetall('FOO:warc')
|
||||
assert len(warcs) == 2
|
||||
|
||||
def test_warcinfo_record(self):
|
||||
simplewriter = SimpleTempWARCWriter(gzip=False)
|
||||
params = {'software': 'recorder test',
|
||||
'format': 'WARC File Format 1.0',
|
||||
'json-metadata': json.dumps({'foo': 'bar'})}
|
||||
|
||||
record = simplewriter.create_warcinfo_record('testfile.warc.gz', **params)
|
||||
simplewriter.write_record(record)
|
||||
buff = simplewriter.get_buffer()
|
||||
assert isinstance(buff, bytes)
|
||||
|
||||
buff = BytesIO(buff)
|
||||
parsed_record = ArcWarcRecordLoader().parse_record_stream(buff)
|
||||
|
||||
assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo'
|
||||
assert parsed_record.rec_headers.get_header('WARC-Filename') == 'testfile.warc.gz'
|
||||
|
||||
buff = parsed_record.stream.read().decode('utf-8')
|
||||
|
||||
length = parsed_record.rec_headers.get_header('Content-Length')
|
||||
|
||||
assert len(buff) == int(length)
|
||||
|
||||
assert 'json-metadata: {"foo": "bar"}\r\n' in buff
|
||||
assert 'format: WARC File Format 1.0\r\n' in buff
|
||||
assert 'json-metadata: {"foo": "bar"}\r\n' in buff
|
||||
|
||||
|
@ -13,12 +13,15 @@ import traceback
|
||||
from collections import OrderedDict
|
||||
|
||||
from socket import gethostname
|
||||
from io import BytesIO
|
||||
|
||||
import fcntl
|
||||
|
||||
from pywb.utils.loaders import LimitReader, to_native_str
|
||||
from pywb.utils.bufferedreaders import BufferedReader
|
||||
from pywb.utils.timeutils import timestamp20_now
|
||||
from pywb.utils.timeutils import timestamp20_now, datetime_to_iso_date
|
||||
|
||||
from pywb.warc.recordloader import ArcWarcRecord
|
||||
|
||||
from webagg.utils import ParamFormatter, res_template
|
||||
|
||||
@ -110,6 +113,25 @@ class BaseWARCWriter(object):
|
||||
params['_formatter'] = ParamFormatter(params, name=self.rec_source_name)
|
||||
self._do_write_req_resp(req, resp, params)
|
||||
|
||||
def create_warcinfo_record(self, filename, **kwargs):
|
||||
headers = {}
|
||||
headers['WARC-Record_ID'] = self._make_warc_id()
|
||||
headers['WARC-Type'] = 'warcinfo'
|
||||
if filename:
|
||||
headers['WARC-Filename'] = filename
|
||||
headers['WARC-Date'] = datetime_to_iso_date(datetime.datetime.utcnow())
|
||||
|
||||
warcinfo = BytesIO()
|
||||
for n, v in six.iteritems(kwargs):
|
||||
self._header(warcinfo, n, v)
|
||||
|
||||
warcinfo.seek(0)
|
||||
|
||||
record = ArcWarcRecord('warc', 'warcinfo', headers, warcinfo,
|
||||
None, '', len(warcinfo.getbuffer()))
|
||||
|
||||
return record
|
||||
|
||||
def _check_revisit(self, record, params):
|
||||
if not self.dedup_index:
|
||||
return record
|
||||
@ -159,7 +181,9 @@ class BaseWARCWriter(object):
|
||||
http_headers_only = False
|
||||
|
||||
if record.length:
|
||||
actual_len = len(record.status_headers.headers_buff)
|
||||
actual_len = 0
|
||||
if record.status_headers:
|
||||
actual_len = len(record.status_headers.headers_buff)
|
||||
|
||||
if not http_headers_only:
|
||||
diff = record.stream.tell() - actual_len
|
||||
@ -170,8 +194,9 @@ class BaseWARCWriter(object):
|
||||
# add empty line
|
||||
self._line(out, b'')
|
||||
|
||||
# write headers and buffer
|
||||
out.write(record.status_headers.headers_buff)
|
||||
# write headers buffer, if any
|
||||
if record.status_headers:
|
||||
out.write(record.status_headers.headers_buff)
|
||||
|
||||
if not http_headers_only:
|
||||
out.write(record.stream.read())
|
||||
@ -351,3 +376,26 @@ class PerRecordWARCWriter(MultiFileWARCWriter):
|
||||
kwargs['max_size'] = 1
|
||||
super(PerRecordWARCWriter, self).__init__(*args, **kwargs)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class SimpleTempWARCWriter(BaseWARCWriter):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(SimpleTempWARCWriter, self).__init__(*args, **kwargs)
|
||||
self.out = self._create_buffer()
|
||||
|
||||
def _create_buffer(self):
|
||||
return tempfile.SpooledTemporaryFile(max_size=512*1024)
|
||||
|
||||
def _do_write_req_resp(self, req, resp, params):
|
||||
self._write_warc_record(self.out, resp)
|
||||
self._write_warc_record(self.out, req)
|
||||
|
||||
def write_record(self, record):
|
||||
self._write_warc_record(self.out, record)
|
||||
|
||||
def get_buffer(self):
|
||||
pos = self.out.tell()
|
||||
self.out.seek(0)
|
||||
buff = self.out.read()
|
||||
self.out.seek(pos)
|
||||
return buff
|
||||
|
Loading…
x
Reference in New Issue
Block a user