1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

warcwriter: add create_warcinfo_record() for creating a warcinfo and a SimpleTempWARCWriter for writing records to temp buff/file

This commit is contained in:
Ilya Kreymer 2016-04-03 12:19:54 -07:00
parent fd76030cb3
commit d40edfc22d
2 changed files with 82 additions and 10 deletions

View File

@ -12,7 +12,7 @@ from fakeredis import FakeStrictRedis
from recorder.recorderapp import RecorderApp
from recorder.redisindexer import WritableRedisIndexer
from recorder.warcwriter import PerRecordWARCWriter, MultiFileWARCWriter
from recorder.warcwriter import PerRecordWARCWriter, MultiFileWARCWriter, SimpleTempWARCWriter
from recorder.filters import ExcludeSpecificHeaders
from recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy
@ -27,6 +27,7 @@ from pywb.warc.cdxindexer import write_cdx_index
from six.moves.urllib.parse import quote, unquote
from io import BytesIO
import time
import json
general_req_data = "\
GET {path} HTTP/1.1\r\n\
@ -180,7 +181,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
self._test_all_warcs('/warcs/', 2)
#@patch('redis.StrictRedis', FakeStrictRedis)
def test_record_param_user_coll(self):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
@ -216,7 +216,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
assert warcs == {cdx['filename'].encode('utf-8'): full_path.encode('utf-8')}
#@patch('redis.StrictRedis', FakeStrictRedis)
def test_record_param_user_coll_revisit(self):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
@ -263,7 +262,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://httpbin.org/get?foo=bar'
assert status_headers.get_header('WARC-Refers-To-Date') != ''
#@patch('redis.StrictRedis', FakeStrictRedis)
def test_record_param_user_coll_skip(self):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
@ -288,7 +286,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
assert len(res) == 2
#@patch('redis.StrictRedis', FakeStrictRedis)
def test_record_param_user_coll_write_dupe_no_revisit(self):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
@ -329,7 +326,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
assert os.path.isfile(path)
assert len(writer.fh_cache) == 1
#@patch('redis.StrictRedis', FakeStrictRedis)
def test_record_multiple_writes_keep_open(self):
warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz')
@ -397,3 +393,31 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
warcs = r.hgetall('FOO:warc')
assert len(warcs) == 2
def test_warcinfo_record(self):
simplewriter = SimpleTempWARCWriter(gzip=False)
params = {'software': 'recorder test',
'format': 'WARC File Format 1.0',
'json-metadata': json.dumps({'foo': 'bar'})}
record = simplewriter.create_warcinfo_record('testfile.warc.gz', **params)
simplewriter.write_record(record)
buff = simplewriter.get_buffer()
assert isinstance(buff, bytes)
buff = BytesIO(buff)
parsed_record = ArcWarcRecordLoader().parse_record_stream(buff)
assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo'
assert parsed_record.rec_headers.get_header('WARC-Filename') == 'testfile.warc.gz'
buff = parsed_record.stream.read().decode('utf-8')
length = parsed_record.rec_headers.get_header('Content-Length')
assert len(buff) == int(length)
assert 'json-metadata: {"foo": "bar"}\r\n' in buff
assert 'format: WARC File Format 1.0\r\n' in buff
assert 'json-metadata: {"foo": "bar"}\r\n' in buff

View File

@ -13,12 +13,15 @@ import traceback
from collections import OrderedDict
from socket import gethostname
from io import BytesIO
import fcntl
from pywb.utils.loaders import LimitReader, to_native_str
from pywb.utils.bufferedreaders import BufferedReader
from pywb.utils.timeutils import timestamp20_now
from pywb.utils.timeutils import timestamp20_now, datetime_to_iso_date
from pywb.warc.recordloader import ArcWarcRecord
from webagg.utils import ParamFormatter, res_template
@ -110,6 +113,25 @@ class BaseWARCWriter(object):
params['_formatter'] = ParamFormatter(params, name=self.rec_source_name)
self._do_write_req_resp(req, resp, params)
def create_warcinfo_record(self, filename, **kwargs):
headers = {}
headers['WARC-Record_ID'] = self._make_warc_id()
headers['WARC-Type'] = 'warcinfo'
if filename:
headers['WARC-Filename'] = filename
headers['WARC-Date'] = datetime_to_iso_date(datetime.datetime.utcnow())
warcinfo = BytesIO()
for n, v in six.iteritems(kwargs):
self._header(warcinfo, n, v)
warcinfo.seek(0)
record = ArcWarcRecord('warc', 'warcinfo', headers, warcinfo,
None, '', len(warcinfo.getbuffer()))
return record
def _check_revisit(self, record, params):
if not self.dedup_index:
return record
@ -159,7 +181,9 @@ class BaseWARCWriter(object):
http_headers_only = False
if record.length:
actual_len = len(record.status_headers.headers_buff)
actual_len = 0
if record.status_headers:
actual_len = len(record.status_headers.headers_buff)
if not http_headers_only:
diff = record.stream.tell() - actual_len
@ -170,8 +194,9 @@ class BaseWARCWriter(object):
# add empty line
self._line(out, b'')
# write headers and buffer
out.write(record.status_headers.headers_buff)
# write headers buffer, if any
if record.status_headers:
out.write(record.status_headers.headers_buff)
if not http_headers_only:
out.write(record.stream.read())
@ -351,3 +376,26 @@ class PerRecordWARCWriter(MultiFileWARCWriter):
kwargs['max_size'] = 1
super(PerRecordWARCWriter, self).__init__(*args, **kwargs)
# ============================================================================
class SimpleTempWARCWriter(BaseWARCWriter):
def __init__(self, *args, **kwargs):
super(SimpleTempWARCWriter, self).__init__(*args, **kwargs)
self.out = self._create_buffer()
def _create_buffer(self):
return tempfile.SpooledTemporaryFile(max_size=512*1024)
def _do_write_req_resp(self, req, resp, params):
self._write_warc_record(self.out, resp)
self._write_warc_record(self.out, req)
def write_record(self, record):
self._write_warc_record(self.out, record)
def get_buffer(self):
pos = self.out.tell()
self.out.seek(0)
buff = self.out.read()
self.out.seek(pos)
return buff