From d40edfc22db1dd107d584ddf8d6e9ac7532e08ad Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 3 Apr 2016 12:19:54 -0700 Subject: [PATCH] warcwriter: add create_warcinfo_record() for creating a warcinfo and a SimpleTempWARCWriter for writing records to temp buff/file --- recorder/test/test_recorder.py | 36 ++++++++++++++++++---- recorder/warcwriter.py | 56 +++++++++++++++++++++++++++++++--- 2 files changed, 82 insertions(+), 10 deletions(-) diff --git a/recorder/test/test_recorder.py b/recorder/test/test_recorder.py index d763953f..9a07075f 100644 --- a/recorder/test/test_recorder.py +++ b/recorder/test/test_recorder.py @@ -12,7 +12,7 @@ from fakeredis import FakeStrictRedis from recorder.recorderapp import RecorderApp from recorder.redisindexer import WritableRedisIndexer -from recorder.warcwriter import PerRecordWARCWriter, MultiFileWARCWriter +from recorder.warcwriter import PerRecordWARCWriter, MultiFileWARCWriter, SimpleTempWARCWriter from recorder.filters import ExcludeSpecificHeaders from recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy @@ -27,6 +27,7 @@ from pywb.warc.cdxindexer import write_cdx_index from six.moves.urllib.parse import quote, unquote from io import BytesIO import time +import json general_req_data = "\ GET {path} HTTP/1.1\r\n\ @@ -180,7 +181,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) self._test_all_warcs('/warcs/', 2) - #@patch('redis.StrictRedis', FakeStrictRedis) def test_record_param_user_coll(self): warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') @@ -216,7 +216,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) assert warcs == {cdx['filename'].encode('utf-8'): full_path.encode('utf-8')} - #@patch('redis.StrictRedis', FakeStrictRedis) def test_record_param_user_coll_revisit(self): warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') @@ -263,7 +262,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://httpbin.org/get?foo=bar' assert status_headers.get_header('WARC-Refers-To-Date') != '' - #@patch('redis.StrictRedis', FakeStrictRedis) def test_record_param_user_coll_skip(self): warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') @@ -288,7 +286,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,') assert len(res) == 2 - #@patch('redis.StrictRedis', FakeStrictRedis) def test_record_param_user_coll_write_dupe_no_revisit(self): warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') @@ -329,7 +326,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) assert os.path.isfile(path) assert len(writer.fh_cache) == 1 - #@patch('redis.StrictRedis', FakeStrictRedis) def test_record_multiple_writes_keep_open(self): warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz') @@ -397,3 +393,31 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) warcs = r.hgetall('FOO:warc') assert len(warcs) == 2 + + def test_warcinfo_record(self): + simplewriter = SimpleTempWARCWriter(gzip=False) + params = {'software': 'recorder test', + 'format': 'WARC File Format 1.0', + 'json-metadata': json.dumps({'foo': 'bar'})} + + record = simplewriter.create_warcinfo_record('testfile.warc.gz', **params) + simplewriter.write_record(record) + buff = simplewriter.get_buffer() + assert isinstance(buff, bytes) + + buff = BytesIO(buff) + parsed_record = ArcWarcRecordLoader().parse_record_stream(buff) + + assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo' + assert parsed_record.rec_headers.get_header('WARC-Filename') == 'testfile.warc.gz' + + buff = parsed_record.stream.read().decode('utf-8') + + length = parsed_record.rec_headers.get_header('Content-Length') + + assert len(buff) == int(length) + + assert 'json-metadata: {"foo": "bar"}\r\n' in buff + assert 'format: WARC File Format 1.0\r\n' in buff + assert 'json-metadata: {"foo": "bar"}\r\n' in buff + diff --git a/recorder/warcwriter.py b/recorder/warcwriter.py index db88e45e..92ba5bce 100644 --- a/recorder/warcwriter.py +++ b/recorder/warcwriter.py @@ -13,12 +13,15 @@ import traceback from collections import OrderedDict from socket import gethostname +from io import BytesIO import fcntl from pywb.utils.loaders import LimitReader, to_native_str from pywb.utils.bufferedreaders import BufferedReader -from pywb.utils.timeutils import timestamp20_now +from pywb.utils.timeutils import timestamp20_now, datetime_to_iso_date + +from pywb.warc.recordloader import ArcWarcRecord from webagg.utils import ParamFormatter, res_template @@ -110,6 +113,25 @@ class BaseWARCWriter(object): params['_formatter'] = ParamFormatter(params, name=self.rec_source_name) self._do_write_req_resp(req, resp, params) + def create_warcinfo_record(self, filename, **kwargs): + headers = {} + headers['WARC-Record_ID'] = self._make_warc_id() + headers['WARC-Type'] = 'warcinfo' + if filename: + headers['WARC-Filename'] = filename + headers['WARC-Date'] = datetime_to_iso_date(datetime.datetime.utcnow()) + + warcinfo = BytesIO() + for n, v in six.iteritems(kwargs): + self._header(warcinfo, n, v) + + warcinfo.seek(0) + + record = ArcWarcRecord('warc', 'warcinfo', headers, warcinfo, + None, '', len(warcinfo.getbuffer())) + + return record + def _check_revisit(self, record, params): if not self.dedup_index: return record @@ -159,7 +181,9 @@ class BaseWARCWriter(object): http_headers_only = False if record.length: - actual_len = len(record.status_headers.headers_buff) + actual_len = 0 + if record.status_headers: + actual_len = len(record.status_headers.headers_buff) if not http_headers_only: diff = record.stream.tell() - actual_len @@ -170,8 +194,9 @@ class BaseWARCWriter(object): # add empty line self._line(out, b'') - # write headers and buffer - out.write(record.status_headers.headers_buff) + # write headers buffer, if any + if record.status_headers: + out.write(record.status_headers.headers_buff) if not http_headers_only: out.write(record.stream.read()) @@ -351,3 +376,26 @@ class PerRecordWARCWriter(MultiFileWARCWriter): kwargs['max_size'] = 1 super(PerRecordWARCWriter, self).__init__(*args, **kwargs) + +# ============================================================================ +class SimpleTempWARCWriter(BaseWARCWriter): + def __init__(self, *args, **kwargs): + super(SimpleTempWARCWriter, self).__init__(*args, **kwargs) + self.out = self._create_buffer() + + def _create_buffer(self): + return tempfile.SpooledTemporaryFile(max_size=512*1024) + + def _do_write_req_resp(self, req, resp, params): + self._write_warc_record(self.out, resp) + self._write_warc_record(self.out, req) + + def write_record(self, record): + self._write_warc_record(self.out, record) + + def get_buffer(self): + pos = self.out.tell() + self.out.seek(0) + buff = self.out.read() + self.out.seek(pos) + return buff