1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

warcwriter: add create_warcinfo_record() for creating a warcinfo and a SimpleTempWARCWriter for writing records to temp buff/file

This commit is contained in:
Ilya Kreymer 2016-04-03 12:19:54 -07:00
parent fd76030cb3
commit d40edfc22d
2 changed files with 82 additions and 10 deletions

View File

@ -12,7 +12,7 @@ from fakeredis import FakeStrictRedis
from recorder.recorderapp import RecorderApp from recorder.recorderapp import RecorderApp
from recorder.redisindexer import WritableRedisIndexer from recorder.redisindexer import WritableRedisIndexer
from recorder.warcwriter import PerRecordWARCWriter, MultiFileWARCWriter from recorder.warcwriter import PerRecordWARCWriter, MultiFileWARCWriter, SimpleTempWARCWriter
from recorder.filters import ExcludeSpecificHeaders from recorder.filters import ExcludeSpecificHeaders
from recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy from recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy
@ -27,6 +27,7 @@ from pywb.warc.cdxindexer import write_cdx_index
from six.moves.urllib.parse import quote, unquote from six.moves.urllib.parse import quote, unquote
from io import BytesIO from io import BytesIO
import time import time
import json
general_req_data = "\ general_req_data = "\
GET {path} HTTP/1.1\r\n\ GET {path} HTTP/1.1\r\n\
@ -180,7 +181,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
self._test_all_warcs('/warcs/', 2) self._test_all_warcs('/warcs/', 2)
#@patch('redis.StrictRedis', FakeStrictRedis)
def test_record_param_user_coll(self): def test_record_param_user_coll(self):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
@ -216,7 +216,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
assert warcs == {cdx['filename'].encode('utf-8'): full_path.encode('utf-8')} assert warcs == {cdx['filename'].encode('utf-8'): full_path.encode('utf-8')}
#@patch('redis.StrictRedis', FakeStrictRedis)
def test_record_param_user_coll_revisit(self): def test_record_param_user_coll_revisit(self):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
@ -263,7 +262,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://httpbin.org/get?foo=bar' assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://httpbin.org/get?foo=bar'
assert status_headers.get_header('WARC-Refers-To-Date') != '' assert status_headers.get_header('WARC-Refers-To-Date') != ''
#@patch('redis.StrictRedis', FakeStrictRedis)
def test_record_param_user_coll_skip(self): def test_record_param_user_coll_skip(self):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
@ -288,7 +286,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,') res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
assert len(res) == 2 assert len(res) == 2
#@patch('redis.StrictRedis', FakeStrictRedis)
def test_record_param_user_coll_write_dupe_no_revisit(self): def test_record_param_user_coll_write_dupe_no_revisit(self):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
@ -329,7 +326,6 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
assert os.path.isfile(path) assert os.path.isfile(path)
assert len(writer.fh_cache) == 1 assert len(writer.fh_cache) == 1
#@patch('redis.StrictRedis', FakeStrictRedis)
def test_record_multiple_writes_keep_open(self): def test_record_multiple_writes_keep_open(self):
warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz') warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz')
@ -397,3 +393,31 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
warcs = r.hgetall('FOO:warc') warcs = r.hgetall('FOO:warc')
assert len(warcs) == 2 assert len(warcs) == 2
def test_warcinfo_record(self):
simplewriter = SimpleTempWARCWriter(gzip=False)
params = {'software': 'recorder test',
'format': 'WARC File Format 1.0',
'json-metadata': json.dumps({'foo': 'bar'})}
record = simplewriter.create_warcinfo_record('testfile.warc.gz', **params)
simplewriter.write_record(record)
buff = simplewriter.get_buffer()
assert isinstance(buff, bytes)
buff = BytesIO(buff)
parsed_record = ArcWarcRecordLoader().parse_record_stream(buff)
assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo'
assert parsed_record.rec_headers.get_header('WARC-Filename') == 'testfile.warc.gz'
buff = parsed_record.stream.read().decode('utf-8')
length = parsed_record.rec_headers.get_header('Content-Length')
assert len(buff) == int(length)
assert 'json-metadata: {"foo": "bar"}\r\n' in buff
assert 'format: WARC File Format 1.0\r\n' in buff
assert 'json-metadata: {"foo": "bar"}\r\n' in buff

View File

@ -13,12 +13,15 @@ import traceback
from collections import OrderedDict from collections import OrderedDict
from socket import gethostname from socket import gethostname
from io import BytesIO
import fcntl import fcntl
from pywb.utils.loaders import LimitReader, to_native_str from pywb.utils.loaders import LimitReader, to_native_str
from pywb.utils.bufferedreaders import BufferedReader from pywb.utils.bufferedreaders import BufferedReader
from pywb.utils.timeutils import timestamp20_now from pywb.utils.timeutils import timestamp20_now, datetime_to_iso_date
from pywb.warc.recordloader import ArcWarcRecord
from webagg.utils import ParamFormatter, res_template from webagg.utils import ParamFormatter, res_template
@ -110,6 +113,25 @@ class BaseWARCWriter(object):
params['_formatter'] = ParamFormatter(params, name=self.rec_source_name) params['_formatter'] = ParamFormatter(params, name=self.rec_source_name)
self._do_write_req_resp(req, resp, params) self._do_write_req_resp(req, resp, params)
def create_warcinfo_record(self, filename, **kwargs):
headers = {}
headers['WARC-Record_ID'] = self._make_warc_id()
headers['WARC-Type'] = 'warcinfo'
if filename:
headers['WARC-Filename'] = filename
headers['WARC-Date'] = datetime_to_iso_date(datetime.datetime.utcnow())
warcinfo = BytesIO()
for n, v in six.iteritems(kwargs):
self._header(warcinfo, n, v)
warcinfo.seek(0)
record = ArcWarcRecord('warc', 'warcinfo', headers, warcinfo,
None, '', len(warcinfo.getbuffer()))
return record
def _check_revisit(self, record, params): def _check_revisit(self, record, params):
if not self.dedup_index: if not self.dedup_index:
return record return record
@ -159,7 +181,9 @@ class BaseWARCWriter(object):
http_headers_only = False http_headers_only = False
if record.length: if record.length:
actual_len = len(record.status_headers.headers_buff) actual_len = 0
if record.status_headers:
actual_len = len(record.status_headers.headers_buff)
if not http_headers_only: if not http_headers_only:
diff = record.stream.tell() - actual_len diff = record.stream.tell() - actual_len
@ -170,8 +194,9 @@ class BaseWARCWriter(object):
# add empty line # add empty line
self._line(out, b'') self._line(out, b'')
# write headers and buffer # write headers buffer, if any
out.write(record.status_headers.headers_buff) if record.status_headers:
out.write(record.status_headers.headers_buff)
if not http_headers_only: if not http_headers_only:
out.write(record.stream.read()) out.write(record.stream.read())
@ -351,3 +376,26 @@ class PerRecordWARCWriter(MultiFileWARCWriter):
kwargs['max_size'] = 1 kwargs['max_size'] = 1
super(PerRecordWARCWriter, self).__init__(*args, **kwargs) super(PerRecordWARCWriter, self).__init__(*args, **kwargs)
# ============================================================================
class SimpleTempWARCWriter(BaseWARCWriter):
def __init__(self, *args, **kwargs):
super(SimpleTempWARCWriter, self).__init__(*args, **kwargs)
self.out = self._create_buffer()
def _create_buffer(self):
return tempfile.SpooledTemporaryFile(max_size=512*1024)
def _do_write_req_resp(self, req, resp, params):
self._write_warc_record(self.out, resp)
self._write_warc_record(self.out, req)
def write_record(self, record):
self._write_warc_record(self.out, record)
def get_buffer(self):
pos = self.out.tell()
self.out.seek(0)
buff = self.out.read()
self.out.seek(pos)
return buff