1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

recorder: add tests for single file writer, including file locking

dedup policy: support customizable dedup/skip/write policy plugins and add tests
This commit is contained in:
Ilya Kreymer 2016-03-18 15:28:24 -07:00
parent cba8e4ee3a
commit b64be0dff1
5 changed files with 229 additions and 54 deletions

View File

@ -1,5 +1,8 @@
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_iso_date
# ============================================================================
# Header Exclusions
# ============================================================================ # ============================================================================
class ExcludeNone(object): class ExcludeNone(object):
def __call__(self, record): def __call__(self, record):
@ -15,3 +18,23 @@ class ExcludeSpecificHeaders(object):
return self.exclude_headers return self.exclude_headers
# ============================================================================
# Revisit Policy
# ============================================================================
class WriteRevisitDupePolicy(object):
def __call__(self, cdx):
dt = timestamp_to_datetime(cdx['timestamp'])
return ('revisit', cdx['url'], datetime_to_iso_date(dt))
# ============================================================================
class SkipDupePolicy(object):
def __call__(self, cdx):
return 'skip'
# ============================================================================
class WriteDupePolicy(object):
def __call__(self, cdx):
return 'write'

View File

@ -39,7 +39,10 @@ class RecorderApp(object):
def _write_loop(self): def _write_loop(self):
while True: while True:
self._write_one() try:
self._write_one()
except:
traceback.print_exc()
def _write_one(self): def _write_one(self):
req = None req = None
@ -56,8 +59,6 @@ class RecorderApp(object):
resp = self._create_resp_record(resp_head, resp_pay, 'response') resp = self._create_resp_record(resp_head, resp_pay, 'response')
self.writer.write_req_resp(req, resp, params) self.writer.write_req_resp(req, resp, params)
except:
traceback.print_exc()
finally: finally:
try: try:

View File

@ -1,8 +1,7 @@
from pywb.utils.canonicalize import calc_search_range from pywb.utils.canonicalize import calc_search_range
from pywb.cdx.cdxobject import CDXObject from pywb.cdx.cdxobject import CDXObject
from pywb.warc.cdxindexer import write_cdx_index from pywb.warc.cdxindexer import write_cdx_index
from pywb.utils.timeutils import timestamp_to_datetime from pywb.utils.timeutils import iso_date_to_timestamp
from pywb.utils.timeutils import datetime_to_iso_date, iso_date_to_timestamp
from io import BytesIO from io import BytesIO
import os import os
@ -11,24 +10,25 @@ from webagg.indexsource import RedisIndexSource
from webagg.aggregator import SimpleAggregator from webagg.aggregator import SimpleAggregator
from webagg.utils import res_template from webagg.utils import res_template
from recorder.filters import WriteRevisitDupePolicy
#============================================================================== #==============================================================================
class WritableRedisIndexer(RedisIndexSource): class WritableRedisIndexer(RedisIndexSource):
def __init__(self, redis_url, rel_path_template='', name='recorder'): def __init__(self, redis_url, rel_path_template='', name='recorder',
dupe_policy=WriteRevisitDupePolicy()):
super(WritableRedisIndexer, self).__init__(redis_url) super(WritableRedisIndexer, self).__init__(redis_url)
self.cdx_lookup = SimpleAggregator({name: self}) self.cdx_lookup = SimpleAggregator({name: self})
self.rel_path_template = rel_path_template self.rel_path_template = rel_path_template
self.dupe_policy = dupe_policy
def add_record(self, stream, params, filename=None): def index_records(self, stream, params, filename=None):
if not filename and hasattr(stream, 'name'):
filename = stream.name
rel_path = res_template(self.rel_path_template, params) rel_path = res_template(self.rel_path_template, params)
filename = os.path.relpath(filename, rel_path) filename = os.path.relpath(filename, rel_path)
cdxout = BytesIO() cdxout = BytesIO()
write_cdx_index(cdxout, stream, filename, write_cdx_index(cdxout, stream, filename,
cdxj=True, append_post=True, rel_root=rel_path) cdxj=True, append_post=True)
z_key = res_template(self.redis_key_template, params) z_key = res_template(self.redis_key_template, params)
@ -55,8 +55,8 @@ class WritableRedisIndexer(RedisIndexSource):
cdx_iter, errs = self.cdx_lookup(params) cdx_iter, errs = self.cdx_lookup(params)
for cdx in cdx_iter: for cdx in cdx_iter:
dt = timestamp_to_datetime(cdx['timestamp']) res = self.dupe_policy(cdx)
return ('revisit', cdx['url'], if res:
datetime_to_iso_date(dt)) return res
return None return None

View File

@ -8,11 +8,12 @@ import webtest
from fakeredis import FakeStrictRedis from fakeredis import FakeStrictRedis
from mock import patch from mock import patch
from pytest import raises
from recorder.recorderapp import RecorderApp from recorder.recorderapp import RecorderApp
from recorder.redisindexer import WritableRedisIndexer from recorder.redisindexer import WritableRedisIndexer
from recorder.warcrecorder import PerRecordWARCRecorder from recorder.warcrecorder import PerRecordWARCRecorder, SingleFileWARCRecorder
from recorder.filters import ExcludeSpecificHeaders from recorder.filters import ExcludeSpecificHeaders, SkipDupePolicy, WriteDupePolicy
from webagg.utils import MementoUtils from webagg.utils import MementoUtils
@ -20,9 +21,11 @@ from pywb.cdx.cdxobject import CDXObject
from pywb.utils.statusandheaders import StatusAndHeadersParser from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.bufferedreaders import DecompressingBufferedReader from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.warc.recordloader import ArcWarcRecordLoader from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.warc.cdxindexer import write_cdx_index
from six.moves.urllib.parse import quote, unquote from six.moves.urllib.parse import quote, unquote
from io import BytesIO from io import BytesIO
import time
general_req_data = "\ general_req_data = "\
GET {path} HTTP/1.1\r\n\ GET {path} HTTP/1.1\r\n\
@ -45,7 +48,7 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
cls.upstream_url = 'http://localhost:{0}'.format(cls.server.port) cls.upstream_url = 'http://localhost:{0}'.format(cls.server.port)
def _test_per_warc(self, recorder_app, host, path, other_params=''): def _test_warc_write(self, recorder_app, host, path, other_params=''):
url = 'http://' + host + path url = 'http://' + host + path
req_url = '/live/resource/postreq?url=' + url + other_params req_url = '/live/resource/postreq?url=' + url + other_params
testapp = webtest.TestApp(recorder_app) testapp = webtest.TestApp(recorder_app)
@ -72,7 +75,7 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
recorder_app = RecorderApp(self.upstream_url, recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCRecorder(to_path(self.root_dir + '/warcs/'))) PerRecordWARCRecorder(to_path(self.root_dir + '/warcs/')))
resp = self._test_per_warc(recorder_app, 'httpbin.org', '/get?foo=bar') resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
assert b'HTTP/1.1 200 OK' in resp.body assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body assert b'"foo": "bar"' in resp.body
@ -82,7 +85,7 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
recorder_app = RecorderApp(self.upstream_url, recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCRecorder(to_path(self.root_dir + '/warcs/')), accept_colls='live') PerRecordWARCRecorder(to_path(self.root_dir + '/warcs/')), accept_colls='live')
resp = self._test_per_warc(recorder_app, 'httpbin.org', '/get?foo=bar') resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
assert b'HTTP/1.1 200 OK' in resp.body assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body assert b'"foo": "bar"' in resp.body
@ -105,7 +108,7 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
recorder_app = RecorderApp(self.upstream_url, recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCRecorder(base_path), accept_colls='live') PerRecordWARCRecorder(base_path), accept_colls='live')
resp = self._test_per_warc(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar') resp = self._test_warc_write(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar')
assert b'HTTP/1.1 302' in resp.body assert b'HTTP/1.1 302' in resp.body
buff = BytesIO(resp.body) buff = BytesIO(resp.body)
@ -134,7 +137,7 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
PerRecordWARCRecorder(base_path, header_filter=header_filter), PerRecordWARCRecorder(base_path, header_filter=header_filter),
accept_colls='live') accept_colls='live')
resp = self._test_per_warc(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar') resp = self._test_warc_write(recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar')
assert b'HTTP/1.1 302' in resp.body assert b'HTTP/1.1 302' in resp.body
buff = BytesIO(resp.body) buff = BytesIO(resp.body)
@ -161,7 +164,7 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
recorder_app = RecorderApp(self.upstream_url, recorder_app = RecorderApp(self.upstream_url,
writer=PerRecordWARCRecorder(to_path(self.root_dir + '/warcs/')), accept_colls='not-live') writer=PerRecordWARCRecorder(to_path(self.root_dir + '/warcs/')), accept_colls='not-live')
resp = self._test_per_warc(recorder_app, 'httpbin.org', '/get?foo=bar') resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
assert b'HTTP/1.1 200 OK' in resp.body assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body assert b'"foo": "bar"' in resp.body
@ -181,7 +184,7 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
self._test_all_warcs('/warcs/', 4) self._test_all_warcs('/warcs/', 4)
resp = self._test_per_warc(recorder_app, 'httpbin.org', resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?foo=bar', '&param.recorder.user=USER&param.recorder.coll=COLL') '/get?foo=bar', '&param.recorder.user=USER&param.recorder.coll=COLL')
assert b'HTTP/1.1 200 OK' in resp.body assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body assert b'"foo": "bar"' in resp.body
@ -190,7 +193,7 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
r = FakeStrictRedis.from_url('redis://localhost/2') r = FakeStrictRedis.from_url('redis://localhost/2')
res = r.zrange('USER:COLL:cdxj', 0, -1) res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
assert len(res) == 1 assert len(res) == 1
cdx = CDXObject(res[0]) cdx = CDXObject(res[0])
@ -214,7 +217,7 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
self._test_all_warcs('/warcs/', 4) self._test_all_warcs('/warcs/', 4)
resp = self._test_per_warc(recorder_app, 'httpbin.org', resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?foo=bar', '&param.recorder.user=USER&param.recorder.coll=COLL') '/get?foo=bar', '&param.recorder.user=USER&param.recorder.coll=COLL')
assert b'HTTP/1.1 200 OK' in resp.body assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body assert b'"foo": "bar"' in resp.body
@ -224,7 +227,7 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
# Test Redis CDX # Test Redis CDX
r = FakeStrictRedis.from_url('redis://localhost/2') r = FakeStrictRedis.from_url('redis://localhost/2')
res = r.zrange('USER:COLL:cdxj', 0, -1) res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
assert len(res) == 2 assert len(res) == 2
cdx = CDXObject(res[1]) cdx = CDXObject(res[1])
@ -246,5 +249,129 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://httpbin.org/get?foo=bar' assert status_headers.get_header('WARC-Refers-To-Target-URI') == 'http://httpbin.org/get?foo=bar'
assert status_headers.get_header('WARC-Refers-To-Date') != '' assert status_headers.get_header('WARC-Refers-To-Date') != ''
@patch('redis.StrictRedis', FakeStrictRedis)
def test_record_param_user_coll_skip(self):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
dedup_index = WritableRedisIndexer('redis://localhost/2/{user}:{coll}:cdxj',
rel_path_template=self.root_dir + '/warcs/',
dupe_policy=SkipDupePolicy())
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCRecorder(warc_path, dedup_index=dedup_index))
# No new entries written
self._test_all_warcs('/warcs/', 4)
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?foo=bar', '&param.recorder.user=USER&param.recorder.coll=COLL')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs/USER/COLL/', 4)
# Test Redis CDX
r = FakeStrictRedis.from_url('redis://localhost/2')
res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
assert len(res) == 2
@patch('redis.StrictRedis', FakeStrictRedis)
def test_record_param_user_coll_write_dupe_no_revisit(self):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
dedup_index = WritableRedisIndexer('redis://localhost/2/{user}:{coll}:cdxj',
rel_path_template=self.root_dir + '/warcs/',
dupe_policy=WriteDupePolicy())
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCRecorder(warc_path, dedup_index=dedup_index))
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?foo=bar', '&param.recorder.user=USER&param.recorder.coll=COLL')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
self._test_all_warcs('/warcs/USER/COLL/', 6)
r = FakeStrictRedis.from_url('redis://localhost/2')
res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,')
assert len(res) == 3
mimes = [CDXObject(x)['mime'] for x in res]
assert sorted(mimes) == ['application/json', 'application/json', 'warc/revisit']
# Single File
def test_record_single_file_warc_1(self):
path = to_path(self.root_dir + '/warcs/A.warc.gz')
recorder_app = RecorderApp(self.upstream_url,
SingleFileWARCRecorder(path))
resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
assert os.path.isfile(path)
@patch('redis.StrictRedis', FakeStrictRedis)
def test_record_single_file_multiple_writes(self):
warc_path = to_path(self.root_dir + '/warcs/FOO/rec-test.warc.gz')
rel_path = self.root_dir + '/warcs/'
dedup_index = WritableRedisIndexer('redis://localhost/2/{coll}:cdxj',
rel_path_template=rel_path)
writer = SingleFileWARCRecorder(warc_path, dedup_index=dedup_index)
recorder_app = RecorderApp(self.upstream_url, writer)
# First Record
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?foo=bar', '&param.recorder.coll=FOO')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"foo": "bar"' in resp.body
# Second Record
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?boo=far', '&param.recorder.coll=FOO')
assert b'HTTP/1.1 200 OK' in resp.body
assert b'"boo": "far"' in resp.body
self._test_all_warcs('/warcs/FOO/', 1)
r = FakeStrictRedis.from_url('redis://localhost/2')
res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,')
assert len(res) == 2
assert os.path.isfile(warc_path)
cdxout = BytesIO()
with open(warc_path, 'rb') as fh:
filename = os.path.relpath(warc_path, rel_path)
write_cdx_index(cdxout, fh, filename,
cdxj=True, append_post=True, sort=True)
res = [CDXObject(x) for x in res]
cdxres = cdxout.getvalue().strip()
cdxres = cdxres.split(b'\n')
cdxres = [CDXObject(x) for x in cdxres]
assert cdxres == res
writer.close()
with raises(OSError):
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?boo=far', '&param.recorder.coll=FOO')

View File

@ -8,12 +8,18 @@ import sys
import os import os
import six import six
import traceback import traceback
from collections import OrderedDict from collections import OrderedDict
from socket import gethostname
import fcntl
from pywb.utils.loaders import LimitReader, to_native_str from pywb.utils.loaders import LimitReader, to_native_str
from pywb.utils.bufferedreaders import BufferedReader from pywb.utils.bufferedreaders import BufferedReader
from pywb.utils.timeutils import timestamp20_now
from webagg.utils import ParamFormatter from webagg.utils import ParamFormatter
@ -99,14 +105,15 @@ class BaseWARCRecorder(object):
print('Skipping due to dedup') print('Skipping due to dedup')
return return
self._do_write_req_resp(req, resp, params) formatter = ParamFormatter(params, name=self.rec_source_name)
self._do_write_req_resp(req, resp, params, formatter)
def _check_revisit(self, record, params): def _check_revisit(self, record, params):
if not self.dedup_index: if not self.dedup_index:
return record return record
try: try:
url = record.rec_headers.get('WARC-Target-URI') url = record.rec_headers.get('WARC-Target-Uri')
digest = record.rec_headers.get('WARC-Payload-Digest') digest = record.rec_headers.get('WARC-Payload-Digest')
iso_dt = record.rec_headers.get('WARC-Date') iso_dt = record.rec_headers.get('WARC-Date')
result = self.dedup_index.lookup_revisit(params, digest, url, iso_dt) result = self.dedup_index.lookup_revisit(params, digest, url, iso_dt)
@ -221,33 +228,47 @@ class Digester(object):
# ============================================================================ # ============================================================================
class SingleFileWARCRecorder(BaseWARCRecorder): class SingleFileWARCRecorder(BaseWARCRecorder):
def __init__(self, warcfilename, *args, **kwargs): def __init__(self, filename, *args, **kwargs):
super(SingleFileWARCRecorder, self).__init__(*args, **kwargs) super(SingleFileWARCRecorder, self).__init__(*args, **kwargs)
self.warcfilename = warcfilename self.filename = filename.format(timestamp=timestamp20_now(),
host=gethostname())
def _do_write_req_resp(self, req, resp, params): try:
print('Writing {0} to {1} '.format(url, self.warcfilename)) os.makedirs(os.path.dirname(self.filename))
with open(self.warcfilename, 'a+b') as out: except:
start = out.tell() pass
self._write_warc_record(out, resp) self._fh = open(self.filename, 'a+b')
self._write_warc_record(out, req)
out.flush() fcntl.flock(self._fh, fcntl.LOCK_EX | fcntl.LOCK_NB)
out.seek(start)
if self.dedup_index: def _do_write_req_resp(self, req, resp, params, formatter):
self.dedup_index.add_record(out, params, filename=self.warcfilename) url = resp.rec_headers.get('WARC-Target-Uri')
print('Writing {0} to {1} '.format(url, self.filename))
def add_user_record(self, url, content_type, data): out = self._fh
with open(self.warcfilename, 'a+b') as out: if not out:
start = out.tell() raise IOError('Already closed')
self._write_warc_metadata(out, url, content_type, data)
out.flush()
#out.seek(start) start = out.tell()
#if self.indexer:
# self.indexer.add_record(out, self.warcfilename) self._write_warc_record(out, resp)
self._write_warc_record(out, req)
out.flush()
out.seek(start)
if self.dedup_index:
self.dedup_index.index_records(out, params, filename=self.filename)
def close(self):
if not self._fh:
return None
fcntl.flock(self._fh, fcntl.LOCK_UN)
self._fh.close()
self._fh = None
# ============================================================================ # ============================================================================
@ -256,11 +277,10 @@ class PerRecordWARCRecorder(BaseWARCRecorder):
super(PerRecordWARCRecorder, self).__init__(*args, **kwargs) super(PerRecordWARCRecorder, self).__init__(*args, **kwargs)
self.warcdir = warcdir self.warcdir = warcdir
def _do_write_req_resp(self, req, resp, params): def _do_write_req_resp(self, req, resp, params, formatter):
resp_uuid = resp.rec_headers['WARC-Record-ID'].split(':')[-1].strip('<> ') resp_uuid = resp.rec_headers['WARC-Record-ID'].split(':')[-1].strip('<> ')
req_uuid = req.rec_headers['WARC-Record-ID'].split(':')[-1].strip('<> ') req_uuid = req.rec_headers['WARC-Record-ID'].split(':')[-1].strip('<> ')
formatter = ParamFormatter(params, name=self.rec_source_name)
full_dir = formatter.format(self.warcdir) full_dir = formatter.format(self.warcdir)
try: try:
@ -271,14 +291,18 @@ class PerRecordWARCRecorder(BaseWARCRecorder):
resp_filename = os.path.join(full_dir, resp_uuid + '.warc.gz') resp_filename = os.path.join(full_dir, resp_uuid + '.warc.gz')
req_filename = os.path.join(full_dir, req_uuid + '.warc.gz') req_filename = os.path.join(full_dir, req_uuid + '.warc.gz')
self._write_record(resp_filename, resp, params, True) url = resp.rec_headers.get('WARC-Target-Uri')
self._write_record(req_filename, req, params, False) print('Writing request for {0} to {1}'.format(url, req_filename))
print('Writing response for {0} to {1}'.format(url, resp_filename))
def _write_record(self, filename, rec, params, index=False): self._write_and_index(resp_filename, resp, params, True)
self._write_and_index(req_filename, req, params, False)
def _write_and_index(self, filename, rec, params, index=False):
with open(filename, 'w+b') as out: with open(filename, 'w+b') as out:
self._write_warc_record(out, rec) self._write_warc_record(out, rec)
if index and self.dedup_index: if index and self.dedup_index:
out.seek(0) out.seek(0)
self.dedup_index.add_record(out, params, filename=filename) self.dedup_index.index_records(out, params, filename=filename)