1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

recorder: ensure filename is also tracked by the indexer, add tests

for redis file mapping
This commit is contained in:
Ilya Kreymer 2016-03-19 10:24:28 -07:00
parent 3452cf39e0
commit c96e419341
3 changed files with 53 additions and 24 deletions

View File

@ -15,13 +15,23 @@ from recorder.filters import WriteRevisitDupePolicy
#==============================================================================
class WritableRedisIndexer(RedisIndexSource):
def __init__(self, redis_url, rel_path_template='', name='recorder',
def __init__(self, redis_url, rel_path_template='',
file_key_template='', name='recorder',
dupe_policy=WriteRevisitDupePolicy()):
super(WritableRedisIndexer, self).__init__(redis_url)
self.cdx_lookup = SimpleAggregator({name: self})
self.rel_path_template = rel_path_template
self.file_key_template = file_key_template
self.dupe_policy = dupe_policy
def add_warc_file(self, full_filename, params):
rel_path = res_template(self.rel_path_template, params)
filename = os.path.relpath(full_filename, rel_path)
file_key = res_template(self.file_key_template, params)
self.redis.hset(file_key, filename, full_filename)
def index_records(self, stream, params, filename=None):
rel_path = res_template(self.rel_path_template, params)
filename = os.path.relpath(filename, rel_path)

View File

@ -13,7 +13,8 @@ from pytest import raises
from recorder.recorderapp import RecorderApp
from recorder.redisindexer import WritableRedisIndexer
from recorder.warcwriter import PerRecordWARCWriter, MultiFileWARCWriter
from recorder.filters import ExcludeSpecificHeaders, SkipDupePolicy, WriteDupePolicy
from recorder.filters import ExcludeSpecificHeaders
from recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy
from webagg.utils import MementoUtils
@ -47,6 +48,13 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
cls.upstream_url = 'http://localhost:{0}'.format(cls.server.port)
def _get_dedup_index(self, dupe_policy=WriteRevisitDupePolicy()):
dedup_index = WritableRedisIndexer('redis://localhost/2/{user}:{coll}:cdxj',
file_key_template='{user}:{coll}:warc',
rel_path_template=self.root_dir + '/warcs/',
dupe_policy=dupe_policy)
return dedup_index
def _test_warc_write(self, recorder_app, host, path, other_params=''):
url = 'http://' + host + path
@ -176,9 +184,7 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
dedup_index = WritableRedisIndexer('redis://localhost/2/{user}:{coll}:cdxj',
rel_path_template=self.root_dir + '/warcs/')
dedup_index = self._get_dedup_index()
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCWriter(warc_path, dedup_index=dedup_index))
@ -204,14 +210,16 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
assert cdx['filename'].startswith('USER/COLL/')
assert cdx['filename'].endswith('.warc.gz')
warcs = r.hgetall('USER:COLL:warc')
full_path = self.root_dir + '/warcs/' + cdx['filename']
assert warcs == {cdx['filename'].encode('utf-8'): full_path.encode('utf-8')}
@patch('redis.StrictRedis', FakeStrictRedis)
def test_record_param_user_coll_revisit(self):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
dedup_index = WritableRedisIndexer('redis://localhost/2/{user}:{coll}:cdxj',
rel_path_template=self.root_dir + '/warcs/')
dedup_index = self._get_dedup_index()
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCWriter(warc_path, dedup_index=dedup_index))
@ -240,6 +248,10 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename'])
warcs = r.hgetall('USER:COLL:warc')
assert len(warcs) == 2
assert warcs[cdx['filename'].encode('utf-8')] == fullwarc.encode('utf-8')
with open(fullwarc, 'rb') as fh:
decomp = DecompressingBufferedReader(fh)
# Test refers-to headers
@ -254,10 +266,7 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
def test_record_param_user_coll_skip(self):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
dedup_index = WritableRedisIndexer('redis://localhost/2/{user}:{coll}:cdxj',
rel_path_template=self.root_dir + '/warcs/',
dupe_policy=SkipDupePolicy())
dedup_index = self._get_dedup_index(dupe_policy=SkipDupePolicy())
recorder_app = RecorderApp(self.upstream_url,
PerRecordWARCWriter(warc_path, dedup_index=dedup_index))
@ -283,10 +292,7 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')
dedup_index = WritableRedisIndexer('redis://localhost/2/{user}:{coll}:cdxj',
rel_path_template=self.root_dir + '/warcs/',
dupe_policy=WriteDupePolicy())
dedup_index = self._get_dedup_index(dupe_policy=WriteDupePolicy())
writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index)
recorder_app = RecorderApp(self.upstream_url, writer)
@ -329,8 +335,10 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
rel_path = self.root_dir + '/warcs/'
dedup_index = WritableRedisIndexer('redis://localhost/2/{coll}:cdxj',
file_key_template='{coll}:warc',
rel_path_template=rel_path)
writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
recorder_app = RecorderApp(self.upstream_url, writer)
@ -382,3 +390,8 @@ class TestRecorder(LiveServerTests, TempDirTests, BaseTestClass):
resp = self._test_warc_write(recorder_app, 'httpbin.org',
'/get?boo=far', '&param.recorder.coll=FOO')
self._test_all_warcs('/warcs/FOO/', 2)
warcs = r.hgetall('FOO:warc')
assert len(warcs) == 2

View File

@ -20,7 +20,7 @@ from pywb.utils.loaders import LimitReader, to_native_str
from pywb.utils.bufferedreaders import BufferedReader
from pywb.utils.timeutils import timestamp20_now
from webagg.utils import ParamFormatter
from webagg.utils import ParamFormatter, res_template
from recorder.filters import ExcludeNone
@ -107,8 +107,8 @@ class BaseWARCWriter(object):
print('Skipping due to dedup')
return
formatter = ParamFormatter(params, name=self.rec_source_name)
self._do_write_req_resp(req, resp, params, formatter)
params['_formatter'] = ParamFormatter(params, name=self.rec_source_name)
self._do_write_req_resp(req, resp, params)
def _check_revisit(self, record, params):
if not self.dedup_index:
@ -248,18 +248,24 @@ class MultiFileWARCWriter(BaseWARCWriter):
self.fh_cache = {}
def _open_file(self, dir_):
def _open_file(self, dir_, params):
timestamp = timestamp20_now()
filename = dir_ + self.filename_template.format(hostname=self.hostname,
timestamp=timestamp)
path, name = os.path.split(filename)
try:
os.makedirs(os.path.dirname(filename))
os.makedirs(path)
except:
pass
fh = open(filename, 'a+b')
if self.dedup_index:
self.dedup_index.add_warc_file(filename, params)
return fh, filename
def _close_file(self, fh):
@ -272,8 +278,8 @@ class MultiFileWARCWriter(BaseWARCWriter):
out, filename = result
self._close_file(out)
def _do_write_req_resp(self, req, resp, params, formatter):
full_dir = formatter.format(self.dir_template)
def _do_write_req_resp(self, req, resp, params):
full_dir = res_template(self.dir_template, params)
result = self.fh_cache.get(full_dir)
@ -283,7 +289,7 @@ class MultiFileWARCWriter(BaseWARCWriter):
out, filename = result
is_new = False
else:
out, filename = self._open_file(full_dir)
out, filename = self._open_file(full_dir, params)
is_new = True
try: