1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Recorder Pending count (#637)

* recorder: add pending counter (in redis) to when using redis based dedup system, supports webrecorder/browsertrix#44
This commit is contained in:
Ilya Kreymer 2021-04-28 16:10:39 -07:00 committed by GitHub
parent 626da99899
commit abb76911f5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 42 additions and 4 deletions

View File

@ -11,7 +11,7 @@ from wsgiprox.wsgiprox import WSGIProxMiddleware
from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter
from pywb.recorder.recorderapp import RecorderApp
from pywb.recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy
from pywb.recorder.redisindexer import WritableRedisIndexer
from pywb.recorder.redisindexer import WritableRedisIndexer, RedisPendingCounterTempBuffer
from pywb.utils.loaders import load_yaml_config
from pywb.utils.geventserver import GeventServer
@ -244,8 +244,16 @@ class FrontEndApp(object):
dedup_index=dedup_index,
dedup_by_url=dedup_by_url)
if dedup_policy:
pending_counter = self.warcserver.dedup_index_url.replace(':cdxj', ':pending')
pending_timeout = recorder_config.get('pending_timeout', 30)
create_buff_func = lambda params, name: RedisPendingCounterTempBuffer(512 * 1024, pending_counter, params, name, pending_timeout)
else:
create_buff_func = None
self.recorder = RecorderApp(self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer,
accept_colls=recorder_config.get('source_filter'))
accept_colls=recorder_config.get('source_filter'),
create_buff_func=create_buff_func)
recorder_server = GeventServer(self.recorder, port=0)

View File

@ -24,8 +24,7 @@ class RecorderApp(object):
self.rec_source_name = kwargs.get('name', 'recorder')
self.create_buff_func = kwargs.get('create_buff_func',
self.default_create_buffer)
self.create_buff_func = kwargs.get('create_buff_func') or self.default_create_buffer
self.write_queue = gevent.queue.Queue()
gevent.spawn(self._write_loop)

View File

@ -2,6 +2,7 @@ from warcio.timeutils import iso_date_to_timestamp
from io import BytesIO
import os
import tempfile
from pywb.utils.canonicalize import calc_search_range
from pywb.utils.format import res_template
@ -101,3 +102,29 @@ class WritableRedisIndexer(RedisIndexSource):
return res
return None
# ============================================================================
class RedisPendingCounterTempBuffer(tempfile.SpooledTemporaryFile):
def __init__(self, max_size, redis_url, params, name, timeout=30):
redis_url = res_template(redis_url, params)
super(RedisPendingCounterTempBuffer, self).__init__(max_size=max_size)
self.redis, self.key = RedisIndexSource.parse_redis_url(redis_url)
self.timeout = timeout
self.redis.incrby(self.key, 1)
self.redis.expire(self.key, self.timeout)
def write(self, buf):
super(RedisPendingCounterTempBuffer, self).write(buf)
self.redis.expire(self.key, self.timeout)
def close(self):
try:
super(RedisPendingCounterTempBuffer, self).close()
except:
traceback.print_exc()
self.redis.incrby(self.key, -1)
self.redis.expire(self.key, self.timeout)

View File

@ -50,3 +50,7 @@ class TestRecordDedup(HttpBinLiveTests, CollsDirMixin, BaseConfigTest, FakeRedis
# ensure only one response/request pair written
assert records == ['response', 'request']
def test_redis_pending_count(self):
res = self.redis.get("pywb:test-dedup:pending")
assert res == b'0'