1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

work on patching support!

This commit is contained in:
Ilya Kreymer 2020-08-10 08:10:42 -07:00
parent 54d8bccf4a
commit e3de917d47
6 changed files with 46 additions and 7 deletions

View File

@ -2,6 +2,8 @@
# ========================================
#
dedup_index: redis://localhost:6379/0/c:{coll}:cdxj
collections:
all: $all
pywb:

View File

@ -10,6 +10,8 @@ from wsgiprox.wsgiprox import WSGIProxMiddleware
from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter
from pywb.recorder.recorderapp import RecorderApp
from pywb.recorder.redisindexer import WritableRedisIndexer
from pywb.recorder.filters import SkipDupePolicy, WriteRevisitDupePolicy, WriteDupePolicy
from pywb.utils.loaders import load_yaml_config
from pywb.utils.geventserver import GeventServer
@ -207,8 +209,25 @@ class FrontEndApp(object):
else:
recorder_coll = recorder_config['source_coll']
# TODO: support dedup
dedup_index = None
if self.warcserver.dedup_index:
policy = self.warcserver.config.get('dedup_policy')
if policy == 'skip':
dedup_policy = SkipDupePolicy()
elif policy == 'revisit':
dedup_policy = WriteRevisitDupePolicy()
elif policy == 'keep':
dedup_policy = WriteDupePolicy()
else:
dedup_policy = WriteRevisitDupePolicy()
print('Recorder Dedup: {0} policy via dedup index {1}'.format(policy, self.warcserver.dedup_index))
dedup_index = WritableRedisIndexer(
redis_url=self.warcserver.dedup_index,
rel_path_template=self.warcserver.archive_paths,
dupe_policy=dedup_policy)
warc_writer = MultiFileWARCWriter(self.warcserver.archive_paths,
max_size=int(recorder_config.get('rollover_size', 1000000000)),
max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)),
@ -393,8 +412,8 @@ class FrontEndApp(object):
:return: WbResponse containing the contents of the record/URL
:rtype: WbResponse
"""
if coll in self.warcserver.list_fixed_routes():
return WbResponse.text_response('Error: Can Not Record Into Custom Collection "{0}"'.format(coll))
#if coll in self.warcserver.list_fixed_routes():
# return WbResponse.text_response('Error: Can Not Record Into Custom Collection "{0}"'.format(coll))
return self.serve_content(environ, coll, url, record=True)
@ -625,8 +644,8 @@ class FrontEndApp(object):
if proxy_config.get('recording'):
logging.info('Proxy recording into collection "{0}"'.format(proxy_coll))
if proxy_coll in self.warcserver.list_fixed_routes():
raise Exception('Can not record into fixed collection')
#if proxy_coll in self.warcserver.list_fixed_routes():
#raise Exception('Can not record into fixed collection')
proxy_route = proxy_coll + self.RECORD_ROUTE
if not config.get('recorder'):

View File

@ -167,9 +167,11 @@ class SortedCDXWriter(BaseCDXWriter):
super(SortedCDXWriter, self).write(entry, filename)
line = self.out.getvalue()
if line:
insort(self.sortlist, line)
self.sortlist.append(line)
#insort(self.sortlist, line)
def __exit__(self, *args):
self.sortlist.sort()
self.actual_out.write(''.join(self.sortlist))
return False

View File

@ -195,6 +195,8 @@ class RecorderApp(object):
req_is_wrapped = False
print('PARAMS', params)
if not skipping:
req_stream = ReqWrapper(input_buff,
headers,

View File

@ -48,14 +48,18 @@ class WritableRedisIndexer(RedisIndexSource):
return base_name
def add_warc_file(self, full_filename, params):
print('PARAMS W', params)
base_filename = self._get_rel_or_base_name(full_filename, params)
file_key = res_template(self.file_key_template, params)
if not file_key:
return
full_load_path = self.full_warc_prefix + full_filename
self.redis.hset(file_key, base_filename, full_load_path)
def add_urls_to_index(self, stream, params, filename, length):
print('PARAMS U', params)
base_filename = self._get_rel_or_base_name(filename, params)
cdxout = BytesIO()
@ -64,6 +68,7 @@ class WritableRedisIndexer(RedisIndexSource):
writer_cls=params.get('writer_cls'))
z_key = res_template(self.redis_key_template, params)
print('KEY', z_key, self.redis_key_template, params)
cdx_list = cdxout.getvalue().rstrip().split(b'\n')

View File

@ -60,6 +60,7 @@ class WarcServer(BaseWarcServer):
super(WarcServer, self).__init__(debug=config.get('debug', False))
self.config = config
self.dedup_index = self.config.get('dedup_index')
self.root_dir = self.config.get('collections_root', '')
self.index_paths = self.init_paths('index_paths')
@ -113,7 +114,14 @@ class WarcServer(BaseWarcServer):
access_checker = AccessChecker(CacheDirectoryAccessSource(self.acl_paths),
self.default_access)
return DefaultResourceHandler(dir_source, self.archive_paths,
if self.dedup_index:
source = SimpleAggregator({'dedup': RedisMultiKeyIndexSource(self.dedup_index),
'dir': dir_source})
else:
source = dir_source
return DefaultResourceHandler(source, self.archive_paths,
rules_file=self.rules_file,
access_checker=access_checker)
@ -243,6 +251,7 @@ def init_index_source(value, source_list=None):
return source
else:
print(value)
raise Exception('Source config must be string or dict')
raise Exception('No Index Source Found for: ' + str(value))