mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
work on patching support!
This commit is contained in:
parent
54d8bccf4a
commit
e3de917d47
@ -2,6 +2,8 @@
|
||||
# ========================================
|
||||
#
|
||||
|
||||
dedup_index: redis://localhost:6379/0/c:{coll}:cdxj
|
||||
|
||||
collections:
|
||||
all: $all
|
||||
pywb:
|
||||
|
@ -10,6 +10,8 @@ from wsgiprox.wsgiprox import WSGIProxMiddleware
|
||||
|
||||
from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter
|
||||
from pywb.recorder.recorderapp import RecorderApp
|
||||
from pywb.recorder.redisindexer import WritableRedisIndexer
|
||||
from pywb.recorder.filters import SkipDupePolicy, WriteRevisitDupePolicy, WriteDupePolicy
|
||||
|
||||
from pywb.utils.loaders import load_yaml_config
|
||||
from pywb.utils.geventserver import GeventServer
|
||||
@ -207,8 +209,25 @@ class FrontEndApp(object):
|
||||
else:
|
||||
recorder_coll = recorder_config['source_coll']
|
||||
|
||||
# TODO: support dedup
|
||||
dedup_index = None
|
||||
if self.warcserver.dedup_index:
|
||||
policy = self.warcserver.config.get('dedup_policy')
|
||||
if policy == 'skip':
|
||||
dedup_policy = SkipDupePolicy()
|
||||
elif policy == 'revisit':
|
||||
dedup_policy = WriteRevisitDupePolicy()
|
||||
elif policy == 'keep':
|
||||
dedup_policy = WriteDupePolicy()
|
||||
else:
|
||||
dedup_policy = WriteRevisitDupePolicy()
|
||||
|
||||
print('Recorder Dedup: {0} policy via dedup index {1}'.format(policy, self.warcserver.dedup_index))
|
||||
|
||||
dedup_index = WritableRedisIndexer(
|
||||
redis_url=self.warcserver.dedup_index,
|
||||
rel_path_template=self.warcserver.archive_paths,
|
||||
dupe_policy=dedup_policy)
|
||||
|
||||
warc_writer = MultiFileWARCWriter(self.warcserver.archive_paths,
|
||||
max_size=int(recorder_config.get('rollover_size', 1000000000)),
|
||||
max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)),
|
||||
@ -393,8 +412,8 @@ class FrontEndApp(object):
|
||||
:return: WbResponse containing the contents of the record/URL
|
||||
:rtype: WbResponse
|
||||
"""
|
||||
if coll in self.warcserver.list_fixed_routes():
|
||||
return WbResponse.text_response('Error: Can Not Record Into Custom Collection "{0}"'.format(coll))
|
||||
#if coll in self.warcserver.list_fixed_routes():
|
||||
# return WbResponse.text_response('Error: Can Not Record Into Custom Collection "{0}"'.format(coll))
|
||||
|
||||
return self.serve_content(environ, coll, url, record=True)
|
||||
|
||||
@ -625,8 +644,8 @@ class FrontEndApp(object):
|
||||
|
||||
if proxy_config.get('recording'):
|
||||
logging.info('Proxy recording into collection "{0}"'.format(proxy_coll))
|
||||
if proxy_coll in self.warcserver.list_fixed_routes():
|
||||
raise Exception('Can not record into fixed collection')
|
||||
#if proxy_coll in self.warcserver.list_fixed_routes():
|
||||
#raise Exception('Can not record into fixed collection')
|
||||
|
||||
proxy_route = proxy_coll + self.RECORD_ROUTE
|
||||
if not config.get('recorder'):
|
||||
|
@ -167,9 +167,11 @@ class SortedCDXWriter(BaseCDXWriter):
|
||||
super(SortedCDXWriter, self).write(entry, filename)
|
||||
line = self.out.getvalue()
|
||||
if line:
|
||||
insort(self.sortlist, line)
|
||||
self.sortlist.append(line)
|
||||
#insort(self.sortlist, line)
|
||||
|
||||
def __exit__(self, *args):
|
||||
self.sortlist.sort()
|
||||
self.actual_out.write(''.join(self.sortlist))
|
||||
return False
|
||||
|
||||
|
@ -195,6 +195,8 @@ class RecorderApp(object):
|
||||
|
||||
req_is_wrapped = False
|
||||
|
||||
print('PARAMS', params)
|
||||
|
||||
if not skipping:
|
||||
req_stream = ReqWrapper(input_buff,
|
||||
headers,
|
||||
|
@ -48,14 +48,18 @@ class WritableRedisIndexer(RedisIndexSource):
|
||||
return base_name
|
||||
|
||||
def add_warc_file(self, full_filename, params):
|
||||
print('PARAMS W', params)
|
||||
base_filename = self._get_rel_or_base_name(full_filename, params)
|
||||
file_key = res_template(self.file_key_template, params)
|
||||
if not file_key:
|
||||
return
|
||||
|
||||
full_load_path = self.full_warc_prefix + full_filename
|
||||
|
||||
self.redis.hset(file_key, base_filename, full_load_path)
|
||||
|
||||
def add_urls_to_index(self, stream, params, filename, length):
|
||||
print('PARAMS U', params)
|
||||
base_filename = self._get_rel_or_base_name(filename, params)
|
||||
|
||||
cdxout = BytesIO()
|
||||
@ -64,6 +68,7 @@ class WritableRedisIndexer(RedisIndexSource):
|
||||
writer_cls=params.get('writer_cls'))
|
||||
|
||||
z_key = res_template(self.redis_key_template, params)
|
||||
print('KEY', z_key, self.redis_key_template, params)
|
||||
|
||||
cdx_list = cdxout.getvalue().rstrip().split(b'\n')
|
||||
|
||||
|
@ -60,6 +60,7 @@ class WarcServer(BaseWarcServer):
|
||||
|
||||
super(WarcServer, self).__init__(debug=config.get('debug', False))
|
||||
self.config = config
|
||||
self.dedup_index = self.config.get('dedup_index')
|
||||
|
||||
self.root_dir = self.config.get('collections_root', '')
|
||||
self.index_paths = self.init_paths('index_paths')
|
||||
@ -113,7 +114,14 @@ class WarcServer(BaseWarcServer):
|
||||
access_checker = AccessChecker(CacheDirectoryAccessSource(self.acl_paths),
|
||||
self.default_access)
|
||||
|
||||
return DefaultResourceHandler(dir_source, self.archive_paths,
|
||||
if self.dedup_index:
|
||||
source = SimpleAggregator({'dedup': RedisMultiKeyIndexSource(self.dedup_index),
|
||||
'dir': dir_source})
|
||||
|
||||
else:
|
||||
source = dir_source
|
||||
|
||||
return DefaultResourceHandler(source, self.archive_paths,
|
||||
rules_file=self.rules_file,
|
||||
access_checker=access_checker)
|
||||
|
||||
@ -243,6 +251,7 @@ def init_index_source(value, source_list=None):
|
||||
return source
|
||||
|
||||
else:
|
||||
print(value)
|
||||
raise Exception('Source config must be string or dict')
|
||||
|
||||
raise Exception('No Index Source Found for: ' + str(value))
|
||||
|
Loading…
x
Reference in New Issue
Block a user