mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
work on patching support!
This commit is contained in:
parent
54d8bccf4a
commit
e3de917d47
@ -2,6 +2,8 @@
|
|||||||
# ========================================
|
# ========================================
|
||||||
#
|
#
|
||||||
|
|
||||||
|
dedup_index: redis://localhost:6379/0/c:{coll}:cdxj
|
||||||
|
|
||||||
collections:
|
collections:
|
||||||
all: $all
|
all: $all
|
||||||
pywb:
|
pywb:
|
||||||
|
@ -10,6 +10,8 @@ from wsgiprox.wsgiprox import WSGIProxMiddleware
|
|||||||
|
|
||||||
from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter
|
from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter
|
||||||
from pywb.recorder.recorderapp import RecorderApp
|
from pywb.recorder.recorderapp import RecorderApp
|
||||||
|
from pywb.recorder.redisindexer import WritableRedisIndexer
|
||||||
|
from pywb.recorder.filters import SkipDupePolicy, WriteRevisitDupePolicy, WriteDupePolicy
|
||||||
|
|
||||||
from pywb.utils.loaders import load_yaml_config
|
from pywb.utils.loaders import load_yaml_config
|
||||||
from pywb.utils.geventserver import GeventServer
|
from pywb.utils.geventserver import GeventServer
|
||||||
@ -207,8 +209,25 @@ class FrontEndApp(object):
|
|||||||
else:
|
else:
|
||||||
recorder_coll = recorder_config['source_coll']
|
recorder_coll = recorder_config['source_coll']
|
||||||
|
|
||||||
# TODO: support dedup
|
|
||||||
dedup_index = None
|
dedup_index = None
|
||||||
|
if self.warcserver.dedup_index:
|
||||||
|
policy = self.warcserver.config.get('dedup_policy')
|
||||||
|
if policy == 'skip':
|
||||||
|
dedup_policy = SkipDupePolicy()
|
||||||
|
elif policy == 'revisit':
|
||||||
|
dedup_policy = WriteRevisitDupePolicy()
|
||||||
|
elif policy == 'keep':
|
||||||
|
dedup_policy = WriteDupePolicy()
|
||||||
|
else:
|
||||||
|
dedup_policy = WriteRevisitDupePolicy()
|
||||||
|
|
||||||
|
print('Recorder Dedup: {0} policy via dedup index {1}'.format(policy, self.warcserver.dedup_index))
|
||||||
|
|
||||||
|
dedup_index = WritableRedisIndexer(
|
||||||
|
redis_url=self.warcserver.dedup_index,
|
||||||
|
rel_path_template=self.warcserver.archive_paths,
|
||||||
|
dupe_policy=dedup_policy)
|
||||||
|
|
||||||
warc_writer = MultiFileWARCWriter(self.warcserver.archive_paths,
|
warc_writer = MultiFileWARCWriter(self.warcserver.archive_paths,
|
||||||
max_size=int(recorder_config.get('rollover_size', 1000000000)),
|
max_size=int(recorder_config.get('rollover_size', 1000000000)),
|
||||||
max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)),
|
max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)),
|
||||||
@ -393,8 +412,8 @@ class FrontEndApp(object):
|
|||||||
:return: WbResponse containing the contents of the record/URL
|
:return: WbResponse containing the contents of the record/URL
|
||||||
:rtype: WbResponse
|
:rtype: WbResponse
|
||||||
"""
|
"""
|
||||||
if coll in self.warcserver.list_fixed_routes():
|
#if coll in self.warcserver.list_fixed_routes():
|
||||||
return WbResponse.text_response('Error: Can Not Record Into Custom Collection "{0}"'.format(coll))
|
# return WbResponse.text_response('Error: Can Not Record Into Custom Collection "{0}"'.format(coll))
|
||||||
|
|
||||||
return self.serve_content(environ, coll, url, record=True)
|
return self.serve_content(environ, coll, url, record=True)
|
||||||
|
|
||||||
@ -625,8 +644,8 @@ class FrontEndApp(object):
|
|||||||
|
|
||||||
if proxy_config.get('recording'):
|
if proxy_config.get('recording'):
|
||||||
logging.info('Proxy recording into collection "{0}"'.format(proxy_coll))
|
logging.info('Proxy recording into collection "{0}"'.format(proxy_coll))
|
||||||
if proxy_coll in self.warcserver.list_fixed_routes():
|
#if proxy_coll in self.warcserver.list_fixed_routes():
|
||||||
raise Exception('Can not record into fixed collection')
|
#raise Exception('Can not record into fixed collection')
|
||||||
|
|
||||||
proxy_route = proxy_coll + self.RECORD_ROUTE
|
proxy_route = proxy_coll + self.RECORD_ROUTE
|
||||||
if not config.get('recorder'):
|
if not config.get('recorder'):
|
||||||
|
@ -167,9 +167,11 @@ class SortedCDXWriter(BaseCDXWriter):
|
|||||||
super(SortedCDXWriter, self).write(entry, filename)
|
super(SortedCDXWriter, self).write(entry, filename)
|
||||||
line = self.out.getvalue()
|
line = self.out.getvalue()
|
||||||
if line:
|
if line:
|
||||||
insort(self.sortlist, line)
|
self.sortlist.append(line)
|
||||||
|
#insort(self.sortlist, line)
|
||||||
|
|
||||||
def __exit__(self, *args):
|
def __exit__(self, *args):
|
||||||
|
self.sortlist.sort()
|
||||||
self.actual_out.write(''.join(self.sortlist))
|
self.actual_out.write(''.join(self.sortlist))
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@ -195,6 +195,8 @@ class RecorderApp(object):
|
|||||||
|
|
||||||
req_is_wrapped = False
|
req_is_wrapped = False
|
||||||
|
|
||||||
|
print('PARAMS', params)
|
||||||
|
|
||||||
if not skipping:
|
if not skipping:
|
||||||
req_stream = ReqWrapper(input_buff,
|
req_stream = ReqWrapper(input_buff,
|
||||||
headers,
|
headers,
|
||||||
|
@ -48,14 +48,18 @@ class WritableRedisIndexer(RedisIndexSource):
|
|||||||
return base_name
|
return base_name
|
||||||
|
|
||||||
def add_warc_file(self, full_filename, params):
|
def add_warc_file(self, full_filename, params):
|
||||||
|
print('PARAMS W', params)
|
||||||
base_filename = self._get_rel_or_base_name(full_filename, params)
|
base_filename = self._get_rel_or_base_name(full_filename, params)
|
||||||
file_key = res_template(self.file_key_template, params)
|
file_key = res_template(self.file_key_template, params)
|
||||||
|
if not file_key:
|
||||||
|
return
|
||||||
|
|
||||||
full_load_path = self.full_warc_prefix + full_filename
|
full_load_path = self.full_warc_prefix + full_filename
|
||||||
|
|
||||||
self.redis.hset(file_key, base_filename, full_load_path)
|
self.redis.hset(file_key, base_filename, full_load_path)
|
||||||
|
|
||||||
def add_urls_to_index(self, stream, params, filename, length):
|
def add_urls_to_index(self, stream, params, filename, length):
|
||||||
|
print('PARAMS U', params)
|
||||||
base_filename = self._get_rel_or_base_name(filename, params)
|
base_filename = self._get_rel_or_base_name(filename, params)
|
||||||
|
|
||||||
cdxout = BytesIO()
|
cdxout = BytesIO()
|
||||||
@ -64,6 +68,7 @@ class WritableRedisIndexer(RedisIndexSource):
|
|||||||
writer_cls=params.get('writer_cls'))
|
writer_cls=params.get('writer_cls'))
|
||||||
|
|
||||||
z_key = res_template(self.redis_key_template, params)
|
z_key = res_template(self.redis_key_template, params)
|
||||||
|
print('KEY', z_key, self.redis_key_template, params)
|
||||||
|
|
||||||
cdx_list = cdxout.getvalue().rstrip().split(b'\n')
|
cdx_list = cdxout.getvalue().rstrip().split(b'\n')
|
||||||
|
|
||||||
|
@ -60,6 +60,7 @@ class WarcServer(BaseWarcServer):
|
|||||||
|
|
||||||
super(WarcServer, self).__init__(debug=config.get('debug', False))
|
super(WarcServer, self).__init__(debug=config.get('debug', False))
|
||||||
self.config = config
|
self.config = config
|
||||||
|
self.dedup_index = self.config.get('dedup_index')
|
||||||
|
|
||||||
self.root_dir = self.config.get('collections_root', '')
|
self.root_dir = self.config.get('collections_root', '')
|
||||||
self.index_paths = self.init_paths('index_paths')
|
self.index_paths = self.init_paths('index_paths')
|
||||||
@ -113,7 +114,14 @@ class WarcServer(BaseWarcServer):
|
|||||||
access_checker = AccessChecker(CacheDirectoryAccessSource(self.acl_paths),
|
access_checker = AccessChecker(CacheDirectoryAccessSource(self.acl_paths),
|
||||||
self.default_access)
|
self.default_access)
|
||||||
|
|
||||||
return DefaultResourceHandler(dir_source, self.archive_paths,
|
if self.dedup_index:
|
||||||
|
source = SimpleAggregator({'dedup': RedisMultiKeyIndexSource(self.dedup_index),
|
||||||
|
'dir': dir_source})
|
||||||
|
|
||||||
|
else:
|
||||||
|
source = dir_source
|
||||||
|
|
||||||
|
return DefaultResourceHandler(source, self.archive_paths,
|
||||||
rules_file=self.rules_file,
|
rules_file=self.rules_file,
|
||||||
access_checker=access_checker)
|
access_checker=access_checker)
|
||||||
|
|
||||||
@ -243,6 +251,7 @@ def init_index_source(value, source_list=None):
|
|||||||
return source
|
return source
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
print(value)
|
||||||
raise Exception('Source config must be string or dict')
|
raise Exception('Source config must be string or dict')
|
||||||
|
|
||||||
raise Exception('No Index Source Found for: ' + str(value))
|
raise Exception('No Index Source Found for: ' + str(value))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user