1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

work on patching support!

This commit is contained in:
Ilya Kreymer 2020-08-10 08:10:42 -07:00
parent 54d8bccf4a
commit e3de917d47
6 changed files with 46 additions and 7 deletions

View File

@ -2,6 +2,8 @@
# ======================================== # ========================================
# #
dedup_index: redis://localhost:6379/0/c:{coll}:cdxj
collections: collections:
all: $all all: $all
pywb: pywb:

View File

@ -10,6 +10,8 @@ from wsgiprox.wsgiprox import WSGIProxMiddleware
from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter
from pywb.recorder.recorderapp import RecorderApp from pywb.recorder.recorderapp import RecorderApp
from pywb.recorder.redisindexer import WritableRedisIndexer
from pywb.recorder.filters import SkipDupePolicy, WriteRevisitDupePolicy, WriteDupePolicy
from pywb.utils.loaders import load_yaml_config from pywb.utils.loaders import load_yaml_config
from pywb.utils.geventserver import GeventServer from pywb.utils.geventserver import GeventServer
@ -207,8 +209,25 @@ class FrontEndApp(object):
else: else:
recorder_coll = recorder_config['source_coll'] recorder_coll = recorder_config['source_coll']
# TODO: support dedup
dedup_index = None dedup_index = None
if self.warcserver.dedup_index:
policy = self.warcserver.config.get('dedup_policy')
if policy == 'skip':
dedup_policy = SkipDupePolicy()
elif policy == 'revisit':
dedup_policy = WriteRevisitDupePolicy()
elif policy == 'keep':
dedup_policy = WriteDupePolicy()
else:
dedup_policy = WriteRevisitDupePolicy()
print('Recorder Dedup: {0} policy via dedup index {1}'.format(policy, self.warcserver.dedup_index))
dedup_index = WritableRedisIndexer(
redis_url=self.warcserver.dedup_index,
rel_path_template=self.warcserver.archive_paths,
dupe_policy=dedup_policy)
warc_writer = MultiFileWARCWriter(self.warcserver.archive_paths, warc_writer = MultiFileWARCWriter(self.warcserver.archive_paths,
max_size=int(recorder_config.get('rollover_size', 1000000000)), max_size=int(recorder_config.get('rollover_size', 1000000000)),
max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)), max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)),
@ -393,8 +412,8 @@ class FrontEndApp(object):
:return: WbResponse containing the contents of the record/URL :return: WbResponse containing the contents of the record/URL
:rtype: WbResponse :rtype: WbResponse
""" """
if coll in self.warcserver.list_fixed_routes(): #if coll in self.warcserver.list_fixed_routes():
return WbResponse.text_response('Error: Can Not Record Into Custom Collection "{0}"'.format(coll)) # return WbResponse.text_response('Error: Can Not Record Into Custom Collection "{0}"'.format(coll))
return self.serve_content(environ, coll, url, record=True) return self.serve_content(environ, coll, url, record=True)
@ -625,8 +644,8 @@ class FrontEndApp(object):
if proxy_config.get('recording'): if proxy_config.get('recording'):
logging.info('Proxy recording into collection "{0}"'.format(proxy_coll)) logging.info('Proxy recording into collection "{0}"'.format(proxy_coll))
if proxy_coll in self.warcserver.list_fixed_routes(): #if proxy_coll in self.warcserver.list_fixed_routes():
raise Exception('Can not record into fixed collection') #raise Exception('Can not record into fixed collection')
proxy_route = proxy_coll + self.RECORD_ROUTE proxy_route = proxy_coll + self.RECORD_ROUTE
if not config.get('recorder'): if not config.get('recorder'):

View File

@ -167,9 +167,11 @@ class SortedCDXWriter(BaseCDXWriter):
super(SortedCDXWriter, self).write(entry, filename) super(SortedCDXWriter, self).write(entry, filename)
line = self.out.getvalue() line = self.out.getvalue()
if line: if line:
insort(self.sortlist, line) self.sortlist.append(line)
#insort(self.sortlist, line)
def __exit__(self, *args): def __exit__(self, *args):
self.sortlist.sort()
self.actual_out.write(''.join(self.sortlist)) self.actual_out.write(''.join(self.sortlist))
return False return False

View File

@ -195,6 +195,8 @@ class RecorderApp(object):
req_is_wrapped = False req_is_wrapped = False
print('PARAMS', params)
if not skipping: if not skipping:
req_stream = ReqWrapper(input_buff, req_stream = ReqWrapper(input_buff,
headers, headers,

View File

@ -48,14 +48,18 @@ class WritableRedisIndexer(RedisIndexSource):
return base_name return base_name
def add_warc_file(self, full_filename, params): def add_warc_file(self, full_filename, params):
print('PARAMS W', params)
base_filename = self._get_rel_or_base_name(full_filename, params) base_filename = self._get_rel_or_base_name(full_filename, params)
file_key = res_template(self.file_key_template, params) file_key = res_template(self.file_key_template, params)
if not file_key:
return
full_load_path = self.full_warc_prefix + full_filename full_load_path = self.full_warc_prefix + full_filename
self.redis.hset(file_key, base_filename, full_load_path) self.redis.hset(file_key, base_filename, full_load_path)
def add_urls_to_index(self, stream, params, filename, length): def add_urls_to_index(self, stream, params, filename, length):
print('PARAMS U', params)
base_filename = self._get_rel_or_base_name(filename, params) base_filename = self._get_rel_or_base_name(filename, params)
cdxout = BytesIO() cdxout = BytesIO()
@ -64,6 +68,7 @@ class WritableRedisIndexer(RedisIndexSource):
writer_cls=params.get('writer_cls')) writer_cls=params.get('writer_cls'))
z_key = res_template(self.redis_key_template, params) z_key = res_template(self.redis_key_template, params)
print('KEY', z_key, self.redis_key_template, params)
cdx_list = cdxout.getvalue().rstrip().split(b'\n') cdx_list = cdxout.getvalue().rstrip().split(b'\n')

View File

@ -60,6 +60,7 @@ class WarcServer(BaseWarcServer):
super(WarcServer, self).__init__(debug=config.get('debug', False)) super(WarcServer, self).__init__(debug=config.get('debug', False))
self.config = config self.config = config
self.dedup_index = self.config.get('dedup_index')
self.root_dir = self.config.get('collections_root', '') self.root_dir = self.config.get('collections_root', '')
self.index_paths = self.init_paths('index_paths') self.index_paths = self.init_paths('index_paths')
@ -113,7 +114,14 @@ class WarcServer(BaseWarcServer):
access_checker = AccessChecker(CacheDirectoryAccessSource(self.acl_paths), access_checker = AccessChecker(CacheDirectoryAccessSource(self.acl_paths),
self.default_access) self.default_access)
return DefaultResourceHandler(dir_source, self.archive_paths, if self.dedup_index:
source = SimpleAggregator({'dedup': RedisMultiKeyIndexSource(self.dedup_index),
'dir': dir_source})
else:
source = dir_source
return DefaultResourceHandler(source, self.archive_paths,
rules_file=self.rules_file, rules_file=self.rules_file,
access_checker=access_checker) access_checker=access_checker)
@ -243,6 +251,7 @@ def init_index_source(value, source_list=None):
return source return source
else: else:
print(value)
raise Exception('Source config must be string or dict') raise Exception('Source config must be string or dict')
raise Exception('No Index Source Found for: ' + str(value)) raise Exception('No Index Source Found for: ' + str(value))