From e3de917d47dbfa92956b4ff2361c636ac6bbd971 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 10 Aug 2020 08:10:42 -0700 Subject: [PATCH] work on patching support! --- config.yaml | 2 ++ pywb/apps/frontendapp.py | 29 ++++++++++++++++++++++++----- pywb/indexer/cdxindexer.py | 4 +++- pywb/recorder/recorderapp.py | 2 ++ pywb/recorder/redisindexer.py | 5 +++++ pywb/warcserver/warcserver.py | 11 ++++++++++- 6 files changed, 46 insertions(+), 7 deletions(-) diff --git a/config.yaml b/config.yaml index 01827eb2..6bab6661 100644 --- a/config.yaml +++ b/config.yaml @@ -2,6 +2,8 @@ # ======================================== # +dedup_index: redis://localhost:6379/0/c:{coll}:cdxj + collections: all: $all pywb: diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index eab2e67f..aed75e88 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -10,6 +10,8 @@ from wsgiprox.wsgiprox import WSGIProxMiddleware from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter from pywb.recorder.recorderapp import RecorderApp +from pywb.recorder.redisindexer import WritableRedisIndexer +from pywb.recorder.filters import SkipDupePolicy, WriteRevisitDupePolicy, WriteDupePolicy from pywb.utils.loaders import load_yaml_config from pywb.utils.geventserver import GeventServer @@ -207,8 +209,25 @@ class FrontEndApp(object): else: recorder_coll = recorder_config['source_coll'] - # TODO: support dedup dedup_index = None + if self.warcserver.dedup_index: + policy = self.warcserver.config.get('dedup_policy') + if policy == 'skip': + dedup_policy = SkipDupePolicy() + elif policy == 'revisit': + dedup_policy = WriteRevisitDupePolicy() + elif policy == 'keep': + dedup_policy = WriteDupePolicy() + else: + dedup_policy = WriteRevisitDupePolicy() + + print('Recorder Dedup: {0} policy via dedup index {1}'.format(policy, self.warcserver.dedup_index)) + + dedup_index = WritableRedisIndexer( + redis_url=self.warcserver.dedup_index, + rel_path_template=self.warcserver.archive_paths, + dupe_policy=dedup_policy) + warc_writer = MultiFileWARCWriter(self.warcserver.archive_paths, max_size=int(recorder_config.get('rollover_size', 1000000000)), max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)), @@ -393,8 +412,8 @@ class FrontEndApp(object): :return: WbResponse containing the contents of the record/URL :rtype: WbResponse """ - if coll in self.warcserver.list_fixed_routes(): - return WbResponse.text_response('Error: Can Not Record Into Custom Collection "{0}"'.format(coll)) + #if coll in self.warcserver.list_fixed_routes(): + # return WbResponse.text_response('Error: Can Not Record Into Custom Collection "{0}"'.format(coll)) return self.serve_content(environ, coll, url, record=True) @@ -625,8 +644,8 @@ class FrontEndApp(object): if proxy_config.get('recording'): logging.info('Proxy recording into collection "{0}"'.format(proxy_coll)) - if proxy_coll in self.warcserver.list_fixed_routes(): - raise Exception('Can not record into fixed collection') + #if proxy_coll in self.warcserver.list_fixed_routes(): + #raise Exception('Can not record into fixed collection') proxy_route = proxy_coll + self.RECORD_ROUTE if not config.get('recorder'): diff --git a/pywb/indexer/cdxindexer.py b/pywb/indexer/cdxindexer.py index 3ce26d0d..bcf2534b 100644 --- a/pywb/indexer/cdxindexer.py +++ b/pywb/indexer/cdxindexer.py @@ -167,9 +167,11 @@ class SortedCDXWriter(BaseCDXWriter): super(SortedCDXWriter, self).write(entry, filename) line = self.out.getvalue() if line: - insort(self.sortlist, line) + self.sortlist.append(line) + #insort(self.sortlist, line) def __exit__(self, *args): + self.sortlist.sort() self.actual_out.write(''.join(self.sortlist)) return False diff --git a/pywb/recorder/recorderapp.py b/pywb/recorder/recorderapp.py index 689d4171..dc95e505 100644 --- a/pywb/recorder/recorderapp.py +++ b/pywb/recorder/recorderapp.py @@ -195,6 +195,8 @@ class RecorderApp(object): req_is_wrapped = False + print('PARAMS', params) + if not skipping: req_stream = ReqWrapper(input_buff, headers, diff --git a/pywb/recorder/redisindexer.py b/pywb/recorder/redisindexer.py index c1984010..c4e70db9 100644 --- a/pywb/recorder/redisindexer.py +++ b/pywb/recorder/redisindexer.py @@ -48,14 +48,18 @@ class WritableRedisIndexer(RedisIndexSource): return base_name def add_warc_file(self, full_filename, params): + print('PARAMS W', params) base_filename = self._get_rel_or_base_name(full_filename, params) file_key = res_template(self.file_key_template, params) + if not file_key: + return full_load_path = self.full_warc_prefix + full_filename self.redis.hset(file_key, base_filename, full_load_path) def add_urls_to_index(self, stream, params, filename, length): + print('PARAMS U', params) base_filename = self._get_rel_or_base_name(filename, params) cdxout = BytesIO() @@ -64,6 +68,7 @@ class WritableRedisIndexer(RedisIndexSource): writer_cls=params.get('writer_cls')) z_key = res_template(self.redis_key_template, params) + print('KEY', z_key, self.redis_key_template, params) cdx_list = cdxout.getvalue().rstrip().split(b'\n') diff --git a/pywb/warcserver/warcserver.py b/pywb/warcserver/warcserver.py index d417c0fa..c25dd246 100644 --- a/pywb/warcserver/warcserver.py +++ b/pywb/warcserver/warcserver.py @@ -60,6 +60,7 @@ class WarcServer(BaseWarcServer): super(WarcServer, self).__init__(debug=config.get('debug', False)) self.config = config + self.dedup_index = self.config.get('dedup_index') self.root_dir = self.config.get('collections_root', '') self.index_paths = self.init_paths('index_paths') @@ -113,7 +114,14 @@ class WarcServer(BaseWarcServer): access_checker = AccessChecker(CacheDirectoryAccessSource(self.acl_paths), self.default_access) - return DefaultResourceHandler(dir_source, self.archive_paths, + if self.dedup_index: + source = SimpleAggregator({'dedup': RedisMultiKeyIndexSource(self.dedup_index), + 'dir': dir_source}) + + else: + source = dir_source + + return DefaultResourceHandler(source, self.archive_paths, rules_file=self.rules_file, access_checker=access_checker) @@ -243,6 +251,7 @@ def init_index_source(value, source_list=None): return source else: + print(value) raise Exception('Source config must be string or dict') raise Exception('No Index Source Found for: ' + str(value))