mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
patching/dedup work:
- enable dedup mode if dedup_policy if set in 'recorder' - support for 'dedup_by_url: true' ignoring digest, dedup by url only - support for 'cache: always' - use default 'dedup_index' setting if not specified - remove prints - bump version to 2.5.0
This commit is contained in:
parent
e3de917d47
commit
08832300b8
@ -2,8 +2,6 @@
|
|||||||
# ========================================
|
# ========================================
|
||||||
#
|
#
|
||||||
|
|
||||||
dedup_index: redis://localhost:6379/0/c:{coll}:cdxj
|
|
||||||
|
|
||||||
collections:
|
collections:
|
||||||
all: $all
|
all: $all
|
||||||
pywb:
|
pywb:
|
||||||
|
@ -110,7 +110,7 @@ class BaseCli(object):
|
|||||||
self.extra_config['debug'] = True
|
self.extra_config['debug'] = True
|
||||||
|
|
||||||
if self.r.record:
|
if self.r.record:
|
||||||
self.extra_config['recorder'] = 'live'
|
self.extra_config['recorder'] = {'source_coll': 'live'}
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
"""Start the application"""
|
"""Start the application"""
|
||||||
|
@ -28,6 +28,7 @@ from pywb.apps.wbrequestresponse import WbResponse
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
import traceback
|
import traceback
|
||||||
import requests
|
import requests
|
||||||
@ -209,17 +210,25 @@ class FrontEndApp(object):
|
|||||||
else:
|
else:
|
||||||
recorder_coll = recorder_config['source_coll']
|
recorder_coll = recorder_config['source_coll']
|
||||||
|
|
||||||
dedup_index = None
|
self.rec_cache_mode = recorder_config.get('cache')
|
||||||
if self.warcserver.dedup_index:
|
|
||||||
policy = self.warcserver.config.get('dedup_policy')
|
dedup_by_url = False
|
||||||
if policy == 'skip':
|
|
||||||
dedup_policy = SkipDupePolicy()
|
policy = recorder_config.get('dedup_policy')
|
||||||
elif policy == 'revisit':
|
if policy == 'skip':
|
||||||
dedup_policy = WriteRevisitDupePolicy()
|
dedup_policy = SkipDupePolicy()
|
||||||
elif policy == 'keep':
|
dedup_by_url = True
|
||||||
dedup_policy = WriteDupePolicy()
|
elif policy == 'revisit':
|
||||||
else:
|
dedup_policy = WriteRevisitDupePolicy()
|
||||||
dedup_policy = WriteRevisitDupePolicy()
|
elif policy == 'keep':
|
||||||
|
dedup_policy = WriteDupePolicy()
|
||||||
|
else:
|
||||||
|
dedup_policy = None
|
||||||
|
|
||||||
|
if dedup_policy:
|
||||||
|
if not self.warcserver.dedup_index:
|
||||||
|
print('dedup_index in the root of the config must also be specified when using dedup_policy')
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
print('Recorder Dedup: {0} policy via dedup index {1}'.format(policy, self.warcserver.dedup_index))
|
print('Recorder Dedup: {0} policy via dedup index {1}'.format(policy, self.warcserver.dedup_index))
|
||||||
|
|
||||||
@ -232,7 +241,8 @@ class FrontEndApp(object):
|
|||||||
max_size=int(recorder_config.get('rollover_size', 1000000000)),
|
max_size=int(recorder_config.get('rollover_size', 1000000000)),
|
||||||
max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)),
|
max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)),
|
||||||
filename_template=recorder_config.get('filename_template'),
|
filename_template=recorder_config.get('filename_template'),
|
||||||
dedup_index=dedup_index)
|
dedup_index=dedup_index,
|
||||||
|
dedup_by_url=dedup_by_url)
|
||||||
|
|
||||||
self.recorder = RecorderApp(self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer,
|
self.recorder = RecorderApp(self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer,
|
||||||
accept_colls=recorder_config.get('source_filter'))
|
accept_colls=recorder_config.get('source_filter'))
|
||||||
@ -258,7 +268,6 @@ class FrontEndApp(object):
|
|||||||
if not os.path.isdir(indexer.root_path):
|
if not os.path.isdir(indexer.root_path):
|
||||||
msg = 'No managed directory "{0}" for auto-indexing'
|
msg = 'No managed directory "{0}" for auto-indexing'
|
||||||
logging.error(msg.format(indexer.root_path))
|
logging.error(msg.format(indexer.root_path))
|
||||||
import sys
|
|
||||||
sys.exit(2)
|
sys.exit(2)
|
||||||
|
|
||||||
msg = 'Auto-Indexing Enabled on "{0}", checking every {1} secs'
|
msg = 'Auto-Indexing Enabled on "{0}", checking every {1} secs'
|
||||||
@ -447,6 +456,7 @@ class FrontEndApp(object):
|
|||||||
coll_config = self.get_coll_config(coll)
|
coll_config = self.get_coll_config(coll)
|
||||||
if record:
|
if record:
|
||||||
coll_config['type'] = 'record'
|
coll_config['type'] = 'record'
|
||||||
|
coll_config['cache'] = self.rec_cache_mode
|
||||||
|
|
||||||
if timemap_output:
|
if timemap_output:
|
||||||
coll_config['output'] = timemap_output
|
coll_config['output'] = timemap_output
|
||||||
|
@ -574,6 +574,9 @@ class RewriterApp(object):
|
|||||||
if is_proxy and environ.get('HTTP_ORIGIN'):
|
if is_proxy and environ.get('HTTP_ORIGIN'):
|
||||||
response.add_access_control_headers(environ)
|
response.add_access_control_headers(environ)
|
||||||
|
|
||||||
|
if r.status_code == 200 and kwargs.get('cache') == 'always' and environ.get('HTTP_REFERER'):
|
||||||
|
response.status_headers['Cache-Control'] = 'public, max-age=31536000, immutable'
|
||||||
|
|
||||||
return response
|
return response
|
||||||
|
|
||||||
def format_response(self, response, wb_url, full_prefix, is_timegate, is_proxy, timegate_closest_ts=None):
|
def format_response(self, response, wb_url, full_prefix, is_timegate, is_proxy, timegate_closest_ts=None):
|
||||||
|
@ -30,6 +30,7 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
|||||||
self.dir_template = dir_template
|
self.dir_template = dir_template
|
||||||
self.key_template = kwargs.get('key_template', self.dir_template)
|
self.key_template = kwargs.get('key_template', self.dir_template)
|
||||||
self.dedup_index = kwargs.get('dedup_index')
|
self.dedup_index = kwargs.get('dedup_index')
|
||||||
|
self.dedup_by_url = kwargs.get('dedup_by_url')
|
||||||
self.filename_template = filename_template
|
self.filename_template = filename_template
|
||||||
self.max_size = max_size
|
self.max_size = max_size
|
||||||
if max_idle_secs > 0:
|
if max_idle_secs > 0:
|
||||||
@ -48,7 +49,7 @@ class MultiFileWARCWriter(BaseWARCWriter):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
url = record.rec_headers.get_header('WARC-Target-URI')
|
url = record.rec_headers.get_header('WARC-Target-URI')
|
||||||
digest = record.rec_headers.get_header('WARC-Payload-Digest')
|
digest = record.rec_headers.get_header('WARC-Payload-Digest') if not self.dedup_by_url else None
|
||||||
iso_dt = record.rec_headers.get_header('WARC-Date')
|
iso_dt = record.rec_headers.get_header('WARC-Date')
|
||||||
result = self.dedup_index.lookup_revisit(params, digest, url, iso_dt)
|
result = self.dedup_index.lookup_revisit(params, digest, url, iso_dt)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -48,7 +48,6 @@ class WritableRedisIndexer(RedisIndexSource):
|
|||||||
return base_name
|
return base_name
|
||||||
|
|
||||||
def add_warc_file(self, full_filename, params):
|
def add_warc_file(self, full_filename, params):
|
||||||
print('PARAMS W', params)
|
|
||||||
base_filename = self._get_rel_or_base_name(full_filename, params)
|
base_filename = self._get_rel_or_base_name(full_filename, params)
|
||||||
file_key = res_template(self.file_key_template, params)
|
file_key = res_template(self.file_key_template, params)
|
||||||
if not file_key:
|
if not file_key:
|
||||||
@ -59,7 +58,6 @@ class WritableRedisIndexer(RedisIndexSource):
|
|||||||
self.redis.hset(file_key, base_filename, full_load_path)
|
self.redis.hset(file_key, base_filename, full_load_path)
|
||||||
|
|
||||||
def add_urls_to_index(self, stream, params, filename, length):
|
def add_urls_to_index(self, stream, params, filename, length):
|
||||||
print('PARAMS U', params)
|
|
||||||
base_filename = self._get_rel_or_base_name(filename, params)
|
base_filename = self._get_rel_or_base_name(filename, params)
|
||||||
|
|
||||||
cdxout = BytesIO()
|
cdxout = BytesIO()
|
||||||
@ -68,7 +66,6 @@ class WritableRedisIndexer(RedisIndexSource):
|
|||||||
writer_cls=params.get('writer_cls'))
|
writer_cls=params.get('writer_cls'))
|
||||||
|
|
||||||
z_key = res_template(self.redis_key_template, params)
|
z_key = res_template(self.redis_key_template, params)
|
||||||
print('KEY', z_key, self.redis_key_template, params)
|
|
||||||
|
|
||||||
cdx_list = cdxout.getvalue().rstrip().split(b'\n')
|
cdx_list = cdxout.getvalue().rstrip().split(b'\n')
|
||||||
|
|
||||||
|
File diff suppressed because one or more lines are too long
@ -1,4 +1,4 @@
|
|||||||
__version__ = '2.4.2'
|
__version__ = '2.5.0'
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print(__version__)
|
print(__version__)
|
||||||
|
@ -130,6 +130,9 @@ class FuzzyMatcher(object):
|
|||||||
if key not in self.FUZZY_SKIP_PARAMS:
|
if key not in self.FUZZY_SKIP_PARAMS:
|
||||||
fuzzy_params[key] = params[key]
|
fuzzy_params[key] = params[key]
|
||||||
|
|
||||||
|
if 'graphql' in url:
|
||||||
|
print(matched_rule, fuzzy_params)
|
||||||
|
|
||||||
return matched_rule, fuzzy_params
|
return matched_rule, fuzzy_params
|
||||||
|
|
||||||
def make_regex(self, config):
|
def make_regex(self, config):
|
||||||
|
@ -39,6 +39,8 @@ SOURCE_LIST = [LiveIndexSource,
|
|||||||
class WarcServer(BaseWarcServer):
|
class WarcServer(BaseWarcServer):
|
||||||
AUTO_COLL_TEMPL = '{coll}'
|
AUTO_COLL_TEMPL = '{coll}'
|
||||||
|
|
||||||
|
DEFAULT_REDIS_DEDUP_INDEX = 'redis://localhost:6379/0/pywb:{coll}:cdxj'
|
||||||
|
|
||||||
def __init__(self, config_file='./config.yaml', custom_config=None):
|
def __init__(self, config_file='./config.yaml', custom_config=None):
|
||||||
config = load_yaml_config(DEFAULT_CONFIG)
|
config = load_yaml_config(DEFAULT_CONFIG)
|
||||||
|
|
||||||
@ -56,11 +58,19 @@ class WarcServer(BaseWarcServer):
|
|||||||
custom_config['collections'].update(config['collections'])
|
custom_config['collections'].update(config['collections'])
|
||||||
if 'proxy' in custom_config and 'proxy' in config:
|
if 'proxy' in custom_config and 'proxy' in config:
|
||||||
custom_config['proxy'].update(config['proxy'])
|
custom_config['proxy'].update(config['proxy'])
|
||||||
|
if 'recorder' in custom_config and 'recorder' in config:
|
||||||
|
custom_config['recorder'].update(config['recorder'])
|
||||||
|
|
||||||
config.update(custom_config)
|
config.update(custom_config)
|
||||||
|
|
||||||
super(WarcServer, self).__init__(debug=config.get('debug', False))
|
super(WarcServer, self).__init__(debug=config.get('debug', False))
|
||||||
self.config = config
|
self.config = config
|
||||||
self.dedup_index = self.config.get('dedup_index')
|
|
||||||
|
recorder_config = self.config.get('recorder') or {}
|
||||||
|
if isinstance(recorder_config, dict) and recorder_config.get('dedup_policy'):
|
||||||
|
self.dedup_index = self.config.get('dedup_index', WarcServer.DEFAULT_REDIS_DEDUP_INDEX)
|
||||||
|
else:
|
||||||
|
self.dedup_index = None
|
||||||
|
|
||||||
self.root_dir = self.config.get('collections_root', '')
|
self.root_dir = self.config.get('collections_root', '')
|
||||||
self.index_paths = self.init_paths('index_paths')
|
self.index_paths = self.init_paths('index_paths')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user