1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

patching/dedup work:

- enable dedup mode if dedup_policy if set in 'recorder'
- support for 'dedup_by_url: true' ignoring digest, dedup by url only
- support for 'cache: always'
- use default 'dedup_index' setting if not specified
- remove prints
- bump version to 2.5.0
This commit is contained in:
Ilya Kreymer 2020-10-08 19:03:48 -07:00
parent e3de917d47
commit 08832300b8
10 changed files with 45 additions and 23 deletions

View File

@ -2,8 +2,6 @@
# ========================================
#
dedup_index: redis://localhost:6379/0/c:{coll}:cdxj
collections:
all: $all
pywb:

View File

@ -110,7 +110,7 @@ class BaseCli(object):
self.extra_config['debug'] = True
if self.r.record:
self.extra_config['recorder'] = 'live'
self.extra_config['recorder'] = {'source_coll': 'live'}
def run(self):
"""Start the application"""

View File

@ -28,6 +28,7 @@ from pywb.apps.wbrequestresponse import WbResponse
import os
import re
import sys
import traceback
import requests
@ -209,17 +210,25 @@ class FrontEndApp(object):
else:
recorder_coll = recorder_config['source_coll']
dedup_index = None
if self.warcserver.dedup_index:
policy = self.warcserver.config.get('dedup_policy')
if policy == 'skip':
dedup_policy = SkipDupePolicy()
elif policy == 'revisit':
dedup_policy = WriteRevisitDupePolicy()
elif policy == 'keep':
dedup_policy = WriteDupePolicy()
else:
dedup_policy = WriteRevisitDupePolicy()
self.rec_cache_mode = recorder_config.get('cache')
dedup_by_url = False
policy = recorder_config.get('dedup_policy')
if policy == 'skip':
dedup_policy = SkipDupePolicy()
dedup_by_url = True
elif policy == 'revisit':
dedup_policy = WriteRevisitDupePolicy()
elif policy == 'keep':
dedup_policy = WriteDupePolicy()
else:
dedup_policy = None
if dedup_policy:
if not self.warcserver.dedup_index:
print('dedup_index in the root of the config must also be specified when using dedup_policy')
sys.exit(2)
print('Recorder Dedup: {0} policy via dedup index {1}'.format(policy, self.warcserver.dedup_index))
@ -232,7 +241,8 @@ class FrontEndApp(object):
max_size=int(recorder_config.get('rollover_size', 1000000000)),
max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)),
filename_template=recorder_config.get('filename_template'),
dedup_index=dedup_index)
dedup_index=dedup_index,
dedup_by_url=dedup_by_url)
self.recorder = RecorderApp(self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer,
accept_colls=recorder_config.get('source_filter'))
@ -258,7 +268,6 @@ class FrontEndApp(object):
if not os.path.isdir(indexer.root_path):
msg = 'No managed directory "{0}" for auto-indexing'
logging.error(msg.format(indexer.root_path))
import sys
sys.exit(2)
msg = 'Auto-Indexing Enabled on "{0}", checking every {1} secs'
@ -447,6 +456,7 @@ class FrontEndApp(object):
coll_config = self.get_coll_config(coll)
if record:
coll_config['type'] = 'record'
coll_config['cache'] = self.rec_cache_mode
if timemap_output:
coll_config['output'] = timemap_output

View File

@ -574,6 +574,9 @@ class RewriterApp(object):
if is_proxy and environ.get('HTTP_ORIGIN'):
response.add_access_control_headers(environ)
if r.status_code == 200 and kwargs.get('cache') == 'always' and environ.get('HTTP_REFERER'):
response.status_headers['Cache-Control'] = 'public, max-age=31536000, immutable'
return response
def format_response(self, response, wb_url, full_prefix, is_timegate, is_proxy, timegate_closest_ts=None):

View File

@ -30,6 +30,7 @@ class MultiFileWARCWriter(BaseWARCWriter):
self.dir_template = dir_template
self.key_template = kwargs.get('key_template', self.dir_template)
self.dedup_index = kwargs.get('dedup_index')
self.dedup_by_url = kwargs.get('dedup_by_url')
self.filename_template = filename_template
self.max_size = max_size
if max_idle_secs > 0:
@ -48,7 +49,7 @@ class MultiFileWARCWriter(BaseWARCWriter):
try:
url = record.rec_headers.get_header('WARC-Target-URI')
digest = record.rec_headers.get_header('WARC-Payload-Digest')
digest = record.rec_headers.get_header('WARC-Payload-Digest') if not self.dedup_by_url else None
iso_dt = record.rec_headers.get_header('WARC-Date')
result = self.dedup_index.lookup_revisit(params, digest, url, iso_dt)
except Exception as e:

View File

@ -48,7 +48,6 @@ class WritableRedisIndexer(RedisIndexSource):
return base_name
def add_warc_file(self, full_filename, params):
print('PARAMS W', params)
base_filename = self._get_rel_or_base_name(full_filename, params)
file_key = res_template(self.file_key_template, params)
if not file_key:
@ -59,7 +58,6 @@ class WritableRedisIndexer(RedisIndexSource):
self.redis.hset(file_key, base_filename, full_load_path)
def add_urls_to_index(self, stream, params, filename, length):
print('PARAMS U', params)
base_filename = self._get_rel_or_base_name(filename, params)
cdxout = BytesIO()
@ -68,7 +66,6 @@ class WritableRedisIndexer(RedisIndexSource):
writer_cls=params.get('writer_cls'))
z_key = res_template(self.redis_key_template, params)
print('KEY', z_key, self.redis_key_template, params)
cdx_list = cdxout.getvalue().rstrip().split(b'\n')

File diff suppressed because one or more lines are too long

View File

@ -1,4 +1,4 @@
__version__ = '2.4.2'
__version__ = '2.5.0'
if __name__ == '__main__':
print(__version__)

View File

@ -130,6 +130,9 @@ class FuzzyMatcher(object):
if key not in self.FUZZY_SKIP_PARAMS:
fuzzy_params[key] = params[key]
if 'graphql' in url:
print(matched_rule, fuzzy_params)
return matched_rule, fuzzy_params
def make_regex(self, config):

View File

@ -39,6 +39,8 @@ SOURCE_LIST = [LiveIndexSource,
class WarcServer(BaseWarcServer):
AUTO_COLL_TEMPL = '{coll}'
DEFAULT_REDIS_DEDUP_INDEX = 'redis://localhost:6379/0/pywb:{coll}:cdxj'
def __init__(self, config_file='./config.yaml', custom_config=None):
config = load_yaml_config(DEFAULT_CONFIG)
@ -56,11 +58,19 @@ class WarcServer(BaseWarcServer):
custom_config['collections'].update(config['collections'])
if 'proxy' in custom_config and 'proxy' in config:
custom_config['proxy'].update(config['proxy'])
if 'recorder' in custom_config and 'recorder' in config:
custom_config['recorder'].update(config['recorder'])
config.update(custom_config)
super(WarcServer, self).__init__(debug=config.get('debug', False))
self.config = config
self.dedup_index = self.config.get('dedup_index')
recorder_config = self.config.get('recorder') or {}
if isinstance(recorder_config, dict) and recorder_config.get('dedup_policy'):
self.dedup_index = self.config.get('dedup_index', WarcServer.DEFAULT_REDIS_DEDUP_INDEX)
else:
self.dedup_index = None
self.root_dir = self.config.get('collections_root', '')
self.index_paths = self.init_paths('index_paths')