From 9da5bd10832452f3cbc0d92853f13074ea9bf206 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 29 Mar 2018 13:42:00 -0700 Subject: [PATCH] Decoding and Recorder Fixes (#313) * redisindex: use decode_resposes=True for redisindex * recorder: close_file(): return true if closed, close_key() return filename if closed * logging: if debug=True, log warc load failures * appveyor build fix: add pypiwin32 as dependency for windows build --- appveyor.yml | 1 + pywb/recorder/multifilewarcwriter.py | 6 ++++-- pywb/warcserver/handlers.py | 7 +++++++ pywb/warcserver/index/aggregator.py | 3 +-- pywb/warcserver/index/indexsource.py | 12 +++++++----- pywb/warcserver/resource/pathresolvers.py | 4 +--- pywb/warcserver/resource/test/test_pathresolvers.py | 4 ++-- 7 files changed, 23 insertions(+), 14 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index ed6ac610..97a1764b 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -17,6 +17,7 @@ install: - "pip install coverage pytest-cov" - "pip install cffi" - "pip install pyopenssl" + - "pip install pypiwin32" - "pip install certauth boto3 youtube-dl pysocks" - "pip install codecov" diff --git a/pywb/recorder/multifilewarcwriter.py b/pywb/recorder/multifilewarcwriter.py index f458acc9..dbfce96a 100644 --- a/pywb/recorder/multifilewarcwriter.py +++ b/pywb/recorder/multifilewarcwriter.py @@ -100,8 +100,10 @@ class MultiFileWARCWriter(BaseWARCWriter): if os.name != 'nt': portalocker.lock(fh, portalocker.LOCK_UN) fh.close() + return True except Exception as e: print(e) + return False def get_dir_key(self, params): return res_template(self.key_template, params) @@ -115,8 +117,8 @@ class MultiFileWARCWriter(BaseWARCWriter): return out, filename = result - self._close_file(out) - return filename + if self._close_file(out): + return filename def close_file(self, match_filename): for dir_key, out, filename in self.iter_open_files(): diff --git a/pywb/warcserver/handlers.py b/pywb/warcserver/handlers.py index 9b77df65..cf1bdf4a 100644 --- a/pywb/warcserver/handlers.py +++ b/pywb/warcserver/handlers.py @@ -8,6 +8,11 @@ from pywb.warcserver.index.fuzzymatcher import FuzzyMatcher from pywb.warcserver.resource.responseloader import WARCPathLoader, LiveWebLoader, VideoLoader import six +import logging +import traceback + + +logger = logging.getLogger('warcserver') #============================================================================= @@ -123,6 +128,8 @@ class ResourceHandler(IndexHandler): return out_headers, resp, errs except (WbException, ArchiveLoadFailed) as e: last_exc = e + if logger.isEnabledFor(logging.DEBUG): + traceback.print_exc() errs[str(loader)] = str(e) if last_exc: diff --git a/pywb/warcserver/index/aggregator.py b/pywb/warcserver/index/aggregator.py index 7b63299c..fd3c806b 100644 --- a/pywb/warcserver/index/aggregator.py +++ b/pywb/warcserver/index/aggregator.py @@ -370,12 +370,11 @@ class BaseRedisMultiKeyIndexSource(BaseAggregator, RedisIndexSource): redis_key_pattern = res_template(self.redis_key_template, params) if '*' not in redis_key_pattern: - keys = [redis_key_pattern.encode('utf-8')] + keys = [redis_key_pattern] else: keys = self.scan_keys(redis_key_pattern, params) for key in keys: - key = key.decode('utf-8') res = self._get_source_for_key(key) if res: yield key, res diff --git a/pywb/warcserver/index/indexsource.py b/pywb/warcserver/index/indexsource.py index 51931ebd..1250f42c 100644 --- a/pywb/warcserver/index/indexsource.py +++ b/pywb/warcserver/index/indexsource.py @@ -282,7 +282,7 @@ class RedisIndexSource(BaseIndexSource): redis_key_template = key_prefix if not redis_: - redis_ = redis.StrictRedis.from_url(redis_url) + redis_ = redis.StrictRedis.from_url(redis_url, decode_responses=True) return redis_, key_prefix def scan_keys(self, match_templ, params, member_key=None): @@ -301,18 +301,18 @@ class RedisIndexSource(BaseIndexSource): keys = self._load_key_set(key) params[scan_key] = keys - match_templ = match_templ.encode('utf-8') + #match_templ = match_templ.encode('utf-8') - return [match_templ.replace(b'*', key) for key in keys] + return [match_templ.replace('*', key) for key in keys] def _load_key_set(self, key): if not self.member_key_type: self.member_key_type = self.redis.type(key) - if self.member_key_type == b'set': + if self.member_key_type == 'set': return self.redis.smembers(key) - elif self.member_key_type == b'hash': + elif self.member_key_type == 'hash': return self.redis.hvals(key) # don't cache if any other type @@ -332,6 +332,8 @@ class RedisIndexSource(BaseIndexSource): def do_load(index_list): for line in index_list: + if isinstance(line, str): + line = line.encode('utf-8') yield CDXObject(line) return do_load(index_list) diff --git a/pywb/warcserver/resource/pathresolvers.py b/pywb/warcserver/resource/pathresolvers.py index 608f091b..5f9834ce 100644 --- a/pywb/warcserver/resource/pathresolvers.py +++ b/pywb/warcserver/resource/pathresolvers.py @@ -82,15 +82,13 @@ class RedisResolver(RedisIndexSource): if '*' in redis_key: for key in self.scan_keys(redis_key, params): - #key = key.decode('utf-8') res = self.redis.hget(key, filename) if res: break else: res = self.redis.hget(redis_key, filename) - if res and six.PY3: - res = res.decode('utf-8') + res = to_native_str(res, 'utf-8') return res diff --git a/pywb/warcserver/resource/test/test_pathresolvers.py b/pywb/warcserver/resource/test/test_pathresolvers.py index dcb24abf..a9ab6747 100644 --- a/pywb/warcserver/resource/test/test_pathresolvers.py +++ b/pywb/warcserver/resource/test/test_pathresolvers.py @@ -110,7 +110,7 @@ class TestPathIndex(object): assert resolver('example.warc.gz', cdx) == 'some_path/example.warc.gz' assert resolver('example-2.warc.gz', cdx) == 'some_path/example-2.warc.gz' - assert resolver.member_key_type == b'set' + assert resolver.member_key_type == 'set' @patch('redis.StrictRedis', FakeStrictRedis) def test_redis_resolver_multi_key_with_member_hash(self): @@ -135,7 +135,7 @@ class TestPathIndex(object): assert resolver('example.warc.gz', cdx) == 'some_path/example.warc.gz' assert resolver('example-2.warc.gz', cdx) == 'some_path/example-2.warc.gz' - assert resolver.member_key_type == b'hash' + assert resolver.member_key_type == 'hash' def test_make_best_resolver_http(self): res = DefaultResolverMixin.make_best_resolver('http://myhost.example.com/warcs/')