1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Decoding and Recorder Fixes (#313)

* redisindex: use decode_resposes=True for redisindex
* recorder: close_file(): return true if closed, close_key() return filename if closed
* logging: if debug=True, log warc load failures
* appveyor build fix: add pypiwin32 as dependency for windows build
This commit is contained in:
Ilya Kreymer 2018-03-29 13:42:00 -07:00 committed by GitHub
parent a9cbdc1bd6
commit 9da5bd1083
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 23 additions and 14 deletions

View File

@ -17,6 +17,7 @@ install:
- "pip install coverage pytest-cov" - "pip install coverage pytest-cov"
- "pip install cffi" - "pip install cffi"
- "pip install pyopenssl" - "pip install pyopenssl"
- "pip install pypiwin32"
- "pip install certauth boto3 youtube-dl pysocks" - "pip install certauth boto3 youtube-dl pysocks"
- "pip install codecov" - "pip install codecov"

View File

@ -100,8 +100,10 @@ class MultiFileWARCWriter(BaseWARCWriter):
if os.name != 'nt': if os.name != 'nt':
portalocker.lock(fh, portalocker.LOCK_UN) portalocker.lock(fh, portalocker.LOCK_UN)
fh.close() fh.close()
return True
except Exception as e: except Exception as e:
print(e) print(e)
return False
def get_dir_key(self, params): def get_dir_key(self, params):
return res_template(self.key_template, params) return res_template(self.key_template, params)
@ -115,8 +117,8 @@ class MultiFileWARCWriter(BaseWARCWriter):
return return
out, filename = result out, filename = result
self._close_file(out) if self._close_file(out):
return filename return filename
def close_file(self, match_filename): def close_file(self, match_filename):
for dir_key, out, filename in self.iter_open_files(): for dir_key, out, filename in self.iter_open_files():

View File

@ -8,6 +8,11 @@ from pywb.warcserver.index.fuzzymatcher import FuzzyMatcher
from pywb.warcserver.resource.responseloader import WARCPathLoader, LiveWebLoader, VideoLoader from pywb.warcserver.resource.responseloader import WARCPathLoader, LiveWebLoader, VideoLoader
import six import six
import logging
import traceback
logger = logging.getLogger('warcserver')
#============================================================================= #=============================================================================
@ -123,6 +128,8 @@ class ResourceHandler(IndexHandler):
return out_headers, resp, errs return out_headers, resp, errs
except (WbException, ArchiveLoadFailed) as e: except (WbException, ArchiveLoadFailed) as e:
last_exc = e last_exc = e
if logger.isEnabledFor(logging.DEBUG):
traceback.print_exc()
errs[str(loader)] = str(e) errs[str(loader)] = str(e)
if last_exc: if last_exc:

View File

@ -370,12 +370,11 @@ class BaseRedisMultiKeyIndexSource(BaseAggregator, RedisIndexSource):
redis_key_pattern = res_template(self.redis_key_template, params) redis_key_pattern = res_template(self.redis_key_template, params)
if '*' not in redis_key_pattern: if '*' not in redis_key_pattern:
keys = [redis_key_pattern.encode('utf-8')] keys = [redis_key_pattern]
else: else:
keys = self.scan_keys(redis_key_pattern, params) keys = self.scan_keys(redis_key_pattern, params)
for key in keys: for key in keys:
key = key.decode('utf-8')
res = self._get_source_for_key(key) res = self._get_source_for_key(key)
if res: if res:
yield key, res yield key, res

View File

@ -282,7 +282,7 @@ class RedisIndexSource(BaseIndexSource):
redis_key_template = key_prefix redis_key_template = key_prefix
if not redis_: if not redis_:
redis_ = redis.StrictRedis.from_url(redis_url) redis_ = redis.StrictRedis.from_url(redis_url, decode_responses=True)
return redis_, key_prefix return redis_, key_prefix
def scan_keys(self, match_templ, params, member_key=None): def scan_keys(self, match_templ, params, member_key=None):
@ -301,18 +301,18 @@ class RedisIndexSource(BaseIndexSource):
keys = self._load_key_set(key) keys = self._load_key_set(key)
params[scan_key] = keys params[scan_key] = keys
match_templ = match_templ.encode('utf-8') #match_templ = match_templ.encode('utf-8')
return [match_templ.replace(b'*', key) for key in keys] return [match_templ.replace('*', key) for key in keys]
def _load_key_set(self, key): def _load_key_set(self, key):
if not self.member_key_type: if not self.member_key_type:
self.member_key_type = self.redis.type(key) self.member_key_type = self.redis.type(key)
if self.member_key_type == b'set': if self.member_key_type == 'set':
return self.redis.smembers(key) return self.redis.smembers(key)
elif self.member_key_type == b'hash': elif self.member_key_type == 'hash':
return self.redis.hvals(key) return self.redis.hvals(key)
# don't cache if any other type # don't cache if any other type
@ -332,6 +332,8 @@ class RedisIndexSource(BaseIndexSource):
def do_load(index_list): def do_load(index_list):
for line in index_list: for line in index_list:
if isinstance(line, str):
line = line.encode('utf-8')
yield CDXObject(line) yield CDXObject(line)
return do_load(index_list) return do_load(index_list)

View File

@ -82,15 +82,13 @@ class RedisResolver(RedisIndexSource):
if '*' in redis_key: if '*' in redis_key:
for key in self.scan_keys(redis_key, params): for key in self.scan_keys(redis_key, params):
#key = key.decode('utf-8')
res = self.redis.hget(key, filename) res = self.redis.hget(key, filename)
if res: if res:
break break
else: else:
res = self.redis.hget(redis_key, filename) res = self.redis.hget(redis_key, filename)
if res and six.PY3: res = to_native_str(res, 'utf-8')
res = res.decode('utf-8')
return res return res

View File

@ -110,7 +110,7 @@ class TestPathIndex(object):
assert resolver('example.warc.gz', cdx) == 'some_path/example.warc.gz' assert resolver('example.warc.gz', cdx) == 'some_path/example.warc.gz'
assert resolver('example-2.warc.gz', cdx) == 'some_path/example-2.warc.gz' assert resolver('example-2.warc.gz', cdx) == 'some_path/example-2.warc.gz'
assert resolver.member_key_type == b'set' assert resolver.member_key_type == 'set'
@patch('redis.StrictRedis', FakeStrictRedis) @patch('redis.StrictRedis', FakeStrictRedis)
def test_redis_resolver_multi_key_with_member_hash(self): def test_redis_resolver_multi_key_with_member_hash(self):
@ -135,7 +135,7 @@ class TestPathIndex(object):
assert resolver('example.warc.gz', cdx) == 'some_path/example.warc.gz' assert resolver('example.warc.gz', cdx) == 'some_path/example.warc.gz'
assert resolver('example-2.warc.gz', cdx) == 'some_path/example-2.warc.gz' assert resolver('example-2.warc.gz', cdx) == 'some_path/example-2.warc.gz'
assert resolver.member_key_type == b'hash' assert resolver.member_key_type == 'hash'
def test_make_best_resolver_http(self): def test_make_best_resolver_http(self):
res = DefaultResolverMixin.make_best_resolver('http://myhost.example.com/warcs/') res = DefaultResolverMixin.make_best_resolver('http://myhost.example.com/warcs/')