1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Windows Build Fixes/Appveyor CI (#225)

windows build fixes: all tests should pass, ci with appveyor
- add appveyor.yml
- path fixes for windows, use os.path.join
- templates_dir: use '/' always for jinja2 paths
- auto colls: ensure chdir before deleting dir
- recorder: ensure warc writer is always closed
- recorder: disable locking in warcwriter on windows for now (read access not avail, shared
lock seems to not be working)
- zipnum: ensure block is closed after read!
- cached dir test: wait before adding file
- tests: adjust timeout tests to allow more leeway in timing
This commit is contained in:
Ilya Kreymer 2017-08-05 17:12:16 -07:00 committed by GitHub
parent a6ab167dd3
commit bcb5bef39d
12 changed files with 110 additions and 50 deletions

28
appveyor.yml Normal file
View File

@ -0,0 +1,28 @@
environment:
global:
CMD_IN_ENV: "cmd /E:ON /V:ON /C obvci_appveyor_python_build_env.cmd"
matrix:
- PYTHON: "C:\\Python27"
- PYTHON: "C:\\Python27-x64"
- PYTHON: "C:\\Python35"
- PYTHON: "C:\\Python35-x64"
- PYTHON: "C:\\Python36"
- PYTHON: "C:\\Python36-x64"
install:
- "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%"
- "pip install --disable-pip-version-check --user --upgrade pip"
- "pip install -U setuptools"
- "pip install coverage pytest-cov coveralls"
- "pip install cffi"
- "pip install pyopenssl"
- "pip install certauth boto youtube-dl"
build_script:
- "python setup.py install"
test_script:
- "python setup.py test"

View File

@ -1,6 +1,5 @@
certauth
youtube-dl
boto
uwsgi
git+https://github.com/t0m/pyamf.git@python3
git+https://github.com/esnme/ultrajson.git

View File

@ -142,9 +142,10 @@ class FrontEndApp(object):
if not coll or not self.warcserver.root_dir:
return
environ['pywb.templates_dir'] = os.path.join(self.warcserver.root_dir,
# jinja2 template paths always use '/' as separator
environ['pywb.templates_dir'] = '/'.join([self.warcserver.root_dir,
coll,
self.templates_dir)
self.templates_dir])
def serve_listing(self, environ):
result = {'fixed': self.warcserver.list_fixed_routes(),

View File

@ -97,6 +97,7 @@ class MultiFileWARCWriter(BaseWARCWriter):
def _close_file(self, fh):
try:
if os.name != 'nt':
portalocker.lock(fh, portalocker.LOCK_UN)
fh.close()
except Exception as e:
@ -222,6 +223,7 @@ class MultiFileWARCWriter(BaseWARCWriter):
self.fh_cache.pop(dir_key, None)
elif is_new:
if os.name != 'nt':
portalocker.lock(out, portalocker.LOCK_EX | portalocker.LOCK_NB)
self.fh_cache[dir_key] = (out, filename)

View File

@ -66,7 +66,7 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
dedup_index = WritableRedisIndexer(redis_url=redis_url,
file_key_template=file_key_template,
rel_path_template=self.root_dir + '/warcs/',
rel_path_template=to_path(self.root_dir + '/warcs/'),
dupe_policy=dupe_policy)
return dedup_index
@ -293,11 +293,11 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
assert cdx['urlkey'] == 'org,httpbin)/user-agent'
assert cdx['mime'] == 'application/json'
assert cdx['offset'] == '0'
assert cdx['filename'].startswith('USER/COLL/')
assert cdx['filename'].startswith(to_path('USER/COLL/'))
assert cdx['filename'].endswith('.warc.gz')
warcs = r.hgetall('USER:COLL:warc')
full_path = self.root_dir + '/warcs/' + cdx['filename']
full_path = to_path(self.root_dir + '/warcs/' + cdx['filename'])
assert warcs == {cdx['filename'].encode('utf-8'): full_path.encode('utf-8')}
def test_record_param_user_coll_same_dir(self):
@ -353,7 +353,7 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
assert cdx['urlkey'] == 'org,httpbin)/user-agent'
assert cdx['mime'] == 'warc/revisit'
assert cdx['offset'] == '0'
assert cdx['filename'].startswith('USER/COLL/')
assert cdx['filename'].startswith(to_path('USER/COLL/'))
assert cdx['filename'].endswith('.warc.gz')
fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename'])
@ -436,10 +436,13 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
assert os.path.isfile(path)
assert len(writer.fh_cache) == 1
writer.close()
assert len(writer.fh_cache) == 0
def test_record_multiple_writes_keep_open(self):
warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz')
rel_path = self.root_dir + '/warcs/'
rel_path = to_path(self.root_dir + '/warcs/')
dedup_index = self._get_dedup_index(user=False)
@ -487,7 +490,7 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
assert len(writer.fh_cache) == 1
writer.close_key(self.root_dir + '/warcs/FOO/')
writer.close_key(to_path(self.root_dir + '/warcs/FOO/'))
assert len(writer.fh_cache) == 0
@ -501,10 +504,13 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
warcs = r.hgetall('FOO:warc')
assert len(warcs) == 2
writer.close()
assert len(writer.fh_cache) == 0
def test_record_multiple_writes_rollover_idle(self):
warc_path = to_path(self.root_dir + '/warcs/GOO/ABC-{hostname}-{timestamp}.warc.gz')
rel_path = self.root_dir + '/warcs/'
rel_path = to_path(self.root_dir + '/warcs/')
dedup_index = self._get_dedup_index(user=False)
@ -539,13 +545,16 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
self._test_all_warcs('/warcs/GOO/', 2)
writer.close()
assert len(writer.fh_cache) == 0
def test_record_custom_record(self):
dedup_index = self._get_dedup_index(user=False)
warc_path = to_path(self.root_dir + '/warcs/meta/meta.warc.gz')
recorder_app = RecorderApp(self.upstream_url,
MultiFileWARCWriter(warc_path, dedup_index=dedup_index))
writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index)
recorder_app = RecorderApp(self.upstream_url, writer)
req_url = '/live/resource/postreq?url=custom://httpbin.org&param.recorder.coll=META&put_record=resource'
@ -568,7 +577,9 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
warcs = r.hgetall('META:warc')
assert len(warcs) == 1
with open(warcs[b'meta/meta.warc.gz'], 'rb') as fh:
warc_key = os.path.join('meta', 'meta.warc.gz').encode('utf-8')
with open(warcs[warc_key], 'rb') as fh:
decomp = DecompressingBufferedReader(fh)
record = ArcWarcRecordLoader().parse_record_stream(decomp, ensure_http_headers=True)
@ -592,6 +603,9 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass)
assert status_headers.get_header('Content-Type') == 'text/plain'
assert status_headers.get_header('Content-Length') == str(len(buff))
writer.close()
assert len(writer.fh_cache) == 0
def test_record_video_metadata(self):
warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/')

View File

@ -119,10 +119,13 @@ class BaseInsertView(object):
template_path = env.get('pywb.templates_dir')
if template_path:
template_path = os.path.join(template_path, self.insert_file)
# jinja paths are not os paths, always use '/' as separator
# https://github.com/pallets/jinja/issues/411
template_path = template_path + '/' + self.insert_file
try:
template = self.jenv.jinja_env.get_template(template_path)
except TemplateNotFound:
except TemplateNotFound as te:
pass
if not template:

View File

@ -274,7 +274,7 @@ class BaseDirectoryIndexSource(BaseAggregator):
if rel_path == '.':
full_name = name
else:
full_name = rel_path + '/' + name
full_name = os.path.join(rel_path, name)
yield full_name, FileIndexSource(filename)
@ -294,6 +294,8 @@ class BaseDirectoryIndexSource(BaseAggregator):
@classmethod
def init_from_string(cls, value):
if os.path.sep != '/':
value = value.replace('/', os.path.sep)
if '://' not in value and os.path.isdir(value):
return cls(value)

View File

@ -57,7 +57,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
def test_agg_collA_found(self):
res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'A'})
exp = [{'source': 'colls/A/indexes/example2.cdxj', 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}]
exp = [{'source': to_path('colls/A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}]
assert(to_json_list(res) == exp)
assert(errs == {})
@ -73,7 +73,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
def test_agg_collB_found(self):
res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': 'B'})
exp = [{'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
exp = [{'source': to_path('colls/B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
assert(to_json_list(res) == exp)
assert(errs == {})
@ -83,7 +83,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
agg_source = SimpleAggregator({'dir': self.dir_loader})
res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'})
exp = [{'source': 'dir:colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
exp = [{'source': to_path('dir:colls/B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
assert(to_json_list(res) == exp)
assert(errs == {})
@ -93,9 +93,9 @@ class TestDirAgg(TempDirTests, BaseTestClass):
res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': '*'})
exp = [
{'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'},
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
{'source': to_path('colls/B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'},
{'source': to_path('colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
{'source': to_path('colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
]
assert(to_json_list(res) == exp)
@ -106,9 +106,9 @@ class TestDirAgg(TempDirTests, BaseTestClass):
res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': '*'})
exp = [
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
{'source': 'colls/A/indexes/example2.cdxj', 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
{'source': to_path('colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
{'source': to_path('colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
{'source': to_path('colls/A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
]
assert(to_json_list(res) == exp)
@ -126,9 +126,9 @@ class TestDirAgg(TempDirTests, BaseTestClass):
{'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'},
{'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'},
{'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'},
{'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
{'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
{'source': 'local:colls/A/indexes/example2.cdxj', 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
{'source': to_path('local:colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
{'source': to_path('local:colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
{'source': to_path('local:colls/A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
]
assert(to_json_list(res) == exp)
@ -156,9 +156,9 @@ class TestDirAgg(TempDirTests, BaseTestClass):
def test_agg_dir_sources_1(self):
res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
exp = {'sources': {'colls/A/indexes/example2.cdxj': 'file',
'colls/B/indexes/iana.cdxj': 'file',
'colls/C/indexes/dupes.cdxj': 'file'}
exp = {'sources': {to_path('colls/A/indexes/example2.cdxj'): 'file',
to_path('colls/B/indexes/iana.cdxj'): 'file',
to_path('colls/C/indexes/dupes.cdxj'): 'file'}
}
assert(res == exp)
@ -166,8 +166,8 @@ class TestDirAgg(TempDirTests, BaseTestClass):
def test_agg_dir_sources_2(self):
res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'})
exp = {'sources': {'colls/A/indexes/example2.cdxj': 'file',
'colls/C/indexes/dupes.cdxj': 'file'}
exp = {'sources': {to_path('colls/A/indexes/example2.cdxj'): 'file',
to_path('colls/C/indexes/dupes.cdxj'): 'file'}
}
assert(res == exp)
@ -193,9 +193,9 @@ class TestDirAgg(TempDirTests, BaseTestClass):
def test_cache_dir_sources_1(self):
exp = {'sources': {'colls/A/indexes/example2.cdxj': 'file',
'colls/B/indexes/iana.cdxj': 'file',
'colls/C/indexes/dupes.cdxj': 'file'}
exp = {'sources': {to_path('colls/A/indexes/example2.cdxj'): 'file',
to_path('colls/B/indexes/iana.cdxj'): 'file',
to_path('colls/C/indexes/dupes.cdxj'): 'file'}
}
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
@ -204,7 +204,10 @@ class TestDirAgg(TempDirTests, BaseTestClass):
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
assert(res == exp)
new_file = os.path.join(self.root_dir, 'colls/C/indexes/empty.cdxj')
new_file = os.path.join(self.root_dir, to_path('colls/C/indexes/empty.cdxj'))
# ensure new file is created at least a second later
time.sleep(1.0)
with open(new_file, 'a') as fh:
os.utime(new_file, None)
@ -212,5 +215,5 @@ class TestDirAgg(TempDirTests, BaseTestClass):
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
# New File Included
exp['sources']['colls/C/indexes/empty.cdxj'] = 'file'
exp['sources'][to_path('colls/C/indexes/empty.cdxj')] = 'file'
assert(res == exp)

View File

@ -36,7 +36,7 @@ aggs_inv = {'simple': SimpleAggregator(sources, invert_sources=True),
agg_tm = {'gevent': GeventTimeoutAggregator(sources, timeout=0.0)}
nf = {'notfound': FileIndexSource(to_path('testdata/not-found-x'))}
nf = {'notfound': FileIndexSource('testdata/not-found-x')}
agg_nf = {'simple': SimpleAggregator(nf),
'gevent': GeventTimeoutAggregator(nf, timeout=5.0),
}

View File

@ -47,7 +47,7 @@ def test_timeout_long_all_pass():
def test_timeout_slower_skipped_1():
agg = GeventTimeoutAggregator(sources, timeout=0.49)
agg = GeventTimeoutAggregator(sources, timeout=0.40)
res, errs = agg(dict(url='http://example.com/'))
@ -58,8 +58,8 @@ def test_timeout_slower_skipped_1():
assert(errs == {'slower': 'timeout'})
def test_timeout_slower_skipped_2():
agg = GeventTimeoutAggregator(sources, timeout=0.19)
def test_timeout_slower_all_skipped():
agg = GeventTimeoutAggregator(sources, timeout=0.10)
res, errs = agg(dict(url='http://example.com/'))
@ -74,8 +74,8 @@ def test_timeout_skipping():
assert(sources['slow'].calls == 3)
assert(sources['slower'].calls == 3)
agg = GeventTimeoutAggregator(sources, timeout=0.49,
t_count=2, t_duration=2.0)
agg = GeventTimeoutAggregator(sources, timeout=0.40,
t_count=2, t_duration=1.0)
exp = [{'source': 'slow', 'timestamp': '20160225042329'}]
@ -107,7 +107,7 @@ def test_timeout_skipping():
assert(errs == {})
time.sleep(2.01)
time.sleep(1.5)
res, errs = agg(dict(url='http://example.com/'))
assert(to_json_list(res, fields=['source', 'timestamp']) == exp)

View File

@ -351,7 +351,15 @@ class ZipNumIndexSource(BaseIndexSource):
for line in BytesIO(buff):
yield line
iter_ = itertools.chain(*map(decompress_block, ranges))
def iter_blocks(reader):
try:
for r in ranges:
yield decompress_block(r)
finally:
reader.close()
# iterate over all blocks
iter_ = itertools.chain.from_iterable(iter_blocks(reader))
# start bound
iter_ = linearsearch(iter_, query.key)

View File

@ -51,8 +51,8 @@ class TestManagedColls(TempDirTests, BaseTestClass):
@classmethod
def teardown_class(cls):
super(TestManagedColls, cls).teardown_class()
os.chdir(cls.orig_cwd)
super(TestManagedColls, cls).teardown_class()
def _check_dirs(self, base, dirlist):
for dir_ in dirlist: