diff --git a/appveyor.yml b/appveyor.yml new file mode 100644 index 00000000..57957ae1 --- /dev/null +++ b/appveyor.yml @@ -0,0 +1,28 @@ +environment: + global: + CMD_IN_ENV: "cmd /E:ON /V:ON /C obvci_appveyor_python_build_env.cmd" + + matrix: + - PYTHON: "C:\\Python27" + - PYTHON: "C:\\Python27-x64" + - PYTHON: "C:\\Python35" + - PYTHON: "C:\\Python35-x64" + - PYTHON: "C:\\Python36" + - PYTHON: "C:\\Python36-x64" + +install: + - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" + - "pip install --disable-pip-version-check --user --upgrade pip" + - "pip install -U setuptools" + - "pip install coverage pytest-cov coveralls" + - "pip install cffi" + - "pip install pyopenssl" + - "pip install certauth boto youtube-dl" + +build_script: + - "python setup.py install" + +test_script: + - "python setup.py test" + + diff --git a/extra_requirements.txt b/extra_requirements.txt index 9e60f57b..3530e2b4 100644 --- a/extra_requirements.txt +++ b/extra_requirements.txt @@ -1,6 +1,5 @@ certauth youtube-dl boto -uwsgi git+https://github.com/t0m/pyamf.git@python3 git+https://github.com/esnme/ultrajson.git diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index f8ef6dff..f28e92a3 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -142,9 +142,10 @@ class FrontEndApp(object): if not coll or not self.warcserver.root_dir: return - environ['pywb.templates_dir'] = os.path.join(self.warcserver.root_dir, - coll, - self.templates_dir) + # jinja2 template paths always use '/' as separator + environ['pywb.templates_dir'] = '/'.join([self.warcserver.root_dir, + coll, + self.templates_dir]) def serve_listing(self, environ): result = {'fixed': self.warcserver.list_fixed_routes(), diff --git a/pywb/recorder/multifilewarcwriter.py b/pywb/recorder/multifilewarcwriter.py index 41e9a363..82ae9d24 100644 --- a/pywb/recorder/multifilewarcwriter.py +++ b/pywb/recorder/multifilewarcwriter.py @@ -97,7 +97,8 @@ class MultiFileWARCWriter(BaseWARCWriter): def _close_file(self, fh): try: - portalocker.lock(fh, portalocker.LOCK_UN) + if os.name != 'nt': + portalocker.lock(fh, portalocker.LOCK_UN) fh.close() except Exception as e: print(e) @@ -222,7 +223,8 @@ class MultiFileWARCWriter(BaseWARCWriter): self.fh_cache.pop(dir_key, None) elif is_new: - portalocker.lock(out, portalocker.LOCK_EX | portalocker.LOCK_NB) + if os.name != 'nt': + portalocker.lock(out, portalocker.LOCK_EX | portalocker.LOCK_NB) self.fh_cache[dir_key] = (out, filename) def iter_open_files(self): diff --git a/pywb/recorder/test/test_recorder.py b/pywb/recorder/test/test_recorder.py index 5c9deace..1ffc77c7 100644 --- a/pywb/recorder/test/test_recorder.py +++ b/pywb/recorder/test/test_recorder.py @@ -66,7 +66,7 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) dedup_index = WritableRedisIndexer(redis_url=redis_url, file_key_template=file_key_template, - rel_path_template=self.root_dir + '/warcs/', + rel_path_template=to_path(self.root_dir + '/warcs/'), dupe_policy=dupe_policy) return dedup_index @@ -293,11 +293,11 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) assert cdx['urlkey'] == 'org,httpbin)/user-agent' assert cdx['mime'] == 'application/json' assert cdx['offset'] == '0' - assert cdx['filename'].startswith('USER/COLL/') + assert cdx['filename'].startswith(to_path('USER/COLL/')) assert cdx['filename'].endswith('.warc.gz') warcs = r.hgetall('USER:COLL:warc') - full_path = self.root_dir + '/warcs/' + cdx['filename'] + full_path = to_path(self.root_dir + '/warcs/' + cdx['filename']) assert warcs == {cdx['filename'].encode('utf-8'): full_path.encode('utf-8')} def test_record_param_user_coll_same_dir(self): @@ -353,7 +353,7 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) assert cdx['urlkey'] == 'org,httpbin)/user-agent' assert cdx['mime'] == 'warc/revisit' assert cdx['offset'] == '0' - assert cdx['filename'].startswith('USER/COLL/') + assert cdx['filename'].startswith(to_path('USER/COLL/')) assert cdx['filename'].endswith('.warc.gz') fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename']) @@ -436,10 +436,13 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) assert os.path.isfile(path) assert len(writer.fh_cache) == 1 + writer.close() + assert len(writer.fh_cache) == 0 + def test_record_multiple_writes_keep_open(self): warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz') - rel_path = self.root_dir + '/warcs/' + rel_path = to_path(self.root_dir + '/warcs/') dedup_index = self._get_dedup_index(user=False) @@ -487,7 +490,7 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) assert len(writer.fh_cache) == 1 - writer.close_key(self.root_dir + '/warcs/FOO/') + writer.close_key(to_path(self.root_dir + '/warcs/FOO/')) assert len(writer.fh_cache) == 0 @@ -501,10 +504,13 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) warcs = r.hgetall('FOO:warc') assert len(warcs) == 2 + writer.close() + assert len(writer.fh_cache) == 0 + def test_record_multiple_writes_rollover_idle(self): warc_path = to_path(self.root_dir + '/warcs/GOO/ABC-{hostname}-{timestamp}.warc.gz') - rel_path = self.root_dir + '/warcs/' + rel_path = to_path(self.root_dir + '/warcs/') dedup_index = self._get_dedup_index(user=False) @@ -539,13 +545,16 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) self._test_all_warcs('/warcs/GOO/', 2) + writer.close() + assert len(writer.fh_cache) == 0 + def test_record_custom_record(self): dedup_index = self._get_dedup_index(user=False) warc_path = to_path(self.root_dir + '/warcs/meta/meta.warc.gz') - recorder_app = RecorderApp(self.upstream_url, - MultiFileWARCWriter(warc_path, dedup_index=dedup_index)) + writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index) + recorder_app = RecorderApp(self.upstream_url, writer) req_url = '/live/resource/postreq?url=custom://httpbin.org¶m.recorder.coll=META&put_record=resource' @@ -568,7 +577,9 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) warcs = r.hgetall('META:warc') assert len(warcs) == 1 - with open(warcs[b'meta/meta.warc.gz'], 'rb') as fh: + warc_key = os.path.join('meta', 'meta.warc.gz').encode('utf-8') + + with open(warcs[warc_key], 'rb') as fh: decomp = DecompressingBufferedReader(fh) record = ArcWarcRecordLoader().parse_record_stream(decomp, ensure_http_headers=True) @@ -592,6 +603,9 @@ class TestRecorder(LiveServerTests, FakeRedisTests, TempDirTests, BaseTestClass) assert status_headers.get_header('Content-Type') == 'text/plain' assert status_headers.get_header('Content-Length') == str(len(buff)) + writer.close() + assert len(writer.fh_cache) == 0 + def test_record_video_metadata(self): warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') diff --git a/pywb/rewrite/templateview.py b/pywb/rewrite/templateview.py index 5e5ed259..4a7973b8 100644 --- a/pywb/rewrite/templateview.py +++ b/pywb/rewrite/templateview.py @@ -119,10 +119,13 @@ class BaseInsertView(object): template_path = env.get('pywb.templates_dir') if template_path: - template_path = os.path.join(template_path, self.insert_file) + # jinja paths are not os paths, always use '/' as separator + # https://github.com/pallets/jinja/issues/411 + template_path = template_path + '/' + self.insert_file + try: template = self.jenv.jinja_env.get_template(template_path) - except TemplateNotFound: + except TemplateNotFound as te: pass if not template: diff --git a/pywb/warcserver/index/aggregator.py b/pywb/warcserver/index/aggregator.py index db5cf435..c323d943 100644 --- a/pywb/warcserver/index/aggregator.py +++ b/pywb/warcserver/index/aggregator.py @@ -274,7 +274,7 @@ class BaseDirectoryIndexSource(BaseAggregator): if rel_path == '.': full_name = name else: - full_name = rel_path + '/' + name + full_name = os.path.join(rel_path, name) yield full_name, FileIndexSource(filename) @@ -294,6 +294,8 @@ class BaseDirectoryIndexSource(BaseAggregator): @classmethod def init_from_string(cls, value): + if os.path.sep != '/': + value = value.replace('/', os.path.sep) if '://' not in value and os.path.isdir(value): return cls(value) diff --git a/pywb/warcserver/index/test/test_dir_agg.py b/pywb/warcserver/index/test/test_dir_agg.py index b56e6806..b5e615fe 100644 --- a/pywb/warcserver/index/test/test_dir_agg.py +++ b/pywb/warcserver/index/test/test_dir_agg.py @@ -57,7 +57,7 @@ class TestDirAgg(TempDirTests, BaseTestClass): def test_agg_collA_found(self): res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'A'}) - exp = [{'source': 'colls/A/indexes/example2.cdxj', 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}] + exp = [{'source': to_path('colls/A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}] assert(to_json_list(res) == exp) assert(errs == {}) @@ -73,7 +73,7 @@ class TestDirAgg(TempDirTests, BaseTestClass): def test_agg_collB_found(self): res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': 'B'}) - exp = [{'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] + exp = [{'source': to_path('colls/B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] assert(to_json_list(res) == exp) assert(errs == {}) @@ -83,7 +83,7 @@ class TestDirAgg(TempDirTests, BaseTestClass): agg_source = SimpleAggregator({'dir': self.dir_loader}) res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'}) - exp = [{'source': 'dir:colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] + exp = [{'source': to_path('dir:colls/B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] assert(to_json_list(res) == exp) assert(errs == {}) @@ -93,9 +93,9 @@ class TestDirAgg(TempDirTests, BaseTestClass): res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': '*'}) exp = [ - {'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}, - {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'}, - {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'}, + {'source': to_path('colls/B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}, + {'source': to_path('colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'}, + {'source': to_path('colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'}, ] assert(to_json_list(res) == exp) @@ -106,9 +106,9 @@ class TestDirAgg(TempDirTests, BaseTestClass): res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': '*'}) exp = [ - {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, - {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, - {'source': 'colls/A/indexes/example2.cdxj', 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'} + {'source': to_path('colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, + {'source': to_path('colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, + {'source': to_path('colls/A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'} ] assert(to_json_list(res) == exp) @@ -126,9 +126,9 @@ class TestDirAgg(TempDirTests, BaseTestClass): {'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'}, {'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'}, {'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'}, - {'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, - {'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, - {'source': 'local:colls/A/indexes/example2.cdxj', 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'} + {'source': to_path('local:colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, + {'source': to_path('local:colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, + {'source': to_path('local:colls/A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'} ] assert(to_json_list(res) == exp) @@ -156,9 +156,9 @@ class TestDirAgg(TempDirTests, BaseTestClass): def test_agg_dir_sources_1(self): res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'}) - exp = {'sources': {'colls/A/indexes/example2.cdxj': 'file', - 'colls/B/indexes/iana.cdxj': 'file', - 'colls/C/indexes/dupes.cdxj': 'file'} + exp = {'sources': {to_path('colls/A/indexes/example2.cdxj'): 'file', + to_path('colls/B/indexes/iana.cdxj'): 'file', + to_path('colls/C/indexes/dupes.cdxj'): 'file'} } assert(res == exp) @@ -166,8 +166,8 @@ class TestDirAgg(TempDirTests, BaseTestClass): def test_agg_dir_sources_2(self): res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'}) - exp = {'sources': {'colls/A/indexes/example2.cdxj': 'file', - 'colls/C/indexes/dupes.cdxj': 'file'} + exp = {'sources': {to_path('colls/A/indexes/example2.cdxj'): 'file', + to_path('colls/C/indexes/dupes.cdxj'): 'file'} } assert(res == exp) @@ -193,9 +193,9 @@ class TestDirAgg(TempDirTests, BaseTestClass): def test_cache_dir_sources_1(self): - exp = {'sources': {'colls/A/indexes/example2.cdxj': 'file', - 'colls/B/indexes/iana.cdxj': 'file', - 'colls/C/indexes/dupes.cdxj': 'file'} + exp = {'sources': {to_path('colls/A/indexes/example2.cdxj'): 'file', + to_path('colls/B/indexes/iana.cdxj'): 'file', + to_path('colls/C/indexes/dupes.cdxj'): 'file'} } res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'}) @@ -204,7 +204,10 @@ class TestDirAgg(TempDirTests, BaseTestClass): res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'}) assert(res == exp) - new_file = os.path.join(self.root_dir, 'colls/C/indexes/empty.cdxj') + new_file = os.path.join(self.root_dir, to_path('colls/C/indexes/empty.cdxj')) + + # ensure new file is created at least a second later + time.sleep(1.0) with open(new_file, 'a') as fh: os.utime(new_file, None) @@ -212,5 +215,5 @@ class TestDirAgg(TempDirTests, BaseTestClass): res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'}) # New File Included - exp['sources']['colls/C/indexes/empty.cdxj'] = 'file' + exp['sources'][to_path('colls/C/indexes/empty.cdxj')] = 'file' assert(res == exp) diff --git a/pywb/warcserver/index/test/test_memento_agg.py b/pywb/warcserver/index/test/test_memento_agg.py index a27a6b83..20cf4838 100644 --- a/pywb/warcserver/index/test/test_memento_agg.py +++ b/pywb/warcserver/index/test/test_memento_agg.py @@ -36,7 +36,7 @@ aggs_inv = {'simple': SimpleAggregator(sources, invert_sources=True), agg_tm = {'gevent': GeventTimeoutAggregator(sources, timeout=0.0)} -nf = {'notfound': FileIndexSource(to_path('testdata/not-found-x'))} +nf = {'notfound': FileIndexSource('testdata/not-found-x')} agg_nf = {'simple': SimpleAggregator(nf), 'gevent': GeventTimeoutAggregator(nf, timeout=5.0), } diff --git a/pywb/warcserver/index/test/test_timeouts.py b/pywb/warcserver/index/test/test_timeouts.py index f50e83d0..1560d7b0 100644 --- a/pywb/warcserver/index/test/test_timeouts.py +++ b/pywb/warcserver/index/test/test_timeouts.py @@ -47,7 +47,7 @@ def test_timeout_long_all_pass(): def test_timeout_slower_skipped_1(): - agg = GeventTimeoutAggregator(sources, timeout=0.49) + agg = GeventTimeoutAggregator(sources, timeout=0.40) res, errs = agg(dict(url='http://example.com/')) @@ -58,8 +58,8 @@ def test_timeout_slower_skipped_1(): assert(errs == {'slower': 'timeout'}) -def test_timeout_slower_skipped_2(): - agg = GeventTimeoutAggregator(sources, timeout=0.19) +def test_timeout_slower_all_skipped(): + agg = GeventTimeoutAggregator(sources, timeout=0.10) res, errs = agg(dict(url='http://example.com/')) @@ -74,8 +74,8 @@ def test_timeout_skipping(): assert(sources['slow'].calls == 3) assert(sources['slower'].calls == 3) - agg = GeventTimeoutAggregator(sources, timeout=0.49, - t_count=2, t_duration=2.0) + agg = GeventTimeoutAggregator(sources, timeout=0.40, + t_count=2, t_duration=1.0) exp = [{'source': 'slow', 'timestamp': '20160225042329'}] @@ -107,7 +107,7 @@ def test_timeout_skipping(): assert(errs == {}) - time.sleep(2.01) + time.sleep(1.5) res, errs = agg(dict(url='http://example.com/')) assert(to_json_list(res, fields=['source', 'timestamp']) == exp) diff --git a/pywb/warcserver/index/zipnum.py b/pywb/warcserver/index/zipnum.py index 607ebfd0..84d9ec02 100644 --- a/pywb/warcserver/index/zipnum.py +++ b/pywb/warcserver/index/zipnum.py @@ -351,7 +351,15 @@ class ZipNumIndexSource(BaseIndexSource): for line in BytesIO(buff): yield line - iter_ = itertools.chain(*map(decompress_block, ranges)) + def iter_blocks(reader): + try: + for r in ranges: + yield decompress_block(r) + finally: + reader.close() + + # iterate over all blocks + iter_ = itertools.chain.from_iterable(iter_blocks(reader)) # start bound iter_ = linearsearch(iter_, query.key) diff --git a/tests/test_auto_colls.py b/tests/test_auto_colls.py index 3354d864..4c8acbad 100644 --- a/tests/test_auto_colls.py +++ b/tests/test_auto_colls.py @@ -51,8 +51,8 @@ class TestManagedColls(TempDirTests, BaseTestClass): @classmethod def teardown_class(cls): - super(TestManagedColls, cls).teardown_class() os.chdir(cls.orig_cwd) + super(TestManagedColls, cls).teardown_class() def _check_dirs(self, base, dirlist): for dir_ in dirlist: