From 02f8fa9ff34b9587179565c50037025a351e5096 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 28 Sep 2017 08:37:04 -0700 Subject: [PATCH] windows: fix file path to/from file:// url conversion, add from_file_url() and use to_file_url() more consistently resolvers: make_best_resolver() handles file:// urls, but not PrefixResolver itself --- pywb/utils/loaders.py | 19 ++++++++++++++----- pywb/warcserver/resource/pathresolvers.py | 9 +++------ .../resource/test/test_pathresolvers.py | 4 ++-- tests/test_record_replay.py | 4 ++-- 4 files changed, 21 insertions(+), 15 deletions(-) diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 3b80f3e9..4b6c4b74 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -11,7 +11,6 @@ import requests import yaml import six -from six.moves.urllib.request import pathname2url, url2pathname from six.moves.urllib.parse import urljoin, unquote_plus, urlsplit, urlencode import time @@ -38,8 +37,17 @@ def is_http(filename): def to_file_url(filename): """ Convert a filename to a file:// url """ - url = os.path.abspath(filename) - url = urljoin('file:', pathname2url(url)) + url = 'file://' + os.path.abspath(filename).replace(os.path.sep, '/') + return url + + +#================================================================= +def from_file_url(url): + """ Convert from file:// url to file path + """ + if url.startswith('file://'): + url = url[len('file://'):].replace('/', os.path.sep) + return url @@ -259,9 +267,10 @@ class LocalFileLoader(PackageLoader): file_only = url.startswith(('/', '.')) # convert to filename - if url.startswith('file://'): + filename = from_file_url(url) + if filename != url: file_only = True - url = url2pathname(url[len('file://'):]) + url = filename try: # first, try as file diff --git a/pywb/warcserver/resource/pathresolvers.py b/pywb/warcserver/resource/pathresolvers.py index 3fa69c7b..f6f6ad61 100644 --- a/pywb/warcserver/resource/pathresolvers.py +++ b/pywb/warcserver/resource/pathresolvers.py @@ -5,7 +5,7 @@ from pywb.utils.binsearch import iter_exact from pywb.warcserver.index.indexsource import RedisIndexSource -from six.moves.urllib.request import url2pathname +from pywb.utils.loaders import from_file_url import six import os @@ -40,9 +40,7 @@ class PrefixResolver(object): if '*' not in path: return path - if path.startswith('file://'): - path = path[7:] - elif '://' in path: + if '://' in path: return path paths = glob.glob(path) @@ -112,8 +110,7 @@ class DefaultResolverMixin(object): if path.startswith('redis://'): return RedisResolver(path) - if path.startswith('file://'): - path = url2pathname(path[len('file://'):]) + path = from_file_url(path) if os.path.isfile(path): return PathIndexResolver(path) diff --git a/pywb/warcserver/resource/test/test_pathresolvers.py b/pywb/warcserver/resource/test/test_pathresolvers.py index 03ab3e85..9fc14128 100644 --- a/pywb/warcserver/resource/test/test_pathresolvers.py +++ b/pywb/warcserver/resource/test/test_pathresolvers.py @@ -31,8 +31,8 @@ class TestPathIndex(object): assert res[0] == os.path.join(get_test_dir(), 'warcs', 'example.warc.gz') def test_resolver_dir_wildcard_as_file_url(self): - url = to_file_url(get_test_dir()) + os.path.sep + '*' + os.path.sep - resolver = PrefixResolver(url) + url = to_file_url(get_test_dir()) + '/*/' + resolver = DefaultResolverMixin.make_best_resolver(url) cdx = CDXObject() res = resolver('example.warc.gz', cdx) diff --git a/tests/test_record_replay.py b/tests/test_record_replay.py index da35a863..2b39db89 100644 --- a/tests/test_record_replay.py +++ b/tests/test_record_replay.py @@ -104,7 +104,7 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest): link_lines = res.text.rstrip().split('\n') assert len(link_lines) == 5 - assert '_test_colls:test2/indexes/autoindex.cdxj' in link_lines[3] - assert '_test_colls:test/indexes/autoindex.cdxj' in link_lines[4] + assert to_path('_test_colls:test2/indexes/autoindex.cdxj') in link_lines[3] + assert to_path('_test_colls:test/indexes/autoindex.cdxj') in link_lines[4]