diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index da4d8116..6d721de3 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -404,23 +404,30 @@ class S3Loader(BaseLoader): # ================================================================= class WebHDFSLoader(HttpLoader): - HTTP_URL = 'http://{host}/webhdfs/v1{path}?op=OPEN&offset={offset}' - LENGTH_PARAM = '&length={length}' + HTTP_URL = 'http://{host}/webhdfs/v1{path}?' def load(self, url, offset, length): parts = urlsplit(url) - http_url = self.HTTP_URL + http_url = self.HTTP_URL.format(host=parts.netloc, + path=parts.path) + + params = {'op': 'OPEN', + 'offset': str(offset) + } if length > 0: - http_url += self.LENGTH_PARAM + params['length'] = str(length) - full_url = http_url.format(host=parts.netloc, - path=parts.path, - offset=offset, - length=length) + if os.environ.get('WEBHDFS_USER'): + params['user.name'] = os.environ.get('WEBHDFS_USER') - return super(WebHDFSLoader, self).load(full_url, 0, -1) + if os.environ.get('WEBHDFS_TOKEN'): + params['delegation'] = os.environ.get('WEBHDFS_TOKEN') + + http_url += urlencode(params) + + return super(WebHDFSLoader, self).load(http_url, 0, -1) # ================================================================= diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py index eef7217a..819390b3 100644 --- a/pywb/utils/test/test_loaders.py +++ b/pywb/utils/test/test_loaders.py @@ -85,6 +85,8 @@ from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url from pywb.utils.loaders import extract_client_cookie from pywb.utils.loaders import read_last_line +from pywb.utils.canonicalize import canonicalize + from mock import patch from warcio.bufferedreaders import DecompressingBufferedReader @@ -119,20 +121,36 @@ def test_s3_read_2(): reader = DecompressingBufferedReader(BytesIO(buff)) assert reader.readline() == b'\n' -def test_mock_webhdfs_load(): - def mock_load(expected): - def mock(self, url, offset, length): - assert url == expected - assert offset == 0 - assert length == -1 - return None +def mock_load(expected): + def mock(self, url, offset, length): + assert canonicalize(url) == canonicalize(expected) + assert offset == 0 + assert length == -1 + return None - return mock + return mock - with patch('pywb.utils.loaders.HttpLoader.load', mock_load('http://remote-host:1234/webhdfs/v1/some/file.warc.gz?op=OPEN&offset=10&length=50')): +def test_mock_webhdfs_load_1(): + expected = 'http://remote-host:1234/webhdfs/v1/some/file.warc.gz?op=OPEN&offset=10&length=50' + with patch('pywb.utils.loaders.HttpLoader.load', mock_load(expected)): res = BlockLoader().load('webhdfs://remote-host:1234/some/file.warc.gz', 10, 50) - with patch('pywb.utils.loaders.HttpLoader.load', mock_load('http://remote-host/webhdfs/v1/some/file.warc.gz?op=OPEN&offset=10')): +def test_mock_webhdfs_load_2(): + expected = 'http://remote-host/webhdfs/v1/some/file.warc.gz?op=OPEN&offset=10' + with patch('pywb.utils.loaders.HttpLoader.load', mock_load(expected)): + res = BlockLoader().load('webhdfs://remote-host/some/file.warc.gz', 10, -1) + +def test_mock_webhdfs_load_3_username(): + os.environ['WEBHDFS_USER'] = 'someuser' + expected = 'http://remote-host/webhdfs/v1/some/file.warc.gz?op=OPEN&offset=10&user.name=someuser' + with patch('pywb.utils.loaders.HttpLoader.load', mock_load(expected)): + res = BlockLoader().load('webhdfs://remote-host/some/file.warc.gz', 10, -1) + +def test_mock_webhdfs_load_4_token(): + os.environ['WEBHDFS_USER'] = '' + os.environ['WEBHDFS_TOKEN'] = 'ATOKEN' + expected = 'http://remote-host/webhdfs/v1/some/file.warc.gz?op=OPEN&offset=10&delegation=ATOKEN' + with patch('pywb.utils.loaders.HttpLoader.load', mock_load(expected)): res = BlockLoader().load('webhdfs://remote-host/some/file.warc.gz', 10, -1)