From 94eb4ad20617dca613cb1210a5e9cc5f197e3492 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 2 Feb 2018 12:47:49 -0800 Subject: [PATCH] loaders: add WebHDFSLoader loader to support handling 'webhdfs://' scheme to load over http from WebHDFS (ukwa/ukwa-pywb#3) tests: add basic test for WebHFDSLoader api format --- pywb/utils/loaders.py | 22 ++++++++++++++++++++++ pywb/utils/test/test_loaders.py | 17 +++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index d3fc7be7..da4d8116 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -225,6 +225,7 @@ class BlockLoader(BaseLoader): BlockLoader.loaders['s3'] = S3Loader BlockLoader.loaders['file'] = LocalFileLoader BlockLoader.loaders['pkg'] = PackageLoader + BlockLoader.loaders['webhdfs'] = WebHDFSLoader @staticmethod def set_profile_loader(src): @@ -401,6 +402,27 @@ class S3Loader(BaseLoader): return obj['Body'] +# ================================================================= +class WebHDFSLoader(HttpLoader): + HTTP_URL = 'http://{host}/webhdfs/v1{path}?op=OPEN&offset={offset}' + LENGTH_PARAM = '&length={length}' + + def load(self, url, offset, length): + parts = urlsplit(url) + + http_url = self.HTTP_URL + + if length > 0: + http_url += self.LENGTH_PARAM + + full_url = http_url.format(host=parts.netloc, + path=parts.path, + offset=offset, + length=length) + + return super(WebHDFSLoader, self).load(full_url, 0, -1) + + # ================================================================= # Signed Cookie-Maker # ================================================================= diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py index 4a217616..eef7217a 100644 --- a/pywb/utils/test/test_loaders.py +++ b/pywb/utils/test/test_loaders.py @@ -85,6 +85,8 @@ from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url from pywb.utils.loaders import extract_client_cookie from pywb.utils.loaders import read_last_line +from mock import patch + from warcio.bufferedreaders import DecompressingBufferedReader from pywb import get_test_dir @@ -117,6 +119,21 @@ def test_s3_read_2(): reader = DecompressingBufferedReader(BytesIO(buff)) assert reader.readline() == b'\n' +def test_mock_webhdfs_load(): + def mock_load(expected): + def mock(self, url, offset, length): + assert url == expected + assert offset == 0 + assert length == -1 + return None + + return mock + + with patch('pywb.utils.loaders.HttpLoader.load', mock_load('http://remote-host:1234/webhdfs/v1/some/file.warc.gz?op=OPEN&offset=10&length=50')): + res = BlockLoader().load('webhdfs://remote-host:1234/some/file.warc.gz', 10, 50) + + with patch('pywb.utils.loaders.HttpLoader.load', mock_load('http://remote-host/webhdfs/v1/some/file.warc.gz?op=OPEN&offset=10')): + res = BlockLoader().load('webhdfs://remote-host/some/file.warc.gz', 10, -1) # Error