1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

loaders: add WebHDFSLoader loader to support handling 'webhdfs://' scheme to load over http from WebHDFS (ukwa/ukwa-pywb#3)

tests: add basic test for WebHFDSLoader api format
This commit is contained in:
Ilya Kreymer 2018-02-02 12:47:49 -08:00 committed by John Berlin
parent c1f0f7517a
commit 94eb4ad206
No known key found for this signature in database
GPG Key ID: 6EF5E4B442011B02
2 changed files with 39 additions and 0 deletions

View File

@ -225,6 +225,7 @@ class BlockLoader(BaseLoader):
BlockLoader.loaders['s3'] = S3Loader
BlockLoader.loaders['file'] = LocalFileLoader
BlockLoader.loaders['pkg'] = PackageLoader
BlockLoader.loaders['webhdfs'] = WebHDFSLoader
@staticmethod
def set_profile_loader(src):
@ -401,6 +402,27 @@ class S3Loader(BaseLoader):
return obj['Body']
# =================================================================
class WebHDFSLoader(HttpLoader):
HTTP_URL = 'http://{host}/webhdfs/v1{path}?op=OPEN&offset={offset}'
LENGTH_PARAM = '&length={length}'
def load(self, url, offset, length):
parts = urlsplit(url)
http_url = self.HTTP_URL
if length > 0:
http_url += self.LENGTH_PARAM
full_url = http_url.format(host=parts.netloc,
path=parts.path,
offset=offset,
length=length)
return super(WebHDFSLoader, self).load(full_url, 0, -1)
# =================================================================
# Signed Cookie-Maker
# =================================================================

View File

@ -85,6 +85,8 @@ from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
from pywb.utils.loaders import extract_client_cookie
from pywb.utils.loaders import read_last_line
from mock import patch
from warcio.bufferedreaders import DecompressingBufferedReader
from pywb import get_test_dir
@ -117,6 +119,21 @@ def test_s3_read_2():
reader = DecompressingBufferedReader(BytesIO(buff))
assert reader.readline() == b'<!DOCTYPE html>\n'
def test_mock_webhdfs_load():
def mock_load(expected):
def mock(self, url, offset, length):
assert url == expected
assert offset == 0
assert length == -1
return None
return mock
with patch('pywb.utils.loaders.HttpLoader.load', mock_load('http://remote-host:1234/webhdfs/v1/some/file.warc.gz?op=OPEN&offset=10&length=50')):
res = BlockLoader().load('webhdfs://remote-host:1234/some/file.warc.gz', 10, 50)
with patch('pywb.utils.loaders.HttpLoader.load', mock_load('http://remote-host/webhdfs/v1/some/file.warc.gz?op=OPEN&offset=10')):
res = BlockLoader().load('webhdfs://remote-host/some/file.warc.gz', 10, -1)
# Error