mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
loaders: add WebHDFSLoader loader to support handling 'webhdfs://' scheme to load over http from WebHDFS (ukwa/ukwa-pywb#3)
tests: add basic test for WebHFDSLoader api format
This commit is contained in:
parent
c1f0f7517a
commit
94eb4ad206
@ -225,6 +225,7 @@ class BlockLoader(BaseLoader):
|
||||
BlockLoader.loaders['s3'] = S3Loader
|
||||
BlockLoader.loaders['file'] = LocalFileLoader
|
||||
BlockLoader.loaders['pkg'] = PackageLoader
|
||||
BlockLoader.loaders['webhdfs'] = WebHDFSLoader
|
||||
|
||||
@staticmethod
|
||||
def set_profile_loader(src):
|
||||
@ -401,6 +402,27 @@ class S3Loader(BaseLoader):
|
||||
return obj['Body']
|
||||
|
||||
|
||||
# =================================================================
|
||||
class WebHDFSLoader(HttpLoader):
|
||||
HTTP_URL = 'http://{host}/webhdfs/v1{path}?op=OPEN&offset={offset}'
|
||||
LENGTH_PARAM = '&length={length}'
|
||||
|
||||
def load(self, url, offset, length):
|
||||
parts = urlsplit(url)
|
||||
|
||||
http_url = self.HTTP_URL
|
||||
|
||||
if length > 0:
|
||||
http_url += self.LENGTH_PARAM
|
||||
|
||||
full_url = http_url.format(host=parts.netloc,
|
||||
path=parts.path,
|
||||
offset=offset,
|
||||
length=length)
|
||||
|
||||
return super(WebHDFSLoader, self).load(full_url, 0, -1)
|
||||
|
||||
|
||||
# =================================================================
|
||||
# Signed Cookie-Maker
|
||||
# =================================================================
|
||||
|
@ -85,6 +85,8 @@ from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
|
||||
from pywb.utils.loaders import extract_client_cookie
|
||||
from pywb.utils.loaders import read_last_line
|
||||
|
||||
from mock import patch
|
||||
|
||||
from warcio.bufferedreaders import DecompressingBufferedReader
|
||||
|
||||
from pywb import get_test_dir
|
||||
@ -117,6 +119,21 @@ def test_s3_read_2():
|
||||
reader = DecompressingBufferedReader(BytesIO(buff))
|
||||
assert reader.readline() == b'<!DOCTYPE html>\n'
|
||||
|
||||
def test_mock_webhdfs_load():
|
||||
def mock_load(expected):
|
||||
def mock(self, url, offset, length):
|
||||
assert url == expected
|
||||
assert offset == 0
|
||||
assert length == -1
|
||||
return None
|
||||
|
||||
return mock
|
||||
|
||||
with patch('pywb.utils.loaders.HttpLoader.load', mock_load('http://remote-host:1234/webhdfs/v1/some/file.warc.gz?op=OPEN&offset=10&length=50')):
|
||||
res = BlockLoader().load('webhdfs://remote-host:1234/some/file.warc.gz', 10, 50)
|
||||
|
||||
with patch('pywb.utils.loaders.HttpLoader.load', mock_load('http://remote-host/webhdfs/v1/some/file.warc.gz?op=OPEN&offset=10')):
|
||||
res = BlockLoader().load('webhdfs://remote-host/some/file.warc.gz', 10, -1)
|
||||
|
||||
|
||||
# Error
|
||||
|
Loading…
x
Reference in New Issue
Block a user