mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
loaders: add WebHDFSLoader loader to support handling 'webhdfs://' scheme to load over http from WebHDFS (ukwa/ukwa-pywb#3)
tests: add basic test for WebHFDSLoader api format
This commit is contained in:
parent
c1f0f7517a
commit
94eb4ad206
@ -225,6 +225,7 @@ class BlockLoader(BaseLoader):
|
|||||||
BlockLoader.loaders['s3'] = S3Loader
|
BlockLoader.loaders['s3'] = S3Loader
|
||||||
BlockLoader.loaders['file'] = LocalFileLoader
|
BlockLoader.loaders['file'] = LocalFileLoader
|
||||||
BlockLoader.loaders['pkg'] = PackageLoader
|
BlockLoader.loaders['pkg'] = PackageLoader
|
||||||
|
BlockLoader.loaders['webhdfs'] = WebHDFSLoader
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def set_profile_loader(src):
|
def set_profile_loader(src):
|
||||||
@ -401,6 +402,27 @@ class S3Loader(BaseLoader):
|
|||||||
return obj['Body']
|
return obj['Body']
|
||||||
|
|
||||||
|
|
||||||
|
# =================================================================
|
||||||
|
class WebHDFSLoader(HttpLoader):
|
||||||
|
HTTP_URL = 'http://{host}/webhdfs/v1{path}?op=OPEN&offset={offset}'
|
||||||
|
LENGTH_PARAM = '&length={length}'
|
||||||
|
|
||||||
|
def load(self, url, offset, length):
|
||||||
|
parts = urlsplit(url)
|
||||||
|
|
||||||
|
http_url = self.HTTP_URL
|
||||||
|
|
||||||
|
if length > 0:
|
||||||
|
http_url += self.LENGTH_PARAM
|
||||||
|
|
||||||
|
full_url = http_url.format(host=parts.netloc,
|
||||||
|
path=parts.path,
|
||||||
|
offset=offset,
|
||||||
|
length=length)
|
||||||
|
|
||||||
|
return super(WebHDFSLoader, self).load(full_url, 0, -1)
|
||||||
|
|
||||||
|
|
||||||
# =================================================================
|
# =================================================================
|
||||||
# Signed Cookie-Maker
|
# Signed Cookie-Maker
|
||||||
# =================================================================
|
# =================================================================
|
||||||
|
@ -85,6 +85,8 @@ from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
|
|||||||
from pywb.utils.loaders import extract_client_cookie
|
from pywb.utils.loaders import extract_client_cookie
|
||||||
from pywb.utils.loaders import read_last_line
|
from pywb.utils.loaders import read_last_line
|
||||||
|
|
||||||
|
from mock import patch
|
||||||
|
|
||||||
from warcio.bufferedreaders import DecompressingBufferedReader
|
from warcio.bufferedreaders import DecompressingBufferedReader
|
||||||
|
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
@ -117,6 +119,21 @@ def test_s3_read_2():
|
|||||||
reader = DecompressingBufferedReader(BytesIO(buff))
|
reader = DecompressingBufferedReader(BytesIO(buff))
|
||||||
assert reader.readline() == b'<!DOCTYPE html>\n'
|
assert reader.readline() == b'<!DOCTYPE html>\n'
|
||||||
|
|
||||||
|
def test_mock_webhdfs_load():
|
||||||
|
def mock_load(expected):
|
||||||
|
def mock(self, url, offset, length):
|
||||||
|
assert url == expected
|
||||||
|
assert offset == 0
|
||||||
|
assert length == -1
|
||||||
|
return None
|
||||||
|
|
||||||
|
return mock
|
||||||
|
|
||||||
|
with patch('pywb.utils.loaders.HttpLoader.load', mock_load('http://remote-host:1234/webhdfs/v1/some/file.warc.gz?op=OPEN&offset=10&length=50')):
|
||||||
|
res = BlockLoader().load('webhdfs://remote-host:1234/some/file.warc.gz', 10, 50)
|
||||||
|
|
||||||
|
with patch('pywb.utils.loaders.HttpLoader.load', mock_load('http://remote-host/webhdfs/v1/some/file.warc.gz?op=OPEN&offset=10')):
|
||||||
|
res = BlockLoader().load('webhdfs://remote-host/some/file.warc.gz', 10, -1)
|
||||||
|
|
||||||
|
|
||||||
# Error
|
# Error
|
||||||
|
Loading…
x
Reference in New Issue
Block a user