198 lines
5.4 KiB
Python
198 lines
5.4 KiB
Python
|
import pickle
|
||
|
import shlex
|
||
|
import subprocess
|
||
|
import time
|
||
|
|
||
|
import pytest
|
||
|
|
||
|
import fsspec
|
||
|
|
||
|
requests = pytest.importorskip("requests")
|
||
|
|
||
|
from fsspec.implementations.webhdfs import WebHDFS # noqa: E402
|
||
|
|
||
|
|
||
|
@pytest.fixture(scope="module")
|
||
|
def hdfs_cluster():
|
||
|
cmd0 = shlex.split("htcluster shutdown")
|
||
|
try:
|
||
|
subprocess.check_output(cmd0, stderr=subprocess.STDOUT)
|
||
|
except FileNotFoundError:
|
||
|
pytest.skip("htcluster not found")
|
||
|
except subprocess.CalledProcessError as ex:
|
||
|
pytest.skip(f"htcluster failed: {ex.output.decode()}")
|
||
|
cmd1 = shlex.split("htcluster startup --image base")
|
||
|
subprocess.check_output(cmd1)
|
||
|
try:
|
||
|
while True:
|
||
|
t = 90
|
||
|
try:
|
||
|
requests.get("http://localhost:50070/webhdfs/v1/?op=LISTSTATUS")
|
||
|
except: # noqa: E722
|
||
|
t -= 1
|
||
|
assert t > 0, "Timeout waiting for HDFS"
|
||
|
time.sleep(1)
|
||
|
continue
|
||
|
break
|
||
|
time.sleep(7)
|
||
|
yield "localhost"
|
||
|
finally:
|
||
|
subprocess.check_output(cmd0)
|
||
|
|
||
|
|
||
|
def test_pickle(hdfs_cluster):
|
||
|
w = WebHDFS(hdfs_cluster, user="testuser")
|
||
|
w2 = pickle.loads(pickle.dumps(w))
|
||
|
assert w == w2
|
||
|
|
||
|
|
||
|
def test_simple(hdfs_cluster):
|
||
|
w = WebHDFS(hdfs_cluster, user="testuser")
|
||
|
home = w.home_directory()
|
||
|
assert home == "/user/testuser"
|
||
|
with pytest.raises(PermissionError):
|
||
|
w.mkdir("/root")
|
||
|
|
||
|
|
||
|
def test_url(hdfs_cluster):
|
||
|
url = "webhdfs://testuser@localhost:50070/user/testuser/myfile"
|
||
|
fo = fsspec.open(url, "wb", data_proxy={"worker.example.com": "localhost"})
|
||
|
with fo as f:
|
||
|
f.write(b"hello")
|
||
|
fo = fsspec.open(url, "rb", data_proxy={"worker.example.com": "localhost"})
|
||
|
with fo as f:
|
||
|
assert f.read() == b"hello"
|
||
|
|
||
|
|
||
|
def test_workflow(hdfs_cluster):
|
||
|
w = WebHDFS(
|
||
|
hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"}
|
||
|
)
|
||
|
fn = "/user/testuser/testrun/afile"
|
||
|
w.mkdir("/user/testuser/testrun")
|
||
|
with w.open(fn, "wb") as f:
|
||
|
f.write(b"hello")
|
||
|
assert w.exists(fn)
|
||
|
info = w.info(fn)
|
||
|
assert info["size"] == 5
|
||
|
assert w.isfile(fn)
|
||
|
assert w.cat(fn) == b"hello"
|
||
|
w.rm("/user/testuser/testrun", recursive=True)
|
||
|
assert not w.exists(fn)
|
||
|
|
||
|
|
||
|
def test_with_gzip(hdfs_cluster):
|
||
|
from gzip import GzipFile
|
||
|
|
||
|
w = WebHDFS(
|
||
|
hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"}
|
||
|
)
|
||
|
fn = "/user/testuser/gzfile"
|
||
|
with w.open(fn, "wb") as f:
|
||
|
gf = GzipFile(fileobj=f, mode="w")
|
||
|
gf.write(b"hello")
|
||
|
gf.close()
|
||
|
with w.open(fn, "rb") as f:
|
||
|
gf = GzipFile(fileobj=f, mode="r")
|
||
|
assert gf.read() == b"hello"
|
||
|
|
||
|
|
||
|
def test_workflow_transaction(hdfs_cluster):
|
||
|
w = WebHDFS(
|
||
|
hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"}
|
||
|
)
|
||
|
fn = "/user/testuser/testrun/afile"
|
||
|
w.mkdirs("/user/testuser/testrun")
|
||
|
with w.transaction:
|
||
|
with w.open(fn, "wb") as f:
|
||
|
f.write(b"hello")
|
||
|
assert not w.exists(fn)
|
||
|
assert w.exists(fn)
|
||
|
assert w.ukey(fn)
|
||
|
files = w.ls("/user/testuser/testrun", True)
|
||
|
summ = w.content_summary("/user/testuser/testrun")
|
||
|
assert summ["length"] == files[0]["size"]
|
||
|
assert summ["fileCount"] == 1
|
||
|
|
||
|
w.rm("/user/testuser/testrun", recursive=True)
|
||
|
assert not w.exists(fn)
|
||
|
|
||
|
|
||
|
def test_webhdfs_cp_file(hdfs_cluster):
|
||
|
fs = WebHDFS(
|
||
|
hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"}
|
||
|
)
|
||
|
|
||
|
src, dst = "/user/testuser/testrun/f1", "/user/testuser/testrun/f2"
|
||
|
|
||
|
fs.mkdir("/user/testuser/testrun")
|
||
|
|
||
|
with fs.open(src, "wb") as f:
|
||
|
f.write(b"hello")
|
||
|
|
||
|
fs.cp_file(src, dst)
|
||
|
|
||
|
assert fs.exists(src)
|
||
|
assert fs.exists(dst)
|
||
|
assert fs.cat(src) == fs.cat(dst)
|
||
|
|
||
|
|
||
|
def test_path_with_equals(hdfs_cluster):
|
||
|
fs = WebHDFS(
|
||
|
hdfs_cluster, user="testuser", data_proxy={"worker.example.com": "localhost"}
|
||
|
)
|
||
|
path_with_equals = "/user/testuser/some_table/datestamp=2023-11-11"
|
||
|
|
||
|
fs.mkdir(path_with_equals)
|
||
|
|
||
|
result = fs.ls(path_with_equals)
|
||
|
assert result is not None
|
||
|
assert fs.exists(path_with_equals)
|
||
|
|
||
|
|
||
|
def test_error_handling_with_equals_in_path(hdfs_cluster):
|
||
|
fs = WebHDFS(hdfs_cluster, user="testuser")
|
||
|
invalid_path_with_equals = (
|
||
|
"/user/testuser/some_table/invalid_path=datestamp=2023-11-11"
|
||
|
)
|
||
|
|
||
|
with pytest.raises(FileNotFoundError):
|
||
|
fs.ls(invalid_path_with_equals)
|
||
|
|
||
|
|
||
|
def test_create_and_touch_file_with_equals(hdfs_cluster):
|
||
|
fs = WebHDFS(
|
||
|
hdfs_cluster,
|
||
|
user="testuser",
|
||
|
data_proxy={"worker.example.com": "localhost"},
|
||
|
)
|
||
|
base_path = "/user/testuser/some_table/datestamp=2023-11-11"
|
||
|
file_path = f"{base_path}/testfile.txt"
|
||
|
|
||
|
fs.mkdir(base_path)
|
||
|
fs.touch(file_path, "wb")
|
||
|
assert fs.exists(file_path)
|
||
|
|
||
|
|
||
|
def test_write_read_verify_file_with_equals(hdfs_cluster):
|
||
|
fs = WebHDFS(
|
||
|
hdfs_cluster,
|
||
|
user="testuser",
|
||
|
data_proxy={"worker.example.com": "localhost"},
|
||
|
)
|
||
|
base_path = "/user/testuser/some_table/datestamp=2023-11-11"
|
||
|
file_path = f"{base_path}/testfile.txt"
|
||
|
content = b"This is some content!"
|
||
|
|
||
|
fs.mkdir(base_path)
|
||
|
with fs.open(file_path, "wb") as f:
|
||
|
f.write(content)
|
||
|
|
||
|
with fs.open(file_path, "rb") as f:
|
||
|
assert f.read() == content
|
||
|
|
||
|
file_info = fs.ls(base_path, detail=True)
|
||
|
assert len(file_info) == 1
|
||
|
assert file_info[0]["name"] == file_path
|
||
|
assert file_info[0]["size"] == len(content)
|