578 lines
17 KiB
Python
578 lines
17 KiB
Python
import asyncio
|
|
import io
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
|
|
import aiohttp
|
|
import pytest
|
|
|
|
import fsspec.asyn
|
|
import fsspec.utils
|
|
from fsspec.implementations.http import HTTPStreamFile
|
|
from fsspec.tests.conftest import data, reset_files, server, win # noqa: F401
|
|
|
|
|
|
def test_list(server):
|
|
h = fsspec.filesystem("http")
|
|
out = h.glob(server + "/index/*")
|
|
assert out == [server + "/index/realfile"]
|
|
|
|
|
|
def test_list_invalid_args(server):
|
|
with pytest.raises(TypeError):
|
|
h = fsspec.filesystem("http", use_foobar=True)
|
|
h.glob(server + "/index/*")
|
|
|
|
|
|
def test_list_cache(server):
|
|
h = fsspec.filesystem("http", use_listings_cache=True)
|
|
out = h.glob(server + "/index/*")
|
|
assert out == [server + "/index/realfile"]
|
|
|
|
|
|
def test_list_cache_with_expiry_time_cached(server):
|
|
h = fsspec.filesystem("http", use_listings_cache=True, listings_expiry_time=30)
|
|
|
|
# First, the directory cache is not initialized.
|
|
assert not h.dircache
|
|
|
|
# By querying the filesystem with "use_listings_cache=True",
|
|
# the cache will automatically get populated.
|
|
out = h.glob(server + "/index/*")
|
|
assert out == [server + "/index/realfile"]
|
|
|
|
# Verify cache content.
|
|
assert len(h.dircache) == 1
|
|
|
|
out = h.glob(server + "/index/*")
|
|
assert out == [server + "/index/realfile"]
|
|
|
|
|
|
def test_list_cache_with_expiry_time_purged(server):
|
|
h = fsspec.filesystem("http", use_listings_cache=True, listings_expiry_time=0.3)
|
|
|
|
# First, the directory cache is not initialized.
|
|
assert not h.dircache
|
|
|
|
# By querying the filesystem with "use_listings_cache=True",
|
|
# the cache will automatically get populated.
|
|
out = h.glob(server + "/index/*")
|
|
assert out == [server + "/index/realfile"]
|
|
assert len(h.dircache) == 1
|
|
|
|
# Verify cache content.
|
|
assert server + "/index/" in h.dircache
|
|
assert len(h.dircache.get(server + "/index/")) == 1
|
|
|
|
# Wait beyond the TTL / cache expiry time.
|
|
time.sleep(0.31)
|
|
|
|
# Verify that the cache item should have been purged.
|
|
cached_items = h.dircache.get(server + "/index/")
|
|
assert cached_items is None
|
|
|
|
# Verify that after clearing the item from the cache,
|
|
# it can get populated again.
|
|
out = h.glob(server + "/index/*")
|
|
assert out == [server + "/index/realfile"]
|
|
cached_items = h.dircache.get(server + "/index/")
|
|
assert len(cached_items) == 1
|
|
|
|
|
|
def test_list_cache_reuse(server):
|
|
h = fsspec.filesystem("http", use_listings_cache=True, listings_expiry_time=5)
|
|
|
|
# First, the directory cache is not initialized.
|
|
assert not h.dircache
|
|
|
|
# By querying the filesystem with "use_listings_cache=True",
|
|
# the cache will automatically get populated.
|
|
out = h.glob(server + "/index/*")
|
|
assert out == [server + "/index/realfile"]
|
|
|
|
# Verify cache content.
|
|
assert len(h.dircache) == 1
|
|
|
|
# Verify another instance without caching enabled does not have cache content.
|
|
h = fsspec.filesystem("http", use_listings_cache=False)
|
|
assert not h.dircache
|
|
|
|
# Verify that yet another new instance, with caching enabled,
|
|
# will see the same cache content again.
|
|
h = fsspec.filesystem("http", use_listings_cache=True, listings_expiry_time=5)
|
|
assert len(h.dircache) == 1
|
|
|
|
# However, yet another instance with a different expiry time will also not have
|
|
# any valid cache content.
|
|
h = fsspec.filesystem("http", use_listings_cache=True, listings_expiry_time=666)
|
|
assert len(h.dircache) == 0
|
|
|
|
|
|
def test_ls_raises_filenotfound(server):
|
|
h = fsspec.filesystem("http")
|
|
|
|
with pytest.raises(FileNotFoundError):
|
|
h.ls(server + "/not-a-key")
|
|
|
|
|
|
def test_list_cache_with_max_paths(server):
|
|
h = fsspec.filesystem("http", use_listings_cache=True, max_paths=5)
|
|
out = h.glob(server + "/index/*")
|
|
assert out == [server + "/index/realfile"]
|
|
|
|
|
|
def test_list_cache_with_skip_instance_cache(server):
|
|
h = fsspec.filesystem("http", use_listings_cache=True, skip_instance_cache=True)
|
|
out = h.glob(server + "/index/*")
|
|
assert out == [server + "/index/realfile"]
|
|
|
|
|
|
def test_glob_return_subfolders(server):
|
|
h = fsspec.filesystem("http")
|
|
out = h.glob(server + "/simple/*")
|
|
assert set(out) == {
|
|
server + "/simple/dir/",
|
|
server + "/simple/file",
|
|
}
|
|
|
|
|
|
def test_isdir(server):
|
|
h = fsspec.filesystem("http")
|
|
assert h.isdir(server + "/index/")
|
|
assert not h.isdir(server + "/index/realfile")
|
|
assert not h.isdir(server + "doesnotevenexist")
|
|
|
|
|
|
def test_policy_arg(server):
|
|
h = fsspec.filesystem("http", size_policy="get")
|
|
out = h.glob(server + "/index/*")
|
|
assert out == [server + "/index/realfile"]
|
|
|
|
|
|
def test_exists(server):
|
|
h = fsspec.filesystem("http")
|
|
assert not h.exists(server + "/notafile")
|
|
with pytest.raises(FileNotFoundError):
|
|
h.cat(server + "/notafile")
|
|
|
|
|
|
def test_read(server):
|
|
h = fsspec.filesystem("http")
|
|
out = server + "/index/realfile"
|
|
with h.open(out, "rb") as f:
|
|
assert f.read() == data
|
|
with h.open(out, "rb", block_size=0) as f:
|
|
assert f.read() == data
|
|
with h.open(out, "rb") as f:
|
|
assert f.read(100) + f.read() == data
|
|
|
|
|
|
def test_file_pickle(server):
|
|
import pickle
|
|
|
|
# via HTTPFile
|
|
h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"})
|
|
out = server + "/index/realfile"
|
|
|
|
with fsspec.open(out, headers={"give_length": "true", "head_ok": "true"}) as f:
|
|
pic = pickle.loads(pickle.dumps(f))
|
|
assert pic.read() == data
|
|
|
|
with h.open(out, "rb") as f:
|
|
pic = pickle.dumps(f)
|
|
assert f.read() == data
|
|
with pickle.loads(pic) as f:
|
|
assert f.read() == data
|
|
|
|
# via HTTPStreamFile
|
|
h = fsspec.filesystem("http")
|
|
out = server + "/index/realfile"
|
|
with h.open(out, "rb") as f:
|
|
out = pickle.dumps(f)
|
|
assert f.read() == data
|
|
with pickle.loads(out) as f:
|
|
assert f.read() == data
|
|
|
|
|
|
def test_methods(server):
|
|
h = fsspec.filesystem("http")
|
|
url = server + "/index/realfile"
|
|
assert h.exists(url)
|
|
assert h.cat(url) == data
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"headers",
|
|
[
|
|
{},
|
|
{"give_length": "true"},
|
|
{"give_length": "true", "head_ok": "true"},
|
|
{"give_range": "true"},
|
|
{"give_length": "true", "head_not_auth": "true"},
|
|
{"give_range": "true", "head_not_auth": "true"},
|
|
{"use_206": "true", "head_ok": "true", "head_give_length": "true"},
|
|
{"use_206": "true", "give_length": "true"},
|
|
{"use_206": "true", "give_range": "true"},
|
|
],
|
|
)
|
|
def test_random_access(server, headers):
|
|
h = fsspec.filesystem("http", headers=headers)
|
|
url = server + "/index/realfile"
|
|
with h.open(url, "rb") as f:
|
|
if headers:
|
|
assert f.size == len(data)
|
|
assert f.read(5) == data[:5]
|
|
|
|
if headers:
|
|
f.seek(5, 1)
|
|
assert f.read(5) == data[10:15]
|
|
else:
|
|
with pytest.raises(ValueError):
|
|
f.seek(5, 1)
|
|
assert f.closed
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"headers",
|
|
[
|
|
{"ignore_range": "true", "head_ok": "true", "head_give_length": "true"},
|
|
{"ignore_range": "true", "give_length": "true"},
|
|
{"ignore_range": "true", "give_range": "true"},
|
|
],
|
|
)
|
|
def test_no_range_support(server, headers):
|
|
h = fsspec.filesystem("http", headers=headers)
|
|
url = server + "/index/realfile"
|
|
with h.open(url, "rb") as f:
|
|
# Random access is not possible if the server doesn't respect Range
|
|
f.seek(5)
|
|
with pytest.raises(ValueError):
|
|
f.read(10)
|
|
|
|
# Reading from the beginning should still work
|
|
f.seek(0)
|
|
assert f.read(10) == data[:10]
|
|
|
|
|
|
def test_stream_seek(server):
|
|
h = fsspec.filesystem("http")
|
|
url = server + "/index/realfile"
|
|
with h.open(url, "rb") as f:
|
|
f.seek(0) # is OK
|
|
data1 = f.read(5)
|
|
assert len(data1) == 5
|
|
f.seek(5)
|
|
f.seek(0, 1)
|
|
data2 = f.read()
|
|
assert data1 + data2 == data
|
|
|
|
|
|
def test_mapper_url(server):
|
|
h = fsspec.filesystem("http")
|
|
mapper = h.get_mapper(server + "/index/")
|
|
assert mapper.root.startswith("http:")
|
|
assert list(mapper)
|
|
|
|
mapper2 = fsspec.get_mapper(server + "/index/")
|
|
assert mapper2.root.startswith("http:")
|
|
assert list(mapper) == list(mapper2)
|
|
|
|
|
|
def test_content_length_zero(server):
|
|
h = fsspec.filesystem(
|
|
"http", headers={"give_length": "true", "zero_length": "true"}
|
|
)
|
|
url = server + "/index/realfile"
|
|
|
|
with h.open(url, "rb") as f:
|
|
assert f.read() == data
|
|
|
|
|
|
def test_content_encoding_gzip(server):
|
|
h = fsspec.filesystem(
|
|
"http", headers={"give_length": "true", "gzip_encoding": "true"}
|
|
)
|
|
url = server + "/index/realfile"
|
|
|
|
with h.open(url, "rb") as f:
|
|
assert isinstance(f, HTTPStreamFile)
|
|
assert f.size is None
|
|
assert f.read() == data
|
|
|
|
|
|
def test_download(server, tmpdir):
|
|
h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "})
|
|
url = server + "/index/realfile"
|
|
fn = os.path.join(tmpdir, "afile")
|
|
h.get(url, fn)
|
|
assert open(fn, "rb").read() == data
|
|
|
|
|
|
def test_multi_download(server, tmpdir):
|
|
h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "})
|
|
urla = server + "/index/realfile"
|
|
urlb = server + "/index/otherfile"
|
|
fna = os.path.join(tmpdir, "afile")
|
|
fnb = os.path.join(tmpdir, "bfile")
|
|
h.get([urla, urlb], [fna, fnb])
|
|
assert open(fna, "rb").read() == data
|
|
assert open(fnb, "rb").read() == data
|
|
|
|
|
|
def test_ls(server):
|
|
h = fsspec.filesystem("http")
|
|
l = h.ls(server + "/data/20020401/", detail=False)
|
|
nc = server + "/data/20020401/GRACEDADM_CLSM0125US_7D.A20020401.030.nc4"
|
|
assert nc in l
|
|
assert len(l) == 11
|
|
assert all(u["type"] == "file" for u in h.ls(server + "/data/20020401/"))
|
|
assert h.glob(server + "/data/20020401/*.nc4") == [nc]
|
|
|
|
|
|
def test_mcat(server):
|
|
h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "})
|
|
urla = server + "/index/realfile"
|
|
urlb = server + "/index/otherfile"
|
|
out = h.cat([urla, urlb])
|
|
assert out == {urla: data, urlb: data}
|
|
|
|
|
|
def test_cat_file_range(server):
|
|
h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "})
|
|
urla = server + "/index/realfile"
|
|
assert h.cat(urla, start=1, end=10) == data[1:10]
|
|
assert h.cat(urla, start=1) == data[1:]
|
|
|
|
assert h.cat(urla, start=-10) == data[-10:]
|
|
assert h.cat(urla, start=-10, end=-2) == data[-10:-2]
|
|
|
|
assert h.cat(urla, end=-10) == data[:-10]
|
|
|
|
|
|
def test_cat_file_range_numpy(server):
|
|
np = pytest.importorskip("numpy")
|
|
h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "})
|
|
urla = server + "/index/realfile"
|
|
assert h.cat(urla, start=np.int8(1), end=np.int8(10)) == data[1:10]
|
|
out = h.cat_ranges([urla, urla], starts=np.array([1, 5]), ends=np.array([10, 15]))
|
|
assert out == [data[1:10], data[5:15]]
|
|
|
|
|
|
def test_mcat_cache(server):
|
|
urla = server + "/index/realfile"
|
|
urlb = server + "/index/otherfile"
|
|
fs = fsspec.filesystem("simplecache", target_protocol="http")
|
|
assert fs.cat([urla, urlb]) == {urla: data, urlb: data}
|
|
|
|
|
|
def test_mcat_expand(server):
|
|
h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "})
|
|
out = h.cat(server + "/index/*")
|
|
assert out == {server + "/index/realfile": data}
|
|
|
|
|
|
def test_info(server):
|
|
fs = fsspec.filesystem("http", headers={"give_etag": "true", "head_ok": "true"})
|
|
info = fs.info(server + "/index/realfile")
|
|
assert info["ETag"] == "xxx"
|
|
|
|
fs = fsspec.filesystem("http", headers={"give_mimetype": "true"})
|
|
info = fs.info(server + "/index/realfile")
|
|
assert info["mimetype"] == "text/html"
|
|
|
|
fs = fsspec.filesystem("http", headers={"redirect": "true"})
|
|
info = fs.info(server + "/redirectme")
|
|
assert info["url"] == server + "/index/realfile"
|
|
|
|
|
|
@pytest.mark.parametrize("method", ["POST", "PUT"])
|
|
def test_put_file(server, tmp_path, method, reset_files):
|
|
src_file = tmp_path / "file_1"
|
|
src_file.write_bytes(data)
|
|
|
|
dwl_file = tmp_path / "down_1"
|
|
|
|
fs = fsspec.filesystem("http", headers={"head_ok": "true", "give_length": "true"})
|
|
with pytest.raises(FileNotFoundError):
|
|
fs.info(server + "/hey")
|
|
|
|
fs.put_file(src_file, server + "/hey", method=method)
|
|
assert fs.info(server + "/hey")["size"] == len(data)
|
|
|
|
fs.get_file(server + "/hey", dwl_file)
|
|
assert dwl_file.read_bytes() == data
|
|
|
|
src_file.write_bytes(b"xxx")
|
|
with open(src_file, "rb") as stream:
|
|
fs.put_file(stream, server + "/hey_2", method=method)
|
|
assert fs.cat(server + "/hey_2") == b"xxx"
|
|
|
|
fs.put_file(io.BytesIO(b"yyy"), server + "/hey_3", method=method)
|
|
assert fs.cat(server + "/hey_3") == b"yyy"
|
|
|
|
|
|
async def get_aiohttp():
|
|
from aiohttp import ClientSession
|
|
|
|
return ClientSession()
|
|
|
|
|
|
async def get_proxy():
|
|
class ProxyClient:
|
|
pass
|
|
|
|
return ProxyClient()
|
|
|
|
|
|
@pytest.mark.xfail(
|
|
condition=sys.flags.optimize > 1, reason="no docstrings when optimised"
|
|
)
|
|
def test_docstring():
|
|
h = fsspec.filesystem("http")
|
|
# most methods have empty docstrings and draw from base class, but this one
|
|
# is generated
|
|
assert h.pipe.__doc__
|
|
|
|
|
|
def test_async_other_thread(server):
|
|
import threading
|
|
|
|
loop = asyncio.get_event_loop()
|
|
th = threading.Thread(target=loop.run_forever)
|
|
|
|
th.daemon = True
|
|
th.start()
|
|
fs = fsspec.filesystem("http", asynchronous=True, loop=loop)
|
|
asyncio.run_coroutine_threadsafe(fs.set_session(), loop=loop).result()
|
|
url = server + "/index/realfile"
|
|
cor = fs._cat([url])
|
|
fut = asyncio.run_coroutine_threadsafe(cor, loop=loop)
|
|
assert fut.result() == {url: data}
|
|
loop.call_soon_threadsafe(loop.stop)
|
|
|
|
|
|
def test_async_this_thread(server):
|
|
async def _():
|
|
fs = fsspec.filesystem("http", asynchronous=True)
|
|
|
|
session = await fs.set_session() # creates client
|
|
|
|
url = server + "/index/realfile"
|
|
with pytest.raises((NotImplementedError, RuntimeError)):
|
|
fs.cat([url])
|
|
out = await fs._cat([url])
|
|
del fs
|
|
assert out == {url: data}
|
|
await session.close()
|
|
|
|
asyncio.run(_())
|
|
|
|
|
|
def _inner_pass(fs, q, fn):
|
|
# pass the FS instance, but don't use it; in new process, the instance
|
|
# cache should be skipped to make a new instance
|
|
import traceback
|
|
|
|
try:
|
|
fs = fsspec.filesystem("http")
|
|
q.put(fs.cat(fn))
|
|
except Exception:
|
|
q.put(traceback.format_exc())
|
|
|
|
|
|
@pytest.mark.parametrize("method", ["spawn", "forkserver"])
|
|
def test_processes(server, method):
|
|
import multiprocessing as mp
|
|
|
|
if win and method != "spawn":
|
|
pytest.skip("Windows can only spawn")
|
|
ctx = mp.get_context(method)
|
|
fn = server + "/index/realfile"
|
|
fs = fsspec.filesystem("http")
|
|
|
|
q = ctx.Queue()
|
|
p = ctx.Process(target=_inner_pass, args=(fs, q, fn))
|
|
p.start()
|
|
out = q.get()
|
|
assert out == fs.cat(fn)
|
|
p.join()
|
|
|
|
|
|
@pytest.mark.parametrize("get_client", [get_aiohttp, get_proxy])
|
|
def test_close(get_client):
|
|
fs = fsspec.filesystem("http", skip_instance_cache=True)
|
|
fs.close_session(None, asyncio.run(get_client()))
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_async_file(server):
|
|
fs = fsspec.filesystem("http", asynchronous=True, skip_instance_cache=True)
|
|
fn = server + "/index/realfile"
|
|
of = await fs.open_async(fn)
|
|
async with of as f:
|
|
out1 = await f.read(10)
|
|
assert data.startswith(out1)
|
|
out2 = await f.read()
|
|
assert data == out1 + out2
|
|
await fs._session.close()
|
|
|
|
|
|
def test_encoded(server):
|
|
fs = fsspec.filesystem("http", encoded=True)
|
|
out = fs.cat(server + "/Hello%3A%20G%C3%BCnter", headers={"give_path": "true"})
|
|
assert json.loads(out)["path"] == "/Hello%3A%20G%C3%BCnter"
|
|
with pytest.raises(aiohttp.client_exceptions.ClientError):
|
|
fs.cat(server + "/Hello: Günter", headers={"give_path": "true"})
|
|
|
|
fs = fsspec.filesystem("http", encoded=False)
|
|
out = fs.cat(server + "/Hello: Günter", headers={"give_path": "true"})
|
|
assert json.loads(out)["path"] == "/Hello:%20G%C3%BCnter"
|
|
|
|
|
|
def test_with_cache(server):
|
|
fs = fsspec.filesystem("http", headers={"head_ok": "true", "give_length": "true"})
|
|
fn = server + "/index/realfile"
|
|
fs1 = fsspec.filesystem("blockcache", fs=fs)
|
|
with fs1.open(fn, "rb") as f:
|
|
out = f.read()
|
|
assert out == fs1.cat(fn)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_async_expand_path(server):
|
|
fs = fsspec.filesystem("http", asynchronous=True, skip_instance_cache=True)
|
|
|
|
# maxdepth=1
|
|
assert await fs._expand_path(server + "/index", recursive=True, maxdepth=1) == [
|
|
server + "/index",
|
|
server + "/index/realfile",
|
|
]
|
|
|
|
# maxdepth=0
|
|
with pytest.raises(ValueError):
|
|
await fs._expand_path(server + "/index", maxdepth=0)
|
|
with pytest.raises(ValueError):
|
|
await fs._expand_path(server + "/index", recursive=True, maxdepth=0)
|
|
|
|
await fs._session.close()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_async_walk(server):
|
|
fs = fsspec.filesystem("http", asynchronous=True, skip_instance_cache=True)
|
|
|
|
# No maxdepth
|
|
res = []
|
|
async for a in fs._walk(server + "/index"):
|
|
res.append(a) # noqa: PERF402
|
|
assert res == [(server + "/index", [], ["realfile"])]
|
|
|
|
# maxdepth=0
|
|
with pytest.raises(ValueError):
|
|
async for a in fs._walk(server + "/index", maxdepth=0):
|
|
pass
|
|
|
|
await fs._session.close()
|