Traktor/myenv/Lib/site-packages/fsspec/implementations/tests/test_http.py

578 lines
17 KiB
Python
Raw Permalink Normal View History

2024-05-26 05:12:46 +02:00
import asyncio
import io
import json
import os
import sys
import time
import aiohttp
import pytest
import fsspec.asyn
import fsspec.utils
from fsspec.implementations.http import HTTPStreamFile
from fsspec.tests.conftest import data, reset_files, server, win # noqa: F401
def test_list(server):
h = fsspec.filesystem("http")
out = h.glob(server + "/index/*")
assert out == [server + "/index/realfile"]
def test_list_invalid_args(server):
with pytest.raises(TypeError):
h = fsspec.filesystem("http", use_foobar=True)
h.glob(server + "/index/*")
def test_list_cache(server):
h = fsspec.filesystem("http", use_listings_cache=True)
out = h.glob(server + "/index/*")
assert out == [server + "/index/realfile"]
def test_list_cache_with_expiry_time_cached(server):
h = fsspec.filesystem("http", use_listings_cache=True, listings_expiry_time=30)
# First, the directory cache is not initialized.
assert not h.dircache
# By querying the filesystem with "use_listings_cache=True",
# the cache will automatically get populated.
out = h.glob(server + "/index/*")
assert out == [server + "/index/realfile"]
# Verify cache content.
assert len(h.dircache) == 1
out = h.glob(server + "/index/*")
assert out == [server + "/index/realfile"]
def test_list_cache_with_expiry_time_purged(server):
h = fsspec.filesystem("http", use_listings_cache=True, listings_expiry_time=0.3)
# First, the directory cache is not initialized.
assert not h.dircache
# By querying the filesystem with "use_listings_cache=True",
# the cache will automatically get populated.
out = h.glob(server + "/index/*")
assert out == [server + "/index/realfile"]
assert len(h.dircache) == 1
# Verify cache content.
assert server + "/index/" in h.dircache
assert len(h.dircache.get(server + "/index/")) == 1
# Wait beyond the TTL / cache expiry time.
time.sleep(0.31)
# Verify that the cache item should have been purged.
cached_items = h.dircache.get(server + "/index/")
assert cached_items is None
# Verify that after clearing the item from the cache,
# it can get populated again.
out = h.glob(server + "/index/*")
assert out == [server + "/index/realfile"]
cached_items = h.dircache.get(server + "/index/")
assert len(cached_items) == 1
def test_list_cache_reuse(server):
h = fsspec.filesystem("http", use_listings_cache=True, listings_expiry_time=5)
# First, the directory cache is not initialized.
assert not h.dircache
# By querying the filesystem with "use_listings_cache=True",
# the cache will automatically get populated.
out = h.glob(server + "/index/*")
assert out == [server + "/index/realfile"]
# Verify cache content.
assert len(h.dircache) == 1
# Verify another instance without caching enabled does not have cache content.
h = fsspec.filesystem("http", use_listings_cache=False)
assert not h.dircache
# Verify that yet another new instance, with caching enabled,
# will see the same cache content again.
h = fsspec.filesystem("http", use_listings_cache=True, listings_expiry_time=5)
assert len(h.dircache) == 1
# However, yet another instance with a different expiry time will also not have
# any valid cache content.
h = fsspec.filesystem("http", use_listings_cache=True, listings_expiry_time=666)
assert len(h.dircache) == 0
def test_ls_raises_filenotfound(server):
h = fsspec.filesystem("http")
with pytest.raises(FileNotFoundError):
h.ls(server + "/not-a-key")
def test_list_cache_with_max_paths(server):
h = fsspec.filesystem("http", use_listings_cache=True, max_paths=5)
out = h.glob(server + "/index/*")
assert out == [server + "/index/realfile"]
def test_list_cache_with_skip_instance_cache(server):
h = fsspec.filesystem("http", use_listings_cache=True, skip_instance_cache=True)
out = h.glob(server + "/index/*")
assert out == [server + "/index/realfile"]
def test_glob_return_subfolders(server):
h = fsspec.filesystem("http")
out = h.glob(server + "/simple/*")
assert set(out) == {
server + "/simple/dir/",
server + "/simple/file",
}
def test_isdir(server):
h = fsspec.filesystem("http")
assert h.isdir(server + "/index/")
assert not h.isdir(server + "/index/realfile")
assert not h.isdir(server + "doesnotevenexist")
def test_policy_arg(server):
h = fsspec.filesystem("http", size_policy="get")
out = h.glob(server + "/index/*")
assert out == [server + "/index/realfile"]
def test_exists(server):
h = fsspec.filesystem("http")
assert not h.exists(server + "/notafile")
with pytest.raises(FileNotFoundError):
h.cat(server + "/notafile")
def test_read(server):
h = fsspec.filesystem("http")
out = server + "/index/realfile"
with h.open(out, "rb") as f:
assert f.read() == data
with h.open(out, "rb", block_size=0) as f:
assert f.read() == data
with h.open(out, "rb") as f:
assert f.read(100) + f.read() == data
def test_file_pickle(server):
import pickle
# via HTTPFile
h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"})
out = server + "/index/realfile"
with fsspec.open(out, headers={"give_length": "true", "head_ok": "true"}) as f:
pic = pickle.loads(pickle.dumps(f))
assert pic.read() == data
with h.open(out, "rb") as f:
pic = pickle.dumps(f)
assert f.read() == data
with pickle.loads(pic) as f:
assert f.read() == data
# via HTTPStreamFile
h = fsspec.filesystem("http")
out = server + "/index/realfile"
with h.open(out, "rb") as f:
out = pickle.dumps(f)
assert f.read() == data
with pickle.loads(out) as f:
assert f.read() == data
def test_methods(server):
h = fsspec.filesystem("http")
url = server + "/index/realfile"
assert h.exists(url)
assert h.cat(url) == data
@pytest.mark.parametrize(
"headers",
[
{},
{"give_length": "true"},
{"give_length": "true", "head_ok": "true"},
{"give_range": "true"},
{"give_length": "true", "head_not_auth": "true"},
{"give_range": "true", "head_not_auth": "true"},
{"use_206": "true", "head_ok": "true", "head_give_length": "true"},
{"use_206": "true", "give_length": "true"},
{"use_206": "true", "give_range": "true"},
],
)
def test_random_access(server, headers):
h = fsspec.filesystem("http", headers=headers)
url = server + "/index/realfile"
with h.open(url, "rb") as f:
if headers:
assert f.size == len(data)
assert f.read(5) == data[:5]
if headers:
f.seek(5, 1)
assert f.read(5) == data[10:15]
else:
with pytest.raises(ValueError):
f.seek(5, 1)
assert f.closed
@pytest.mark.parametrize(
"headers",
[
{"ignore_range": "true", "head_ok": "true", "head_give_length": "true"},
{"ignore_range": "true", "give_length": "true"},
{"ignore_range": "true", "give_range": "true"},
],
)
def test_no_range_support(server, headers):
h = fsspec.filesystem("http", headers=headers)
url = server + "/index/realfile"
with h.open(url, "rb") as f:
# Random access is not possible if the server doesn't respect Range
f.seek(5)
with pytest.raises(ValueError):
f.read(10)
# Reading from the beginning should still work
f.seek(0)
assert f.read(10) == data[:10]
def test_stream_seek(server):
h = fsspec.filesystem("http")
url = server + "/index/realfile"
with h.open(url, "rb") as f:
f.seek(0) # is OK
data1 = f.read(5)
assert len(data1) == 5
f.seek(5)
f.seek(0, 1)
data2 = f.read()
assert data1 + data2 == data
def test_mapper_url(server):
h = fsspec.filesystem("http")
mapper = h.get_mapper(server + "/index/")
assert mapper.root.startswith("http:")
assert list(mapper)
mapper2 = fsspec.get_mapper(server + "/index/")
assert mapper2.root.startswith("http:")
assert list(mapper) == list(mapper2)
def test_content_length_zero(server):
h = fsspec.filesystem(
"http", headers={"give_length": "true", "zero_length": "true"}
)
url = server + "/index/realfile"
with h.open(url, "rb") as f:
assert f.read() == data
def test_content_encoding_gzip(server):
h = fsspec.filesystem(
"http", headers={"give_length": "true", "gzip_encoding": "true"}
)
url = server + "/index/realfile"
with h.open(url, "rb") as f:
assert isinstance(f, HTTPStreamFile)
assert f.size is None
assert f.read() == data
def test_download(server, tmpdir):
h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "})
url = server + "/index/realfile"
fn = os.path.join(tmpdir, "afile")
h.get(url, fn)
assert open(fn, "rb").read() == data
def test_multi_download(server, tmpdir):
h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "})
urla = server + "/index/realfile"
urlb = server + "/index/otherfile"
fna = os.path.join(tmpdir, "afile")
fnb = os.path.join(tmpdir, "bfile")
h.get([urla, urlb], [fna, fnb])
assert open(fna, "rb").read() == data
assert open(fnb, "rb").read() == data
def test_ls(server):
h = fsspec.filesystem("http")
l = h.ls(server + "/data/20020401/", detail=False)
nc = server + "/data/20020401/GRACEDADM_CLSM0125US_7D.A20020401.030.nc4"
assert nc in l
assert len(l) == 11
assert all(u["type"] == "file" for u in h.ls(server + "/data/20020401/"))
assert h.glob(server + "/data/20020401/*.nc4") == [nc]
def test_mcat(server):
h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "})
urla = server + "/index/realfile"
urlb = server + "/index/otherfile"
out = h.cat([urla, urlb])
assert out == {urla: data, urlb: data}
def test_cat_file_range(server):
h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "})
urla = server + "/index/realfile"
assert h.cat(urla, start=1, end=10) == data[1:10]
assert h.cat(urla, start=1) == data[1:]
assert h.cat(urla, start=-10) == data[-10:]
assert h.cat(urla, start=-10, end=-2) == data[-10:-2]
assert h.cat(urla, end=-10) == data[:-10]
def test_cat_file_range_numpy(server):
np = pytest.importorskip("numpy")
h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "})
urla = server + "/index/realfile"
assert h.cat(urla, start=np.int8(1), end=np.int8(10)) == data[1:10]
out = h.cat_ranges([urla, urla], starts=np.array([1, 5]), ends=np.array([10, 15]))
assert out == [data[1:10], data[5:15]]
def test_mcat_cache(server):
urla = server + "/index/realfile"
urlb = server + "/index/otherfile"
fs = fsspec.filesystem("simplecache", target_protocol="http")
assert fs.cat([urla, urlb]) == {urla: data, urlb: data}
def test_mcat_expand(server):
h = fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true "})
out = h.cat(server + "/index/*")
assert out == {server + "/index/realfile": data}
def test_info(server):
fs = fsspec.filesystem("http", headers={"give_etag": "true", "head_ok": "true"})
info = fs.info(server + "/index/realfile")
assert info["ETag"] == "xxx"
fs = fsspec.filesystem("http", headers={"give_mimetype": "true"})
info = fs.info(server + "/index/realfile")
assert info["mimetype"] == "text/html"
fs = fsspec.filesystem("http", headers={"redirect": "true"})
info = fs.info(server + "/redirectme")
assert info["url"] == server + "/index/realfile"
@pytest.mark.parametrize("method", ["POST", "PUT"])
def test_put_file(server, tmp_path, method, reset_files):
src_file = tmp_path / "file_1"
src_file.write_bytes(data)
dwl_file = tmp_path / "down_1"
fs = fsspec.filesystem("http", headers={"head_ok": "true", "give_length": "true"})
with pytest.raises(FileNotFoundError):
fs.info(server + "/hey")
fs.put_file(src_file, server + "/hey", method=method)
assert fs.info(server + "/hey")["size"] == len(data)
fs.get_file(server + "/hey", dwl_file)
assert dwl_file.read_bytes() == data
src_file.write_bytes(b"xxx")
with open(src_file, "rb") as stream:
fs.put_file(stream, server + "/hey_2", method=method)
assert fs.cat(server + "/hey_2") == b"xxx"
fs.put_file(io.BytesIO(b"yyy"), server + "/hey_3", method=method)
assert fs.cat(server + "/hey_3") == b"yyy"
async def get_aiohttp():
from aiohttp import ClientSession
return ClientSession()
async def get_proxy():
class ProxyClient:
pass
return ProxyClient()
@pytest.mark.xfail(
condition=sys.flags.optimize > 1, reason="no docstrings when optimised"
)
def test_docstring():
h = fsspec.filesystem("http")
# most methods have empty docstrings and draw from base class, but this one
# is generated
assert h.pipe.__doc__
def test_async_other_thread(server):
import threading
loop = asyncio.get_event_loop()
th = threading.Thread(target=loop.run_forever)
th.daemon = True
th.start()
fs = fsspec.filesystem("http", asynchronous=True, loop=loop)
asyncio.run_coroutine_threadsafe(fs.set_session(), loop=loop).result()
url = server + "/index/realfile"
cor = fs._cat([url])
fut = asyncio.run_coroutine_threadsafe(cor, loop=loop)
assert fut.result() == {url: data}
loop.call_soon_threadsafe(loop.stop)
def test_async_this_thread(server):
async def _():
fs = fsspec.filesystem("http", asynchronous=True)
session = await fs.set_session() # creates client
url = server + "/index/realfile"
with pytest.raises((NotImplementedError, RuntimeError)):
fs.cat([url])
out = await fs._cat([url])
del fs
assert out == {url: data}
await session.close()
asyncio.run(_())
def _inner_pass(fs, q, fn):
# pass the FS instance, but don't use it; in new process, the instance
# cache should be skipped to make a new instance
import traceback
try:
fs = fsspec.filesystem("http")
q.put(fs.cat(fn))
except Exception:
q.put(traceback.format_exc())
@pytest.mark.parametrize("method", ["spawn", "forkserver"])
def test_processes(server, method):
import multiprocessing as mp
if win and method != "spawn":
pytest.skip("Windows can only spawn")
ctx = mp.get_context(method)
fn = server + "/index/realfile"
fs = fsspec.filesystem("http")
q = ctx.Queue()
p = ctx.Process(target=_inner_pass, args=(fs, q, fn))
p.start()
out = q.get()
assert out == fs.cat(fn)
p.join()
@pytest.mark.parametrize("get_client", [get_aiohttp, get_proxy])
def test_close(get_client):
fs = fsspec.filesystem("http", skip_instance_cache=True)
fs.close_session(None, asyncio.run(get_client()))
@pytest.mark.asyncio
async def test_async_file(server):
fs = fsspec.filesystem("http", asynchronous=True, skip_instance_cache=True)
fn = server + "/index/realfile"
of = await fs.open_async(fn)
async with of as f:
out1 = await f.read(10)
assert data.startswith(out1)
out2 = await f.read()
assert data == out1 + out2
await fs._session.close()
def test_encoded(server):
fs = fsspec.filesystem("http", encoded=True)
out = fs.cat(server + "/Hello%3A%20G%C3%BCnter", headers={"give_path": "true"})
assert json.loads(out)["path"] == "/Hello%3A%20G%C3%BCnter"
with pytest.raises(aiohttp.client_exceptions.ClientError):
fs.cat(server + "/Hello: Günter", headers={"give_path": "true"})
fs = fsspec.filesystem("http", encoded=False)
out = fs.cat(server + "/Hello: Günter", headers={"give_path": "true"})
assert json.loads(out)["path"] == "/Hello:%20G%C3%BCnter"
def test_with_cache(server):
fs = fsspec.filesystem("http", headers={"head_ok": "true", "give_length": "true"})
fn = server + "/index/realfile"
fs1 = fsspec.filesystem("blockcache", fs=fs)
with fs1.open(fn, "rb") as f:
out = f.read()
assert out == fs1.cat(fn)
@pytest.mark.asyncio
async def test_async_expand_path(server):
fs = fsspec.filesystem("http", asynchronous=True, skip_instance_cache=True)
# maxdepth=1
assert await fs._expand_path(server + "/index", recursive=True, maxdepth=1) == [
server + "/index",
server + "/index/realfile",
]
# maxdepth=0
with pytest.raises(ValueError):
await fs._expand_path(server + "/index", maxdepth=0)
with pytest.raises(ValueError):
await fs._expand_path(server + "/index", recursive=True, maxdepth=0)
await fs._session.close()
@pytest.mark.asyncio
async def test_async_walk(server):
fs = fsspec.filesystem("http", asynchronous=True, skip_instance_cache=True)
# No maxdepth
res = []
async for a in fs._walk(server + "/index"):
res.append(a) # noqa: PERF402
assert res == [(server + "/index", [], ["realfile"])]
# maxdepth=0
with pytest.raises(ValueError):
async for a in fs._walk(server + "/index", maxdepth=0):
pass
await fs._session.close()