Traktor/myenv/Lib/site-packages/fsspec/tests/test_caches.py
2024-05-26 05:12:46 +02:00

256 lines
8.2 KiB
Python

import pickle
import string
import pytest
from fsspec.caching import (
BlockCache,
FirstChunkCache,
ReadAheadCache,
caches,
register_cache,
)
from fsspec.implementations.cached import WholeFileCacheFileSystem
def test_cache_getitem(Cache_imp):
cacher = Cache_imp(4, letters_fetcher, len(string.ascii_letters))
assert cacher._fetch(0, 4) == b"abcd"
assert cacher._fetch(None, 4) == b"abcd"
assert cacher._fetch(2, 4) == b"cd"
def test_block_cache_lru():
# BlockCache is a cache that stores blocks of data and uses LRU to evict
block_size = 4
cache = BlockCache(
block_size, letters_fetcher, len(string.ascii_letters), maxblocks=2
)
# miss
cache._fetch(0, 2)
assert cache.cache_info().misses == 1
assert cache.cache_info().currsize == 1
assert cache.total_requested_bytes == block_size * cache.miss_count
assert cache.size == 52
# hit
cache._fetch(0, 2)
assert cache.cache_info().misses == 1
assert cache.cache_info().currsize == 1
assert cache.total_requested_bytes == block_size * cache.miss_count
# hit
cache._fetch(0, 2)
assert cache.cache_info().misses == 1
assert cache.cache_info().currsize == 1
# this works as a counter since all the reads are from the cache
assert cache.hit_count == 3
assert cache.miss_count == 1
# so far only 4 bytes have been read using range requests
assert cache.total_requested_bytes == block_size * cache.miss_count
# miss
cache._fetch(4, 6)
assert cache.cache_info().misses == 2
assert cache.cache_info().currsize == 2
assert cache.total_requested_bytes == block_size * cache.miss_count
# miss & evict
cache._fetch(12, 13)
assert cache.cache_info().misses == 3
assert cache.cache_info().currsize == 2
assert cache.hit_count == 5
assert cache.miss_count == 3
assert cache.total_requested_bytes == block_size * cache.miss_count
def test_first_cache():
"""
FirstChunkCache is a cache that only caches the first chunk of data
when some of that first block is requested.
"""
block_size = 5
cache = FirstChunkCache(block_size, letters_fetcher, len(string.ascii_letters))
assert cache.cache is None
assert cache._fetch(12, 15) == letters_fetcher(12, 15)
assert cache.miss_count == 1
assert cache.hit_count == 0
assert cache.cache is None
total_requested_bytes = 15 - 12
assert cache.total_requested_bytes == total_requested_bytes
# because we overlap with the cache range, it will be cached
assert cache._fetch(3, 10) == letters_fetcher(3, 10)
assert cache.miss_count == 2
assert cache.hit_count == 0
# we'll read the first 5 and then the rest
total_requested_bytes += block_size + 5
assert cache.total_requested_bytes == total_requested_bytes
# partial hit again
assert cache._fetch(3, 10) == letters_fetcher(3, 10)
assert cache.miss_count == 2
assert cache.hit_count == 1
# we have the first 5 bytes cached
total_requested_bytes += 10 - 5
assert cache.total_requested_bytes == total_requested_bytes
assert cache.cache == letters_fetcher(0, 5)
assert cache._fetch(0, 4) == letters_fetcher(0, 4)
assert cache.hit_count == 2
assert cache.miss_count == 2
assert cache.total_requested_bytes == 18
def test_readahead_cache():
"""
ReadAheadCache is a cache that reads ahead of the requested range.
If the access pattern is not sequential it will be very inefficient.
"""
block_size = 5
cache = ReadAheadCache(block_size, letters_fetcher, len(string.ascii_letters))
assert cache._fetch(12, 15) == letters_fetcher(12, 15)
assert cache.miss_count == 1
assert cache.hit_count == 0
total_requested_bytes = 15 - 12 + block_size
assert cache.total_requested_bytes == total_requested_bytes
assert cache._fetch(3, 10) == letters_fetcher(3, 10)
assert cache.miss_count == 2
assert cache.hit_count == 0
assert len(cache.cache) == 12
total_requested_bytes += (10 - 3) + block_size
assert cache.total_requested_bytes == total_requested_bytes
# caache hit again
assert cache._fetch(3, 10) == letters_fetcher(3, 10)
assert cache.miss_count == 2
assert cache.hit_count == 1
assert len(cache.cache) == 12
assert cache.total_requested_bytes == total_requested_bytes
assert cache.cache == letters_fetcher(3, 15)
# cache miss
assert cache._fetch(0, 4) == letters_fetcher(0, 4)
assert cache.hit_count == 1
assert cache.miss_count == 3
assert len(cache.cache) == 9
total_requested_bytes += (4 - 0) + block_size
assert cache.total_requested_bytes == total_requested_bytes
def _fetcher(start, end):
return b"0" * (end - start)
def letters_fetcher(start, end):
return string.ascii_letters[start:end].encode()
not_parts_caches = {k: v for k, v in caches.items() if k != "parts"}
@pytest.fixture(params=not_parts_caches.values(), ids=list(not_parts_caches))
def Cache_imp(request):
return request.param
def test_cache_empty_file(Cache_imp):
blocksize = 5
size = 0
cache = Cache_imp(blocksize, _fetcher, size)
assert cache._fetch(0, 0) == b""
def test_cache_pickleable(Cache_imp):
blocksize = 5
size = 100
cache = Cache_imp(blocksize, _fetcher, size)
cache._fetch(0, 5) # fill in cache
unpickled = pickle.loads(pickle.dumps(cache))
assert isinstance(unpickled, Cache_imp)
assert unpickled.blocksize == blocksize
assert unpickled.size == size
assert unpickled._fetch(0, 10) == b"0" * 10
@pytest.mark.parametrize(
"size_requests",
[[(0, 30), (0, 35), (51, 52)], [(0, 1), (1, 11), (1, 52)], [(0, 52), (11, 15)]],
)
@pytest.mark.parametrize("blocksize", [1, 10, 52, 100])
def test_cache_basic(Cache_imp, blocksize, size_requests):
cache = Cache_imp(blocksize, letters_fetcher, len(string.ascii_letters))
for start, end in size_requests:
result = cache._fetch(start, end)
expected = string.ascii_letters[start:end].encode()
assert result == expected
@pytest.mark.parametrize("strict", [True, False])
@pytest.mark.parametrize("sort", [True, False])
def test_known(sort, strict):
parts = {(10, 20): b"1" * 10, (20, 30): b"2" * 10, (0, 10): b"0" * 10}
if sort:
parts = dict(sorted(parts.items()))
c = caches["parts"](None, None, 100, parts, strict=strict)
assert (0, 30) in c.data # got consolidated
assert c._fetch(5, 15) == b"0" * 5 + b"1" * 5
assert c._fetch(15, 25) == b"1" * 5 + b"2" * 5
if strict:
# Over-read will raise error
with pytest.raises(ValueError):
# tries to call None fetcher
c._fetch(25, 35)
else:
# Over-read will be zero-padded
assert c._fetch(25, 35) == b"2" * 5 + b"\x00" * 5
def test_background(server, monkeypatch):
import threading
import time
import fsspec
head = {"head_ok": "true", "head_give_length": "true"}
urla = server + "/index/realfile"
h = fsspec.filesystem("http", headers=head)
thread_ids = {threading.current_thread().ident}
f = h.open(urla, block_size=5, cache_type="background")
orig = f.cache._fetch_block
def wrapped(*a, **kw):
thread_ids.add(threading.current_thread().ident)
return orig(*a, **kw)
f.cache._fetch_block = wrapped
assert len(thread_ids) == 1
f.read(1)
time.sleep(0.1) # second block is loading
assert len(thread_ids) == 2
def test_register_cache():
# just test that we have them populated and fail to re-add again unless overload
with pytest.raises(ValueError):
register_cache(BlockCache)
register_cache(BlockCache, clobber=True)
def test_cache_kwargs(mocker):
# test that kwargs are passed to the underlying filesystem after cache commit
fs = WholeFileCacheFileSystem(target_protocol="memory")
fs.touch("test")
fs.fs.put = mocker.MagicMock()
with fs.open("test", "wb", overwrite=True) as file_handle:
file_handle.write(b"foo")
# We don't care about the first parameter, just retrieve its expected value.
# It is a random location that cannot be predicted.
# The important thing is the 'overwrite' kwarg
fs.fs.put.assert_called_with(fs.fs.put.call_args[0][0], "/test", overwrite=True)