951 lines
31 KiB
Python
951 lines
31 KiB
Python
|
from __future__ import annotations
|
||
|
|
||
|
import collections
|
||
|
import functools
|
||
|
import logging
|
||
|
import math
|
||
|
import os
|
||
|
import threading
|
||
|
import warnings
|
||
|
from concurrent.futures import Future, ThreadPoolExecutor
|
||
|
from typing import (
|
||
|
TYPE_CHECKING,
|
||
|
Any,
|
||
|
Callable,
|
||
|
ClassVar,
|
||
|
Generic,
|
||
|
NamedTuple,
|
||
|
OrderedDict,
|
||
|
TypeVar,
|
||
|
)
|
||
|
|
||
|
if TYPE_CHECKING:
|
||
|
import mmap
|
||
|
|
||
|
from typing_extensions import ParamSpec
|
||
|
|
||
|
P = ParamSpec("P")
|
||
|
else:
|
||
|
P = TypeVar("P")
|
||
|
|
||
|
T = TypeVar("T")
|
||
|
|
||
|
|
||
|
logger = logging.getLogger("fsspec")
|
||
|
|
||
|
Fetcher = Callable[[int, int], bytes] # Maps (start, end) to bytes
|
||
|
|
||
|
|
||
|
class BaseCache:
|
||
|
"""Pass-though cache: doesn't keep anything, calls every time
|
||
|
|
||
|
Acts as base class for other cachers
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
blocksize: int
|
||
|
How far to read ahead in numbers of bytes
|
||
|
fetcher: func
|
||
|
Function of the form f(start, end) which gets bytes from remote as
|
||
|
specified
|
||
|
size: int
|
||
|
How big this file is
|
||
|
"""
|
||
|
|
||
|
name: ClassVar[str] = "none"
|
||
|
|
||
|
def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
|
||
|
self.blocksize = blocksize
|
||
|
self.nblocks = 0
|
||
|
self.fetcher = fetcher
|
||
|
self.size = size
|
||
|
self.hit_count = 0
|
||
|
self.miss_count = 0
|
||
|
# the bytes that we actually requested
|
||
|
self.total_requested_bytes = 0
|
||
|
|
||
|
def _fetch(self, start: int | None, stop: int | None) -> bytes:
|
||
|
if start is None:
|
||
|
start = 0
|
||
|
if stop is None:
|
||
|
stop = self.size
|
||
|
if start >= self.size or start >= stop:
|
||
|
return b""
|
||
|
return self.fetcher(start, stop)
|
||
|
|
||
|
def _reset_stats(self) -> None:
|
||
|
"""Reset hit and miss counts for a more ganular report e.g. by file."""
|
||
|
self.hit_count = 0
|
||
|
self.miss_count = 0
|
||
|
self.total_requested_bytes = 0
|
||
|
|
||
|
def _log_stats(self) -> str:
|
||
|
"""Return a formatted string of the cache statistics."""
|
||
|
if self.hit_count == 0 and self.miss_count == 0:
|
||
|
# a cache that does nothing, this is for logs only
|
||
|
return ""
|
||
|
return " , %s: %d hits, %d misses, %d total requested bytes" % (
|
||
|
self.name,
|
||
|
self.hit_count,
|
||
|
self.miss_count,
|
||
|
self.total_requested_bytes,
|
||
|
)
|
||
|
|
||
|
def __repr__(self) -> str:
|
||
|
# TODO: use rich for better formatting
|
||
|
return f"""
|
||
|
<{self.__class__.__name__}:
|
||
|
block size : {self.blocksize}
|
||
|
block count : {self.nblocks}
|
||
|
file size : {self.size}
|
||
|
cache hits : {self.hit_count}
|
||
|
cache misses: {self.miss_count}
|
||
|
total requested bytes: {self.total_requested_bytes}>
|
||
|
"""
|
||
|
|
||
|
|
||
|
class MMapCache(BaseCache):
|
||
|
"""memory-mapped sparse file cache
|
||
|
|
||
|
Opens temporary file, which is filled blocks-wise when data is requested.
|
||
|
Ensure there is enough disc space in the temporary location.
|
||
|
|
||
|
This cache method might only work on posix
|
||
|
"""
|
||
|
|
||
|
name = "mmap"
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
blocksize: int,
|
||
|
fetcher: Fetcher,
|
||
|
size: int,
|
||
|
location: str | None = None,
|
||
|
blocks: set[int] | None = None,
|
||
|
) -> None:
|
||
|
super().__init__(blocksize, fetcher, size)
|
||
|
self.blocks = set() if blocks is None else blocks
|
||
|
self.location = location
|
||
|
self.cache = self._makefile()
|
||
|
|
||
|
def _makefile(self) -> mmap.mmap | bytearray:
|
||
|
import mmap
|
||
|
import tempfile
|
||
|
|
||
|
if self.size == 0:
|
||
|
return bytearray()
|
||
|
|
||
|
# posix version
|
||
|
if self.location is None or not os.path.exists(self.location):
|
||
|
if self.location is None:
|
||
|
fd = tempfile.TemporaryFile()
|
||
|
self.blocks = set()
|
||
|
else:
|
||
|
fd = open(self.location, "wb+")
|
||
|
fd.seek(self.size - 1)
|
||
|
fd.write(b"1")
|
||
|
fd.flush()
|
||
|
else:
|
||
|
fd = open(self.location, "r+b")
|
||
|
|
||
|
return mmap.mmap(fd.fileno(), self.size)
|
||
|
|
||
|
def _fetch(self, start: int | None, end: int | None) -> bytes:
|
||
|
logger.debug(f"MMap cache fetching {start}-{end}")
|
||
|
if start is None:
|
||
|
start = 0
|
||
|
if end is None:
|
||
|
end = self.size
|
||
|
if start >= self.size or start >= end:
|
||
|
return b""
|
||
|
start_block = start // self.blocksize
|
||
|
end_block = end // self.blocksize
|
||
|
need = [i for i in range(start_block, end_block + 1) if i not in self.blocks]
|
||
|
hits = [i for i in range(start_block, end_block + 1) if i in self.blocks]
|
||
|
self.miss_count += len(need)
|
||
|
self.hit_count += len(hits)
|
||
|
while need:
|
||
|
# TODO: not a for loop so we can consolidate blocks later to
|
||
|
# make fewer fetch calls; this could be parallel
|
||
|
i = need.pop(0)
|
||
|
|
||
|
sstart = i * self.blocksize
|
||
|
send = min(sstart + self.blocksize, self.size)
|
||
|
self.total_requested_bytes += send - sstart
|
||
|
logger.debug(f"MMap get block #{i} ({sstart}-{send})")
|
||
|
self.cache[sstart:send] = self.fetcher(sstart, send)
|
||
|
self.blocks.add(i)
|
||
|
|
||
|
return self.cache[start:end]
|
||
|
|
||
|
def __getstate__(self) -> dict[str, Any]:
|
||
|
state = self.__dict__.copy()
|
||
|
# Remove the unpicklable entries.
|
||
|
del state["cache"]
|
||
|
return state
|
||
|
|
||
|
def __setstate__(self, state: dict[str, Any]) -> None:
|
||
|
# Restore instance attributes
|
||
|
self.__dict__.update(state)
|
||
|
self.cache = self._makefile()
|
||
|
|
||
|
|
||
|
class ReadAheadCache(BaseCache):
|
||
|
"""Cache which reads only when we get beyond a block of data
|
||
|
|
||
|
This is a much simpler version of BytesCache, and does not attempt to
|
||
|
fill holes in the cache or keep fragments alive. It is best suited to
|
||
|
many small reads in a sequential order (e.g., reading lines from a file).
|
||
|
"""
|
||
|
|
||
|
name = "readahead"
|
||
|
|
||
|
def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
|
||
|
super().__init__(blocksize, fetcher, size)
|
||
|
self.cache = b""
|
||
|
self.start = 0
|
||
|
self.end = 0
|
||
|
|
||
|
def _fetch(self, start: int | None, end: int | None) -> bytes:
|
||
|
if start is None:
|
||
|
start = 0
|
||
|
if end is None or end > self.size:
|
||
|
end = self.size
|
||
|
if start >= self.size or start >= end:
|
||
|
return b""
|
||
|
l = end - start
|
||
|
if start >= self.start and end <= self.end:
|
||
|
# cache hit
|
||
|
self.hit_count += 1
|
||
|
return self.cache[start - self.start : end - self.start]
|
||
|
elif self.start <= start < self.end:
|
||
|
# partial hit
|
||
|
self.miss_count += 1
|
||
|
part = self.cache[start - self.start :]
|
||
|
l -= len(part)
|
||
|
start = self.end
|
||
|
else:
|
||
|
# miss
|
||
|
self.miss_count += 1
|
||
|
part = b""
|
||
|
end = min(self.size, end + self.blocksize)
|
||
|
self.total_requested_bytes += end - start
|
||
|
self.cache = self.fetcher(start, end) # new block replaces old
|
||
|
self.start = start
|
||
|
self.end = self.start + len(self.cache)
|
||
|
return part + self.cache[:l]
|
||
|
|
||
|
|
||
|
class FirstChunkCache(BaseCache):
|
||
|
"""Caches the first block of a file only
|
||
|
|
||
|
This may be useful for file types where the metadata is stored in the header,
|
||
|
but is randomly accessed.
|
||
|
"""
|
||
|
|
||
|
name = "first"
|
||
|
|
||
|
def __init__(self, blocksize: int, fetcher: Fetcher, size: int) -> None:
|
||
|
if blocksize > size:
|
||
|
# this will buffer the whole thing
|
||
|
blocksize = size
|
||
|
super().__init__(blocksize, fetcher, size)
|
||
|
self.cache: bytes | None = None
|
||
|
|
||
|
def _fetch(self, start: int | None, end: int | None) -> bytes:
|
||
|
start = start or 0
|
||
|
if start > self.size:
|
||
|
logger.debug("FirstChunkCache: requested start > file size")
|
||
|
return b""
|
||
|
|
||
|
end = min(end, self.size)
|
||
|
|
||
|
if start < self.blocksize:
|
||
|
if self.cache is None:
|
||
|
self.miss_count += 1
|
||
|
if end > self.blocksize:
|
||
|
self.total_requested_bytes += end
|
||
|
data = self.fetcher(0, end)
|
||
|
self.cache = data[: self.blocksize]
|
||
|
return data[start:]
|
||
|
self.cache = self.fetcher(0, self.blocksize)
|
||
|
self.total_requested_bytes += self.blocksize
|
||
|
part = self.cache[start:end]
|
||
|
if end > self.blocksize:
|
||
|
self.total_requested_bytes += end - self.blocksize
|
||
|
part += self.fetcher(self.blocksize, end)
|
||
|
self.hit_count += 1
|
||
|
return part
|
||
|
else:
|
||
|
self.miss_count += 1
|
||
|
self.total_requested_bytes += end - start
|
||
|
return self.fetcher(start, end)
|
||
|
|
||
|
|
||
|
class BlockCache(BaseCache):
|
||
|
"""
|
||
|
Cache holding memory as a set of blocks.
|
||
|
|
||
|
Requests are only ever made ``blocksize`` at a time, and are
|
||
|
stored in an LRU cache. The least recently accessed block is
|
||
|
discarded when more than ``maxblocks`` are stored.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
blocksize : int
|
||
|
The number of bytes to store in each block.
|
||
|
Requests are only ever made for ``blocksize``, so this
|
||
|
should balance the overhead of making a request against
|
||
|
the granularity of the blocks.
|
||
|
fetcher : Callable
|
||
|
size : int
|
||
|
The total size of the file being cached.
|
||
|
maxblocks : int
|
||
|
The maximum number of blocks to cache for. The maximum memory
|
||
|
use for this cache is then ``blocksize * maxblocks``.
|
||
|
"""
|
||
|
|
||
|
name = "blockcache"
|
||
|
|
||
|
def __init__(
|
||
|
self, blocksize: int, fetcher: Fetcher, size: int, maxblocks: int = 32
|
||
|
) -> None:
|
||
|
super().__init__(blocksize, fetcher, size)
|
||
|
self.nblocks = math.ceil(size / blocksize)
|
||
|
self.maxblocks = maxblocks
|
||
|
self._fetch_block_cached = functools.lru_cache(maxblocks)(self._fetch_block)
|
||
|
|
||
|
def cache_info(self):
|
||
|
"""
|
||
|
The statistics on the block cache.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
NamedTuple
|
||
|
Returned directly from the LRU Cache used internally.
|
||
|
"""
|
||
|
return self._fetch_block_cached.cache_info()
|
||
|
|
||
|
def __getstate__(self) -> dict[str, Any]:
|
||
|
state = self.__dict__
|
||
|
del state["_fetch_block_cached"]
|
||
|
return state
|
||
|
|
||
|
def __setstate__(self, state: dict[str, Any]) -> None:
|
||
|
self.__dict__.update(state)
|
||
|
self._fetch_block_cached = functools.lru_cache(state["maxblocks"])(
|
||
|
self._fetch_block
|
||
|
)
|
||
|
|
||
|
def _fetch(self, start: int | None, end: int | None) -> bytes:
|
||
|
if start is None:
|
||
|
start = 0
|
||
|
if end is None:
|
||
|
end = self.size
|
||
|
if start >= self.size or start >= end:
|
||
|
return b""
|
||
|
|
||
|
# byte position -> block numbers
|
||
|
start_block_number = start // self.blocksize
|
||
|
end_block_number = end // self.blocksize
|
||
|
|
||
|
# these are cached, so safe to do multiple calls for the same start and end.
|
||
|
for block_number in range(start_block_number, end_block_number + 1):
|
||
|
self._fetch_block_cached(block_number)
|
||
|
|
||
|
return self._read_cache(
|
||
|
start,
|
||
|
end,
|
||
|
start_block_number=start_block_number,
|
||
|
end_block_number=end_block_number,
|
||
|
)
|
||
|
|
||
|
def _fetch_block(self, block_number: int) -> bytes:
|
||
|
"""
|
||
|
Fetch the block of data for `block_number`.
|
||
|
"""
|
||
|
if block_number > self.nblocks:
|
||
|
raise ValueError(
|
||
|
f"'block_number={block_number}' is greater than "
|
||
|
f"the number of blocks ({self.nblocks})"
|
||
|
)
|
||
|
|
||
|
start = block_number * self.blocksize
|
||
|
end = start + self.blocksize
|
||
|
self.total_requested_bytes += end - start
|
||
|
self.miss_count += 1
|
||
|
logger.info("BlockCache fetching block %d", block_number)
|
||
|
block_contents = super()._fetch(start, end)
|
||
|
return block_contents
|
||
|
|
||
|
def _read_cache(
|
||
|
self, start: int, end: int, start_block_number: int, end_block_number: int
|
||
|
) -> bytes:
|
||
|
"""
|
||
|
Read from our block cache.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
start, end : int
|
||
|
The start and end byte positions.
|
||
|
start_block_number, end_block_number : int
|
||
|
The start and end block numbers.
|
||
|
"""
|
||
|
start_pos = start % self.blocksize
|
||
|
end_pos = end % self.blocksize
|
||
|
|
||
|
self.hit_count += 1
|
||
|
if start_block_number == end_block_number:
|
||
|
block: bytes = self._fetch_block_cached(start_block_number)
|
||
|
return block[start_pos:end_pos]
|
||
|
|
||
|
else:
|
||
|
# read from the initial
|
||
|
out = [self._fetch_block_cached(start_block_number)[start_pos:]]
|
||
|
|
||
|
# intermediate blocks
|
||
|
# Note: it'd be nice to combine these into one big request. However
|
||
|
# that doesn't play nicely with our LRU cache.
|
||
|
out.extend(
|
||
|
map(
|
||
|
self._fetch_block_cached,
|
||
|
range(start_block_number + 1, end_block_number),
|
||
|
)
|
||
|
)
|
||
|
|
||
|
# final block
|
||
|
out.append(self._fetch_block_cached(end_block_number)[:end_pos])
|
||
|
|
||
|
return b"".join(out)
|
||
|
|
||
|
|
||
|
class BytesCache(BaseCache):
|
||
|
"""Cache which holds data in a in-memory bytes object
|
||
|
|
||
|
Implements read-ahead by the block size, for semi-random reads progressing
|
||
|
through the file.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
trim: bool
|
||
|
As we read more data, whether to discard the start of the buffer when
|
||
|
we are more than a blocksize ahead of it.
|
||
|
"""
|
||
|
|
||
|
name: ClassVar[str] = "bytes"
|
||
|
|
||
|
def __init__(
|
||
|
self, blocksize: int, fetcher: Fetcher, size: int, trim: bool = True
|
||
|
) -> None:
|
||
|
super().__init__(blocksize, fetcher, size)
|
||
|
self.cache = b""
|
||
|
self.start: int | None = None
|
||
|
self.end: int | None = None
|
||
|
self.trim = trim
|
||
|
|
||
|
def _fetch(self, start: int | None, end: int | None) -> bytes:
|
||
|
# TODO: only set start/end after fetch, in case it fails?
|
||
|
# is this where retry logic might go?
|
||
|
if start is None:
|
||
|
start = 0
|
||
|
if end is None:
|
||
|
end = self.size
|
||
|
if start >= self.size or start >= end:
|
||
|
return b""
|
||
|
if (
|
||
|
self.start is not None
|
||
|
and start >= self.start
|
||
|
and self.end is not None
|
||
|
and end < self.end
|
||
|
):
|
||
|
# cache hit: we have all the required data
|
||
|
offset = start - self.start
|
||
|
self.hit_count += 1
|
||
|
return self.cache[offset : offset + end - start]
|
||
|
|
||
|
if self.blocksize:
|
||
|
bend = min(self.size, end + self.blocksize)
|
||
|
else:
|
||
|
bend = end
|
||
|
|
||
|
if bend == start or start > self.size:
|
||
|
return b""
|
||
|
|
||
|
if (self.start is None or start < self.start) and (
|
||
|
self.end is None or end > self.end
|
||
|
):
|
||
|
# First read, or extending both before and after
|
||
|
self.total_requested_bytes += bend - start
|
||
|
self.miss_count += 1
|
||
|
self.cache = self.fetcher(start, bend)
|
||
|
self.start = start
|
||
|
else:
|
||
|
assert self.start is not None
|
||
|
assert self.end is not None
|
||
|
self.miss_count += 1
|
||
|
|
||
|
if start < self.start:
|
||
|
if self.end is None or self.end - end > self.blocksize:
|
||
|
self.total_requested_bytes += bend - start
|
||
|
self.cache = self.fetcher(start, bend)
|
||
|
self.start = start
|
||
|
else:
|
||
|
self.total_requested_bytes += self.start - start
|
||
|
new = self.fetcher(start, self.start)
|
||
|
self.start = start
|
||
|
self.cache = new + self.cache
|
||
|
elif self.end is not None and bend > self.end:
|
||
|
if self.end > self.size:
|
||
|
pass
|
||
|
elif end - self.end > self.blocksize:
|
||
|
self.total_requested_bytes += bend - start
|
||
|
self.cache = self.fetcher(start, bend)
|
||
|
self.start = start
|
||
|
else:
|
||
|
self.total_requested_bytes += bend - self.end
|
||
|
new = self.fetcher(self.end, bend)
|
||
|
self.cache = self.cache + new
|
||
|
|
||
|
self.end = self.start + len(self.cache)
|
||
|
offset = start - self.start
|
||
|
out = self.cache[offset : offset + end - start]
|
||
|
if self.trim:
|
||
|
num = (self.end - self.start) // (self.blocksize + 1)
|
||
|
if num > 1:
|
||
|
self.start += self.blocksize * num
|
||
|
self.cache = self.cache[self.blocksize * num :]
|
||
|
return out
|
||
|
|
||
|
def __len__(self) -> int:
|
||
|
return len(self.cache)
|
||
|
|
||
|
|
||
|
class AllBytes(BaseCache):
|
||
|
"""Cache entire contents of the file"""
|
||
|
|
||
|
name: ClassVar[str] = "all"
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
blocksize: int | None = None,
|
||
|
fetcher: Fetcher | None = None,
|
||
|
size: int | None = None,
|
||
|
data: bytes | None = None,
|
||
|
) -> None:
|
||
|
super().__init__(blocksize, fetcher, size) # type: ignore[arg-type]
|
||
|
if data is None:
|
||
|
self.miss_count += 1
|
||
|
self.total_requested_bytes += self.size
|
||
|
data = self.fetcher(0, self.size)
|
||
|
self.data = data
|
||
|
|
||
|
def _fetch(self, start: int | None, stop: int | None) -> bytes:
|
||
|
self.hit_count += 1
|
||
|
return self.data[start:stop]
|
||
|
|
||
|
|
||
|
class KnownPartsOfAFile(BaseCache):
|
||
|
"""
|
||
|
Cache holding known file parts.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
blocksize: int
|
||
|
How far to read ahead in numbers of bytes
|
||
|
fetcher: func
|
||
|
Function of the form f(start, end) which gets bytes from remote as
|
||
|
specified
|
||
|
size: int
|
||
|
How big this file is
|
||
|
data: dict
|
||
|
A dictionary mapping explicit `(start, stop)` file-offset tuples
|
||
|
with known bytes.
|
||
|
strict: bool, default True
|
||
|
Whether to fetch reads that go beyond a known byte-range boundary.
|
||
|
If `False`, any read that ends outside a known part will be zero
|
||
|
padded. Note that zero padding will not be used for reads that
|
||
|
begin outside a known byte-range.
|
||
|
"""
|
||
|
|
||
|
name: ClassVar[str] = "parts"
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
blocksize: int,
|
||
|
fetcher: Fetcher,
|
||
|
size: int,
|
||
|
data: dict[tuple[int, int], bytes] = {},
|
||
|
strict: bool = True,
|
||
|
**_: Any,
|
||
|
):
|
||
|
super().__init__(blocksize, fetcher, size)
|
||
|
self.strict = strict
|
||
|
|
||
|
# simple consolidation of contiguous blocks
|
||
|
if data:
|
||
|
old_offsets = sorted(data.keys())
|
||
|
offsets = [old_offsets[0]]
|
||
|
blocks = [data.pop(old_offsets[0])]
|
||
|
for start, stop in old_offsets[1:]:
|
||
|
start0, stop0 = offsets[-1]
|
||
|
if start == stop0:
|
||
|
offsets[-1] = (start0, stop)
|
||
|
blocks[-1] += data.pop((start, stop))
|
||
|
else:
|
||
|
offsets.append((start, stop))
|
||
|
blocks.append(data.pop((start, stop)))
|
||
|
|
||
|
self.data = dict(zip(offsets, blocks))
|
||
|
else:
|
||
|
self.data = data
|
||
|
|
||
|
def _fetch(self, start: int | None, stop: int | None) -> bytes:
|
||
|
if start is None:
|
||
|
start = 0
|
||
|
if stop is None:
|
||
|
stop = self.size
|
||
|
|
||
|
out = b""
|
||
|
for (loc0, loc1), data in self.data.items():
|
||
|
# If self.strict=False, use zero-padded data
|
||
|
# for reads beyond the end of a "known" buffer
|
||
|
if loc0 <= start < loc1:
|
||
|
off = start - loc0
|
||
|
out = data[off : off + stop - start]
|
||
|
if not self.strict or loc0 <= stop <= loc1:
|
||
|
# The request is within a known range, or
|
||
|
# it begins within a known range, and we
|
||
|
# are allowed to pad reads beyond the
|
||
|
# buffer with zero
|
||
|
out += b"\x00" * (stop - start - len(out))
|
||
|
self.hit_count += 1
|
||
|
return out
|
||
|
else:
|
||
|
# The request ends outside a known range,
|
||
|
# and we are being "strict" about reads
|
||
|
# beyond the buffer
|
||
|
start = loc1
|
||
|
break
|
||
|
|
||
|
# We only get here if there is a request outside the
|
||
|
# known parts of the file. In an ideal world, this
|
||
|
# should never happen
|
||
|
if self.fetcher is None:
|
||
|
# We cannot fetch the data, so raise an error
|
||
|
raise ValueError(f"Read is outside the known file parts: {(start, stop)}. ")
|
||
|
# We can fetch the data, but should warn the user
|
||
|
# that this may be slow
|
||
|
warnings.warn(
|
||
|
f"Read is outside the known file parts: {(start, stop)}. "
|
||
|
f"IO/caching performance may be poor!"
|
||
|
)
|
||
|
logger.debug(f"KnownPartsOfAFile cache fetching {start}-{stop}")
|
||
|
self.total_requested_bytes += stop - start
|
||
|
self.miss_count += 1
|
||
|
return out + super()._fetch(start, stop)
|
||
|
|
||
|
|
||
|
class UpdatableLRU(Generic[P, T]):
|
||
|
"""
|
||
|
Custom implementation of LRU cache that allows updating keys
|
||
|
|
||
|
Used by BackgroudBlockCache
|
||
|
"""
|
||
|
|
||
|
class CacheInfo(NamedTuple):
|
||
|
hits: int
|
||
|
misses: int
|
||
|
maxsize: int
|
||
|
currsize: int
|
||
|
|
||
|
def __init__(self, func: Callable[P, T], max_size: int = 128) -> None:
|
||
|
self._cache: OrderedDict[Any, T] = collections.OrderedDict()
|
||
|
self._func = func
|
||
|
self._max_size = max_size
|
||
|
self._hits = 0
|
||
|
self._misses = 0
|
||
|
self._lock = threading.Lock()
|
||
|
|
||
|
def __call__(self, *args: P.args, **kwargs: P.kwargs) -> T:
|
||
|
if kwargs:
|
||
|
raise TypeError(f"Got unexpected keyword argument {kwargs.keys()}")
|
||
|
with self._lock:
|
||
|
if args in self._cache:
|
||
|
self._cache.move_to_end(args)
|
||
|
self._hits += 1
|
||
|
return self._cache[args]
|
||
|
|
||
|
result = self._func(*args, **kwargs)
|
||
|
|
||
|
with self._lock:
|
||
|
self._cache[args] = result
|
||
|
self._misses += 1
|
||
|
if len(self._cache) > self._max_size:
|
||
|
self._cache.popitem(last=False)
|
||
|
|
||
|
return result
|
||
|
|
||
|
def is_key_cached(self, *args: Any) -> bool:
|
||
|
with self._lock:
|
||
|
return args in self._cache
|
||
|
|
||
|
def add_key(self, result: T, *args: Any) -> None:
|
||
|
with self._lock:
|
||
|
self._cache[args] = result
|
||
|
if len(self._cache) > self._max_size:
|
||
|
self._cache.popitem(last=False)
|
||
|
|
||
|
def cache_info(self) -> UpdatableLRU.CacheInfo:
|
||
|
with self._lock:
|
||
|
return self.CacheInfo(
|
||
|
maxsize=self._max_size,
|
||
|
currsize=len(self._cache),
|
||
|
hits=self._hits,
|
||
|
misses=self._misses,
|
||
|
)
|
||
|
|
||
|
|
||
|
class BackgroundBlockCache(BaseCache):
|
||
|
"""
|
||
|
Cache holding memory as a set of blocks with pre-loading of
|
||
|
the next block in the background.
|
||
|
|
||
|
Requests are only ever made ``blocksize`` at a time, and are
|
||
|
stored in an LRU cache. The least recently accessed block is
|
||
|
discarded when more than ``maxblocks`` are stored. If the
|
||
|
next block is not in cache, it is loaded in a separate thread
|
||
|
in non-blocking way.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
blocksize : int
|
||
|
The number of bytes to store in each block.
|
||
|
Requests are only ever made for ``blocksize``, so this
|
||
|
should balance the overhead of making a request against
|
||
|
the granularity of the blocks.
|
||
|
fetcher : Callable
|
||
|
size : int
|
||
|
The total size of the file being cached.
|
||
|
maxblocks : int
|
||
|
The maximum number of blocks to cache for. The maximum memory
|
||
|
use for this cache is then ``blocksize * maxblocks``.
|
||
|
"""
|
||
|
|
||
|
name: ClassVar[str] = "background"
|
||
|
|
||
|
def __init__(
|
||
|
self, blocksize: int, fetcher: Fetcher, size: int, maxblocks: int = 32
|
||
|
) -> None:
|
||
|
super().__init__(blocksize, fetcher, size)
|
||
|
self.nblocks = math.ceil(size / blocksize)
|
||
|
self.maxblocks = maxblocks
|
||
|
self._fetch_block_cached = UpdatableLRU(self._fetch_block, maxblocks)
|
||
|
|
||
|
self._thread_executor = ThreadPoolExecutor(max_workers=1)
|
||
|
self._fetch_future_block_number: int | None = None
|
||
|
self._fetch_future: Future[bytes] | None = None
|
||
|
self._fetch_future_lock = threading.Lock()
|
||
|
|
||
|
def cache_info(self) -> UpdatableLRU.CacheInfo:
|
||
|
"""
|
||
|
The statistics on the block cache.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
NamedTuple
|
||
|
Returned directly from the LRU Cache used internally.
|
||
|
"""
|
||
|
return self._fetch_block_cached.cache_info()
|
||
|
|
||
|
def __getstate__(self) -> dict[str, Any]:
|
||
|
state = self.__dict__
|
||
|
del state["_fetch_block_cached"]
|
||
|
del state["_thread_executor"]
|
||
|
del state["_fetch_future_block_number"]
|
||
|
del state["_fetch_future"]
|
||
|
del state["_fetch_future_lock"]
|
||
|
return state
|
||
|
|
||
|
def __setstate__(self, state) -> None:
|
||
|
self.__dict__.update(state)
|
||
|
self._fetch_block_cached = UpdatableLRU(self._fetch_block, state["maxblocks"])
|
||
|
self._thread_executor = ThreadPoolExecutor(max_workers=1)
|
||
|
self._fetch_future_block_number = None
|
||
|
self._fetch_future = None
|
||
|
self._fetch_future_lock = threading.Lock()
|
||
|
|
||
|
def _fetch(self, start: int | None, end: int | None) -> bytes:
|
||
|
if start is None:
|
||
|
start = 0
|
||
|
if end is None:
|
||
|
end = self.size
|
||
|
if start >= self.size or start >= end:
|
||
|
return b""
|
||
|
|
||
|
# byte position -> block numbers
|
||
|
start_block_number = start // self.blocksize
|
||
|
end_block_number = end // self.blocksize
|
||
|
|
||
|
fetch_future_block_number = None
|
||
|
fetch_future = None
|
||
|
with self._fetch_future_lock:
|
||
|
# Background thread is running. Check we we can or must join it.
|
||
|
if self._fetch_future is not None:
|
||
|
assert self._fetch_future_block_number is not None
|
||
|
if self._fetch_future.done():
|
||
|
logger.info("BlockCache joined background fetch without waiting.")
|
||
|
self._fetch_block_cached.add_key(
|
||
|
self._fetch_future.result(), self._fetch_future_block_number
|
||
|
)
|
||
|
# Cleanup the fetch variables. Done with fetching the block.
|
||
|
self._fetch_future_block_number = None
|
||
|
self._fetch_future = None
|
||
|
else:
|
||
|
# Must join if we need the block for the current fetch
|
||
|
must_join = bool(
|
||
|
start_block_number
|
||
|
<= self._fetch_future_block_number
|
||
|
<= end_block_number
|
||
|
)
|
||
|
if must_join:
|
||
|
# Copy to the local variables to release lock
|
||
|
# before waiting for result
|
||
|
fetch_future_block_number = self._fetch_future_block_number
|
||
|
fetch_future = self._fetch_future
|
||
|
|
||
|
# Cleanup the fetch variables. Have a local copy.
|
||
|
self._fetch_future_block_number = None
|
||
|
self._fetch_future = None
|
||
|
|
||
|
# Need to wait for the future for the current read
|
||
|
if fetch_future is not None:
|
||
|
logger.info("BlockCache waiting for background fetch.")
|
||
|
# Wait until result and put it in cache
|
||
|
self._fetch_block_cached.add_key(
|
||
|
fetch_future.result(), fetch_future_block_number
|
||
|
)
|
||
|
|
||
|
# these are cached, so safe to do multiple calls for the same start and end.
|
||
|
for block_number in range(start_block_number, end_block_number + 1):
|
||
|
self._fetch_block_cached(block_number)
|
||
|
|
||
|
# fetch next block in the background if nothing is running in the background,
|
||
|
# the block is within file and it is not already cached
|
||
|
end_block_plus_1 = end_block_number + 1
|
||
|
with self._fetch_future_lock:
|
||
|
if (
|
||
|
self._fetch_future is None
|
||
|
and end_block_plus_1 <= self.nblocks
|
||
|
and not self._fetch_block_cached.is_key_cached(end_block_plus_1)
|
||
|
):
|
||
|
self._fetch_future_block_number = end_block_plus_1
|
||
|
self._fetch_future = self._thread_executor.submit(
|
||
|
self._fetch_block, end_block_plus_1, "async"
|
||
|
)
|
||
|
|
||
|
return self._read_cache(
|
||
|
start,
|
||
|
end,
|
||
|
start_block_number=start_block_number,
|
||
|
end_block_number=end_block_number,
|
||
|
)
|
||
|
|
||
|
def _fetch_block(self, block_number: int, log_info: str = "sync") -> bytes:
|
||
|
"""
|
||
|
Fetch the block of data for `block_number`.
|
||
|
"""
|
||
|
if block_number > self.nblocks:
|
||
|
raise ValueError(
|
||
|
f"'block_number={block_number}' is greater than "
|
||
|
f"the number of blocks ({self.nblocks})"
|
||
|
)
|
||
|
|
||
|
start = block_number * self.blocksize
|
||
|
end = start + self.blocksize
|
||
|
logger.info("BlockCache fetching block (%s) %d", log_info, block_number)
|
||
|
self.total_requested_bytes += end - start
|
||
|
self.miss_count += 1
|
||
|
block_contents = super()._fetch(start, end)
|
||
|
return block_contents
|
||
|
|
||
|
def _read_cache(
|
||
|
self, start: int, end: int, start_block_number: int, end_block_number: int
|
||
|
) -> bytes:
|
||
|
"""
|
||
|
Read from our block cache.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
start, end : int
|
||
|
The start and end byte positions.
|
||
|
start_block_number, end_block_number : int
|
||
|
The start and end block numbers.
|
||
|
"""
|
||
|
start_pos = start % self.blocksize
|
||
|
end_pos = end % self.blocksize
|
||
|
|
||
|
# kind of pointless to count this as a hit, but it is
|
||
|
self.hit_count += 1
|
||
|
|
||
|
if start_block_number == end_block_number:
|
||
|
block = self._fetch_block_cached(start_block_number)
|
||
|
return block[start_pos:end_pos]
|
||
|
|
||
|
else:
|
||
|
# read from the initial
|
||
|
out = [self._fetch_block_cached(start_block_number)[start_pos:]]
|
||
|
|
||
|
# intermediate blocks
|
||
|
# Note: it'd be nice to combine these into one big request. However
|
||
|
# that doesn't play nicely with our LRU cache.
|
||
|
out.extend(
|
||
|
map(
|
||
|
self._fetch_block_cached,
|
||
|
range(start_block_number + 1, end_block_number),
|
||
|
)
|
||
|
)
|
||
|
|
||
|
# final block
|
||
|
out.append(self._fetch_block_cached(end_block_number)[:end_pos])
|
||
|
|
||
|
return b"".join(out)
|
||
|
|
||
|
|
||
|
caches: dict[str | None, type[BaseCache]] = {
|
||
|
# one custom case
|
||
|
None: BaseCache,
|
||
|
}
|
||
|
|
||
|
|
||
|
def register_cache(cls: type[BaseCache], clobber: bool = False) -> None:
|
||
|
"""'Register' cache implementation.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
clobber: bool, optional
|
||
|
If set to True (default is False) - allow to overwrite existing
|
||
|
entry.
|
||
|
|
||
|
Raises
|
||
|
------
|
||
|
ValueError
|
||
|
"""
|
||
|
name = cls.name
|
||
|
if not clobber and name in caches:
|
||
|
raise ValueError(f"Cache with name {name!r} is already known: {caches[name]}")
|
||
|
caches[name] = cls
|
||
|
|
||
|
|
||
|
for c in (
|
||
|
BaseCache,
|
||
|
MMapCache,
|
||
|
BytesCache,
|
||
|
ReadAheadCache,
|
||
|
BlockCache,
|
||
|
FirstChunkCache,
|
||
|
AllBytes,
|
||
|
KnownPartsOfAFile,
|
||
|
BackgroundBlockCache,
|
||
|
):
|
||
|
register_cache(c)
|