Inzynierka/Lib/site-packages/pandas/core/interchange/column.py

378 lines
13 KiB
Python
Raw Normal View History

2023-06-02 12:51:02 +02:00
from __future__ import annotations
from typing import Any
import numpy as np
from pandas._libs.lib import infer_dtype
from pandas._libs.tslibs import iNaT
from pandas.errors import NoBufferPresent
from pandas.util._decorators import cache_readonly
import pandas as pd
from pandas.api.types import (
is_categorical_dtype,
is_string_dtype,
)
from pandas.core.interchange.buffer import PandasBuffer
from pandas.core.interchange.dataframe_protocol import (
Column,
ColumnBuffers,
ColumnNullType,
DtypeKind,
)
from pandas.core.interchange.utils import (
ArrowCTypes,
Endianness,
dtype_to_arrow_c_fmt,
)
_NP_KINDS = {
"i": DtypeKind.INT,
"u": DtypeKind.UINT,
"f": DtypeKind.FLOAT,
"b": DtypeKind.BOOL,
"U": DtypeKind.STRING,
"M": DtypeKind.DATETIME,
"m": DtypeKind.DATETIME,
}
_NULL_DESCRIPTION = {
DtypeKind.FLOAT: (ColumnNullType.USE_NAN, None),
DtypeKind.DATETIME: (ColumnNullType.USE_SENTINEL, iNaT),
DtypeKind.INT: (ColumnNullType.NON_NULLABLE, None),
DtypeKind.UINT: (ColumnNullType.NON_NULLABLE, None),
DtypeKind.BOOL: (ColumnNullType.NON_NULLABLE, None),
# Null values for categoricals are stored as `-1` sentinel values
# in the category date (e.g., `col.values.codes` is int8 np.ndarray)
DtypeKind.CATEGORICAL: (ColumnNullType.USE_SENTINEL, -1),
# follow Arrow in using 1 as valid value and 0 for missing/null value
DtypeKind.STRING: (ColumnNullType.USE_BYTEMASK, 0),
}
_NO_VALIDITY_BUFFER = {
ColumnNullType.NON_NULLABLE: "This column is non-nullable",
ColumnNullType.USE_NAN: "This column uses NaN as null",
ColumnNullType.USE_SENTINEL: "This column uses a sentinel value",
}
class PandasColumn(Column):
"""
A column object, with only the methods and properties required by the
interchange protocol defined.
A column can contain one or more chunks. Each chunk can contain up to three
buffers - a data buffer, a mask buffer (depending on null representation),
and an offsets buffer (if variable-size binary; e.g., variable-length
strings).
Note: this Column object can only be produced by ``__dataframe__``, so
doesn't need its own version or ``__column__`` protocol.
"""
def __init__(self, column: pd.Series, allow_copy: bool = True) -> None:
"""
Note: doesn't deal with extension arrays yet, just assume a regular
Series/ndarray for now.
"""
if not isinstance(column, pd.Series):
raise NotImplementedError(f"Columns of type {type(column)} not handled yet")
# Store the column as a private attribute
self._col = column
self._allow_copy = allow_copy
def size(self) -> int:
"""
Size of the column, in elements.
"""
return self._col.size
@property
def offset(self) -> int:
"""
Offset of first element. Always zero.
"""
# TODO: chunks are implemented now, probably this should return something
return 0
@cache_readonly
def dtype(self) -> tuple[DtypeKind, int, str, str]:
dtype = self._col.dtype
if is_categorical_dtype(dtype):
codes = self._col.values.codes
(
_,
bitwidth,
c_arrow_dtype_f_str,
_,
) = self._dtype_from_pandasdtype(codes.dtype)
return (
DtypeKind.CATEGORICAL,
bitwidth,
c_arrow_dtype_f_str,
Endianness.NATIVE,
)
elif is_string_dtype(dtype):
if infer_dtype(self._col) == "string":
return (
DtypeKind.STRING,
8,
dtype_to_arrow_c_fmt(dtype),
Endianness.NATIVE,
)
raise NotImplementedError("Non-string object dtypes are not supported yet")
else:
return self._dtype_from_pandasdtype(dtype)
def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]:
"""
See `self.dtype` for details.
"""
# Note: 'c' (complex) not handled yet (not in array spec v1).
# 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled
# datetime and timedelta both map to datetime (is timedelta handled?)
kind = _NP_KINDS.get(dtype.kind, None)
if kind is None:
# Not a NumPy dtype. Check if it's a categorical maybe
raise ValueError(f"Data type {dtype} not supported by interchange protocol")
return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), dtype.byteorder
@property
def describe_categorical(self):
"""
If the dtype is categorical, there are two options:
- There are only values in the data buffer.
- There is a separate non-categorical Column encoding for categorical values.
Raises TypeError if the dtype is not categorical
Content of returned dict:
- "is_ordered" : bool, whether the ordering of dictionary indices is
semantically meaningful.
- "is_dictionary" : bool, whether a dictionary-style mapping of
categorical values to other objects exists
- "categories" : Column representing the (implicit) mapping of indices to
category values (e.g. an array of cat1, cat2, ...).
None if not a dictionary-style categorical.
"""
if not self.dtype[0] == DtypeKind.CATEGORICAL:
raise TypeError(
"describe_categorical only works on a column with categorical dtype!"
)
return {
"is_ordered": self._col.cat.ordered,
"is_dictionary": True,
"categories": PandasColumn(pd.Series(self._col.cat.categories)),
}
@property
def describe_null(self):
kind = self.dtype[0]
try:
null, value = _NULL_DESCRIPTION[kind]
except KeyError:
raise NotImplementedError(f"Data type {kind} not yet supported")
return null, value
@cache_readonly
def null_count(self) -> int:
"""
Number of null elements. Should always be known.
"""
return self._col.isna().sum().item()
@property
def metadata(self) -> dict[str, pd.Index]:
"""
Store specific metadata of the column.
"""
return {"pandas.index": self._col.index}
def num_chunks(self) -> int:
"""
Return the number of chunks the column consists of.
"""
return 1
def get_chunks(self, n_chunks: int | None = None):
"""
Return an iterator yielding the chunks.
See `DataFrame.get_chunks` for details on ``n_chunks``.
"""
if n_chunks and n_chunks > 1:
size = len(self._col)
step = size // n_chunks
if size % n_chunks != 0:
step += 1
for start in range(0, step * n_chunks, step):
yield PandasColumn(
self._col.iloc[start : start + step], self._allow_copy
)
else:
yield self
def get_buffers(self) -> ColumnBuffers:
"""
Return a dictionary containing the underlying buffers.
The returned dictionary has the following contents:
- "data": a two-element tuple whose first element is a buffer
containing the data and whose second element is the data
buffer's associated dtype.
- "validity": a two-element tuple whose first element is a buffer
containing mask values indicating missing data and
whose second element is the mask value buffer's
associated dtype. None if the null representation is
not a bit or byte mask.
- "offsets": a two-element tuple whose first element is a buffer
containing the offset values for variable-size binary
data (e.g., variable-length strings) and whose second
element is the offsets buffer's associated dtype. None
if the data buffer does not have an associated offsets
buffer.
"""
buffers: ColumnBuffers = {
"data": self._get_data_buffer(),
"validity": None,
"offsets": None,
}
try:
buffers["validity"] = self._get_validity_buffer()
except NoBufferPresent:
pass
try:
buffers["offsets"] = self._get_offsets_buffer()
except NoBufferPresent:
pass
return buffers
def _get_data_buffer(
self,
) -> tuple[PandasBuffer, Any]: # Any is for self.dtype tuple
"""
Return the buffer containing the data and the buffer's associated dtype.
"""
if self.dtype[0] in (
DtypeKind.INT,
DtypeKind.UINT,
DtypeKind.FLOAT,
DtypeKind.BOOL,
DtypeKind.DATETIME,
):
buffer = PandasBuffer(self._col.to_numpy(), allow_copy=self._allow_copy)
dtype = self.dtype
elif self.dtype[0] == DtypeKind.CATEGORICAL:
codes = self._col.values._codes
buffer = PandasBuffer(codes, allow_copy=self._allow_copy)
dtype = self._dtype_from_pandasdtype(codes.dtype)
elif self.dtype[0] == DtypeKind.STRING:
# Marshal the strings from a NumPy object array into a byte array
buf = self._col.to_numpy()
b = bytearray()
# TODO: this for-loop is slow; can be implemented in Cython/C/C++ later
for obj in buf:
if isinstance(obj, str):
b.extend(obj.encode(encoding="utf-8"))
# Convert the byte array to a Pandas "buffer" using
# a NumPy array as the backing store
buffer = PandasBuffer(np.frombuffer(b, dtype="uint8"))
# Define the dtype for the returned buffer
dtype = (
DtypeKind.STRING,
8,
ArrowCTypes.STRING,
Endianness.NATIVE,
) # note: currently only support native endianness
else:
raise NotImplementedError(f"Data type {self._col.dtype} not handled yet")
return buffer, dtype
def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]:
"""
Return the buffer containing the mask values indicating missing data and
the buffer's associated dtype.
Raises NoBufferPresent if null representation is not a bit or byte mask.
"""
null, invalid = self.describe_null
if self.dtype[0] == DtypeKind.STRING:
# For now, use byte array as the mask.
# TODO: maybe store as bit array to save space?..
buf = self._col.to_numpy()
# Determine the encoding for valid values
valid = invalid == 0
invalid = not valid
mask = np.zeros(shape=(len(buf),), dtype=np.bool_)
for i, obj in enumerate(buf):
mask[i] = valid if isinstance(obj, str) else invalid
# Convert the mask array to a Pandas "buffer" using
# a NumPy array as the backing store
buffer = PandasBuffer(mask)
# Define the dtype of the returned buffer
dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE)
return buffer, dtype
try:
msg = f"{_NO_VALIDITY_BUFFER[null]} so does not have a separate mask"
except KeyError:
# TODO: implement for other bit/byte masks?
raise NotImplementedError("See self.describe_null")
raise NoBufferPresent(msg)
def _get_offsets_buffer(self) -> tuple[PandasBuffer, Any]:
"""
Return the buffer containing the offset values for variable-size binary
data (e.g., variable-length strings) and the buffer's associated dtype.
Raises NoBufferPresent if the data buffer does not have an associated
offsets buffer.
"""
if self.dtype[0] == DtypeKind.STRING:
# For each string, we need to manually determine the next offset
values = self._col.to_numpy()
ptr = 0
offsets = np.zeros(shape=(len(values) + 1,), dtype=np.int64)
for i, v in enumerate(values):
# For missing values (in this case, `np.nan` values)
# we don't increment the pointer
if isinstance(v, str):
b = v.encode(encoding="utf-8")
ptr += len(b)
offsets[i + 1] = ptr
# Convert the offsets to a Pandas "buffer" using
# the NumPy array as the backing store
buffer = PandasBuffer(offsets)
# Assemble the buffer dtype info
dtype = (
DtypeKind.INT,
64,
ArrowCTypes.INT64,
Endianness.NATIVE,
) # note: currently only support native endianness
else:
raise NoBufferPresent(
"This column has a fixed-length dtype so "
"it does not have an offsets buffer"
)
return buffer, dtype